4 | Redirect
5 |
6 |
7 |
10 |
11 |
12 |
13 | If you are not redirected automatically, follow the link here.
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/doc/source/nflwin.rst:
--------------------------------------------------------------------------------
1 | nflwin package
2 | ==============
3 |
4 | Submodules
5 | ----------
6 |
7 | nflwin.model module
8 | -------------------
9 |
10 | .. automodule:: nflwin.model
11 | :members:
12 | :undoc-members:
13 | :show-inheritance:
14 |
15 | nflwin.preprocessing module
16 | ---------------------------
17 |
18 | .. automodule:: nflwin.preprocessing
19 | :members:
20 | :undoc-members:
21 | :show-inheritance:
22 |
23 | nflwin.utilities module
24 | -----------------------
25 |
26 | .. automodule:: nflwin.utilities
27 | :members:
28 | :undoc-members:
29 | :show-inheritance:
30 |
31 |
32 | Module contents
33 | ---------------
34 |
35 | .. automodule:: nflwin
36 | :members:
37 | :undoc-members:
38 | :show-inheritance:
39 |
--------------------------------------------------------------------------------
/run_tests.sh:
--------------------------------------------------------------------------------
1 |
2 | #!/bin/bash
3 |
4 | mflag=" -m"
5 | mflagval=" not requires_db"
6 | python_version=`python -c "import sys; print(sys.version_info.major)"`
7 | pytest_command="py.test"
8 | if [ $python_version == "3" ]; then
9 | pytest_command="pytest"
10 | fi
11 |
12 | while getopts ":d" opt; do
13 | case $opt in
14 | d)
15 | echo "Running all tests..."
16 | mflagval=''
17 | mflag=''
18 | ;;
19 | \?)
20 | echo ""
21 | echo ""
22 | echo "Invalid option: -$OPTARG"
23 | echo "Usage:"
24 | echo "-----------------"
25 | echo "-d: run tests which require nfldb database access"
26 | echo ""
27 | ;;
28 | esac
29 | done
30 |
31 | python -m ${pytest_command}${mflag}"${mflagval}" --cov=nflwin --cov-config .coveragerc --cov-report term-missing nflwin/tests/
32 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) [2015] [Andrew Schechtman-Rook]
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | NFLWin
2 | ===============
3 |
4 | |Build Status| |Doc Status|
5 |
6 |
7 | Estimate Win Probability (WP) for plays in NFL games:
8 |
9 | .. code-block:: python
10 |
11 | >>> import pandas as pd
12 | >>> from nflwin.model import WPModel
13 | >>> standard_model = WPModel.load_model()
14 | >>> plays = pd.DataFrame({
15 | ... "quarter": ["Q1", "Q2", "Q4"],
16 | ... "seconds_elapsed": [0, 0, 600],
17 | ... "offense_team": ["NYJ", "NYJ", "NE"],
18 | ... "yardline": [-20, 20, 35],
19 | ... "down": [1, 3, 3],
20 | ... "yards_to_go": [10, 2, 10],
21 | ... "home_team": ["NYJ", "NYJ", "NYJ"],
22 | ... "away_team": ["NE", "NE", "NE"],
23 | ... "curr_home_score": [0, 0, 21],
24 | ... "curr_away_score": [0, 0, 10]
25 | ... })
26 | >>> standard_model.predict_wp(plays)
27 | array([ 0.58300397, 0.64321796, 0.18195466])
28 |
29 | For full documentation, including information about methods and accuracy, click `here `_.
30 |
31 | License
32 | ---------------
33 | MIT. See `license file `_.
34 |
35 | .. |Build Status| image:: https://travis-ci.org/AndrewRook/NFLWin.svg?branch=master
36 | :target: https://travis-ci.org/AndrewRook/NFLWin
37 | :alt: Build Status
38 | .. |Doc Status| image:: https://readthedocs.org/projects/nflwin/badge/?version=latest
39 | :target: http://nflwin.readthedocs.io/en/latest/?badge=latest
40 | :alt: Documentation Status
41 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | #From http://conda.pydata.org/docs/travis.html
2 | language: python
3 | python:
4 | # We don't actually use the Travis Python, but this keeps it organized.
5 | - "2.7"
6 | install:
7 | - sudo apt-get update
8 | # We do this conditionally because it saves us some downloading if the
9 | # version is the same.
10 | - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
11 | wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh;
12 | else
13 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
14 | fi
15 | - bash miniconda.sh -b -p $HOME/miniconda
16 | - export PATH="$HOME/miniconda/bin:$PATH"
17 | - hash -r
18 | - conda config --set always_yes yes --set changeps1 no
19 | - conda update -q conda
20 | # Useful for debugging any issues with conda
21 | - conda info -a
22 |
23 | - conda create -n nflwin_py2 -y -c conda-forge python=2 numpy scipy pandas pytest pytest-cov scikit-learn=0.19
24 | - source activate nflwin_py2
25 | #- pip install nfldb==0.2.15 nflgame==1.2.19
26 | - python setup.py install
27 |
28 | - source deactivate
29 | - conda create -n nflwin_py3 -y -c conda-forge python=3 numpy scipy pandas pytest pytest-cov scikit-learn=0.19
30 | - source activate nflwin_py3
31 | - python setup.py install
32 |
33 | script:
34 | # Your test script goes here
35 | - source activate nflwin_py3
36 | - ./run_tests.sh
37 | - source activate nflwin_py2
38 | - ./run_tests.sh
39 |
--------------------------------------------------------------------------------
/doc/source/installation.rst:
--------------------------------------------------------------------------------
1 | Installation
2 | ===============
3 | NFLWin only supports Python 2, as nfldb is currently incompatible
4 | with Python 3. The bulk of NFLWin should work natively with Python 3,
5 | however that is currently untested. Pull requests ensuring this
6 | compatibility would be welcome.
7 |
8 |
9 | Releases
10 | ----------------------
11 | Stable releases of NFLWin are available on PyPI::
12 |
13 | $ pip install nflwin
14 |
15 | The default install provides exactly the tools necessary to make
16 | predictions using the standard WP model as well as make new
17 | models. However it does not include the dependencies necessary for
18 | :ref:`using nfldb `, producing diagnostic plots, or contributing to the
19 | package.
20 |
21 | Installing NFLWin with those extra dependencies is accomplished by
22 | adding a parameter in square brackets::
23 |
24 | $ pip install nflwin[plotting] #Adds matplotlib for plotting
25 | $ pip install nflwin[nfldb] #Dependencies for using nfldb
26 | $ pip install nflwin[dev] #Everything you need to develop on NFLWin
27 |
28 | .. note::
29 | NFLWin depends on the scipy library, which is notoriously difficult
30 | to install via pip or from source. One option if you're having
31 | difficulty getting scipy installed is to use the `Conda
32 | `_ package manager. After installing
33 | Conda, you can create a new environment and install dependencies
34 | manually before pip installing NFLWin::
35 |
36 | $ conda create -n nflwin-env python=2.7 numpy scipy scikit-learn pandas
37 |
38 | Bleeding Edge
39 | ---------------------------
40 | If you want the most recent stable version you can install directly
41 | from GitHub::
42 |
43 | $ pip install git+https://github.com/AndrewRook/NFLWin.git@master#egg=nflwin
44 |
45 | You can append the arguments for the extra dependencies in the same
46 | way as for the installation from PyPI.
47 |
48 | .. note::
49 | GitHub installs **do not** come with the default model. If you want
50 | to use a GitHub install with the default model, you'll need to
51 | install NFLWin from PyPI somewhere else and then copy the model
52 | into the model directory from your GitHub install. If you need to
53 | figure out where that directory is, print
54 | ``model.WPModel.model_directory``.
55 |
--------------------------------------------------------------------------------
/make_default_model.py:
--------------------------------------------------------------------------------
1 | """A simple script to create, train, validate, and save the default model"""
2 | from __future__ import division, print_function
3 |
4 | import datetime as dt
5 | import time
6 | import os
7 |
8 | from nflwin import model
9 |
10 | def main():
11 | start = time.time()
12 | win_probability_model = model.WPModel()
13 |
14 | training_seasons = [2009, 2010, 2011, 2012, 2013, 2014]
15 | validation_seasons = [2015]
16 | season_types = ["Regular", "Postseason"]
17 |
18 | win_probability_model.train_model(training_seasons=training_seasons,
19 | training_season_types=season_types)
20 | print("Took {0:.2f}s to build model".format(time.time() - start))
21 |
22 | start = time.time()
23 | max_deviation, residual_area = win_probability_model.validate_model(validation_seasons=validation_seasons,
24 | validation_season_types=season_types)
25 | print("Took {0:.2f}s to validate model, with a max residual of {1:.2f} and a residual area of {2:.2f}"
26 | .format(time.time() - start, max_deviation, residual_area))
27 |
28 | win_probability_model.save_model()
29 |
30 | ax = win_probability_model.plot_validation(label="max deviation={0:.2f}, \n"
31 | "residual total area={1:.2f}"
32 | "".format(max_deviation, residual_area))
33 | curr_datetime = dt.datetime.now()
34 | ax.set_title("Model Generated At: " + curr_datetime.strftime("%Y-%m-%d %H:%M:%S"))
35 | ax.legend(loc="lower right", fontsize=10)
36 | ax.text(0.02, 0.98, ("Data from: {0:s}\n"
37 | "Training season(s): {1:s}\n"
38 | "Validation season(s): {2:s}"
39 | "".format(", ".join(season_types),
40 | ", ".join(str(year) for year in training_seasons),
41 | ", ".join(str(year) for year in validation_seasons))),
42 | ha="left", va="top", fontsize=10, transform=ax.transAxes)
43 |
44 | this_filepath = os.path.dirname(os.path.abspath(__file__))
45 | save_filepath = os.path.join(this_filepath, "doc", "source", "_static", "validation_plot.png")
46 | ax.figure.savefig(save_filepath)
47 |
48 |
49 | if __name__ == "__main__":
50 | main()
51 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import tarfile
4 | import warnings
5 | from setuptools import setup, find_packages
6 | from setuptools.command.install import install as _install
7 |
8 | ###################################################################
9 | #Boilerplate I modified from the internet
10 |
11 | VERSION_FILE = "nflwin/_version.py"
12 | version_string = open(VERSION_FILE, "r").read()
13 | version_re = r"^__version__ = [u]{0,1}['\"]([^'\"]*)['\"]"
14 | version_match = re.search(version_re, version_string, re.M)
15 | if version_match:
16 | VERSION = version_match.group(1)
17 | else:
18 | raise RuntimeError("Unable to find version string in {0}".format(VERSION_FILE))
19 |
20 | NAME = "nflwin"
21 | PACKAGES = find_packages(where=".")
22 | META_PATH = os.path.join(NAME, "__init__.py")
23 | KEYWORDS = ['NFL','WP','Win Probability']
24 | CLASSIFIERS = [
25 | "Development Status :: 4 - Beta",
26 | "Natural Language :: English",
27 | "License :: OSI Approved :: MIT License",
28 | "Operating System :: OS Independent",
29 | "Programming Language :: Python",
30 | "Programming Language :: Python :: 2",
31 | "Programming Language :: Python :: 2.7",
32 | ]
33 | INSTALL_REQUIRES = ['numpy',
34 | 'scipy',
35 | 'pandas',
36 | 'scikit-learn',
37 | 'joblib',]
38 |
39 | EXTRAS_REQUIRE = {
40 | "plotting": ["matplotlib"],
41 | "nfldb": ["nfldb", "sqlalchemy"],
42 | "dev": ["matplotlib", "nfldb", "sqlalchemy", "pytest", "pytest-cov", "sphinx", "numpydoc"]
43 | }
44 |
45 | PACKAGE_DATA = {"nflwin": ["models/default_model.nflwin*"]}
46 |
47 | HERE = os.path.abspath(os.path.dirname(__file__))
48 | README = None
49 | with open(os.path.join(HERE, 'README.rst'),'r') as f:
50 | README = f.read()
51 |
52 | ###################################################################
53 |
54 | if __name__ == "__main__":
55 | setup(
56 | name=NAME,
57 | description='A Python implementation of NFL Win Probability (WP)',
58 | license='MIT',
59 | url='https://github.com/AndrewRook/NFLWin',
60 | version=VERSION,
61 | author='Andrew Schechtman-Rook',
62 | author_email='footballastronomer@gmail.com',
63 | maintainer='Andrew Schechtman-Rook',
64 | maintainer_email='footballastronomer@gmail.com',
65 | keywords=KEYWORDS,
66 | long_description=README,
67 | packages=PACKAGES,
68 | package_data=PACKAGE_DATA,
69 | classifiers=CLASSIFIERS,
70 | install_requires=INSTALL_REQUIRES,
71 | extras_require=EXTRAS_REQUIRE
72 | )
73 |
--------------------------------------------------------------------------------
/doc/source/index.rst:
--------------------------------------------------------------------------------
1 | ==================================
2 | NFLWin
3 | ==================================
4 |
5 | NFLWin is designed from the ground up to provide two things:
6 |
7 | * A simple-to-use interface for users to compute Win Probabilities
8 | (WP) for NFL plays based on a built-in WP model.
9 | * A robust framework for improving estimates of WP.
10 |
11 | NFLWin builds on `scikit-learn's `_
12 | ``fit``-``transform`` idiom, allowing for pipelines that take in raw
13 | box score data and return estimated WPs - all data
14 | preprocessing takes place behind the scenes. Additionally,
15 | these preprocessing steps can be easily reordered, replaced, and/or
16 | extended, allowing for rapid iteration and prototyping of potential
17 | improvements to the WP model.
18 |
19 | NFLWin also has built-in support for efficiently querying data from
20 | `nfldb `_ directly into a format
21 | useable by the built-in WP model, although the model is fully
22 | data-source-agnostic as long as the data is formatted properly for the
23 | model to parse.
24 |
25 | Quickstart
26 | ---------------
27 |
28 | NFLWin is ``pip``-installable::
29 |
30 | $ pip install nflwin
31 |
32 | .. note:: NFLWin depends on `SciPy `_, which
33 | is notoriously difficult to install properly via
34 | ``pip``. You may wish to use the `Conda
35 | `_ package manager to install
36 | Scipy before installing NFLWin.
37 |
38 | When installed via ``pip``, NFLWin comes with a working Win Probability model out-of-the-box:
39 |
40 | .. code-block:: python
41 |
42 | >>> from nflwin.model import WPModel
43 | >>> standard_model = WPModel.load_model()
44 |
45 | The default model can be inspected to learn what data it requires:
46 |
47 | .. code-block:: python
48 |
49 | >>> standard_model.column_descriptions
50 | {'home_team': 'Abbreviation for the home team', 'yardline': "The yardline, given by (yards from own goalline - 50). -49 is your own 1 while 49 is the opponent's 1.", 'seconds_elapsed': 'Seconds elapsed in the quarter', 'down': 'The current down', 'curr_away_score': 'Abbreviation for the visiting team', 'offense_team': 'Abbreviation for the offensive team', 'yards_to_go': 'Yards to a first down (or the endzone)', 'quarter': 'The quarter'}
51 |
52 |
53 |
54 | NFLWin operates on `Pandas `_ DataFrames:
55 |
56 | .. code-block:: python
57 |
58 | >>> import pandas as pd
59 | >>> plays = pd.DataFrame({
60 | ... "quarter": ["Q1", "Q2", "Q4"],
61 | ... "seconds_elapsed": [0, 0, 600],
62 | ... "offense_team": ["NYJ", "NYJ", "NE"],
63 | ... "yardline": [-20, 20, 35],
64 | ... "down": [1, 3, 3],
65 | ... "yards_to_go": [10, 2, 10],
66 | ... "home_team": ["NYJ", "NYJ", "NYJ"],
67 | ... "away_team": ["NE", "NE", "NE"],
68 | ... "curr_home_score": [0, 0, 21],
69 | ... "curr_away_score": [0, 0, 10]
70 | ... })
71 |
72 | Once data is loaded, using the model to predict WP is easy:
73 |
74 | .. code-block:: python
75 |
76 | >>> standard_model.predict_wp(plays)
77 | array([ 0.58300397, 0.64321796, 0.18195466])
78 |
79 | Current Default Model
80 | ---------------------
81 |
82 | .. image:: _static/validation_plot.png
83 |
84 | Why NFLWin?
85 | --------------
86 | Put simply, there are no other options: while WP models have been
87 | widely used in NFL analytics for years, the analytics community has
88 | almost totally dropped the ball in making these models available for the
89 | general public or even explaining their algorithms at all.
90 |
91 | For a (much) longer explanation, see the `PhD Football blog
92 | `_.
93 |
94 |
95 | Resources
96 | ------------
97 |
98 | .. toctree::
99 | :maxdepth: 2
100 | :hidden:
101 | :caption: Links
102 |
103 | installation.rst
104 | model.rst
105 | Using nfldb
106 | Developer Documentation
107 | Full API Documentation
108 |
109 |
--------------------------------------------------------------------------------
/increment_version.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | ########################################################################
4 | # This script intelligently increments NFLWin's version,
5 | # based on the rules of semantic versioning.
6 | # It does the following:
7 | # 1. Parse command line arguments to determine whether to
8 | # increment major, minor, or patch version.
9 | # 2. Makes sure it's not on the master branch.
10 | # 3. Makes sure there aren't any changes that have been
11 | # staged but not committed.
12 | # 4. Makes sure there aren't any changes that have been
13 | # committed but not pushed.
14 | # 5. Makes sure all unit tests pass.
15 | # 6. Compares current version in nflwin/_version.py to most recent
16 | # git tag to make sure they're the same.
17 | # 7. Figures out what the new version should be.
18 | # 8. Updates nflwin/_version.py to the new version.
19 | # 9. Uploads package to PyPI.
20 | #10. Adds and commits nflwin/_version.py with commit message
21 | # "bumped [TYPE] version to [VERSION]", where [TYPE] is major, minor, or patch.
22 | #11. Tags latest commit with version number (no 'v').
23 | #12. Pushes commit and tag.
24 | ########################################################################
25 |
26 | set -e
27 |
28 | #Parse command line arguments:
29 | if [ "$#" -ne 1 ]; then
30 | echo "Syntax: ./increment_version.sh [major|minor|patch]"
31 | exit 1
32 | fi
33 |
34 | VERSION_TYPE=`echo "$1" | tr '[:upper:]' '[:lower:]'`
35 |
36 | if [ "$VERSION_TYPE" != "major" -a "$VERSION_TYPE" != "minor" -a "$VERSION_TYPE" != "patch" ]; then
37 | echo "Version type must be one of 'major', 'minor', or 'patch'"
38 | exit 1
39 | fi
40 |
41 | #Ensure we're not on master:
42 | CURRENT_BRANCH=`git rev-parse --abbrev-ref HEAD`
43 | if [ "$CURRENT_BRANCH" == "master" ]; then
44 | echo "Must not be on master branch"
45 | exit 1
46 | fi
47 |
48 | #Make sure there aren't any staged changes:
49 | STAGED_CHANGES_FLAG=`git status | grep "Changes to be committed" | wc -l`
50 | if [ $STAGED_CHANGES_FLAG -ne 0 ]; then
51 | echo "Must not have any staged changes"
52 | exit 1
53 | fi
54 |
55 | #Make sure there aren't any unpushed changes:
56 | git pull #Do this first to sync things
57 |
58 | UP_TO_DATE_FLAG=`git status | sed -n 2p | grep "Your branch is up-to-date with" | wc -l`
59 | if [ $UP_TO_DATE_FLAG -eq 0 ]; then
60 | echo "Must not have any unpushed changes"
61 | exit 1
62 | fi
63 |
64 | #Make sure all unit tests pass:
65 | ./run_tests.sh #Will return 1 if any tests fail, thus triggering the set -e flag.
66 |
67 | #Get version in nflwin/_version.py
68 | VERSION_PY=`grep "^__version__" nflwin/_version.py | awk '{print $NF}' | tr -d \" | tr -d u`
69 |
70 | #Get version in git:
71 | VERSION_GIT=`git describe --tags $(git rev-list --tags --max-count=1)`
72 |
73 | #Ensure versions are the same:
74 | if [ $VERSION_PY != $VERSION_GIT ]; then
75 | echo "Versions must match! Python version=${VERSION_PY}, git version=${VERSION_GIT}"
76 | exit 1
77 | fi
78 |
79 | #Determines what new version should be:
80 | MAJOR=`echo $VERSION_PY | awk -F"." '{print $1}'`
81 | MINOR=`echo $VERSION_PY | awk -F"." '{print $2}'`
82 | PATCH=`echo $VERSION_PY | awk -F"." '{print $3}'`
83 | if [ $VERSION_TYPE == "patch" ]; then
84 | PATCH=$(expr $PATCH + 1)
85 | elif [ $VERSION_TYPE == "minor" ]; then
86 | MINOR=$(expr $MINOR + 1)
87 | PATCH=0
88 | else
89 | MAJOR=$(expr $MAJOR + 1)
90 | MINOR=0
91 | PATCH=0
92 | fi
93 | NEW_VERSION="$MAJOR.$MINOR.$PATCH"
94 |
95 | #Update nflwin/_version.py:
96 | sed -i.bak "s/${VERSION_PY}/${NEW_VERSION}/" nflwin/_version.py
97 | rm nflwin/_version.py.bak
98 |
99 | #Upload package to PyPI:
100 | python setup.py sdist upload -r pypi
101 |
102 | #Stage and commit nflwin/_version.py
103 | git add nflwin/_version.py
104 | git commit -m "bumped ${VERSION_TYPE} version to ${NEW_VERSION}"
105 |
106 | #Tag the commit:
107 | git tag -a ${NEW_VERSION} -m "bumped ${VERSION_TYPE}"
108 |
109 | #Push the commit and tag:
110 | git push
111 | git push origin ${NEW_VERSION}
112 |
113 | echo "finished!"
114 |
115 | exit 0
116 |
--------------------------------------------------------------------------------
/doc/source/dev.rst:
--------------------------------------------------------------------------------
1 | For Developers
2 | =========================
3 |
4 | This section of the documentation covers things that will be useful for those already contributing to NFLWin.
5 |
6 | .. note::
7 | Unless stated otherwise assume that all filepaths given in this section start at the root directory for the repo.
8 |
9 | Testing Documentation
10 | ------------------------------------------
11 |
12 | Documentation for NFLWin is hosted at `Read the Docs `_, and is built automatically when changes are made on the master branch or a release is cut. However, oftentimes it's valuable to display NFLWin's documentation locally as you're writing. To do this, run the following::
13 |
14 | $ ./build_local_documentation.sh
15 |
16 | When that command finishes, open up ``doc/index.html`` in your browser of choice to see the site.
17 |
18 | Updating the Default Model
19 | --------------------------------------
20 |
21 | NFLWin comes with a pre-trained model, but if the code generating that model is updated **the model itself is not**. So you have to update it yourself. The good news, however, is that there's a script for that::
22 |
23 | $ python make_default_model.py
24 |
25 | .. note::
26 | This script hardcodes in the seasons to use for training and
27 | testing samples. After each season those will likely need to be
28 | updated to use the most up-to-date data.
29 |
30 | .. note::
31 | This script requires ``matplotlib`` in order to run, as it produces a
32 | validation plot for the documentation.
33 |
34 | Cutting a New Release
35 | ----------------------------------
36 | NFLWin uses `semantic versioning `_, which basically boils down to the following (taken directly from the webpage linked earlier in this sentence):
37 |
38 | Given a version number MAJOR.MINOR.PATCH, increment the:
39 |
40 | 1. MAJOR version when you make incompatible API changes,
41 | 2. MINOR version when you add functionality in a backwards-compatible manner, and
42 | 3. PATCH version when you make backwards-compatible bug fixes.
43 |
44 | Basically, unless you change something drastic you leave the major version alone (the exception being going to version 1.0.0, which indicates the first release where the interface is considered "stable").
45 |
46 | The trick here is to note that information about a new release must live in a few places:
47 |
48 | * In ``nflwin/_version.py`` as the value of the ``__version__`` variable.
49 | * As a tagged commit.
50 | * As a release on GitHub.
51 | * As an upload to PyPI.
52 | * (If necessary) as a documented release on Read the Docs.
53 |
54 | Changing the version in one place but not in others can have relatively minor but fairly annoying consequences. To help manage the release cutting process there is a shell script that automates significant parts of this process::
55 |
56 | $ ./increment_version.sh [major|minor|patch]
57 |
58 | This script does a bunch of things, namely:
59 |
60 | 1. Parse command line arguments to determine whether to
61 | increment major, minor, or patch version.
62 | 2. Makes sure it's not on the master branch.
63 | 3. Makes sure there aren't any changes that have been
64 | staged but not committed.
65 | 4. Makes sure there aren't any changes that have been
66 | committed but not pushed.
67 | 5. Makes sure all unit tests pass.
68 | 6. Compares current version in nflwin/_version.py to most recent
69 | git tag to make sure they're the same.
70 | 7. Figures out what the new version should be.
71 | 8. Updates nflwin/_version.py to the new version.
72 | 9. Uploads package to PyPI.
73 | 10. Adds and commits nflwin/_version.py with commit message
74 | "bumped [TYPE] version to [VERSION]", where [TYPE] is major, minor, or patch.
75 | 11. Tags latest commit with version number (no 'v').
76 | 12. Pushes commit and tag.
77 |
78 | It will exit if **anything** returns with a non-zero exit status, and since it waits until the very end to upload anything to PyPI or GitHub if you do run into an error in most cases you can fix it and then just re-run the script.
79 |
80 | The process for cutting a release is as follows:
81 |
82 | 1. Make double sure that you're on a branch that's not ``master`` and you're ready to cut a new release (general good practice is to branch off from master *just* for the purpose of making a new release).
83 | 2. Run the ``increment_version.sh`` script.
84 | 3. Fix any errors, then rerun the script until it passes.
85 | 4. Make a PR on GitHub into master, and merge it in (self-merge is ok if branch is just updating version).
86 | 5. Make release notes for new release on GitHub.
87 | 6. (If necessary) go to Read the Docs and activate the new release.
88 |
--------------------------------------------------------------------------------
/doc/source/nfldb.rst:
--------------------------------------------------------------------------------
1 | .. _nfldb-install:
2 |
3 | Using Data From nfldb
4 | =======================================
5 |
6 | NFLWin comes with robust support for querying data from `nfldb
7 | `_, a package designed to
8 | facilitate downloading and accessing play-by-play data. There are
9 | functions to query the nfldb database in :py:mod:`nflwin.utilities`,
10 | and :py:class:`nflwin.model.WPModel` has keyword arguments that allow
11 | you to directly use nfldb data to fit and validate a WP model. Using
12 | nfldb is totally optional: a default model is already fit and ready to
13 | use, and NFLWin is fully compatible with any source for play-by-play
14 | data. However, nfldb is one of the few free sources of up-to-date NFL
15 | data and so it may be a useful resource to have.
16 |
17 |
18 | Installing nfldb
19 | --------------------------------
20 |
21 | nfldb is pip-installable, and can be installed as an extra dependency
22 | (``pip install nflwin[nfldb]``). Without setting up the nfldb
23 | Postgres database first, however, the pip install will succeed but
24 | nfldb will be unuseable. What's more, trying to set up the database
25 | *after* installing nfldb may fail as well.
26 |
27 | The nfldb wiki has `fairly decent installation instructions
28 | `_, but I know
29 | that when I went through the installation process I had to interpret
30 | and adjust several steps. I'd at least recommend reading through the
31 | wiki first, but in case it's useful
32 | I've listed the steps I followed below (for reference I was on Mac OS 10.10).
33 |
34 |
35 | Installing Postgres
36 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
37 | I had an old install kicking around, so I first had to clean that up.
38 | Since I was using `Homebrew `_::
39 |
40 | $ brew uninstall -force postgresql
41 | $ rm -rf /usr/local/var/postgres/ # where I'd installed the prior DB
42 |
43 | Then install a fresh version::
44 |
45 | $ brew update
46 | $ brew install postgresql
47 |
48 |
49 | Start Postgres and Create a Default DB
50 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
51 | You can choose to run Postgres at startup, but I don't use it that
52 | often so I choose not to do those steps - I just run it in the
53 | foreground with this command::
54 |
55 | $ postgres -D /usr/local/var/postgres
56 |
57 | Or in the background with this command::
58 |
59 | $ pg_ctl -D /usr/local/var/postgres -l logfile start
60 |
61 | If you don't create a default database based on your username,
62 | launching Postgres will fail with a ``psql: FATAL: database
63 | "USERNAME" does not exist`` error::
64 |
65 | $ createdb `whoami`
66 |
67 | Check that the install and configuration went well by launching
68 | Postgres as your default user::
69 |
70 | $ psql
71 | psql (9.5.2)
72 | Type "help" for help.
73 |
74 | USERNAME=#
75 |
76 | Next, add a password::
77 |
78 | USERNAME=# ALTER ROLE "USERNAME" WITH ENCRYPTED PASSWORD 'choose a
79 | superuser password';
80 | USERNAME=# \q;
81 |
82 | Edit the ``pg_hba.conf``file found in your database (in my case the
83 | file was
84 | ``/usr/local/var/postgres/pg_hba.conf``), and change all instances of
85 | ``trust`` to ``md5``.
86 |
87 | Create nfldb Postgres User and Database
88 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
89 | Start by making a user::
90 |
91 | $ createuser -U USERNAME -E -P nfldb
92 |
93 | where you replace ``USERNAME`` with your actual username. Make up a
94 | new password. Then make the nfldb database::
95 |
96 | $ createdb -U USERNAME -O nfldb nfldb
97 |
98 | You'll need to enter the password for the USERNAME account. Next, add
99 | the fuzzy string matching extension::
100 |
101 | $ psql -U USERNAME -c 'CREATE EXTENSION fuzzystrmatch;' nfldb
102 |
103 | You should now be able to connect the nfldb user to the nfldb
104 | database::
105 |
106 | $ psql -U nfldb nfldb
107 |
108 | From this point you should be able to follow along with the
109 | instructions from `nfldb
110 | `_.
111 |
112 | Using nfldb
113 | ----------------------
114 |
115 | Once nfldb is properly installed, you can use it with NFLwin in a
116 | couple of different ways.
117 |
118 | Querying Data
119 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^
120 | nfldb comes with a robust set of options to query its database, but
121 | they tend to be designed more for ad hoc querying of small amounts of
122 | data or computing aggregate statistics. It's possible to use built-in
123 | nfldb queries to get the data NFLWin needs, but it's *slow*. So NFLWin
124 | has built in support for bulk queries of nfldb in the
125 | :py:mod:`nflwin.utilities` module::
126 |
127 | >>> from nflwin import utilities
128 | >>> data = utilities.get_nfldb_play_data(season_years=[2010],
129 | ... season_types=["Regular", "Postseason"])
130 | >>> data.head()
131 | gsis_id drive_id play_id offense_team yardline down yards_to_go \
132 | 0 2010090900 1 35 MIN -20.0 0 0
133 | 1 2010090900 1 57 NO -27.0 1 10
134 | 2 2010090900 1 81 NO 1.0 1 10
135 | 3 2010090900 1 109 NO 13.0 1 10
136 | 4 2010090900 1 135 NO 13.0 2 10
137 |
138 | home_team away_team offense_won quarter seconds_elapsed curr_home_score \
139 | 0 NO MIN False Q1 0.0 0
140 | 1 NO MIN True Q1 4.0 0
141 | 2 NO MIN True Q1 39.0 0
142 | 3 NO MIN True Q1 79.0 0
143 | 4 NO MIN True Q1 84.0 0
144 |
145 | curr_away_score
146 | 0 0
147 | 1 0
148 | 2 0
149 | 3 0
150 | 4 0
151 |
152 | You can see the `docstring `_ for more details, but basically ``get_nfldb_play_data`` queries
153 | the nfldb database directly for columns relevant to estimating WP,
154 | does some simple parsing/preprocessing to get them in the right format,
155 | then returns them as a dataframe. Keyword arguments control what parts
156 | of seasons are queried.
157 |
158 | Integration with WPModel
159 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
160 |
161 | While you can train NFLWin's win probability model
162 | (:py:class:`nflwin.model.WPModel`) with whatever data you want, it
163 | comes with keyword arguments that allow you to query nfldb
164 | directly. For instance, to train the default model on the 2009 and 2010
165 | regular seasons from nfldb, you'd enter the following::
166 |
167 | >>> from nflwin.model import WPModel
168 | >>> model = WPModel()
169 | >>> model.create_default_pipeline() #doctest: +ELLIPSIS
170 | Pipeline(...)
171 | >>> model.train_model(source_data="nfldb",
172 | ... training_seasons=[2009, 2010],
173 | ... training_season_types=["Regular"])
174 |
--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | REM Command file for Sphinx documentation
4 |
5 | if "%SPHINXBUILD%" == "" (
6 | set SPHINXBUILD=sphinx-build
7 | )
8 | set BUILDDIR=build
9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source
10 | set I18NSPHINXOPTS=%SPHINXOPTS% source
11 | if NOT "%PAPER%" == "" (
12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
14 | )
15 |
16 | if "%1" == "" goto help
17 |
18 | if "%1" == "help" (
19 | :help
20 | echo.Please use `make ^` where ^ is one of
21 | echo. html to make standalone HTML files
22 | echo. dirhtml to make HTML files named index.html in directories
23 | echo. singlehtml to make a single large HTML file
24 | echo. pickle to make pickle files
25 | echo. json to make JSON files
26 | echo. htmlhelp to make HTML files and a HTML help project
27 | echo. qthelp to make HTML files and a qthelp project
28 | echo. devhelp to make HTML files and a Devhelp project
29 | echo. epub to make an epub
30 | echo. epub3 to make an epub3
31 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
32 | echo. text to make text files
33 | echo. man to make manual pages
34 | echo. texinfo to make Texinfo files
35 | echo. gettext to make PO message catalogs
36 | echo. changes to make an overview over all changed/added/deprecated items
37 | echo. xml to make Docutils-native XML files
38 | echo. pseudoxml to make pseudoxml-XML files for display purposes
39 | echo. linkcheck to check all external links for integrity
40 | echo. doctest to run all doctests embedded in the documentation if enabled
41 | echo. coverage to run coverage check of the documentation if enabled
42 | echo. dummy to check syntax errors of document sources
43 | goto end
44 | )
45 |
46 | if "%1" == "clean" (
47 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
48 | del /q /s %BUILDDIR%\*
49 | goto end
50 | )
51 |
52 |
53 | REM Check if sphinx-build is available and fallback to Python version if any
54 | %SPHINXBUILD% 1>NUL 2>NUL
55 | if errorlevel 9009 goto sphinx_python
56 | goto sphinx_ok
57 |
58 | :sphinx_python
59 |
60 | set SPHINXBUILD=python -m sphinx.__init__
61 | %SPHINXBUILD% 2> nul
62 | if errorlevel 9009 (
63 | echo.
64 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
65 | echo.installed, then set the SPHINXBUILD environment variable to point
66 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
67 | echo.may add the Sphinx directory to PATH.
68 | echo.
69 | echo.If you don't have Sphinx installed, grab it from
70 | echo.http://sphinx-doc.org/
71 | exit /b 1
72 | )
73 |
74 | :sphinx_ok
75 |
76 |
77 | if "%1" == "html" (
78 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
79 | if errorlevel 1 exit /b 1
80 | echo.
81 | echo.Build finished. The HTML pages are in %BUILDDIR%/html.
82 | goto end
83 | )
84 |
85 | if "%1" == "dirhtml" (
86 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
87 | if errorlevel 1 exit /b 1
88 | echo.
89 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
90 | goto end
91 | )
92 |
93 | if "%1" == "singlehtml" (
94 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
95 | if errorlevel 1 exit /b 1
96 | echo.
97 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
98 | goto end
99 | )
100 |
101 | if "%1" == "pickle" (
102 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
103 | if errorlevel 1 exit /b 1
104 | echo.
105 | echo.Build finished; now you can process the pickle files.
106 | goto end
107 | )
108 |
109 | if "%1" == "json" (
110 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
111 | if errorlevel 1 exit /b 1
112 | echo.
113 | echo.Build finished; now you can process the JSON files.
114 | goto end
115 | )
116 |
117 | if "%1" == "htmlhelp" (
118 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
119 | if errorlevel 1 exit /b 1
120 | echo.
121 | echo.Build finished; now you can run HTML Help Workshop with the ^
122 | .hhp project file in %BUILDDIR%/htmlhelp.
123 | goto end
124 | )
125 |
126 | if "%1" == "qthelp" (
127 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
128 | if errorlevel 1 exit /b 1
129 | echo.
130 | echo.Build finished; now you can run "qcollectiongenerator" with the ^
131 | .qhcp project file in %BUILDDIR%/qthelp, like this:
132 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\NFLWin.qhcp
133 | echo.To view the help file:
134 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\NFLWin.ghc
135 | goto end
136 | )
137 |
138 | if "%1" == "devhelp" (
139 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
140 | if errorlevel 1 exit /b 1
141 | echo.
142 | echo.Build finished.
143 | goto end
144 | )
145 |
146 | if "%1" == "epub" (
147 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
148 | if errorlevel 1 exit /b 1
149 | echo.
150 | echo.Build finished. The epub file is in %BUILDDIR%/epub.
151 | goto end
152 | )
153 |
154 | if "%1" == "epub3" (
155 | %SPHINXBUILD% -b epub3 %ALLSPHINXOPTS% %BUILDDIR%/epub3
156 | if errorlevel 1 exit /b 1
157 | echo.
158 | echo.Build finished. The epub3 file is in %BUILDDIR%/epub3.
159 | goto end
160 | )
161 |
162 | if "%1" == "latex" (
163 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
164 | if errorlevel 1 exit /b 1
165 | echo.
166 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
167 | goto end
168 | )
169 |
170 | if "%1" == "latexpdf" (
171 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
172 | cd %BUILDDIR%/latex
173 | make all-pdf
174 | cd %~dp0
175 | echo.
176 | echo.Build finished; the PDF files are in %BUILDDIR%/latex.
177 | goto end
178 | )
179 |
180 | if "%1" == "latexpdfja" (
181 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
182 | cd %BUILDDIR%/latex
183 | make all-pdf-ja
184 | cd %~dp0
185 | echo.
186 | echo.Build finished; the PDF files are in %BUILDDIR%/latex.
187 | goto end
188 | )
189 |
190 | if "%1" == "text" (
191 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
192 | if errorlevel 1 exit /b 1
193 | echo.
194 | echo.Build finished. The text files are in %BUILDDIR%/text.
195 | goto end
196 | )
197 |
198 | if "%1" == "man" (
199 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
200 | if errorlevel 1 exit /b 1
201 | echo.
202 | echo.Build finished. The manual pages are in %BUILDDIR%/man.
203 | goto end
204 | )
205 |
206 | if "%1" == "texinfo" (
207 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
208 | if errorlevel 1 exit /b 1
209 | echo.
210 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
211 | goto end
212 | )
213 |
214 | if "%1" == "gettext" (
215 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
216 | if errorlevel 1 exit /b 1
217 | echo.
218 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
219 | goto end
220 | )
221 |
222 | if "%1" == "changes" (
223 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
224 | if errorlevel 1 exit /b 1
225 | echo.
226 | echo.The overview file is in %BUILDDIR%/changes.
227 | goto end
228 | )
229 |
230 | if "%1" == "linkcheck" (
231 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
232 | if errorlevel 1 exit /b 1
233 | echo.
234 | echo.Link check complete; look for any errors in the above output ^
235 | or in %BUILDDIR%/linkcheck/output.txt.
236 | goto end
237 | )
238 |
239 | if "%1" == "doctest" (
240 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
241 | if errorlevel 1 exit /b 1
242 | echo.
243 | echo.Testing of doctests in the sources finished, look at the ^
244 | results in %BUILDDIR%/doctest/output.txt.
245 | goto end
246 | )
247 |
248 | if "%1" == "coverage" (
249 | %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
250 | if errorlevel 1 exit /b 1
251 | echo.
252 | echo.Testing of coverage in the sources finished, look at the ^
253 | results in %BUILDDIR%/coverage/python.txt.
254 | goto end
255 | )
256 |
257 | if "%1" == "xml" (
258 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
259 | if errorlevel 1 exit /b 1
260 | echo.
261 | echo.Build finished. The XML files are in %BUILDDIR%/xml.
262 | goto end
263 | )
264 |
265 | if "%1" == "pseudoxml" (
266 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
267 | if errorlevel 1 exit /b 1
268 | echo.
269 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
270 | goto end
271 | )
272 |
273 | if "%1" == "dummy" (
274 | %SPHINXBUILD% -b dummy %ALLSPHINXOPTS% %BUILDDIR%/dummy
275 | if errorlevel 1 exit /b 1
276 | echo.
277 | echo.Build finished. Dummy builder generates no files.
278 | goto end
279 | )
280 |
281 | :end
282 |
--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
1 | # Makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | PAPER =
8 | BUILDDIR = build
9 |
10 | # User-friendly check for sphinx-build
11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don\'t have Sphinx installed, grab it from http://sphinx-doc.org/)
13 | endif
14 |
15 | # Internal variables.
16 | PAPEROPT_a4 = -D latex_paper_size=a4
17 | PAPEROPT_letter = -D latex_paper_size=letter
18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
19 | # the i18n builder cannot share the environment and doctrees with the others
20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
21 |
22 | .PHONY: help
23 | help:
24 | @echo "Please use \`make ' where is one of"
25 | @echo " html to make standalone HTML files"
26 | @echo " dirhtml to make HTML files named index.html in directories"
27 | @echo " singlehtml to make a single large HTML file"
28 | @echo " pickle to make pickle files"
29 | @echo " json to make JSON files"
30 | @echo " htmlhelp to make HTML files and a HTML help project"
31 | @echo " qthelp to make HTML files and a qthelp project"
32 | @echo " applehelp to make an Apple Help Book"
33 | @echo " devhelp to make HTML files and a Devhelp project"
34 | @echo " epub to make an epub"
35 | @echo " epub3 to make an epub3"
36 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
37 | @echo " latexpdf to make LaTeX files and run them through pdflatex"
38 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
39 | @echo " text to make text files"
40 | @echo " man to make manual pages"
41 | @echo " texinfo to make Texinfo files"
42 | @echo " info to make Texinfo files and run them through makeinfo"
43 | @echo " gettext to make PO message catalogs"
44 | @echo " changes to make an overview of all changed/added/deprecated items"
45 | @echo " xml to make Docutils-native XML files"
46 | @echo " pseudoxml to make pseudoxml-XML files for display purposes"
47 | @echo " linkcheck to check all external links for integrity"
48 | @echo " doctest to run all doctests embedded in the documentation (if enabled)"
49 | @echo " coverage to run coverage check of the documentation (if enabled)"
50 | @echo " dummy to check syntax errors of document sources"
51 |
52 | .PHONY: clean
53 | clean:
54 | rm -rf $(BUILDDIR)/*
55 |
56 | .PHONY: html
57 | html:
58 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
59 | @echo
60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
61 |
62 | .PHONY: dirhtml
63 | dirhtml:
64 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
65 | @echo
66 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
67 |
68 | .PHONY: singlehtml
69 | singlehtml:
70 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
71 | @echo
72 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
73 |
74 | .PHONY: pickle
75 | pickle:
76 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
77 | @echo
78 | @echo "Build finished; now you can process the pickle files."
79 |
80 | .PHONY: json
81 | json:
82 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
83 | @echo
84 | @echo "Build finished; now you can process the JSON files."
85 |
86 | .PHONY: htmlhelp
87 | htmlhelp:
88 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
89 | @echo
90 | @echo "Build finished; now you can run HTML Help Workshop with the" \
91 | ".hhp project file in $(BUILDDIR)/htmlhelp."
92 |
93 | .PHONY: qthelp
94 | qthelp:
95 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
96 | @echo
97 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \
98 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
99 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/NFLWin.qhcp"
100 | @echo "To view the help file:"
101 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/NFLWin.qhc"
102 |
103 | .PHONY: applehelp
104 | applehelp:
105 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
106 | @echo
107 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
108 | @echo "N.B. You won't be able to view it unless you put it in" \
109 | "~/Library/Documentation/Help or install it in your application" \
110 | "bundle."
111 |
112 | .PHONY: devhelp
113 | devhelp:
114 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
115 | @echo
116 | @echo "Build finished."
117 | @echo "To view the help file:"
118 | @echo "# mkdir -p $$HOME/.local/share/devhelp/NFLWin"
119 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/NFLWin"
120 | @echo "# devhelp"
121 |
122 | .PHONY: epub
123 | epub:
124 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
125 | @echo
126 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
127 |
128 | .PHONY: epub3
129 | epub3:
130 | $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3
131 | @echo
132 | @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3."
133 |
134 | .PHONY: latex
135 | latex:
136 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
137 | @echo
138 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
139 | @echo "Run \`make' in that directory to run these through (pdf)latex" \
140 | "(use \`make latexpdf' here to do that automatically)."
141 |
142 | .PHONY: latexpdf
143 | latexpdf:
144 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
145 | @echo "Running LaTeX files through pdflatex..."
146 | $(MAKE) -C $(BUILDDIR)/latex all-pdf
147 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
148 |
149 | .PHONY: latexpdfja
150 | latexpdfja:
151 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
152 | @echo "Running LaTeX files through platex and dvipdfmx..."
153 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
154 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
155 |
156 | .PHONY: text
157 | text:
158 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
159 | @echo
160 | @echo "Build finished. The text files are in $(BUILDDIR)/text."
161 |
162 | .PHONY: man
163 | man:
164 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
165 | @echo
166 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
167 |
168 | .PHONY: texinfo
169 | texinfo:
170 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
171 | @echo
172 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
173 | @echo "Run \`make' in that directory to run these through makeinfo" \
174 | "(use \`make info' here to do that automatically)."
175 |
176 | .PHONY: info
177 | info:
178 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
179 | @echo "Running Texinfo files through makeinfo..."
180 | make -C $(BUILDDIR)/texinfo info
181 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
182 |
183 | .PHONY: gettext
184 | gettext:
185 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
186 | @echo
187 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
188 |
189 | .PHONY: changes
190 | changes:
191 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
192 | @echo
193 | @echo "The overview file is in $(BUILDDIR)/changes."
194 |
195 | .PHONY: linkcheck
196 | linkcheck:
197 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
198 | @echo
199 | @echo "Link check complete; look for any errors in the above output " \
200 | "or in $(BUILDDIR)/linkcheck/output.txt."
201 |
202 | .PHONY: doctest
203 | doctest:
204 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
205 | @echo "Testing of doctests in the sources finished, look at the " \
206 | "results in $(BUILDDIR)/doctest/output.txt."
207 |
208 | .PHONY: coverage
209 | coverage:
210 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
211 | @echo "Testing of coverage in the sources finished, look at the " \
212 | "results in $(BUILDDIR)/coverage/python.txt."
213 |
214 | .PHONY: xml
215 | xml:
216 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
217 | @echo
218 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml."
219 |
220 | .PHONY: pseudoxml
221 | pseudoxml:
222 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
223 | @echo
224 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
225 |
226 | .PHONY: dummy
227 | dummy:
228 | $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy
229 | @echo
230 | @echo "Build finished. Dummy builder generates no files."
231 |
--------------------------------------------------------------------------------
/doc/source/model.rst:
--------------------------------------------------------------------------------
1 | Creating a New WP Model
2 | ==============================
3 | While NFLWin ships with a fairly robust default model, there is always
4 | room for improvement. Maybe there's a new dataset you want to use to
5 | train the model, a new feature you want to add, or a new machine
6 | learning model you want to evaluate.
7 |
8 | Good news! NFLWin makes it easy to train a new model, whether you just
9 | want to refresh the data or to do an entire refit from scratch. We'll
10 | start with the simplest case:
11 |
12 | Default Model, New Data
13 | -----------------------
14 | Refreshing the data with NFLWin is a snap. If you want to change the
15 | data used by the default model but keep the source as nfldb, all you
16 | have to do is override the default keyword arguments when calling the
17 | :meth:`~nflwin.model.WPModel.train_model` and :meth:`~nflwin.model.WPModel.validate_model`
18 | methods. For instance, if for some insane reason you wanted to train on the 2009 and 2010 regular
19 | seasons and validate on the 2011 and 2012 playoffs, you would do the following:
20 |
21 | .. code-block:: python
22 |
23 | >>> from nflwin.model import WPModel
24 | >>> new_data_model = WPModel()
25 | >>> new_data_model.train_model(training_seasons=[2009, 2010], training_season_types=["Regular"])
26 | >>> new_data_model.validate_model(validation_seasons=[2011, 2012], validation_season_types=["Postseason"])
27 | (21.355462918011327, 565.56909036318007)
28 |
29 | If you want to supply your own data, that's easy too - simply set the
30 | `source_data` kwarg of :meth:`~nflwin.model.WPModel.train_model` and
31 | :meth:`~nflwin.model.WPModel.validate_model` to be a Pandas DataFrame of your training and validation data (respectively):
32 |
33 | ..
34 | from nflwin.utilities import get_nfldb_play_data
35 | training_data = get_nfldb_play_data(season_years=[2012, 2013])
36 | validation_data = get_nfldb_play_data(season_years=[2014])
37 |
38 | .. code-block:: python
39 |
40 | >>> from nflwin.model import WPModel
41 | >>> new_data_model = WPModel()
42 | >>> training_data.head()
43 | gsis_id drive_id play_id offense_team yardline down yards_to_go \
44 | 0 2012090500 1 35 DAL -15.0 0 0
45 | 1 2012090500 1 57 NYG -34.0 1 10
46 | 2 2012090500 1 79 NYG -34.0 2 10
47 | 3 2012090500 1 103 NYG -29.0 3 5
48 | 4 2012090500 1 125 NYG -29.0 4 5
49 |
50 | home_team away_team offense_won quarter seconds_elapsed curr_home_score \
51 | 0 NYG DAL True Q1 0.0 0
52 | 1 NYG DAL False Q1 4.0 0
53 | 2 NYG DAL False Q1 11.0 0
54 | 3 NYG DAL False Q1 55.0 0
55 | 4 NYG DAL False Q1 62.0 0
56 |
57 | curr_away_score
58 | 0 0
59 | 1 0
60 | 2 0
61 | 3 0
62 | 4 0
63 | >>> new_data_model.train_model(source_data=training_data)
64 | >>> validation_data.head()
65 | gsis_id drive_id play_id offense_team yardline down yards_to_go \
66 | 0 2014090400 1 36 SEA -15.0 0 0
67 | 1 2014090400 1 58 GB -37.0 1 10
68 | 2 2014090400 1 79 GB -31.0 2 4
69 | 3 2014090400 1 111 GB -26.0 1 10
70 | 4 2014090400 1 132 GB -11.0 1 10
71 |
72 | home_team away_team offense_won quarter seconds_elapsed curr_home_score \
73 | 0 SEA GB True Q1 0.0 0
74 | 1 SEA GB False Q1 4.0 0
75 | 2 SEA GB False Q1 30.0 0
76 | 3 SEA GB False Q1 49.0 0
77 | 4 SEA GB False Q1 88.0 0
78 |
79 | curr_away_score
80 | 0 0
81 | 1 0
82 | 2 0
83 | 3 0
84 | 4 0
85 | >>> new_data_model.validate_model(source_data=validation_data)
86 | (8.9344062502671591, 265.7971863696315)
87 |
88 | Building a New Model
89 | --------------------
90 | If you want to construct a totally new model, that's possible
91 | too. Just instantiate
92 | :class:`~nflwin.model.WPModel`, then replace the
93 | :attr:`~nflwin.model.WPModel.model` attribute with either a
94 | scikit-learn `classifier
95 | `_
96 | or `Pipeline
97 | `_. From
98 | that point :meth:`~nflwin.model.WPModel.train_model` and
99 | :meth:`~nflwin.model.WPModel.validate_model` should work as normal.
100 |
101 | .. note::
102 | If you create your own model, the
103 | :attr:`~nflwin.model.WPModel.column_descriptions` attribute will no longer be
104 | accurate unless you update it manually.
105 |
106 | .. note::
107 | If your model uses a data structure other than a Pandas DataFrame,
108 | you will not be able to use the ``source_data="nfldb"`` default
109 | kwarg of :meth:`~nflwin.model.WPModel.train_model` and
110 | :meth:`~nflwin.model.WPModel.validate_model`. If you want to use nfldb
111 | data, query it through :func:`nflwin.utilities.get_nfldb_play_data`
112 | first and convert it from a DataFrame to the format required by your model.
113 |
114 | Using NFLWin's Preprocessors
115 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
116 | While you can completely roll your own WP model from scratch, NFLWin
117 | comes with several classes designed to aid in preprocessing your
118 | data. These can be found in the appropriately named
119 | :mod:`~nflwin.preprocessing` module. Each of these preprocessors inherits
120 | from scikit-learn's BaseEstimator class, and therefore is fully
121 | compatible with scikit-learn Pipelines. Available preprocessors
122 | include:
123 |
124 | * :class:`~nflwin.preprocessing.ComputeElapsedTime`: Convert the time
125 | elapsed in a quarter into the total seconds elapsed in the game.
126 | * :class:`~nflwin.preprocessing.ComputeIfOffenseIsHome`: Create an
127 | indicator variable for whether or not the offense is the home team.
128 | * :class:`~nflwin.preprocessing.CreateScoreDifferential`: Create a
129 | column indicating the difference between the offense and defense
130 | point totals (offense-defense). Uses home team and away team plus
131 | an indicator giving if the offense is the home team to compute.
132 | * :class:`~nflwin.preprocessing.MapToInt`: Map a column of values to
133 | integers. Useful for string columns (e.g. a quarter column with "Q1", "Q2", etc).
134 | * :class:`~nflwin.preprocessing.CheckColumnNames`: Ensure that only the desired data gets passed to
135 | the model in the right order. Useful to guarantee that the
136 | underlying numpy arrays in a Pandas DataFrame used for model
137 | validation are in the same order as they were when the model was
138 | trained.
139 |
140 | To see examples of these preprocessors in use to build a model, look
141 | at :meth:`nflwin.model.WPModel.create_default_pipeline`.
142 |
143 | Model I/O
144 | ---------
145 | To save a model to disk, use the
146 | :meth:`nflwin.model.WPModel.save_model` method.
147 |
148 | .. note::
149 | If you do not provide
150 | a filename, the default model will be overwritten and in order to
151 | recover it you will need to reinstall NFLWin (which will then
152 | overwrite any non-default models you have saved).
153 |
154 | To load a model from disk, use the
155 | :meth:`nflwin.model.WPModel.load_model` class method. By default this
156 | will load the standard model that comes bundled with pip installs of
157 | NFLWin. Simply specify the ``filename`` kwarg to load a non-standard
158 | model.
159 |
160 | .. note::
161 | By default, models are saved to and loaded from the path given by
162 | :attr:`nflwin.model.WPModel.model_directory`, which by default is
163 | located inside your NFLWin install.
164 |
165 | Estimating Quality of Fit
166 | -------------------------
167 | When you care about measuring the probability of a classification
168 | model rather than getting a yes/no prediction it is challenging to
169 | estimate its quality. This is an area I'm actively looking to improve
170 | upon, but for now NFLWin does the following.
171 |
172 | First, it takes the probabilities given by the model for each play in the
173 | validation set, then produces a `kernel density estimate
174 | `_ (KDE) of all
175 | the plays as well as just the ones that were predicted
176 | correctly. The ratio of these two KDEs is the actual WP measured
177 | from the test data set at a given *predicted* WP. While all of this is
178 | measured in :meth:`~nflwin.model.WPModel.validate_model`, you can plot
179 | it for yourself by calling the
180 | :meth:`~nflwin.model.WPModel.plot_validation` method, which will
181 | generate a plot like that shown on the home page.
182 |
183 | From there NFLWin computes both the maximum deviation at any given
184 | percentage and the total area between the estimated WP from the model
185 | and what would be expected if the model was perfect - that's what is
186 | actually returned by
187 | :meth:`~nflwin.model.WPModel.validate_model`. This is obviously not
188 | ideal given that it's not directly estimating uncertainties in
189 | the model, but it's the best I've been able to come up with so far. If anyone
190 | has an idea for how to do this better I would welcome it enthusiastically.
191 |
--------------------------------------------------------------------------------
/nflwin/tests/test_model.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function, division
2 |
3 | import os
4 | import collections
5 |
6 | import numpy as np
7 | import pandas as pd
8 | import pytest
9 |
10 | from nflwin import model
11 |
12 |
13 | class TestDefaults(object):
14 | """Tests for defaults."""
15 |
16 | def test_column_descriptions_set(self):
17 | wpmodel = model.WPModel()
18 | assert isinstance(wpmodel.column_descriptions, collections.Mapping)
19 |
20 | class TestModelTrain(object):
21 | """Tests for the train_model method."""
22 |
23 | def test_bad_string(self):
24 | wpmodel = model.WPModel()
25 | with pytest.raises(ValueError):
26 | wpmodel.train_model(source_data="this is a bad string")
27 |
28 | def test_dataframe_input(self):
29 | wpmodel = model.WPModel()
30 | test_data = {'offense_won': {0: True, 1: False, 2: False,
31 | 3: False, 4: False, 5: True,
32 | 6: True, 7: True, 8: True, 9: False},
33 | 'home_team': {0: 'NYG', 1: 'NYG', 2: 'NYG', 3: 'NYG',
34 | 4: 'NYG', 5: 'NYG', 6: 'NYG', 7: 'NYG',
35 | 8: 'NYG', 9: 'NYG'},
36 | 'away_team': {0: 'DAL', 1: 'DAL', 2: 'DAL', 3: 'DAL',
37 | 4: 'DAL', 5: 'DAL', 6: 'DAL', 7: 'DAL',
38 | 8: 'DAL', 9: 'DAL'},
39 | 'gsis_id': {0: '2012090500', 1: '2012090500', 2: '2012090500',
40 | 3: '2012090500', 4: '2012090500', 5: '2012090500',
41 | 6: '2012090500', 7: '2012090500', 8: '2012090500',
42 | 9: '2012090500'},
43 | 'play_id': {0: 35, 1: 57, 2: 79, 3: 103, 4: 125, 5: 150,
44 | 6: 171, 7: 190, 8: 212, 9: 252},
45 | 'seconds_elapsed': {0: 0.0, 1: 4.0, 2: 11.0, 3: 55.0, 4: 62.0,
46 | 5: 76.0, 6: 113.0, 7: 153.0, 8: 159.0, 9: 171.0},
47 | 'down': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 1, 6: 2, 7: 3, 8: 4, 9: 1},
48 | 'curr_home_score': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0},
49 | 'offense_team': {0: 'DAL', 1: 'NYG', 2: 'NYG', 3: 'NYG',
50 | 4: 'NYG', 5: 'DAL', 6: 'DAL', 7: 'DAL',
51 | 8: 'DAL', 9: 'NYG'},
52 | 'curr_away_score': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0},
53 | 'yardline': {0: -15.0, 1: -34.0, 2: -34.0, 3: -29.0,
54 | 4: -29.0, 5: -26.0, 6: -23.0, 7: -31.0, 8: -31.0, 9: -37.0},
55 | 'drive_id': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 2, 6: 2, 7: 2, 8: 2, 9: 3},
56 | 'yards_to_go': {0: 0, 1: 10, 2: 10, 3: 5, 4: 5, 5: 10, 6: 7, 7: 15, 8: 15, 9: 10},
57 | 'quarter': {0: 'Q1', 1: 'Q1', 2: 'Q1', 3: 'Q1', 4: 'Q1',
58 | 5: 'Q1', 6: 'Q1', 7: 'Q1', 8: 'Q1', 9: 'Q1'}
59 | }
60 | test_df = pd.DataFrame(test_data)
61 | wpmodel.train_model(source_data=test_df)
62 |
63 | class TestModelValidate(object):
64 | """Tests for the validate_model method."""
65 |
66 | def setup_method(self, method):
67 | self.test_data = {'offense_won': {0: True, 1: False, 2: False,
68 | 3: False, 4: False, 5: True,
69 | 6: True, 7: True, 8: True, 9: False},
70 | 'home_team': {0: 'NYG', 1: 'NYG', 2: 'NYG', 3: 'NYG',
71 | 4: 'NYG', 5: 'NYG', 6: 'NYG', 7: 'NYG',
72 | 8: 'NYG', 9: 'NYG'},
73 | 'away_team': {0: 'DAL', 1: 'DAL', 2: 'DAL', 3: 'DAL',
74 | 4: 'DAL', 5: 'DAL', 6: 'DAL', 7: 'DAL',
75 | 8: 'DAL', 9: 'DAL'},
76 | 'gsis_id': {0: '2012090500', 1: '2012090500', 2: '2012090500',
77 | 3: '2012090500', 4: '2012090500', 5: '2012090500',
78 | 6: '2012090500', 7: '2012090500', 8: '2012090500',
79 | 9: '2012090500'},
80 | 'play_id': {0: 35, 1: 57, 2: 79, 3: 103, 4: 125, 5: 150,
81 | 6: 171, 7: 190, 8: 212, 9: 252},
82 | 'seconds_elapsed': {0: 0.0, 1: 4.0, 2: 11.0, 3: 55.0, 4: 62.0,
83 | 5: 76.0, 6: 113.0, 7: 153.0, 8: 159.0, 9: 171.0},
84 | 'down': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 1, 6: 2, 7: 3, 8: 4, 9: 1},
85 | 'curr_home_score': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0},
86 | 'offense_team': {0: 'DAL', 1: 'NYG', 2: 'NYG', 3: 'NYG',
87 | 4: 'NYG', 5: 'DAL', 6: 'DAL', 7: 'DAL',
88 | 8: 'DAL', 9: 'NYG'},
89 | 'curr_away_score': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0},
90 | 'yardline': {0: -15.0, 1: -34.0, 2: -34.0, 3: -29.0,
91 | 4: -29.0, 5: -26.0, 6: -23.0, 7: -31.0, 8: -31.0, 9: -37.0},
92 | 'drive_id': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 2, 6: 2, 7: 2, 8: 2, 9: 3},
93 | 'yards_to_go': {0: 0, 1: 10, 2: 10, 3: 5, 4: 5, 5: 10, 6: 7, 7: 15, 8: 15, 9: 10},
94 | 'quarter': {0: 'Q1', 1: 'Q1', 2: 'Q1', 3: 'Q1', 4: 'Q1',
95 | 5: 'Q1', 6: 'Q1', 7: 'Q1', 8: 'Q1', 9: 'Q1'}
96 | }
97 | self.test_df = pd.DataFrame(self.test_data)
98 |
99 | def test_bad_string(self):
100 | wpmodel = model.WPModel()
101 | wpmodel.train_model(source_data=self.test_df)
102 | with pytest.raises(ValueError):
103 | wpmodel.validate_model(source_data="this is bad data")
104 |
105 |
106 | def test_dataframe_input(self):
107 | wpmodel = model.WPModel()
108 | wpmodel.train_model(source_data=self.test_df)
109 | wpmodel.validate_model(source_data=self.test_df)
110 |
111 | class TestTestDistribution(object):
112 | """Tests the _test_distribution static method of WPModel."""
113 |
114 | def test_simple_case(self):
115 | input_probabilities = [0.1, 0.2, 0.3]
116 | input_predicted_win_percents = [0.1, 0.2, 0.3]
117 | input_num_plays_used = [10, 10, 10]
118 |
119 | expected_output = 1.0
120 |
121 | assert (expected_output -
122 | model.WPModel._test_distribution(input_probabilities,
123 | input_predicted_win_percents,
124 | input_num_plays_used)
125 | ) < 1e-5
126 |
127 | def test_more_complicated_case(self):
128 | input_probabilities = [0.1, 0.2, 0.4]
129 | input_predicted_win_percents = [0.1, 0.2, 0.3]
130 | input_num_plays_used = [10, 10, 100000]
131 |
132 | expected_output = 0.0
133 |
134 | assert (expected_output -
135 | model.WPModel._test_distribution(input_probabilities,
136 | input_predicted_win_percents,
137 | input_num_plays_used)
138 | ) < 1e-5
139 |
140 |
141 | class TestModelIO(object):
142 | """Tests functions that deal with model saving and loading"""
143 |
144 | def teardown_method(self, method):
145 |
146 | try:
147 | os.remove(self.expected_path)
148 | except OSError:
149 | pass
150 |
151 | def test_model_save_default(self):
152 | instance = model.WPModel()
153 | model_name = "test_model_asljasljt.nflwin"
154 | instance._default_model_filename = model_name
155 |
156 | self.expected_path = os.path.join(
157 | os.path.join(
158 | os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "models")
159 | , model_name)
160 |
161 | assert os.path.isfile(self.expected_path) is False
162 |
163 | instance.save_model()
164 |
165 | assert os.path.isfile(self.expected_path) is True
166 |
167 | def test_model_save_specified(self):
168 | instance = model.WPModel()
169 | model_name = "test_model_qerooiua.nflwin"
170 |
171 | self.expected_path = os.path.join(
172 | os.path.join(
173 | os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "models")
174 | , model_name)
175 |
176 | assert os.path.isfile(self.expected_path) is False
177 |
178 | instance.save_model(filename=model_name)
179 |
180 | assert os.path.isfile(self.expected_path) is True
181 |
182 | def test_model_load_default(self):
183 | instance = model.WPModel()
184 | model_name = "test_model_asljasljt.nflwin"
185 | instance._default_model_filename = model_name
186 |
187 | self.expected_path = os.path.join(
188 | os.path.join(
189 | os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "models")
190 | , model_name)
191 |
192 | assert os.path.isfile(self.expected_path) is False
193 |
194 | instance.save_model()
195 |
196 | WPModel_class = model.WPModel
197 | WPModel_class._default_model_filename = model_name
198 |
199 | loaded_instance = WPModel_class.load_model()
200 |
201 | assert isinstance(loaded_instance, model.WPModel)
202 |
203 | def test_model_load_specified(self):
204 | instance = model.WPModel()
205 | model_name = "test_model_qerooiua.nflwin"
206 |
207 | self.expected_path = os.path.join(
208 | os.path.join(
209 | os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "models")
210 | , model_name)
211 |
212 | assert os.path.isfile(self.expected_path) is False
213 |
214 | instance.save_model(filename=model_name)
215 |
216 | loaded_instance = model.WPModel.load_model(filename=model_name)
217 |
218 | assert isinstance(loaded_instance, model.WPModel)
219 |
220 |
221 |
--------------------------------------------------------------------------------
/nflwin/utilities.py:
--------------------------------------------------------------------------------
1 | """Utility functions that don't fit in the main modules"""
2 | from __future__ import print_function, division
3 |
4 | import numpy as np
5 | import pandas as pd
6 |
7 |
8 | def connect_nfldb():
9 | """Connect to the nfldb database.
10 |
11 | Rather than using the builtin method we make our own,
12 | since we're going to use SQLAlchemy as the engine. However,
13 | we can still make use of the information in the nfldb config
14 | file to get information like username and password, which
15 | means this function doesn't need any arguments.
16 |
17 | Parameters
18 | ----------
19 | None
20 |
21 | Returns
22 | -------
23 | SQLAlchemy engine object
24 | A connected engine, ready to be used to query the DB.
25 |
26 | Raises
27 | ------
28 | IOError
29 | If it can't find the config file.
30 | """
31 | import nfldb
32 | import sqlalchemy as sql
33 | db_config, paths_tried = nfldb.db.config()
34 | if db_config is None:
35 | raise IOError("get_play_data: could not find database config! Looked"
36 | " in these places: {0}".format(paths_tried))
37 | db_config["drivername"] = "postgres"
38 | db_config["username"] = db_config["user"]
39 | del db_config["user"]
40 | del db_config["timezone"]
41 |
42 | engine = sql.create_engine(sql.engine.url.URL(**db_config))
43 |
44 | return engine
45 |
46 |
47 | def get_nfldb_play_data(season_years=None, season_types=("Regular", "Postseason")):
48 | """Get play-by-play data from the nfldb database.
49 |
50 | We use a specialized query and then postprocessing because, while possible to
51 | do using the objects created by ``nfldb``, it is *orders of magnitude slower*.
52 | This is due to the more general nature of ``nfldb``, which is not really designed
53 | for this kind of data mining. Since we need to get a lot of data in a single way,
54 | it's much simpler to interact at a lower level with the underlying postgres
55 | database.
56 |
57 |
58 | Parameters
59 | ----------
60 | season_years : list (default=None)
61 | A list of all years to get data for (earliest year in nfldb is 2009).
62 | If ``None``, get data from all available seasons.
63 | season_types : list (default=["Regular", "Postseason"])
64 | A list of all parts of seasons to get data for (acceptable values are
65 | "Preseason", "Regular", and "Postseason"). If ``None``, get data from
66 | all three season types.
67 |
68 | Returns
69 | -------
70 | Pandas DataFrame
71 | The play by play data, with the following columns:
72 |
73 | * **gsis_id:** The official NFL GSIS_ID for the game.
74 | * **drive_id:** The id of the drive, starts at 1 and increases by 1 for each new drive.
75 | * **play_id:** The id of the play in ``nfldb``. Note that sequential plays have
76 | increasing but not necessarily sequential values. With ``drive_id`` and ``gsis_id``,
77 | works as a unique identifier for a given play.
78 | * **quarter:** The quarter, prepended with "Q" (e.g. ``Q1`` means the first quarter).
79 | Overtime periods are denoted as ``OT``, ``OT2``, and theoretically ``OT3`` if one were to
80 | ever be played.
81 | * **seconds_elapsed:** seconds elapsed since the start of the quarter.
82 | * **offense_team:** The abbreviation of the team currently with possession of the ball.
83 | * **yardline:** The current field position. Goes from -49 to 49, where negative numbers
84 | indicate that the team with possession is on its own side of the field.
85 | * **down:** The down. kickoffs, extra points, and similar have a down of 0.
86 | * **yards_to_go:** How many yards needed in order to get a first down (or touchdown).
87 | * **home_team:** The abbreviation of the home team.
88 | * **away_team:** The abbreviation of the away team.
89 | * **curr_home_score:** The home team's score at the start of the play.
90 | * **curr_away_score:** The away team's score at the start of the play.
91 | * **offense_won:** A boolean - ``True`` if the offense won the game, ``False`` otherwise. (The
92 | database query skips tied games.)
93 |
94 | Notes
95 | -----
96 | ``gsis_id``, ``drive_id``, and ``play_id`` are not necessary to make the model, but
97 | are included because they can be useful for computing things like WPA.
98 | """
99 |
100 | engine = connect_nfldb()
101 |
102 | sql_string = _make_nfldb_query_string(season_years=season_years, season_types=season_types)
103 |
104 | plays_df = pd.read_sql(sql_string, engine)
105 |
106 | #Fix yardline, quarter and time elapsed:
107 | def yardline_time_fix(row):
108 | try:
109 | yardline = float(row['yardline'][1:-1])
110 | except TypeError:
111 | yardline = np.nan
112 | split_time = row['time'].split(",")
113 | return yardline, split_time[0][1:], float(split_time[1][:-1])
114 |
115 | plays_df[['yardline', 'quarter', 'seconds_elapsed']] = pd.DataFrame(plays_df.apply(yardline_time_fix, axis=1).values.tolist())
116 | plays_df.drop('time', axis=1, inplace=True)
117 |
118 | #Set NaN downs (kickoffs, etc) to 0:
119 | plays_df['down'] = plays_df['down'].fillna(value=0).astype(np.int8)
120 |
121 |
122 | #Aggregate scores:
123 | plays_df = _aggregate_nfldb_scores(plays_df)
124 |
125 | return plays_df
126 |
127 | def _aggregate_nfldb_scores(play_df):
128 | """Aggregate the raw nfldb data to get the score of every play."""
129 |
130 | # First, add the yardline of the subsequent play to the df
131 | play_df['next_yardline'] = play_df['yardline'].shift(-1)
132 |
133 | #Set up the dictionary to keep track of things:
134 | curr_home_score = 0
135 | curr_away_score = 0
136 | curr_gsis_id = play_df.iloc[0].gsis_id
137 | argdict = {"curr_home_score": 0, "curr_away_score": 0, "curr_gsis_id": play_df.iloc[0].gsis_id}
138 |
139 | #Define an internal function to actually compute the score of a given play:
140 | def compute_current_scores(play, argdict):
141 | #If new game, set scores to zero:
142 | if play.gsis_id != argdict['curr_gsis_id']:
143 | argdict['curr_home_score'] = 0
144 | argdict['curr_away_score'] = 0
145 | argdict['curr_gsis_id'] = play.gsis_id
146 |
147 | #Get current score at start of play:
148 | home_score_to_return = argdict['curr_home_score']
149 | away_score_to_return = argdict['curr_away_score']
150 |
151 | #Check if an extra point is missing from the data:
152 | if play.offense_play_points == 6 and play.next_yardline < 0:
153 | play.offense_play_points += 1
154 | if play.defense_play_points == 6 and play.next_yardline < 0:
155 | play.defense_play_points += 1
156 |
157 | #Update scores, if necessary:
158 | if play.offense_team == play.home_team:
159 | argdict['curr_home_score'] += play.offense_play_points
160 | argdict['curr_away_score'] += play.defense_play_points
161 | else:
162 | argdict['curr_home_score'] += play.defense_play_points
163 | argdict['curr_away_score'] += play.offense_play_points
164 | return home_score_to_return, away_score_to_return
165 |
166 | #Apply function to data:
167 | #TODO (AndrewRook): Make the .apply function go faster, currently it's a large bottleneck
168 | aggregate_scores = play_df.apply(compute_current_scores, axis=1, args=(argdict,))
169 | aggregate_scores = pd.DataFrame(aggregate_scores.values.tolist())
170 | play_df[['curr_home_score', 'curr_away_score']] = aggregate_scores
171 |
172 | #Drop unnecessary columns:
173 | play_df.drop(labels=["next_yardline", "offense_play_points", "defense_play_points"],
174 | axis=1, inplace=True)
175 |
176 | return play_df
177 |
178 |
179 | def _make_nfldb_query_string(season_years=None, season_types=None):
180 | """Construct the query string to get all the play data.
181 |
182 | This way is a little more compact and robust than specifying
183 | the string in the function that uses it.
184 |
185 | """
186 |
187 | play_fields = ['gsis_id', 'drive_id', 'play_id',
188 | 'time', 'pos_team AS offense_team', 'yardline', 'down',
189 | 'yards_to_go']
190 |
191 | offense_play_points = ("GREATEST("
192 | "(agg_play.fumbles_rec_tds * 6), "
193 | "(agg_play.kicking_rec_tds * 6), "
194 | "(agg_play.passing_tds * 6), "
195 | "(agg_play.receiving_tds * 6), "
196 | "(agg_play.rushing_tds * 6), "
197 | "(agg_play.kicking_xpmade * 1), "
198 | "(agg_play.passing_twoptm * 2), "
199 | "(agg_play.receiving_twoptm * 2), "
200 | "(agg_play.rushing_twoptm * 2), "
201 | "(agg_play.kicking_fgm * 3)) "
202 | "AS offense_play_points")
203 | defense_play_points = ("GREATEST("
204 | "(agg_play.defense_frec_tds * 6), "
205 | "(agg_play.defense_int_tds * 6), "
206 | "(agg_play.defense_misc_tds * 6), "
207 | "(agg_play.kickret_tds * 6), "
208 | "(agg_play.puntret_tds * 6), "
209 | "(agg_play.defense_safe * 2)) "
210 | "AS defense_play_points")
211 |
212 | game_fields = ("game.home_team, game.away_team, "
213 | "((game.home_score > game.away_score AND play.pos_team = game.home_team) "
214 | "OR (game.away_score > game.home_score AND play.pos_team = game.away_team)) AS offense_won")
215 |
216 | where_clause = ("WHERE game.home_score != game.away_score "
217 | "AND game.finished = TRUE "
218 | "AND play.pos_team != 'UNK' "
219 | "AND (play.time).phase not in ('Pregame', 'Half', 'Final')")
220 |
221 | if season_years is not None:
222 | where_clause += " AND game.season_year"
223 | if len(season_years) == 1:
224 | where_clause += " = {0}".format(season_years[0])
225 | else:
226 | where_clause += (" in ({0})"
227 | "".format(",".join([str(year) for year in season_years])))
228 | if season_types is not None:
229 | where_clause += " AND game.season_type"
230 | if len(season_types) == 1:
231 | where_clause += " = '{0}'".format(season_types[0])
232 | else:
233 | where_clause += " in ('{0}')".format("','".join(season_types))
234 |
235 | query_string = "SELECT "
236 | query_string += "play." + ", play.".join(play_fields)
237 | query_string += ", " + offense_play_points
238 | query_string += ", " + defense_play_points
239 | query_string += ", " + game_fields
240 | query_string += " FROM play INNER JOIN agg_play"
241 | query_string += (" ON play.gsis_id = agg_play.gsis_id"
242 | " AND play.drive_id = agg_play.drive_id"
243 | " AND play.play_id = agg_play.play_id")
244 | query_string += " INNER JOIN game on play.gsis_id = game.gsis_id"
245 | query_string += " " + where_clause
246 | query_string += " ORDER BY play.gsis_id, play.drive_id, play.play_id;"
247 |
248 | return query_string
249 |
--------------------------------------------------------------------------------
/doc/source/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # NFLWin documentation build configuration file, created by
4 | # sphinx-quickstart on Thu Jun 16 22:35:58 2016.
5 | #
6 | # This file is execfile()d with the current directory set to its
7 | # containing dir.
8 | #
9 | # Note that not all possible configuration values are present in this
10 | # autogenerated file.
11 | #
12 | # All configuration values have a default; values that are commented out
13 | # serve to show the default.
14 |
15 | import sys
16 | import os
17 |
18 | # If extensions (or modules to document with autodoc) are in another directory,
19 | # add these directories to sys.path here. If the directory is relative to the
20 | # documentation root, use os.path.abspath to make it absolute, like shown here.
21 | PROJECT_DIRECTORY = os.path.dirname(
22 | os.path.dirname(
23 | os.path.dirname(
24 | os.path.abspath(__file__)
25 | )
26 | )
27 | )
28 | sys.path.insert(0, PROJECT_DIRECTORY)
29 |
30 | # -- General configuration ------------------------------------------------
31 |
32 | # If your documentation needs a minimal Sphinx version, state it here.
33 | #needs_sphinx = '1.0'
34 |
35 | # Add any Sphinx extension module names here, as strings. They can be
36 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
37 | # ones.
38 | extensions = [
39 | 'sphinx.ext.autodoc',
40 | 'sphinx.ext.doctest',
41 | 'sphinx.ext.viewcode',
42 | 'sphinx.ext.githubpages',
43 | 'sphinx.ext.autosummary',
44 | 'numpydoc',
45 | ]
46 |
47 | #some magic (http://stackoverflow.com/questions/12206334/sphinx-autosummary-toctree-contains-reference-to-nonexisting-document-warnings) to suppress spurious warnings:
48 | numpydoc_show_class_members = False
49 |
50 | # Add any paths that contain templates here, relative to this directory.
51 | templates_path = ['_templates']
52 |
53 | # The suffix(es) of source filenames.
54 | # You can specify multiple suffix as a list of string:
55 | # source_suffix = ['.rst', '.md']
56 | source_suffix = '.rst'
57 |
58 | # The encoding of source files.
59 | #source_encoding = 'utf-8-sig'
60 |
61 | # The master toctree document.
62 | master_doc = 'index'
63 |
64 | # General information about the project.
65 | project = u'NFLWin'
66 | copyright = u'2016, Andrew Schechtman-Rook'
67 | author = u'Andrew Schechtman-Rook'
68 |
69 | # The version info for the project you're documenting, acts as replacement for
70 | # |version| and |release|, also used in various other places throughout the
71 | # built documents.
72 | #
73 | # The short X.Y version.
74 | from nflwin import __version__
75 | version = __version__
76 | # The full version, including alpha/beta/rc tags.
77 | release = __version__
78 |
79 | # The language for content autogenerated by Sphinx. Refer to documentation
80 | # for a list of supported languages.
81 | #
82 | # This is also used if you do content translation via gettext catalogs.
83 | # Usually you set "language" from the command line for these cases.
84 | language = None
85 |
86 | # There are two options for replacing |today|: either, you set today to some
87 | # non-false value, then it is used:
88 | #today = ''
89 | # Else, today_fmt is used as the format for a strftime call.
90 | #today_fmt = '%B %d, %Y'
91 |
92 | # List of patterns, relative to source directory, that match files and
93 | # directories to ignore when looking for source files.
94 | # This patterns also effect to html_static_path and html_extra_path
95 | exclude_patterns = []
96 |
97 | # The reST default role (used for this markup: `text`) to use for all
98 | # documents.
99 | #default_role = None
100 |
101 | # If true, '()' will be appended to :func: etc. cross-reference text.
102 | #add_function_parentheses = True
103 |
104 | # If true, the current module name will be prepended to all description
105 | # unit titles (such as .. function::).
106 | #add_module_names = True
107 |
108 | # If true, sectionauthor and moduleauthor directives will be shown in the
109 | # output. They are ignored by default.
110 | #show_authors = False
111 |
112 | # The name of the Pygments (syntax highlighting) style to use.
113 | pygments_style = 'sphinx'
114 |
115 | # A list of ignored prefixes for module index sorting.
116 | #modindex_common_prefix = []
117 |
118 | # If true, keep warnings as "system message" paragraphs in the built documents.
119 | #keep_warnings = False
120 |
121 | # If true, `todo` and `todoList` produce output, else they produce nothing.
122 | todo_include_todos = False
123 |
124 |
125 | # -- Options for HTML output ----------------------------------------------
126 |
127 | # The theme to use for HTML and HTML Help pages. See the documentation for
128 | # a list of builtin themes.
129 | html_theme = 'sphinx_rtd_theme'
130 |
131 | # Theme options are theme-specific and customize the look and feel of a theme
132 | # further. For a list of options available for each theme, see the
133 | # documentation.
134 | #html_theme_options = {}
135 |
136 | # Add any paths that contain custom themes here, relative to this directory.
137 | #html_theme_path = []
138 |
139 | # The name for this set of Sphinx documents.
140 | # " v documentation" by default.
141 | #html_title = u'NFLWin v0.1.0'
142 |
143 | # A shorter title for the navigation bar. Default is the same as html_title.
144 | #html_short_title = None
145 |
146 | # The name of an image file (relative to this directory) to place at the top
147 | # of the sidebar.
148 | #html_logo = None
149 |
150 | # The name of an image file (relative to this directory) to use as a favicon of
151 | # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
152 | # pixels large.
153 | #html_favicon = None
154 |
155 | # Add any paths that contain custom static files (such as style sheets) here,
156 | # relative to this directory. They are copied after the builtin static files,
157 | # so a file named "default.css" will overwrite the builtin "default.css".
158 | html_static_path = ['_static']
159 |
160 | # Add any extra paths that contain custom files (such as robots.txt or
161 | # .htaccess) here, relative to this directory. These files are copied
162 | # directly to the root of the documentation.
163 | #html_extra_path = []
164 |
165 | # If not None, a 'Last updated on:' timestamp is inserted at every page
166 | # bottom, using the given strftime format.
167 | # The empty string is equivalent to '%b %d, %Y'.
168 | #html_last_updated_fmt = None
169 |
170 | # If true, SmartyPants will be used to convert quotes and dashes to
171 | # typographically correct entities.
172 | #html_use_smartypants = True
173 |
174 | # Custom sidebar templates, maps document names to template names.
175 | #html_sidebars = {}
176 |
177 | # Additional templates that should be rendered to pages, maps page names to
178 | # template names.
179 | #html_additional_pages = {}
180 |
181 | # If false, no module index is generated.
182 | #html_domain_indices = True
183 |
184 | # If false, no index is generated.
185 | #html_use_index = True
186 |
187 | # If true, the index is split into individual pages for each letter.
188 | #html_split_index = False
189 |
190 | # If true, links to the reST sources are added to the pages.
191 | #html_show_sourcelink = True
192 |
193 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
194 | #html_show_sphinx = True
195 |
196 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
197 | #html_show_copyright = True
198 |
199 | # If true, an OpenSearch description file will be output, and all pages will
200 | # contain a tag referring to it. The value of this option must be the
201 | # base URL from which the finished HTML is served.
202 | #html_use_opensearch = ''
203 |
204 | # This is the file name suffix for HTML files (e.g. ".xhtml").
205 | #html_file_suffix = None
206 |
207 | # Language to be used for generating the HTML full-text search index.
208 | # Sphinx supports the following languages:
209 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
210 | # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh'
211 | #html_search_language = 'en'
212 |
213 | # A dictionary with options for the search language support, empty by default.
214 | # 'ja' uses this config value.
215 | # 'zh' user can custom change `jieba` dictionary path.
216 | #html_search_options = {'type': 'default'}
217 |
218 | # The name of a javascript file (relative to the configuration directory) that
219 | # implements a search results scorer. If empty, the default will be used.
220 | #html_search_scorer = 'scorer.js'
221 |
222 | # Output file base name for HTML help builder.
223 | htmlhelp_basename = 'NFLWindoc'
224 |
225 | # -- Options for LaTeX output ---------------------------------------------
226 |
227 | latex_elements = {
228 | # The paper size ('letterpaper' or 'a4paper').
229 | #'papersize': 'letterpaper',
230 |
231 | # The font size ('10pt', '11pt' or '12pt').
232 | #'pointsize': '10pt',
233 |
234 | # Additional stuff for the LaTeX preamble.
235 | #'preamble': '',
236 |
237 | # Latex figure (float) alignment
238 | #'figure_align': 'htbp',
239 | }
240 |
241 | # Grouping the document tree into LaTeX files. List of tuples
242 | # (source start file, target name, title,
243 | # author, documentclass [howto, manual, or own class]).
244 | latex_documents = [
245 | (master_doc, 'NFLWin.tex', u'NFLWin Documentation',
246 | u'Andrew Schechtman-Rook', 'manual'),
247 | ]
248 |
249 | # The name of an image file (relative to this directory) to place at the top of
250 | # the title page.
251 | #latex_logo = None
252 |
253 | # For "manual" documents, if this is true, then toplevel headings are parts,
254 | # not chapters.
255 | #latex_use_parts = False
256 |
257 | # If true, show page references after internal links.
258 | #latex_show_pagerefs = False
259 |
260 | # If true, show URL addresses after external links.
261 | #latex_show_urls = False
262 |
263 | # Documents to append as an appendix to all manuals.
264 | #latex_appendices = []
265 |
266 | # If false, no module index is generated.
267 | #latex_domain_indices = True
268 |
269 |
270 | # -- Options for manual page output ---------------------------------------
271 |
272 | # One entry per manual page. List of tuples
273 | # (source start file, name, description, authors, manual section).
274 | man_pages = [
275 | (master_doc, 'nflwin', u'NFLWin Documentation',
276 | [author], 1)
277 | ]
278 |
279 | # If true, show URL addresses after external links.
280 | #man_show_urls = False
281 |
282 |
283 | # -- Options for Texinfo output -------------------------------------------
284 |
285 | # Grouping the document tree into Texinfo files. List of tuples
286 | # (source start file, target name, title, author,
287 | # dir menu entry, description, category)
288 | texinfo_documents = [
289 | (master_doc, 'NFLWin', u'NFLWin Documentation',
290 | author, 'NFLWin', 'One line description of project.',
291 | 'Miscellaneous'),
292 | ]
293 |
294 | # Documents to append as an appendix to all manuals.
295 | #texinfo_appendices = []
296 |
297 | # If false, no module index is generated.
298 | #texinfo_domain_indices = True
299 |
300 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
301 | #texinfo_show_urls = 'footnote'
302 |
303 | # If true, do not generate a @detailmenu in the "Top" node's menu.
304 | #texinfo_no_detailmenu = False
305 |
306 | #Run apidoc if inside a ReadTheDocs environment:
307 | on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
308 | if on_rtd:
309 | os.system("sphinx-apidoc -f -o doc/source nflwin/ nflwin/tests")
310 |
--------------------------------------------------------------------------------
/nflwin/tests/test_utilities.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function, division
2 |
3 | try:
4 | import nfldb
5 | nfldb_missing=False
6 | except ImportError:
7 | nfldb_missing=True
8 |
9 | import numpy as np
10 | import pandas as pd
11 | import pytest
12 |
13 | import nflwin.utilities as utils
14 |
15 | class TestGetNFLDBPlayData(object):
16 | """Testing the ability to get play data from nfldb."""
17 |
18 | #TODO (AndrewRook): Need to test if the sql query actually works
19 |
20 | def setup_method(self, method):
21 | self.test_df = pd.DataFrame({
22 | 'gsis_id': [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
23 | 'drive_id': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
24 | 'play_id': [1, 2, 3, 4, 5, 1, 2, 3, 4, 5],
25 | 'time': ["(Q1,0)", "(Q1,152)", "(Q1,354)", "(Q1,354)", "(Q2,0)",
26 | "(OT,840)", "(OT,840)", "(OT2,875)", "(OT3,900)", "(OT,900)"],
27 | 'offense_team': ["HOU", "KC", "KC", "HOU", "HOU", "UNK", "DEN", "DEN", "CAR", "UNK"],
28 | 'yardline': ["(-15)", "(35)", "(-15)", "(-30)", "(-26)",
29 | None, "(48)", "(-15)", "(-18)", None],
30 | 'down': [np.nan, np.nan, np.nan, 1.0, 2.0, np.nan, 1.0, np.nan, 1.0, np.nan],
31 | 'yards_to_go': [0, 0, 0, 10, 6, 0, 2, 0, 10, 0],
32 | 'offense_play_points': [0, 1, 0, 0, 0, 0, 6, 0, 0, 0],
33 | 'defense_play_points': [6, 0, 0, 0, 0, 0, 0, 0, 0, 0],
34 | 'home_team': ["HOU", "HOU", "HOU", "HOU", "HOU", "DEN", "DEN", "DEN", "DEN", "DEN"],
35 | 'away_team': ["KC", "KC", "KC", "KC", "KC", "CAR", "CAR", "CAR", "CAR", "CAR"],
36 | 'offense_won': [False, False, False, False, False, True, True, True, True, True]
37 | })
38 |
39 | def test_standard_play_mock(self,monkeypatch):
40 | def mockreturn_engine():
41 | return True
42 | def mockreturn_query_string(season_years, season_types):
43 | return True
44 | def mockreturn_read_sql(sql_string, engine):
45 | return self.test_df
46 | monkeypatch.setattr(utils, 'connect_nfldb', mockreturn_engine)
47 | monkeypatch.setattr(utils, '_make_nfldb_query_string', mockreturn_query_string)
48 | monkeypatch.setattr(pd, 'read_sql', mockreturn_read_sql)
49 |
50 | expected_df = pd.DataFrame({
51 | 'gsis_id': [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
52 | 'drive_id': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
53 | 'play_id': [1, 2, 3, 4, 5, 1, 2, 3, 4, 5],
54 | 'seconds_elapsed': [0.0, 152.0, 354.0, 354.0, 0.0,
55 | 840.0, 840.0, 875.0, 900.0, 900.0],
56 | 'offense_team': ["HOU", "KC", "KC", "HOU", "HOU", "UNK", "DEN", "DEN", "CAR", "UNK"],
57 | 'yardline': [-15, 35, -15, -30, -26,
58 | np.nan, 48, -15, -18, np.nan],
59 | 'down': [0, 0, 0, 1, 2, 0, 1, 0, 1, 0],
60 | 'yards_to_go': [0, 0, 0, 10, 6, 0, 2, 0, 10, 0],
61 | 'home_team': ["HOU", "HOU", "HOU", "HOU", "HOU", "DEN", "DEN", "DEN", "DEN", "DEN"],
62 | 'away_team': ["KC", "KC", "KC", "KC", "KC", "CAR", "CAR", "CAR", "CAR", "CAR"],
63 | 'offense_won': [False, False, False, False, False, True, True, True, True, True],
64 | 'quarter': ["Q1", "Q1", "Q1", "Q1", "Q2", "OT", "OT", "OT2", "OT3", "OT"],
65 | 'curr_home_score': [0, 0, 0, 0, 0, 0, 0, 7, 7, 7],
66 | 'curr_away_score': [0, 6, 7, 7, 7, 0, 0, 0, 0, 0]
67 | })
68 | expected_df['down'] = expected_df['down'].astype(np.int8)
69 |
70 | pd.util.testing.assert_frame_equal(utils.get_nfldb_play_data().sort_index(axis=1),
71 | expected_df.sort_index(axis=1))
72 |
73 | @pytest.mark.requires_db
74 | def test_2015_playoffs_query(self):
75 | queried_df = utils.get_nfldb_play_data(season_years=[2015], season_types=["Postseason"])
76 | expected_df = pd.DataFrame({
77 | 'gsis_id': ['2016010900', '2016010900', '2016010900', '2016010900', '2016010900'],
78 | 'drive_id': [1, 1, 2, 2, 2],
79 | 'play_id': [36, 54, 70, 88, 109],
80 | 'seconds_elapsed': [0., 11., 11., 11., 45.],
81 | 'offense_team': ['HOU', 'KC', 'KC', 'HOU', 'HOU'],
82 | 'yardline': [-15., 35, -15, -30, -26],
83 | 'down': [0, 0, 0, 1, 2],
84 | 'yards_to_go': [0, 0, 0, 10, 6],
85 | 'home_team': ['HOU', 'HOU', 'HOU', 'HOU', 'HOU'],
86 | 'away_team': ['KC', 'KC', 'KC', 'KC', 'KC'],
87 | 'offense_won': [False, True, True, False, False],
88 | 'quarter': ['Q1', 'Q1', 'Q1', 'Q1', 'Q1'],
89 | 'curr_home_score': [0, 0, 0, 0, 0],
90 | 'curr_away_score': [0, 6, 7, 7, 7]
91 | })
92 | expected_df['down'] = expected_df['down'].astype(np.int8)
93 | pd.util.testing.assert_frame_equal(queried_df[:5].sort_index(axis=1),
94 | expected_df.sort_index(axis=1), check_column_type=False)
95 |
96 | @pytest.mark.requires_db
97 | def test_2009_regular_season_query(self):
98 | queried_df = utils.get_nfldb_play_data(season_years=[2009], season_types=["Regular"])
99 | expected_df = pd.DataFrame({
100 | 'gsis_id': ['2009091000', '2009091000', '2009091000', '2009091000', '2009091000'],
101 | 'drive_id': [1, 1, 1, 1, 1],
102 | 'play_id': [46, 68, 92, 113, 139],
103 | 'seconds_elapsed': [0., 7, 44, 85, 93],
104 | 'offense_team': ['TEN', 'PIT', 'PIT', 'PIT', 'PIT'],
105 | 'yardline': [-20., -8, -3, -6, -6],
106 | 'down': [0, 1, 2, 3, 4],
107 | 'yards_to_go': [0, 10, 5, 8, 8],
108 | 'home_team': ['PIT', 'PIT', 'PIT', 'PIT', 'PIT'],
109 | 'away_team': ['TEN', 'TEN', 'TEN', 'TEN', 'TEN'],
110 | 'offense_won': [False, True, True, True, True],
111 | 'quarter': ['Q1', 'Q1', 'Q1', 'Q1', 'Q1'],
112 | 'curr_home_score': [0, 0, 0, 0, 0],
113 | 'curr_away_score': [0, 0, 0, 0, 0]
114 | })
115 | expected_df['down'] = expected_df['down'].astype(np.int8)
116 | pd.util.testing.assert_frame_equal(queried_df[:5].sort_index(axis=1),
117 | expected_df.sort_index(axis=1), check_column_type=False)
118 |
119 | @pytest.mark.requires_db
120 | class TestConnectNFLDB(object):
121 | """testing the connect_nfldb function"""
122 | def setup_method(self, method):
123 | self.curr_config_home = nfldb.db._config_home
124 |
125 | def teardown_method(self, method):
126 | nfldb.db._config_home = self.curr_config_home
127 |
128 | def test_no_config_error(self):
129 | nfldb.db._config_home = "/boogaboogabooga"
130 |
131 | with pytest.raises(IOError):
132 | utils.connect_nfldb()
133 |
134 | @pytest.mark.requires_db
135 | def test_engine_works(self):
136 | engine = utils.connect_nfldb()
137 | test_query = ("SELECT play.description "
138 | "from play "
139 | "WHERE play.gsis_id = '2009080950' AND play.play_id=721;")
140 |
141 | plays_df = pd.read_sql(test_query, engine)
142 |
143 | assert (plays_df.iloc[0]['description'] ==
144 | u'(6:55) L.White left guard for 3 yards, TOUCHDOWN.')
145 |
146 | class TestMakeNFLDBQueryString(object):
147 | """testing the _make_nfldb_query_string function"""
148 |
149 | def test_no_args(self):
150 | expected_string = ("SELECT play.gsis_id, play.drive_id, "
151 | "play.play_id, play.time, play.pos_team AS offense_team, "
152 | "play.yardline, play.down, play.yards_to_go, "
153 | "GREATEST("
154 | "(agg_play.fumbles_rec_tds * 6), "
155 | "(agg_play.kicking_rec_tds * 6), "
156 | "(agg_play.passing_tds * 6), "
157 | "(agg_play.receiving_tds * 6), "
158 | "(agg_play.rushing_tds * 6), "
159 | "(agg_play.kicking_xpmade * 1), "
160 | "(agg_play.passing_twoptm * 2), "
161 | "(agg_play.receiving_twoptm * 2), "
162 | "(agg_play.rushing_twoptm * 2), "
163 | "(agg_play.kicking_fgm * 3)) AS offense_play_points, "
164 | "GREATEST("
165 | "(agg_play.defense_frec_tds * 6), "
166 | "(agg_play.defense_int_tds * 6), "
167 | "(agg_play.defense_misc_tds * 6), "
168 | "(agg_play.kickret_tds * 6), "
169 | "(agg_play.puntret_tds * 6), "
170 | "(agg_play.defense_safe * 2)) AS defense_play_points, "
171 | "game.home_team, game.away_team, "
172 | "((game.home_score > game.away_score AND play.pos_team = game.home_team) OR "
173 | "(game.away_score > game.home_score AND play.pos_team = game.away_team))"
174 | " AS offense_won "
175 | "FROM play INNER JOIN agg_play "
176 | "ON play.gsis_id = agg_play.gsis_id "
177 | "AND play.drive_id = agg_play.drive_id "
178 | "AND play.play_id = agg_play.play_id "
179 | "INNER JOIN game on play.gsis_id = game.gsis_id "
180 | "WHERE game.home_score != game.away_score AND game.finished = TRUE "
181 | "AND play.pos_team != 'UNK' "
182 | "AND (play.time).phase not in ('Pregame', 'Half', 'Final') "
183 | "ORDER BY play.gsis_id, play.drive_id, play.play_id;")
184 | assert expected_string == utils._make_nfldb_query_string()
185 |
186 | def test_single_year(self):
187 | """Test that adding a single year constraint works"""
188 | expected_substring = ("WHERE game.home_score != game.away_score "
189 | "AND game.finished = TRUE "
190 | "AND play.pos_team != 'UNK' "
191 | "AND (play.time).phase not in ('Pregame', 'Half', 'Final') "
192 | "AND game.season_year = 2013")
193 | assert expected_substring in utils._make_nfldb_query_string(season_years=[2013])
194 |
195 | def test_single_season_type(self):
196 | """Test that adding a single season type constraint works"""
197 | expected_substring = ("WHERE game.home_score != game.away_score "
198 | "AND game.finished = TRUE "
199 | "AND play.pos_team != 'UNK' "
200 | "AND (play.time).phase not in ('Pregame', 'Half', 'Final') "
201 | "AND game.season_type = 'Regular'")
202 | assert expected_substring in utils._make_nfldb_query_string(season_types=["Regular"])
203 |
204 | def test_multiple_year(self):
205 | """Test that adding a multiple year constraint works"""
206 | expected_substring = ("WHERE game.home_score != game.away_score "
207 | "AND game.finished = TRUE "
208 | "AND play.pos_team != 'UNK' "
209 | "AND (play.time).phase not in ('Pregame', 'Half', 'Final') "
210 | "AND game.season_year in (2013,2010)")
211 | assert expected_substring in utils._make_nfldb_query_string(season_years=[2013, 2010])
212 |
213 | def test_multiple_season_type(self):
214 | """Test that adding a single season type constraint works"""
215 | expected_substring = ("WHERE game.home_score != game.away_score "
216 | "AND game.finished = TRUE "
217 | "AND play.pos_team != 'UNK' "
218 | "AND (play.time).phase not in ('Pregame', 'Half', 'Final') "
219 | "AND game.season_type in ('Regular','Postseason'")
220 | assert expected_substring in utils._make_nfldb_query_string(season_types=["Regular", "Postseason"])
221 |
222 |
223 | class TestAggregateNFLDBScores(object):
224 | """Testing the _aggregate_nfldb_scores function"""
225 |
226 | def test_single_game_offense_points(self):
227 | input_df = pd.DataFrame({'gsis_id': [0, 0, 0, 0, 0, 0, 0, 0],
228 | 'yardline': [0, 0, 0, -15, 0, 0, 0, -15],
229 | 'offense_team': ['KC', 'KC', 'KC', 'KC', 'NE', 'NE', 'NE', 'NE'],
230 | 'home_team': ['KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC'],
231 | 'away_team': ['NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE'],
232 | 'offense_play_points': [0, 0, 3, 0, 0, 6, 1, 0],
233 | 'defense_play_points': [0, 0, 0, 0, 0, 0, 0, 0]
234 | })
235 | expected_df = pd.DataFrame({'gsis_id': [0, 0, 0, 0, 0, 0, 0, 0],
236 | 'yardline': [0, 0, 0, -15, 0, 0, 0, -15],
237 | 'offense_team': ['KC', 'KC', 'KC', 'KC', 'NE', 'NE', 'NE', 'NE'],
238 | 'home_team': ['KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC'],
239 | 'away_team': ['NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE']
240 | })
241 | #Have to append the score columns manually:
242 | expected_df[['curr_home_score', 'curr_away_score']] = pd.DataFrame([(0, 0),
243 | (0, 0),
244 | (0, 0),
245 | (3, 0),
246 | (3, 0),
247 | (3, 0),
248 | (3, 6),
249 | (3, 7),])
250 |
251 | input_df = utils._aggregate_nfldb_scores(input_df)
252 | pd.util.testing.assert_frame_equal(input_df, expected_df)
253 |
254 | def test_single_game_defense_points(self):
255 | input_df = pd.DataFrame({'gsis_id': [0, 0, 0, 0, 0, 0, 0, 0],
256 | 'yardline': [0, 0, 0, -15, 0, 0, 0, -15],
257 | 'offense_team': ['KC', 'KC', 'KC', 'KC', 'NE', 'NE', 'NE', 'NE'],
258 | 'away_team': ['KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC'],
259 | 'home_team': ['NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE'],
260 | 'offense_play_points': [0, 0, 3, 0, 0, 6, 1, 0],
261 | 'defense_play_points': [0, 0, 0, 0, 0, 0, 0, 0]
262 | })
263 | expected_df = pd.DataFrame({'gsis_id': [0, 0, 0, 0, 0, 0, 0, 0],
264 | 'yardline': [0, 0, 0, -15, 0, 0, 0, -15],
265 | 'offense_team': ['KC', 'KC', 'KC', 'KC', 'NE', 'NE', 'NE', 'NE'],
266 | 'away_team': ['KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC'],
267 | 'home_team': ['NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE']
268 | })
269 | #Have to append the score columns manually:
270 | expected_df[['curr_home_score', 'curr_away_score']] = pd.DataFrame([(0, 0),
271 | (0, 0),
272 | (0, 0),
273 | (0, 3),
274 | (0, 3),
275 | (0, 3),
276 | (6, 3),
277 | (7, 3),])
278 |
279 | input_df = utils._aggregate_nfldb_scores(input_df)
280 | pd.util.testing.assert_frame_equal(input_df, expected_df)
281 |
282 | def test_multiple_games(self):
283 | input_df = pd.DataFrame({'gsis_id': [0, 0, 0, 1, 1, 1, 1, 1],
284 | 'yardline': [0, 0, 0, -15, 0, 0, 0, -15],
285 | 'offense_team': ['KC', 'KC', 'KC', 'NYJ', 'NE', 'NE', 'NE', 'NE'],
286 | 'home_team': ['KC', 'KC', 'KC', 'NYJ', 'NYJ', 'NYJ', 'NYJ', 'NYJ'],
287 | 'away_team': ['DEN', 'DEN', 'DEN', 'NE', 'NE', 'NE', 'NE', 'NE'],
288 | 'offense_play_points': [0, 0, 3, 0, 0, 6, 1, 0],
289 | 'defense_play_points': [0, 0, 0, 0, 0, 0, 0, 0]
290 | })
291 | expected_df = pd.DataFrame({'gsis_id': [0, 0, 0, 1, 1, 1, 1, 1],
292 | 'yardline': [0, 0, 0, -15, 0, 0, 0, -15],
293 | 'offense_team': ['KC', 'KC', 'KC', 'NYJ', 'NE', 'NE', 'NE', 'NE'],
294 | 'home_team': ['KC', 'KC', 'KC', 'NYJ', 'NYJ', 'NYJ', 'NYJ', 'NYJ'],
295 | 'away_team': ['DEN', 'DEN', 'DEN', 'NE', 'NE', 'NE', 'NE', 'NE']
296 | })
297 | #Have to append the score columns manually:
298 | expected_df[['curr_home_score', 'curr_away_score']] = pd.DataFrame([(0, 0),
299 | (0, 0),
300 | (0, 0),
301 | (0, 0),
302 | (0, 0),
303 | (0, 0),
304 | (0, 6),
305 | (0, 7),])
306 |
307 | input_df = utils._aggregate_nfldb_scores(input_df)
308 | pd.util.testing.assert_frame_equal(input_df, expected_df)
309 |
310 | def test_missing_xp(self):
311 | input_df = pd.DataFrame({'gsis_id': [0, 0, 0, 0, 0, 0, 0, 0],
312 | 'yardline': [0, 0, 0, -15, 0, 0, 0, -15],
313 | 'offense_team': ['KC', 'KC', 'KC', 'NE', 'KC', 'KC', 'KC', 'KC'],
314 | 'home_team': ['KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC'],
315 | 'away_team': ['NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE'],
316 | 'offense_play_points': [0, 0, 0, 0, 0, 0, 6, 0],
317 | 'defense_play_points': [0, 0, 6, 0, 0, 0, 0, 0]
318 | })
319 | expected_df = pd.DataFrame({'gsis_id': [0, 0, 0, 0, 0, 0, 0, 0],
320 | 'yardline': [0, 0, 0, -15, 0, 0, 0, -15],
321 | 'offense_team': ['KC', 'KC', 'KC', 'NE', 'KC', 'KC', 'KC', 'KC'],
322 | 'home_team': ['KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC'],
323 | 'away_team': ['NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE']
324 | })
325 | #Have to append the score columns manually:
326 | expected_df[['curr_home_score', 'curr_away_score']] = pd.DataFrame([(0, 0),
327 | (0, 0),
328 | (0, 0),
329 | (0, 7),
330 | (0, 7),
331 | (0, 7),
332 | (0, 7),
333 | (7, 7),])
334 |
335 | input_df = utils._aggregate_nfldb_scores(input_df)
336 | pd.util.testing.assert_frame_equal(input_df, expected_df)
337 |
338 |
339 |
340 |
341 |
342 |
--------------------------------------------------------------------------------
/nflwin/preprocessing.py:
--------------------------------------------------------------------------------
1 | """Tools to get raw data ready for modeling."""
2 | from __future__ import print_function, division
3 |
4 | import numpy as np
5 | import pandas as pd
6 |
7 | from sklearn.base import BaseEstimator
8 | from sklearn.preprocessing import OneHotEncoder
9 | from sklearn.utils.validation import NotFittedError
10 |
11 | class ComputeElapsedTime(BaseEstimator):
12 | """Compute the total elapsed time from the start of the game.
13 |
14 | Parameters
15 | ----------
16 | quarter_colname : string
17 | Which column indicates what quarter it is.
18 | quarter_time_colname : string
19 | Which column indicates how much time has elapsed in the current quarter.
20 | quarter_to_second_mapping : dict (default=``{"Q1": 0, "Q2": 900, "Q3": 1800, "Q4": 2700,
21 | "OT": 3600, "OT2": 4500, "OT3": 5400}``)
22 | What mapping to use between the string values in the quarter column and the seconds they
23 | correspond to. Mostly useful if your data had quarters listed as something like "Quarter 1"
24 | or "q1" instead of the values from ``nfldb``.
25 | total_time_colname : string (default="total_elapsed_time")
26 | What column name to store the total elapsed time under.
27 | copy : boolean (default=True)
28 | Whether to add the new column in place.
29 | """
30 | def __init__(self, quarter_colname, quarter_time_colname,
31 | quarter_to_second_mapping={"Q1": 0, "Q2": 900, "Q3": 1800, "Q4": 2700,
32 | "OT": 3600, "OT2": 4500, "OT3": 5400},
33 | total_time_colname="total_elapsed_time", copy=True):
34 | self.quarter_colname = quarter_colname
35 | self.quarter_time_colname = quarter_time_colname
36 | self.quarter_to_second_mapping = quarter_to_second_mapping
37 | self.total_time_colname = total_time_colname
38 | self.copy = copy
39 |
40 | def fit(self, X, y=None):
41 | return self
42 |
43 |
44 | def transform(self, X, y=None):
45 | """Create the new column.
46 |
47 | Parameters
48 | ----------
49 | X : Pandas DataFrame, of shape(number of plays, number of features)
50 | NFL play data.
51 | y : Numpy array, with length = number of plays, or None
52 | 1 if the home team won, 0 if not.
53 | (Used as part of Scikit-learn's ``Pipeline``)
54 |
55 | Returns
56 | -------
57 | X : Pandas DataFrame, of shape(number of plays, number of features + 1)
58 | The input DataFrame, with the new column added.
59 |
60 | Raises
61 | ------
62 | KeyError
63 | If ``quarter_colname`` or ``quarter_time_colname`` don't exist, or
64 | if ``total_time_colname`` **does** exist.
65 | TypeError
66 | If the total time elapsed is not a numeric column, which typically indicates
67 | that the mapping did not apply to every row.
68 | """
69 |
70 | if self.quarter_colname not in X.columns:
71 | raise KeyError("ComputeElapsedTime: quarter_colname {0} does not exist in dataset."
72 | .format(self.quarter_colname))
73 | if self.quarter_time_colname not in X.columns:
74 | raise KeyError("ComputeElapsedTime: quarter_time_colname {0} does not exist in dataset."
75 | .format(self.quarter_time_colname))
76 |
77 | if self.total_time_colname in X.columns:
78 | raise KeyError("ComputeElapsedTime: total_time_colname {0} already exists in dataset."
79 | .format(self.total_time_colname))
80 |
81 | if self.copy:
82 | X = X.copy()
83 |
84 | try:
85 | time_elapsed = X[self.quarter_colname].replace(self.quarter_to_second_mapping) + X[self.quarter_time_colname]
86 | except TypeError:
87 | raise TypeError("ComputeElapsedTime: Total time elapsed not numeric. Check your mapping from quarter name to time.")
88 |
89 | X[self.total_time_colname] = time_elapsed.astype(np.int)
90 |
91 | return X
92 |
93 |
94 | class ComputeIfOffenseIsHome(BaseEstimator):
95 | """Determine if the team currently with possession is the home team.
96 |
97 |
98 | Parameters
99 | ----------
100 | offense_team_colname : string
101 | Which column indicates what team was on offense.
102 | home_team_colname : string
103 | Which column indicates what team was the home team.
104 | offense_home_team_colname : string (default="is_offense_home")
105 | What column to store whether or not the offense was the home team.
106 | copy : boolean (default=True)
107 | Whether to add the new column in place.
108 | """
109 | def __init__(self, offense_team_colname,
110 | home_team_colname,
111 | offense_home_team_colname="is_offense_home",
112 | copy=True):
113 | self.offense_team_colname = offense_team_colname
114 | self.home_team_colname = home_team_colname
115 | self.offense_home_team_colname = offense_home_team_colname
116 | self.copy = copy
117 |
118 | def fit(self, X, y=None):
119 | return self
120 |
121 | def transform(self, X, y=None):
122 | """Create the new column.
123 |
124 | Parameters
125 | ----------
126 | X : Pandas DataFrame, of shape(number of plays, number of features)
127 | NFL play data.
128 | y : Numpy array, with length = number of plays, or None
129 | 1 if the home team won, 0 if not.
130 | (Used as part of Scikit-learn's ``Pipeline``)
131 |
132 | Returns
133 | -------
134 | X : Pandas DataFrame, of shape(number of plays, number of features + 1)
135 | The input DataFrame, with the new column added.
136 |
137 | Raises
138 | ------
139 | KeyError
140 | If ``offense_team_colname`` or ``home_team_colname`` don't exist, or
141 | if ``offense_home_team_colname`` **does** exist.
142 | """
143 |
144 | if self.home_team_colname not in X.columns:
145 | raise KeyError("ComputeIfOffenseWon: home_team_colname {0} does not exist in dataset."
146 | .format(self.home_team_colname))
147 | if self.offense_team_colname not in X.columns:
148 | raise KeyError("ComputeIfOffenseWon: offense_team_colname {0} does not exist in dataset."
149 | .format(self.offense_team_colname))
150 |
151 | if self.offense_home_team_colname in X.columns:
152 | raise KeyError("ComputeIfOffenseWon: offense_home_team_colname {0} already exists in dataset."
153 | .format(self.offense_home_team_colname))
154 |
155 | if self.copy:
156 | X = X.copy()
157 |
158 | X[self.offense_home_team_colname] = (X[self.home_team_colname] == X[self.offense_team_colname])
159 |
160 | return X
161 |
162 |
163 | class MapToInt(BaseEstimator):
164 | """Map a column of values to integers.
165 |
166 | Mapping to integer is nice if you know a column
167 | only has a few specific values in it, but you need
168 | to convert it to integers before one-hot encoding it.
169 |
170 | Parameters
171 | ----------
172 | colname : string
173 | The name of the column to perform the mapping on.
174 | copy : boolean (default=True)
175 | If ``False``, apply the mapping in-place.
176 |
177 | Attributes
178 | ----------
179 | mapping : dict
180 | Keys are the unique values of the column, values are the
181 | integers those values will be mapped to.
182 |
183 | Note
184 | ----
185 | The ``transform`` method DOES NOT CHECK to see if the input
186 | DataFrame only contains values in ``mapping``. Any values not
187 | in ``mapping`` will be left alone, which can cause subtle bugs
188 | if you're not careful.
189 | """
190 |
191 | def __init__(self, colname, copy=True):
192 | self.colname = colname
193 | self.copy = copy
194 | self.mapping = None
195 |
196 | def fit(self, X, y=None):
197 | """Find all unique strings and construct the mapping.
198 |
199 | Parameters
200 | ----------
201 | X : Pandas DataFrame, of shape(number of plays, number of features)
202 | NFL play data.
203 | y : Numpy array, with length = number of plays, or None
204 | 1 if the home team won, 0 if not.
205 | (Used as part of Scikit-learn's ``Pipeline``)
206 |
207 | Returns
208 | -------
209 | self : For compatibility with Scikit-learn's ``Pipeline``.
210 |
211 | Raises
212 | ------
213 | KeyError
214 | If ``colname`` is not in ``X``.
215 |
216 | """
217 | if self.colname not in X.columns:
218 | raise KeyError("MapStringsToInt: Required column {0} "
219 | "not present in data".format(self.colname))
220 | unique_values = X[self.colname].unique()
221 |
222 | self.mapping = {unique_values[i]: i for i in range(len(unique_values))}
223 |
224 | try:
225 | del self.mapping[np.nan]
226 | except KeyError:
227 | pass
228 |
229 | return self
230 |
231 | def transform(self, X, y=None):
232 | """Apply the mapping to the data.
233 |
234 | Parameters
235 | ----------
236 | X : Pandas DataFrame, of shape(number of plays, number of features)
237 | NFL play data.
238 | y : Numpy array, with length = number of plays, or None
239 | 1 if the home team won, 0 if not.
240 | (Used as part of Scikit-learn's ``Pipeline``)
241 |
242 | Returns
243 | -------
244 | X : Pandas DataFrame, of shape(number of plays, number of features)
245 | The input DataFrame, with the mapping applied.
246 |
247 | Raises
248 | ------
249 | NotFittedError
250 | If ``transform`` is called before ``fit``.
251 | KeyError
252 | If ``colname`` is not in ``X``.
253 | """
254 | if not self.mapping:
255 | raise NotFittedError("MapStringsToInt: Must fit before transform.")
256 |
257 | if self.colname not in X.columns:
258 | raise KeyError("MapStringsToInt: Required column {0} "
259 | "not present in data".format(self.colname))
260 |
261 | if self.copy:
262 | X = X.copy()
263 |
264 | X[self.colname].replace(self.mapping, inplace=True)
265 |
266 | return X
267 |
268 |
269 | class OneHotEncoderFromDataFrame(BaseEstimator):
270 | """One-hot encode a DataFrame.
271 |
272 | This cleaner wraps the standard scikit-learn OneHotEncoder,
273 | handling the transfer between column name and column index.
274 |
275 | Parameters
276 | ----------
277 | categorical_feature_names : "all" or array of column names.
278 | Specify what features are treated as categorical.
279 | * "all" (default): All features are treated as categorical.
280 | * array of column names: Array of categorical feature names.
281 | dtype : number type, default=np.float.
282 | Desired dtype of output.
283 | handle_unknown : str, "error" (default) or "ignore".
284 | Whether to raise an error or ignore if an unknown categorical feature
285 | is present during transform.
286 | copy : boolean (default=True)
287 | If ``False``, apply the encoding in-place.
288 | """
289 |
290 | @property
291 | def dtype(self):
292 | return self._dtype
293 | @dtype.setter
294 | def dtype(self, dtype):
295 | self._dtype = dtype
296 | self.onehot.dtype = self._dtype
297 |
298 | @property
299 | def handle_unknown(self):
300 | return self._handle_unknown
301 | @handle_unknown.setter
302 | def handle_unknown(self, handle_unknown):
303 | self._handle_unknown = handle_unknown
304 | self.onehot.handle_unknown = self._handle_unknown
305 |
306 | def __init__(self,
307 | categorical_feature_names="all",
308 | dtype=np.float,
309 | handle_unknown="error",
310 | copy=True):
311 | self.onehot = OneHotEncoder(sparse=False, n_values="auto",
312 | categorical_features="all") #We'll subset the DF
313 | self.categorical_feature_names = categorical_feature_names
314 | self.dtype = dtype
315 | self.handle_unknown = handle_unknown
316 | self.copy = copy
317 |
318 | def fit(self, X, y=None):
319 | """Convert the column names to indices, then compute the one hot encoding.
320 |
321 | Parameters
322 | ----------
323 | X : Pandas DataFrame, of shape(number of plays, number of features)
324 | NFL play data.
325 | y : Numpy array, with length = number of plays, or None
326 | 1 if the home team won, 0 if not.
327 | (Used as part of Scikit-learn's ``Pipeline``)
328 |
329 | Returns
330 | -------
331 | self : For compatibility with Scikit-learn's ``Pipeline``.
332 | """
333 |
334 | if self.categorical_feature_names == "all":
335 | self.categorical_feature_names = X.columns
336 |
337 | #Get all columns that need to be encoded:
338 | data_to_encode = X[self.categorical_feature_names]
339 |
340 |
341 | self.onehot.fit(data_to_encode)
342 |
343 | return self
344 |
345 | def transform(self, X, y=None):
346 | """Apply the encoding to the data.
347 |
348 | Parameters
349 | ----------
350 | X : Pandas DataFrame, of shape(number of plays, number of features)
351 | NFL play data.
352 | y : Numpy array, with length = number of plays, or None
353 | 1 if the home team won, 0 if not.
354 | (Used as part of Scikit-learn's ``Pipeline``)
355 |
356 | Returns
357 | -------
358 | X : Pandas DataFrame, of shape(number of plays, number of new features)
359 | The input DataFrame, with the encoding applied.
360 | """
361 | if self.copy:
362 | X = X.copy()
363 |
364 | data_to_transform = X[self.categorical_feature_names]
365 | transformed_data = self.onehot.transform(data_to_transform)
366 |
367 | #TODO (AndrewRook): Find good column names for the encoded columns.
368 | colnames = ["onehot_col{0}".format(i+1) for i in range(transformed_data.shape[1])]
369 | #Create a dataframe from the transformed columns (setting the index is critical for
370 | #merging with data containing non-standard indexes)
371 | transformed_df = pd.DataFrame(transformed_data, columns=colnames, index=X.index)
372 |
373 | X.drop(self.categorical_feature_names, axis=1, inplace=True)
374 | X[transformed_df.columns] = transformed_df
375 |
376 | return X
377 |
378 |
379 |
380 | class CreateScoreDifferential(BaseEstimator):
381 | """Convert offense and defense scores into a differential (offense - defense).
382 |
383 | Parameters
384 | ----------
385 | home_score_colname : string
386 | The name of the column containing the score of the home team.
387 | away_score_colname : string
388 | The name of the column containing the score of the away team.
389 | offense_home_colname : string
390 | The name of the column indicating if the offense is home.
391 | score_differential_colname : string (default=``"score_differential"``)
392 | The name of column containing the score differential. Must not already
393 | exist in the DataFrame.
394 | copy : boolean (default = ``True``)
395 | If ``False``, add the score differential in place.
396 | """
397 | def __init__(self, home_score_colname,
398 | away_score_colname,
399 | offense_home_colname,
400 | score_differential_colname="score_differential",
401 | copy=True):
402 | self.home_score_colname = home_score_colname
403 | self.away_score_colname = away_score_colname
404 | self.offense_home_colname = offense_home_colname
405 | self.score_differential_colname = score_differential_colname
406 | self.copy = copy
407 |
408 | def fit(self, X, y=None):
409 | return self
410 |
411 | def transform(self, X, y=None):
412 | """Create the score differential column.
413 |
414 | Parameters
415 | ----------
416 | X : Pandas DataFrame, of shape(number of plays, number of features)
417 | NFL play data.
418 | y : Numpy array, with length = number of plays, or None
419 | 1 if the home team won, 0 if not.
420 | (Used as part of Scikit-learn's ``Pipeline``)
421 |
422 | Returns
423 | -------
424 | X : Pandas DataFrame, of shape(number of plays, number of features + 1)
425 | The input DataFrame, with the score differential column added.
426 | """
427 | try:
428 | score_differential = ((X[self.home_score_colname] - X[self.away_score_colname]) *
429 | (2 * X[self.offense_home_colname] - 1))
430 | except KeyError:
431 | raise KeyError("CreateScoreDifferential: data missing required column. Must "
432 | "include columns named {0}, {1}, and {2}".format(self.home_score_colname,
433 | self.away_score_colname,
434 | self.offense_home_colname))
435 | if self.score_differential_colname in X.columns:
436 | raise KeyError("CreateScoreDifferential: column {0} already in DataFrame, and can't "
437 | "be used for the score differential".format(self.score_differential_colname))
438 |
439 | if self.copy:
440 | X = X.copy()
441 |
442 | X[self.score_differential_colname] = score_differential
443 |
444 | return X
445 |
446 |
447 |
448 | class CheckColumnNames(BaseEstimator):
449 | """Make sure user has the right column names, in the right order.
450 |
451 | This is a useful first step to make sure that nothing
452 | is going to break downstream, but can also be used effectively
453 | to drop columns that are no longer necessary.
454 |
455 | Parameters
456 | ----------
457 | column_names : ``None``, or list of strings
458 | A list of column names that need to be present in the scoring
459 | data. All other columns will be stripped out. The order of the
460 | columns will be applied to any scoring
461 | data as well, in order to handle the fact that pandas lets
462 | you play fast and loose with column order. If ``None``,
463 | will obtain every column in the DataFrame passed to the
464 | ``fit`` method.
465 | copy : boolean (default=``True``)
466 | If ``False``, add the score differential in place.
467 |
468 | """
469 | def __init__(self, column_names=None, copy=True):
470 | self.column_names = column_names
471 | self.copy = copy
472 | self._fit = True
473 | self.user_specified_columns = False
474 | if self.column_names is None:
475 | self._fit = False
476 | else:
477 | self.user_specified_columns = True
478 |
479 |
480 | def fit(self, X, y=None):
481 | """Grab the column names from a Pandas DataFrame.
482 |
483 | Parameters
484 | ----------
485 | X : Pandas DataFrame, of shape(number of plays, number of features)
486 | NFL play data.
487 | y : Numpy array, with length = number of plays, or None
488 | 1 if the home team won, 0 if not.
489 | (Used as part of Scikit-learn's ``Pipeline``)
490 |
491 | Returns
492 | -------
493 | self : For compatibility with Scikit-learn's ``Pipeline``.
494 | """
495 | if not self.user_specified_columns:
496 | self.column_names = X.columns
497 | self._fit = True
498 |
499 | return self
500 |
501 | def transform(self, X, y=None):
502 | """Apply the column ordering to the data.
503 |
504 | Parameters
505 | ----------
506 | X : Pandas DataFrame, of shape(number of plays, number of features)
507 | NFL play data.
508 | y : Numpy array, with length = number of plays, or None
509 | 1 if the home team won, 0 if not.
510 | (Used as part of Scikit-learn's ``Pipeline``)
511 |
512 | Returns
513 | -------
514 | X : Pandas DataFrame, of shape(number of plays, ``len(column_names)``)
515 | The input DataFrame, properly ordered and with extraneous
516 | columns dropped
517 |
518 | Raises
519 | ------
520 | KeyError
521 | If the input data frame doesn't have all the columns specified
522 | by ``column_names``.
523 | NotFittedError
524 | If ``transform`` is called before ``fit``.
525 | """
526 | if not self._fit:
527 | raise NotFittedError("CheckColumnName: Call 'fit' before 'transform")
528 |
529 | if self.copy:
530 | X = X.copy()
531 |
532 | try:
533 |
534 | return X[self.column_names]
535 | except KeyError:
536 | raise KeyError("CheckColumnName: DataFrame does not have required columns. "
537 | "Must contain at least {0}".format(self.column_names))
538 |
--------------------------------------------------------------------------------
/nflwin/model.py:
--------------------------------------------------------------------------------
1 | """Tools for creating and running the model."""
2 | from __future__ import print_function, division
3 |
4 | import os
5 |
6 | import numpy as np
7 | from scipy import integrate
8 | from scipy import stats
9 |
10 | import joblib
11 |
12 | from sklearn.ensemble import RandomForestClassifier
13 | from sklearn.linear_model import LogisticRegression
14 | from sklearn.calibration import CalibratedClassifierCV
15 | from sklearn.model_selection import train_test_split, GridSearchCV
16 | from sklearn.metrics import brier_score_loss
17 | from sklearn.neighbors import KernelDensity
18 | from sklearn.pipeline import Pipeline
19 | from sklearn.utils.validation import NotFittedError
20 |
21 | from . import preprocessing, utilities
22 |
23 | class WPModel(object):
24 | """The object that computes win probabilities.
25 |
26 | In addition to holding the model itself, it defines some columns names likely to be
27 | used in the model as parameters to allow other users to more easily figure out which
28 | columns go into the model.
29 |
30 | Parameters
31 | ----------
32 | copy_data : boolean (default=``True``)
33 | Whether or not to copy data when fitting and applying the model. Running the model
34 | in-place (``copy_data=False``) will be faster and have a smaller memory footprint,
35 | but if not done carefully can lead to data integrity issues.
36 |
37 | Attributes
38 | ----------
39 | model : A Scikit-learn pipeline (or equivalent)
40 | The actual model used to compute WP. Upon initialization it will be set to
41 | a default model, but can be overridden by the user.
42 | column_descriptions : dictionary
43 | A dictionary whose keys are the names of the columns used in the model, and the values are
44 | string descriptions of what the columns mean. Set at initialization to be the default model,
45 | if you create your own model you'll need to update this attribute manually.
46 | training_seasons : A list of ints, or ``None`` (default=``None``)
47 | If the model was trained using data downloaded from nfldb, a list of the seasons
48 | used to train the model. If nfldb was **not** used, an empty list. If no model
49 | has been trained yet, ``None``.
50 | training_season_types : A list of strings or ``None`` (default=``None``)
51 | Same as ``training_seasons``, except for the portions of the seasons used in training the
52 | model ("Preseason", "Regular", and/or "Postseason").
53 | validation_seasons : same as ``training_seasons``, but for validation data.
54 | validation_season_types : same as ``training_season_types``, but for validation data.
55 | sample_probabilities : A numpy array of floats or ``None`` (default=``None``)
56 | After the model has been validated, contains the sampled predicted probabilities used to
57 | compute the validation statistic.
58 | predicted_win_percents : A numpy array of floats or ``None`` (default=``None``)
59 | After the model has been validated, contains the actual probabilities in the test
60 | set at each probability in ``sample_probabilities``.
61 | num_plays_used : A numpy array of floats or ``None`` (default=``None``)
62 | After the model has been validated, contains the number of plays used to compute each
63 | element of ``predicted_win_percents``.
64 | model_directory : string
65 | The directory where all models will be saved to or loaded from.
66 |
67 | """
68 | model_directory = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
69 | _default_model_filename = "default_model.nflwin"
70 |
71 | def __init__(self,
72 | copy_data=True
73 | ):
74 | self.copy_data = copy_data
75 |
76 | self.model = self.create_default_pipeline()
77 | self._training_seasons = None
78 | self._training_season_types = None
79 | self._validation_seasons = None
80 | self._validation_season_types = None
81 |
82 | self._sample_probabilities = None
83 | self._predicted_win_percents = None
84 | self._num_plays_used = None
85 |
86 |
87 | @property
88 | def training_seasons(self):
89 | return self._training_seasons
90 | @property
91 | def training_seasons_types(self):
92 | return self._training_season_types
93 | @property
94 | def validation_seasons(self):
95 | return self._validation_seasons
96 | @property
97 | def validation_seasons_types(self):
98 | return self._validation_season_types
99 |
100 | @property
101 | def sample_probabilities(self):
102 | return self._sample_probabilities
103 | @property
104 | def predicted_win_percents(self):
105 | return self._predicted_win_percents
106 | @property
107 | def num_plays_used(self):
108 | return self._num_plays_used
109 |
110 | def train_model(self,
111 | source_data="nfldb",
112 | training_seasons=(2009, 2010, 2011, 2012, 2013, 2014),
113 | training_season_types=("Regular", "Postseason"),
114 | target_colname="offense_won"):
115 | """Train the model.
116 |
117 | Once a modeling pipeline is set up (either the default or something
118 | custom-generated), historical data needs to be fed into it in order to
119 | "fit" the model so that it can then be used to predict future results.
120 | This method implements a simple wrapper around the core Scikit-learn functionality
121 | which does this.
122 |
123 | The default is to use data from the nfldb database, however that can be changed
124 | to a simple Pandas DataFrame if desired (for instance if you wish to use data
125 | from another source).
126 |
127 | There is no particular output from this function, rather the parameters governing
128 | the fit of the model are saved inside the model object itself. If you want to get an
129 | estimate of the quality of the fit, use the ``validate_model`` method after running
130 | this method.
131 |
132 | Notes
133 | -----
134 | If you are loading in the default model, **there is no need to re-run this method**.
135 | In fact, doing so will likely result in weird errors and could corrupt the model if you
136 | were to try to save it back to disk.
137 |
138 | Parameters
139 | ----------
140 | source_data : the string ``"nfldb"`` or a Pandas DataFrame (default=``"nfldb"``)
141 | The data to be used to train the model. If ``"nfldb"``, will query the nfldb
142 | database for the training data (note that this requires a correctly configured
143 | installation of nfldb's database).
144 | training_seasons : list of ints (default=``[2009, 2010, 2011, 2012, 2013, 2014]``)
145 | What seasons to use to train the model if getting data from the nfldb database.
146 | If ``source_data`` is not ``"nfldb"``, this argument will be ignored.
147 | **NOTE:** it is critical not to use all possible data in order to train the
148 | model - some will need to be reserved for a final validation (see the
149 | ``validate_model`` method). A good dataset to reserve
150 | for validation is the most recent one or two NFL seasons.
151 | training_season_types : list of strings (default=``["Regular", "Postseason"]``)
152 | If querying from the nfldb database, what parts of the seasons to use.
153 | Options are "Preseason", "Regular", and "Postseason". If ``source_data`` is not
154 | ``"nfldb"``, this argument will be ignored.
155 | target_colname : string or integer (default=``"offense_won"``)
156 | The name of the target variable column.
157 |
158 | Returns
159 | -------
160 | ``None``
161 | """
162 | self._training_seasons = []
163 | self._training_season_types = []
164 | if isinstance(source_data, str):
165 | if source_data == "nfldb":
166 | source_data = utilities.get_nfldb_play_data(season_years=training_seasons,
167 | season_types=training_season_types)
168 | self._training_seasons = training_seasons
169 | self._training_season_types = training_season_types
170 | else:
171 | raise ValueError("WPModel: if source_data is a string, it must be 'nfldb'")
172 | target_col = source_data[target_colname]
173 | feature_cols = source_data.drop(target_colname, axis=1)
174 | self.model.fit(feature_cols, target_col)
175 |
176 | def validate_model(self,
177 | source_data="nfldb",
178 | validation_seasons=(2015,),
179 | validation_season_types=("Regular", "Postseason"),
180 | target_colname="offense_won"):
181 | """Validate the model.
182 |
183 | Once a modeling pipeline is trained, a different dataset must be fed into the trained model
184 | to validate the quality of the fit.
185 | This method implements a simple wrapper around the core Scikit-learn functionality
186 | which does this.
187 |
188 | The default is to use data from the nfldb database, however that can be changed
189 | to a simple Pandas DataFrame if desired (for instance if you wish to use data
190 | from another source).
191 |
192 | The output of this method is a p value which represents the confidence at which
193 | we can reject the null hypothesis that the model predicts the appropriate win
194 | probabilities. This number is computed by first smoothing the predicted win probabilities of both all test data and
195 | just the data where the offense won with a gaussian `kernel density
196 | estimate `_
197 | with standard deviation = 0.01. Once the data is smooth, ratios at each percentage point from 1% to 99% are computed (i.e.
198 | what fraction of the time did the offense win when the model says they have a 1% chance of winning, 2% chance, etc.). Each of
199 | these ratios should be well approximated by the binomial distribution, since they are essentially independent (not perfectly
200 | but hopefully close enough) weighted coin flips, giving a p value. From there `Fisher's method `_
201 | is used to combine the p values into a global p value. A p value close to zero means that the model is unlikely to be
202 | properly predicting the correct win probabilities. A p value close to one, **while not proof that the model is correct**,
203 | means that the model is at least not inconsistent with the hypothesis that it predicts good win probabilities.
204 |
205 | Parameters
206 | ----------
207 | source_data : the string ``"nfldb"`` or a Pandas DataFrame (default=``"nfldb"``)
208 | The data to be used to train the model. If ``"nfldb"``, will query the nfldb
209 | database for the training data (note that this requires a correctly configured
210 | installation of nfldb's database).
211 | training_seasons : list of ints (default=``[2015]``)
212 | What seasons to use to validate the model if getting data from the nfldb database.
213 | If ``source_data`` is not ``"nfldb"``, this argument will be ignored.
214 | **NOTE:** it is critical not to use the same data to validate the model as was used
215 | in the fit. Generally a good data set to use for validation is one from a time
216 | period more recent than was used to train the model. For instance, if the model was trained
217 | on data from 2009-2014, data from the 2015 season would be a sensible choice to validate the model.
218 | training_season_types : list of strings (default=``["Regular", "Postseason"]``)
219 | If querying from the nfldb database, what parts of the seasons to use.
220 | Options are "Preseason", "Regular", and "Postseason". If ``source_data`` is not
221 | ``"nfldb"``, this argument will be ignored.
222 | target_colname : string or integer (default=``"offense_won"``)
223 | The name of the target variable column.
224 |
225 | Returns
226 | -------
227 | float, between 0 and 1
228 | The combined p value, where smaller values indicate that the model is not accurately predicting win
229 | probabilities.
230 |
231 | Raises
232 | ------
233 | NotFittedError
234 | If the model hasn't been fit.
235 |
236 | Notes
237 | -----
238 | Probabilities are computed between 1 and 99 percent because a single incorrect prediction at 100% or 0% automatically drives
239 | the global p value to zero. Since the model is being smoothed this situation can occur even when there are no model predictions
240 | at those extreme values, and therefore leads to erroneous p values.
241 |
242 | While it seems reasonable (to me at least), I am not totally certain that this approach is entirely correct.
243 | It's certainly sub-optimal in that you would ideally reject the null hypothesis that the model predictions
244 | **aren't** appropriate, but that seems to be a much harder problem (and one that would need much more test
245 | data to beat down the uncertainties involved). I'm also not sure if using Fisher's method is appropriate here,
246 | and I wonder if it might be necessary to Monte Carlo this. I would welcome input from others on better ways to do this.
247 |
248 | """
249 |
250 | if self.training_seasons is None:
251 | raise NotFittedError("Must fit model before validating.")
252 |
253 | self._validation_seasons = []
254 | self._validation_season_types = []
255 | if isinstance(source_data, str):
256 | if source_data == "nfldb":
257 | source_data = utilities.get_nfldb_play_data(season_years=validation_seasons,
258 | season_types=validation_season_types)
259 | self._validation_seasons = validation_seasons
260 | self._validation_season_types = validation_season_types
261 | else:
262 | raise ValueError("WPModel: if source_data is a string, it must be 'nfldb'")
263 |
264 | target_col = source_data[target_colname]
265 | feature_cols = source_data.drop(target_colname, axis=1)
266 | predicted_probabilities = self.model.predict_proba(feature_cols)[:,1]
267 |
268 | self._sample_probabilities, self._predicted_win_percents, self._num_plays_used = (
269 | WPModel._compute_predicted_percentages(target_col.values, predicted_probabilities))
270 |
271 | #Compute the maximal deviation from a perfect prediction as well as the area under the
272 | #curve of the residual between |predicted - perfect|:
273 | max_deviation, residual_area = self._compute_prediction_statistics(self.sample_probabilities,
274 | self.predicted_win_percents)
275 | return max_deviation, residual_area
276 |
277 | #Compute p-values for each where null hypothesis is that distributions are same, then combine
278 | #them all to make sure data is not inconsistent with accurate predictions.
279 | # combined_pvalue = self._test_distribution(self.sample_probabilities,
280 | # self.predicted_win_percents,
281 | # self.num_plays_used)
282 |
283 | # return combined_pvalue
284 |
285 | @staticmethod
286 | def _compute_prediction_statistics(sample_probabilities, predicted_win_percents):
287 | """Take the KDE'd model estimates, then compute statistics.
288 |
289 | Returns
290 | -------
291 | A tuple of (``max_deviation``, ``residual_area``), where ``max_deviation``
292 | is the largest discrepancy between the model and expectation at any WP,
293 | and ``residual_area`` is the total area under the curve of |predicted WP - expected WP|.
294 | """
295 | abs_deviations = np.abs(predicted_win_percents - sample_probabilities)
296 | max_deviation = np.max(abs_deviations)
297 | residual_area = integrate.simps(abs_deviations,
298 | sample_probabilities)
299 | return (max_deviation, residual_area)
300 |
301 |
302 | def predict_wp(self, plays):
303 | """Estimate the win probability for a set of plays.
304 |
305 | Basically a simple wrapper around ``WPModel.model.predict_proba``,
306 | takes in a DataFrame and then spits out an array of predicted
307 | win probabilities.
308 |
309 | Parameters
310 | ----------
311 | plays : Pandas DataFrame
312 | The input data to use to make the predictions.
313 |
314 | Returns
315 | -------
316 | Numpy array, of length ``len(plays)``
317 | Predicted probability that the offensive team in each play
318 | will go on to win the game.
319 |
320 | Raises
321 | ------
322 | NotFittedError
323 | If the model hasn't been fit.
324 | """
325 | if self.training_seasons is None:
326 | raise NotFittedError("Must fit model before predicting WP.")
327 |
328 | return self.model.predict_proba(plays)[:,1]
329 |
330 |
331 | def plot_validation(self, axis=None, **kwargs):
332 | """Plot the validation data.
333 |
334 | Parameters
335 | ----------
336 | axis : matplotlib.pyplot.axis object or ``None`` (default=``None``)
337 | If provided, the validation line will be overlaid on ``axis``.
338 | Otherwise, a new figure and axis will be generated and plotted on.
339 | **kwargs
340 | Arguments to ``axis.plot``.
341 |
342 | Returns
343 | -------
344 | matplotlib.pylot.axis
345 | The axis the plot was made on.
346 |
347 | Raises
348 | ------
349 | NotFittedError
350 | If the model hasn't been fit **and** validated.
351 | """
352 |
353 | if self.sample_probabilities is None:
354 | raise NotFittedError("Must validate model before plotting.")
355 |
356 | import matplotlib.pyplot as plt
357 | if axis is None:
358 | axis = plt.figure().add_subplot(111)
359 | axis.plot([0, 100], [0, 100], ls="--", lw=2, color="black")
360 | axis.set_xlabel("Predicted WP")
361 | axis.set_ylabel("Actual WP")
362 | axis.plot(self.sample_probabilities,
363 | self.predicted_win_percents,
364 | **kwargs)
365 |
366 | return axis
367 |
368 |
369 | @staticmethod
370 | def _test_distribution(sample_probabilities, predicted_win_percents, num_plays_used):
371 | """Based off assuming the data at each probability is a Bernoulli distribution."""
372 |
373 | #Get the p-values:
374 | p_values = [stats.binom_test(np.round(predicted_win_percents[i] * num_plays_used[i]),
375 | np.round(num_plays_used[i]),
376 | p=sample_probabilities[i]) for i in range(len(sample_probabilities))]
377 | combined_p_value = stats.combine_pvalues(p_values)[1]
378 | return(combined_p_value)
379 |
380 | @staticmethod
381 | def _compute_predicted_percentages(actual_results, predicted_win_probabilities):
382 | """Compute the sample percentages from a validation data set.
383 | """
384 | kde_offense_won = KernelDensity(kernel='gaussian', bandwidth=0.01).fit(
385 | (predicted_win_probabilities[(actual_results == 1)])[:, np.newaxis])
386 | kde_total = KernelDensity(kernel='gaussian', bandwidth=0.01).fit(
387 | predicted_win_probabilities[:, np.newaxis])
388 | sample_probabilities = np.linspace(0.01, 0.99, 99)
389 | number_density_offense_won = np.exp(kde_offense_won.score_samples(sample_probabilities[:, np.newaxis])) * np.sum((actual_results))
390 | number_density_total = np.exp(kde_total.score_samples(sample_probabilities[:, np.newaxis])) * len(actual_results)
391 | number_offense_won = number_density_offense_won * np.sum(actual_results) / np.sum(number_density_offense_won)
392 | number_total = number_density_total * len(actual_results) / np.sum(number_density_total)
393 | predicted_win_percents = number_offense_won / number_total
394 |
395 | return 100.*sample_probabilities, 100.*predicted_win_percents, number_total
396 |
397 | def create_default_pipeline(self):
398 | """Create the default win probability estimation pipeline.
399 |
400 |
401 | Returns
402 | -------
403 | Scikit-learn pipeline
404 | The default pipeline, suitable for computing win probabilities
405 | but by no means the best possible model.
406 |
407 | This can be run any time a new default pipeline is required,
408 | and either set to the ``model`` attribute or used independently.
409 | """
410 |
411 | steps = []
412 |
413 | offense_team_colname = "offense_team"
414 | home_team_colname = "home_team"
415 | home_score_colname = "curr_home_score"
416 | away_score_colname = "curr_away_score"
417 | down_colname = "down"
418 | quarter_colname = "quarter"
419 | time_colname = "seconds_elapsed"
420 | yardline_colname = "yardline"
421 | yards_to_go_colname="yards_to_go"
422 |
423 | self.column_descriptions = {
424 | offense_team_colname: "Abbreviation for the offensive team",
425 | home_team_colname: "Abbreviation for the home team",
426 | away_score_colname: "Abbreviation for the visiting team",
427 | down_colname: "The current down",
428 | yards_to_go_colname: "Yards to a first down (or the endzone)",
429 | quarter_colname: "The quarter",
430 | time_colname: "Seconds elapsed in the quarter",
431 | yardline_colname: ("The yardline, given by (yards from own goalline - 50). "
432 | "-49 is your own 1 while 49 is the opponent's 1.")
433 | }
434 |
435 | is_offense_home = preprocessing.ComputeIfOffenseIsHome(offense_team_colname,
436 | home_team_colname,
437 | copy=self.copy_data)
438 | steps.append(("compute_offense_home", is_offense_home))
439 | score_differential = preprocessing.CreateScoreDifferential(home_score_colname,
440 | away_score_colname,
441 | is_offense_home.offense_home_team_colname,
442 | copy=self.copy_data)
443 | steps.append(("create_score_differential", score_differential))
444 | steps.append(("map_downs_to_int", preprocessing.MapToInt(down_colname, copy=self.copy_data)))
445 | total_time_elapsed = preprocessing.ComputeElapsedTime(quarter_colname, time_colname, copy=self.copy_data)
446 | steps.append(("compute_total_time_elapsed", total_time_elapsed))
447 | steps.append(("remove_unnecessary_columns", preprocessing.CheckColumnNames(
448 | column_names=[is_offense_home.offense_home_team_colname,
449 | score_differential.score_differential_colname,
450 | total_time_elapsed.total_time_colname,
451 | yardline_colname,
452 | yards_to_go_colname,
453 | down_colname],
454 | copy=self.copy_data)))
455 | steps.append(("encode_categorical_columns", preprocessing.OneHotEncoderFromDataFrame(
456 | categorical_feature_names=[down_colname],
457 | copy=self.copy_data)))
458 |
459 | search_grid = {'base_estimator__penalty': ['l1', 'l2'],
460 | 'base_estimator__C': [0.01, 0.1, 1, 10, 100]
461 | }
462 | base_model = LogisticRegression()
463 | calibrated_model = CalibratedClassifierCV(base_model, cv=2, method="isotonic")
464 | #grid_search_model = GridSearchCV(calibrated_model, search_grid,
465 | # scoring=self._brier_loss_scorer)
466 | steps.append(("compute_model", calibrated_model))
467 |
468 | pipe = Pipeline(steps)
469 | return pipe
470 |
471 | def save_model(self, filename=None):
472 | """Save the WPModel instance to disk.
473 |
474 | All models are saved to the same place, with the installed
475 | NFLWin library (given by ``WPModel.model_directory``).
476 |
477 | Parameters
478 | ----------
479 | filename : string (default=None):
480 | The filename to use for the saved model. If this parameter
481 | is not specified, save to the default filename. Note that if a model
482 | already lists with this filename, it will be overwritten. Note also that
483 | this is a filename only, **not** a full path. If a full path is specified
484 | it is likely (albeit not guaranteed) to cause errors.
485 |
486 | Returns
487 | -------
488 | ``None``
489 | """
490 |
491 | if filename is None:
492 | filename = self._default_model_filename
493 | joblib.dump(self, os.path.join(self.model_directory, filename))
494 |
495 | @classmethod
496 | def load_model(cls, filename=None):
497 | """Load a saved WPModel.
498 |
499 | Parameters
500 | ----------
501 | Same as ``save_model``.
502 |
503 | Returns
504 | -------
505 | ``nflwin.WPModel`` instance.
506 | """
507 | if filename is None:
508 | filename = cls._default_model_filename
509 |
510 | return joblib.load(os.path.join(cls.model_directory, filename))
511 |
512 | @staticmethod
513 | def _brier_loss_scorer(estimator, X, y):
514 | """Use the Brier loss to estimate model score.
515 |
516 | For use in GridSearchCV, instead of accuracy.
517 | """
518 | predicted_positive_probabilities = estimator.predict_proba(X)[:, 1]
519 | return 1. - brier_score_loss(y, predicted_positive_probabilities)
520 |
--------------------------------------------------------------------------------
/nflwin/tests/test_preprocessing.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function, division
2 |
3 | import numpy as np
4 | import pandas as pd
5 | import pytest
6 | from sklearn.utils.validation import NotFittedError
7 | from sklearn.pipeline import Pipeline
8 |
9 | from nflwin import preprocessing
10 |
11 | class TestPipelines(object):
12 | """Testing if pipelining cleaning steps works."""
13 | def test_map_to_int_to_onehot(self):
14 | fit_df = pd.DataFrame({"quarter": ["Q1", "Q1", "Q1", "Q2", "Q2"]})
15 | transform_df = fit_df.copy()
16 |
17 | mti = preprocessing.MapToInt("quarter", copy=True)
18 | ohe = preprocessing.OneHotEncoderFromDataFrame(categorical_feature_names=["quarter"], copy=True)
19 | pipe = Pipeline(steps=[("one", mti), ("two", ohe)])
20 | pipe.fit(fit_df)
21 | output_df = pipe.transform(transform_df)
22 |
23 | expected_df = pd.DataFrame({"onehot_col1": [1.0, 1, 1, 0, 0], "onehot_col2": [0.0, 0, 0, 1, 1]})
24 | pd.util.testing.assert_frame_equal(output_df, expected_df)
25 |
26 | class TestComputeElapsedTime(object):
27 | """Testing if we can properly map quarters and time elapsed to a total time elapsed."""
28 |
29 | def test_bad_quarter_colname_produces_error(self):
30 | input_df = pd.DataFrame({"blahblahblah": ["Q1", "Q2", "Q3", "Q4", "OT"],
31 | "time_elapsed": [200, 0, 50, 850, 40]})
32 | cet = preprocessing.ComputeElapsedTime("quarter", "time_elapsed")
33 | cet.fit(input_df)
34 |
35 | with pytest.raises(KeyError):
36 | cet.transform(input_df)
37 |
38 | def test_bad_time_elapsed_colname_produces_error(self):
39 | input_df = pd.DataFrame({"quarter": ["Q1", "Q2", "Q3", "Q4", "OT"],
40 | "blahblahblah": [200, 0, 50, 850, 40]})
41 | cet = preprocessing.ComputeElapsedTime("quarter", "time_elapsed")
42 | cet.fit(input_df)
43 |
44 | with pytest.raises(KeyError):
45 | cet.transform(input_df)
46 |
47 | def test_preexisting_output_colname_produces_error(self):
48 | input_df = pd.DataFrame({"quarter": ["Q1", "Q2", "Q3", "Q4", "OT"],
49 | "time_elapsed": [200, 0, 50, 850, 40],
50 | "total_time_elapsed": [0, 0, 0, 0, 0]})
51 | cet = preprocessing.ComputeElapsedTime("quarter", "time_elapsed",
52 | total_time_colname="total_time_elapsed")
53 | cet.fit(input_df)
54 |
55 | with pytest.raises(KeyError):
56 | cet.transform(input_df)
57 |
58 | def test_incomplete_quarter_mapping(self):
59 | input_df = pd.DataFrame({"quarter": ["Q1", "Q2", "Q3", "Q4", "OT1"],
60 | "time_elapsed": [200, 0, 50, 850, 40]})
61 | cet = preprocessing.ComputeElapsedTime("quarter", "time_elapsed",
62 | quarter_to_second_mapping={
63 | "Q1": 0,
64 | "Q2": 900,
65 | "Q4": 2700,
66 | "OT1":3600} )
67 | cet.fit(input_df)
68 |
69 | with pytest.raises(TypeError):
70 | cet.transform(input_df)
71 |
72 | def test_simple_working_case(self):
73 | input_df = pd.DataFrame({"quarter": ["Q1", "Q2", "Q3", "Q4", "OT"],
74 | "time_elapsed": [200, 0, 50, 850, 40]})
75 | cet = preprocessing.ComputeElapsedTime("quarter", "time_elapsed")
76 | cet.fit(input_df)
77 |
78 | transformed_df = cet.transform(input_df)
79 | expected_df = pd.DataFrame({"quarter": ["Q1", "Q2", "Q3", "Q4", "OT"],
80 | "time_elapsed": [200, 0, 50, 850, 40],
81 | "total_elapsed_time": [200, 900, 1850, 3550, 3640]})
82 | pd.util.testing.assert_frame_equal(transformed_df, expected_df)
83 |
84 | def test_inplace_transform(self):
85 | input_df = pd.DataFrame({"quarter": ["Q1", "Q2", "Q3", "Q4", "OT"],
86 | "time_elapsed": [200, 0, 50, 850, 40]})
87 | cet = preprocessing.ComputeElapsedTime("quarter", "time_elapsed", copy=False)
88 | cet.fit(input_df)
89 |
90 | cet.transform(input_df)
91 | expected_df = pd.DataFrame({"quarter": ["Q1", "Q2", "Q3", "Q4", "OT"],
92 | "time_elapsed": [200, 0, 50, 850, 40],
93 | "total_elapsed_time": [200, 900, 1850, 3550, 3640]})
94 | pd.util.testing.assert_frame_equal(input_df, expected_df)
95 |
96 | def test_custom_mapping(self):
97 | input_df = pd.DataFrame({"quarter": ["quarter1", "Q2", "Q3", "Q4", "OT1"],
98 | "time_elapsed": [200, 0, 50, 850, 40]})
99 | cet = preprocessing.ComputeElapsedTime("quarter", "time_elapsed",
100 | quarter_to_second_mapping={
101 | "quarter1": 0,
102 | "Q2": 500,
103 | "Q3": 1800,
104 | "Q4": 2700,
105 | "OT1":3600})
106 | cet.fit(input_df)
107 |
108 | transformed_df = cet.transform(input_df)
109 | expected_df = pd.DataFrame({"quarter": ["quarter1", "Q2", "Q3", "Q4", "OT1"],
110 | "time_elapsed": [200, 0, 50, 850, 40],
111 | "total_elapsed_time": [200, 500, 1850, 3550, 3640]})
112 | pd.util.testing.assert_frame_equal(transformed_df, expected_df)
113 |
114 |
115 | class TestComputeIfOffenseIsHome(object):
116 | """Testing if we can correctly compute if the offense is the home team."""
117 |
118 | def test_bad_offense_colname_produces_error(self):
119 | input_df = pd.DataFrame({"home_team": ["a", "a", "a"],
120 | "blahblahblah": ["a", "b", "a"]})
121 | ciow = preprocessing.ComputeIfOffenseIsHome("offense_team", "home_team")
122 | ciow.fit(input_df)
123 |
124 | with pytest.raises(KeyError):
125 | ciow.transform(input_df)
126 |
127 | def test_bad_home_team_colname_produces_error(self):
128 | input_df = pd.DataFrame({"blahblahblah": ["a", "a", "a"],
129 | "offense_team": ["a", "b", "a"]})
130 | ciow = preprocessing.ComputeIfOffenseIsHome("offense_team", "home_team")
131 | ciow.fit(input_df)
132 |
133 | with pytest.raises(KeyError):
134 | ciow.transform(input_df)
135 |
136 | def test_existing_offense_home_team_colname_produces_error(self):
137 | input_df = pd.DataFrame({"home_team": ["a", "a", "a"],
138 | "offense_team": ["a", "b", "a"]})
139 | ciow = preprocessing.ComputeIfOffenseIsHome("offense_team", "home_team",
140 | offense_home_team_colname="home_team")
141 | ciow.fit(input_df)
142 |
143 | with pytest.raises(KeyError):
144 | ciow.transform(input_df)
145 |
146 | def test_correct_answer_with_copy(self):
147 | input_df = pd.DataFrame({"home_team": ["a", "a", "a"],
148 | "offense_team": ["a", "b", "a"]})
149 | expected_input_df = input_df.copy()
150 | expected_transformed_df = pd.DataFrame({"home_team": ["a", "a", "a"],
151 | "offense_team": ["a", "b", "a"],
152 | "offense_home_team": [True, False, True]})
153 | ciow = preprocessing.ComputeIfOffenseIsHome("offense_team", "home_team",
154 | offense_home_team_colname="offense_home_team",
155 | copy=True)
156 | transformed_df = ciow.transform(input_df)
157 | pd.util.testing.assert_frame_equal(input_df.sort_index(axis=1), expected_input_df.sort_index(axis=1))
158 | pd.util.testing.assert_frame_equal(transformed_df.sort_index(axis=1), expected_transformed_df.sort_index(axis=1))
159 |
160 | def test_correct_answer_without_copy(self):
161 | input_df = pd.DataFrame({"home_team": ["a", "a", "a"],
162 | "offense_team": ["a", "b", "a"]})
163 | expected_transformed_df = pd.DataFrame({"home_team": ["a", "a", "a"],
164 | "offense_team": ["a", "b", "a"],
165 | "offense_home_team": [True, False, True]})
166 | ciow = preprocessing.ComputeIfOffenseIsHome("offense_team", "home_team",
167 | offense_home_team_colname="offense_home_team",
168 | copy=False)
169 | ciow.transform(input_df)
170 | pd.util.testing.assert_frame_equal(input_df.sort_index(axis=1), expected_transformed_df.sort_index(axis=1))
171 |
172 |
173 | class TestMapToInt(object):
174 | """Testing if the integer mapper works."""
175 |
176 | def test_fit_bad_colname_produces_error(self):
177 | input_df = pd.DataFrame({"one": ["one", "two", "one", "four",
178 | "six", "two", "one", "one"]})
179 | mti = preprocessing.MapToInt("blahblahblah")
180 |
181 | with pytest.raises(KeyError):
182 | mti.fit(input_df)
183 |
184 |
185 | def test_mapping_without_nans(self):
186 | input_df = pd.DataFrame({"one": ["one", "two", "one", "four",
187 | "six", "two", "one", "one"]})
188 | mti = preprocessing.MapToInt("one")
189 | mti.fit(input_df)
190 | expected_output = {"one": 0, "two": 1, "four": 2, "six": 3}
191 | assert mti.mapping == expected_output
192 |
193 | def test_mapping_with_nans(self):
194 | input_df = pd.DataFrame({"one": ["one", "two", "one", "four",
195 | "six", np.nan, "one", "one"]})
196 | mti = preprocessing.MapToInt("one")
197 | mti.fit(input_df)
198 | expected_output = {"one": 0, "two": 1, "four": 2, "six": 3}
199 | assert mti.mapping == expected_output
200 |
201 | def test_transform_before_fit_produces_error(self):
202 | input_df = pd.DataFrame({"one": ["one", "two", "one", "four",
203 | "six", "two", "one", "one"]})
204 | mti = preprocessing.MapToInt("one")
205 |
206 | with pytest.raises(NotFittedError):
207 | mti.transform(input_df)
208 |
209 | def test_transform_bad_colname_produces_error(self):
210 | input_df = pd.DataFrame({"one": ["one", "two", "one", "four",
211 | "six", "two", "one", "one"]})
212 | mti = preprocessing.MapToInt("one")
213 | mti.fit(input_df)
214 | transform_df = pd.DataFrame({"blahblahblah": ["one", "two", "one", "four",
215 | "six", "two", "one", "one"]})
216 | with pytest.raises(KeyError):
217 | mti.transform(transform_df)
218 |
219 | def test_transform_without_nans(self):
220 | input_df = pd.DataFrame({"one": ["one", "two", "one", "four",
221 | "six", "two", "one", "one"]})
222 | mti = preprocessing.MapToInt("one")
223 | mti.fit(input_df)
224 | transformed_df = mti.transform(input_df)
225 | expected_df = pd.DataFrame({"one": [0, 1, 0, 2, 3, 1, 0, 0]})
226 | pd.util.testing.assert_frame_equal(transformed_df, expected_df)
227 |
228 | def test_transform_with_nans(self):
229 | input_df = pd.DataFrame({"one": ["one", "two", "one", "four",
230 | "six", "two", np.nan, "one"]})
231 | mti = preprocessing.MapToInt("one")
232 | mti.fit(input_df)
233 | transformed_df = mti.transform(input_df)
234 | expected_df = pd.DataFrame({"one": [0, 1, 0, 2, 3, 1, np.nan, 0]})
235 | pd.util.testing.assert_frame_equal(transformed_df, expected_df)
236 |
237 | def test_transform_inplace(self):
238 | input_df = pd.DataFrame({"one": ["one", "two", "one", "four",
239 | "six", "two", "one", "one"]})
240 | mti = preprocessing.MapToInt("one", copy=False)
241 | mti.fit(input_df)
242 | mti.transform(input_df)
243 | expected_df = pd.DataFrame({"one": [0, 1, 0, 2, 3, 1, 0, 0]})
244 | pd.util.testing.assert_frame_equal(input_df, expected_df)
245 |
246 | def test_transform_copy(self):
247 | input_df = pd.DataFrame({"one": ["one", "two", "one", "four",
248 | "six", "two", "one", "one"]})
249 | expected_df = input_df.copy()
250 | mti = preprocessing.MapToInt("one", copy=True)
251 | mti.fit(input_df)
252 | transformed_data = mti.transform(input_df)
253 | pd.util.testing.assert_frame_equal(input_df, expected_df)
254 |
255 |
256 |
257 |
258 | class TestOneHotEncoderFromDataFrame(object):
259 | """Testing if the one-hot encoder wrapper works."""
260 |
261 | def setup_method(self, method):
262 | self.data = pd.DataFrame({"one": [1, 2, 3, 1],
263 | "two": [2, 2, 2, 5],
264 | "three": [0, 5, 0, 5]})
265 | self.data = self.data[["one", "two", "three"]]
266 |
267 | def test_correct_dtype_passed(self):
268 | ohe = preprocessing.OneHotEncoderFromDataFrame(dtype=np.int)
269 | assert ohe.dtype == np.int
270 |
271 | def test_correct_handle_unknown_string_passed(self):
272 | ohe = preprocessing.OneHotEncoderFromDataFrame(handle_unknown="ignore")
273 | assert ohe.handle_unknown == "ignore"
274 |
275 | def test_encode_all_columns(self):
276 | ohe = preprocessing.OneHotEncoderFromDataFrame(categorical_feature_names="all")
277 | ohe.fit(self.data)
278 | transformed_data = ohe.transform(self.data)
279 | expected_data = pd.DataFrame({"onehot_col1": [1., 0, 0, 1],
280 | "onehot_col2": [0., 1, 0, 0],
281 | "onehot_col3": [0., 0, 1, 0],
282 | "onehot_col4": [1., 1, 1, 0],
283 | "onehot_col5": [0., 0, 0, 1],
284 | "onehot_col6": [1., 0, 1, 0],
285 | "onehot_col7": [0., 1, 0, 1]})
286 |
287 | pd.util.testing.assert_frame_equal(transformed_data.sort_index(axis=1),
288 | expected_data.sort_index(axis=1))
289 |
290 | def test_encode_some_columns(self):
291 | ohe = preprocessing.OneHotEncoderFromDataFrame(categorical_feature_names=["one", "three"])
292 | ohe.fit(self.data)
293 | transformed_data = ohe.transform(self.data)
294 | expected_data = pd.DataFrame({"two": [2, 2, 2, 5],
295 | "onehot_col1": [1., 0, 0, 1],
296 | "onehot_col2": [0., 1, 0, 0],
297 | "onehot_col3": [0., 0, 1, 0],
298 | "onehot_col4": [1., 0, 1, 0],
299 | "onehot_col5": [0., 1, 0, 1]})
300 |
301 | pd.util.testing.assert_frame_equal(transformed_data.sort_index(axis=1),
302 | expected_data.sort_index(axis=1))
303 |
304 | def test_copy_data_works(self):
305 | ohe = preprocessing.OneHotEncoderFromDataFrame(categorical_feature_names=["one", "three"],
306 | copy=True)
307 | ohe.fit(self.data)
308 | transformed_data = ohe.transform(self.data)
309 | expected_data = pd.DataFrame({"one": [1, 2, 3, 1],
310 | "two": [2, 2, 2, 5],
311 | "three": [0, 5, 0, 5]})
312 |
313 | pd.util.testing.assert_frame_equal(self.data.sort_index(axis=1),
314 | expected_data.sort_index(axis=1))
315 |
316 |
317 | def test_inplace_transform_works(self):
318 | ohe = preprocessing.OneHotEncoderFromDataFrame(categorical_feature_names=["one", "three"],
319 | copy=False)
320 | data = self.data.copy()
321 | ohe.fit(self.data)
322 | ohe.transform(self.data)
323 | expected_data = pd.DataFrame({"two": [2, 2, 2, 5],
324 | "onehot_col1": [1., 0, 0, 1],
325 | "onehot_col2": [0., 1, 0, 0],
326 | "onehot_col3": [0., 0, 1, 0],
327 | "onehot_col4": [1., 0, 1, 0],
328 | "onehot_col5": [0., 1, 0, 1]})
329 |
330 | pd.util.testing.assert_frame_equal(self.data.sort_index(axis=1),
331 | expected_data.sort_index(axis=1))
332 |
333 | def test_encoding_subset_columns(self):
334 | ohe = preprocessing.OneHotEncoderFromDataFrame(categorical_feature_names=["one", "three"],
335 | copy=True)
336 | shifted_data = self.data[2:]
337 | ohe.fit(shifted_data)
338 | transformed_data = ohe.transform(shifted_data)
339 | self.data = pd.DataFrame({"one": [1, 2, 3, 1],
340 | "two": [2, 2, 2, 5],
341 | "three": [0, 5, 0, 5]})
342 | expected_data = pd.DataFrame({"two": [2, 5],
343 | "onehot_col1": [0., 1],
344 | "onehot_col2": [1., 0],
345 | "onehot_col3": [1., 0],
346 | "onehot_col4": [0., 1]},
347 | index=[2, 3])
348 | print(transformed_data)
349 | print(expected_data)
350 | pd.util.testing.assert_frame_equal(transformed_data.sort_index(axis=1),
351 | expected_data.sort_index(axis=1))
352 |
353 |
354 |
355 |
356 | class TestCreateScoreDifferential(object):
357 | """Testing if score differentials are properly created."""
358 |
359 | def test_bad_home_score_colname(self):
360 | csd = preprocessing.CreateScoreDifferential("badcol", "away_score", "offense_home")
361 | data = pd.DataFrame({"home_score": [1, 2, 3, 4],
362 | "away_score": [10, 0, 5, 15],
363 | "offense_home": [True, True, True, True]})
364 | with pytest.raises(KeyError):
365 | csd.transform(data)
366 |
367 | def test_bad_away_score_colname(self):
368 | csd = preprocessing.CreateScoreDifferential("home_score", "badcol", "offense_home")
369 | data = pd.DataFrame({"home_score": [1, 2, 3, 4],
370 | "away_score": [10, 0, 5, 15],
371 | "offense_home": [True, True, True, True]})
372 | with pytest.raises(KeyError):
373 | csd.fit(data)
374 | csd.transform(data)
375 |
376 | def test_bad_offense_home_colname(self):
377 | csd = preprocessing.CreateScoreDifferential("home_score", "away_score", "badcol")
378 | data = pd.DataFrame({"home_score": [1, 2, 3, 4],
379 | "away_score": [10, 0, 5, 15],
380 | "offense_home": [True, True, True, True]})
381 | with pytest.raises(KeyError):
382 | csd.fit(data)
383 | csd.transform(data)
384 |
385 | def test_differential_column_already_exists(self):
386 | csd = preprocessing.CreateScoreDifferential("home_score",
387 | "away_score",
388 | "offense_home",
389 | score_differential_colname="used_col")
390 | data = pd.DataFrame({"home_score": [1, 2, 3, 4],
391 | "away_score": [10, 0, 5, 15],
392 | "offense_home": [True, True, True, True],
393 | "used_col": [0, 0, 0, 0]})
394 | with pytest.raises(KeyError):
395 | csd.fit(data)
396 | csd.transform(data)
397 |
398 | def test_differential_works_offense_is_home(self):
399 | csd = preprocessing.CreateScoreDifferential("home_score",
400 | "away_score",
401 | "offense_home",
402 | score_differential_colname="score_diff")
403 | input_data = pd.DataFrame({"home_score": [1, 2, 3, 4],
404 | "away_score": [10, 0, 5, 15],
405 | "offense_home": [True, True, True, True]})
406 | expected_data = pd.DataFrame({"home_score": [1, 2, 3, 4],
407 | "away_score": [10, 0, 5, 15],
408 | "offense_home": [True, True, True, True],
409 | "score_diff": [-9, 2, -2, -11]})
410 |
411 | csd.fit(input_data)
412 | transformed_data = csd.transform(input_data)
413 | pd.util.testing.assert_frame_equal(expected_data.sort_index(axis=1),
414 | transformed_data.sort_index(axis=1))
415 |
416 | def test_differential_works_offense_is_away(self):
417 | csd = preprocessing.CreateScoreDifferential("home_score",
418 | "away_score",
419 | "offense_home",
420 | score_differential_colname="score_diff")
421 | input_data = pd.DataFrame({"home_score": [1, 2, 3, 4],
422 | "away_score": [10, 0, 5, 15],
423 | "offense_home": [False, False, False, False]})
424 | expected_data = pd.DataFrame({"home_score": [1, 2, 3, 4],
425 | "away_score": [10, 0, 5, 15],
426 | "offense_home": [False, False, False, False],
427 | "score_diff": [9, -2, 2, 11]})
428 |
429 | csd.fit(input_data)
430 | transformed_data = csd.transform(input_data)
431 | pd.util.testing.assert_frame_equal(expected_data.sort_index(axis=1),
432 | transformed_data.sort_index(axis=1))
433 |
434 | def test_differential_works_offense_is_mix(self):
435 | csd = preprocessing.CreateScoreDifferential("home_score",
436 | "away_score",
437 | "offense_home",
438 | score_differential_colname="score_diff")
439 | input_data = pd.DataFrame({"home_score": [1, 2, 3, 4],
440 | "away_score": [10, 0, 5, 15],
441 | "offense_home": [True, True, False, False]})
442 | expected_data = pd.DataFrame({"home_score": [1, 2, 3, 4],
443 | "away_score": [10, 0, 5, 15],
444 | "offense_home": [True, True, False, False],
445 | "score_diff": [-9, 2, 2, 11]})
446 |
447 | csd.fit(input_data)
448 | transformed_data = csd.transform(input_data)
449 | pd.util.testing.assert_frame_equal(expected_data.sort_index(axis=1),
450 | transformed_data.sort_index(axis=1))
451 |
452 | def test_differential_with_copied_data(self):
453 | csd = preprocessing.CreateScoreDifferential("home_score",
454 | "away_score",
455 | "offense_home",
456 | score_differential_colname="score_diff",
457 | copy=True)
458 | input_data = pd.DataFrame({"home_score": [1, 2, 3, 4],
459 | "away_score": [10, 0, 5, 15],
460 | "offense_home": [True, True, True, True]})
461 | expected_input_data = pd.DataFrame({"home_score": [1, 2, 3, 4],
462 | "away_score": [10, 0, 5, 15],
463 | "offense_home": [True, True, True, True]})
464 | expected_transformed_data = pd.DataFrame({"home_score": [1, 2, 3, 4],
465 | "away_score": [10, 0, 5, 15],
466 | "offense_home": [True, True, True, True],
467 | "score_diff": [-9, 2, -2, -11]})
468 |
469 | csd.fit(input_data)
470 | transformed_data = csd.transform(input_data)
471 | pd.util.testing.assert_frame_equal(expected_input_data.sort_index(axis=1),
472 | input_data.sort_index(axis=1))
473 | pd.util.testing.assert_frame_equal(expected_transformed_data.sort_index(axis=1),
474 | transformed_data.sort_index(axis=1))
475 |
476 | def test_differential_with_inplace_data(self):
477 | csd = preprocessing.CreateScoreDifferential("home_score",
478 | "away_score",
479 | "offense_home",
480 | score_differential_colname="score_diff",
481 | copy=False)
482 | input_data = pd.DataFrame({"home_score": [1, 2, 3, 4],
483 | "away_score": [10, 0, 5, 15],
484 | "offense_home": [True, True, True, True]})
485 | expected_data = pd.DataFrame({"home_score": [1, 2, 3, 4],
486 | "away_score": [10, 0, 5, 15],
487 | "offense_home": [True, True, True, True],
488 | "score_diff": [-9, 2, -2, -11]})
489 | csd.fit(input_data)
490 | csd.transform(input_data)
491 | pd.util.testing.assert_frame_equal(expected_data.sort_index(axis=1),
492 | input_data.sort_index(axis=1))
493 |
494 |
495 |
496 |
497 | class TestCheckColumnNames(object):
498 | """Testing whether column names are properly checked."""
499 |
500 | def test_transform_called_before_fit(self):
501 | ccn = preprocessing.CheckColumnNames()
502 | data = pd.DataFrame()
503 |
504 | with pytest.raises(NotFittedError):
505 | ccn.transform(data)
506 |
507 | def test_transform_data_has_wrong_columns(self):
508 | ccn = preprocessing.CheckColumnNames()
509 | input_data = pd.DataFrame({"one": [1, 2],
510 | "two": [3, 4]})
511 | ccn.fit(input_data)
512 | test_data = pd.DataFrame({"one": [1, 2],
513 | "three": [3, 4]})
514 |
515 | with pytest.raises(KeyError):
516 | ccn.transform(test_data)
517 |
518 | def test_transform_reorders_columns(self):
519 | ccn = preprocessing.CheckColumnNames()
520 | input_data = pd.DataFrame({"one": [1, 2],
521 | "two": [3, 4],
522 | "three": [5, 6]})
523 | test_data = pd.DataFrame({"one": [7, 8],
524 | "two": [9, 10],
525 | "three": [11, 12]})
526 | expected_data = test_data.copy()
527 | #Ensure columns are in a particular order:
528 | input_data = input_data[["one", "two", "three"]]
529 | test_data = test_data[["two", "one", "three"]]
530 | expected_data = expected_data[["one", "two", "three"]]
531 |
532 | with pytest.raises(AssertionError):
533 | pd.util.testing.assert_frame_equal(test_data, expected_data)
534 |
535 | ccn.fit(input_data)
536 | pd.util.testing.assert_frame_equal(ccn.transform(test_data), expected_data)
537 |
538 |
539 | def test_transform_drops_unnecessary_columns(self):
540 | ccn = preprocessing.CheckColumnNames()
541 | input_data = pd.DataFrame({"one": [1, 2],
542 | "two": [3, 4],
543 | "three": [5, 6]})
544 | test_data = pd.DataFrame({"one": [7, 8],
545 | "two": [9, 10],
546 | "three": [11, 12],
547 | "four": [13, 14]})
548 | expected_data = pd.DataFrame({"one": [7, 8],
549 | "two": [9, 10],
550 | "three": [11, 12]})
551 | #Ensure columns are in a particular order:
552 | input_data = input_data[["one", "two", "three"]]
553 | expected_data = expected_data[["one", "two", "three"]]
554 |
555 | ccn.fit(input_data)
556 | pd.util.testing.assert_frame_equal(ccn.transform(test_data), expected_data)
557 |
558 |
559 | def test_transform_with_user_specified_colums(self):
560 | ccn = preprocessing.CheckColumnNames(column_names=["c", "b", "a"])
561 | input_data = pd.DataFrame({"e": [-2, -1, 0],
562 | "a": [1, 2, 3],
563 | "b": [4, 5, 6],
564 | "c": [7, 8, 9],
565 | "d": [10, 11, 12]})
566 | expected_data = pd.DataFrame({"c": [7, 8, 9],
567 | "b": [4, 5, 6],
568 | "a": [1, 2, 3]})
569 | expected_data = expected_data[["c", "b", "a"]]
570 | transformed_data = ccn.transform(input_data)
571 | pd.util.testing.assert_frame_equal(expected_data, transformed_data)
572 |
--------------------------------------------------------------------------------