├── NBApredict
    ├── run
    │   ├── __init__.py
    │   ├── all.py
    │   └── daily.py
    ├── database
    │   ├── __init__.py
    │   ├── reconcile.py
    │   ├── getters.py
    │   ├── manipulator.py
    │   └── dbinterface.py
    ├── br_web_scraper
    │   ├── __init__.py
    │   ├── parsers
    │   │   ├── __init__.py
    │   │   ├── players_season_totals.py
    │   │   ├── box_scores.py
    │   │   └── schedule.py
    │   ├── errors.py
    │   ├── json_encoders.py
    │   ├── client.py
    │   ├── http_client.py
    │   ├── data.py
    │   └── output.py
    ├── management
    │   ├── tables
    │   │   ├── __init__.py
    │   │   ├── results.py
    │   │   ├── predictions.py
    │   │   ├── teams.py
    │   │   ├── team_stats.py
    │   │   ├── odds.py
    │   │   └── schedule.py
    │   ├── __init__.py
    │   ├── etl.py
    │   └── conversion.py
    ├── __init__.py
    ├── settings.yaml
    ├── scrapers
    │   ├── scraper.py
    │   ├── team_scraper.py
    │   ├── season_scraper.py
    │   └── line_scraper.py
    ├── predict
    │   ├── games.py
    │   ├── get.py
    │   └── bets.py
    ├── helpers
    │   ├── json.py
    │   ├── type.py
    │   ├── classes.py
    │   └── br_references.py
    ├── models
    │   ├── graphing.py
    │   └── four_factor_regression.py
    └── configuration.py
├── .gitignore
├── LICENSE.txt
├── README.md
└── project_notebook.ipynb


/NBApredict/run/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/NBApredict/database/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/NBApredict/br_web_scraper/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/NBApredict/management/tables/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/NBApredict/management/tables/results.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/NBApredict/br_web_scraper/parsers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/NBApredict/management/__init__.py:
--------------------------------------------------------------------------------
1 | from sqlalchemy.orm import sessionmaker
2 | Session = sessionmaker()
3 | 


--------------------------------------------------------------------------------
/NBApredict/__init__.py:
--------------------------------------------------------------------------------
1 | # __init__.py signals to python that the folder contains relevant
2 | # packages and information


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | #Ignore the following file extensions:
 2 | *.exe
 3 | *.log
 4 | *.txt
 5 | *.pyc
 6 | *.json
 7 | *.idea
 8 | *.csv
 9 | *.db
10 | *.sqlite
11 | *xlsx
12 | graphs
13 | scratch*
14 | .ipynb_checkpoints
15 | */.ipynb_checkpoints/*
16 | *test_regression
17 | 


--------------------------------------------------------------------------------
/NBApredict/management/tables/predictions.py:
--------------------------------------------------------------------------------
 1 | """Functions for prediction table creation and operations."""
 2 | 
 3 | 
 4 | def format_data():
 5 |     pass
 6 | 
 7 | 
 8 | def create_table(db, prediction_data):
 9 |     pass
10 | 
11 | 
12 | def insert():
13 |     pass
14 | 


--------------------------------------------------------------------------------
/NBApredict/br_web_scraper/errors.py:
--------------------------------------------------------------------------------
 1 | class InvalidDate(Exception):
 2 |     def __init__(self, day, month, year):
 3 |         message = "Date with year set to {year}, month set to {month}, and day set to {day} is invalid"\
 4 |             .format(
 5 |                 year=year,
 6 |                 month=month,
 7 |                 day=day,
 8 |             )
 9 |         super().__init__(message)
10 | 


--------------------------------------------------------------------------------
/NBApredict/br_web_scraper/json_encoders.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from json import JSONEncoder
 3 | from enum import Enum
 4 | 
 5 | 
 6 | class BasketballReferenceJSONEncoder(JSONEncoder):
 7 |     def default(self, obj):
 8 |         if isinstance(obj, datetime):
 9 |             return obj.isoformat()
10 | 
11 |         if isinstance(obj, Enum):
12 |             return obj.value
13 | 
14 |         return JSONEncoder.default(self, obj)
15 | 
16 | 


--------------------------------------------------------------------------------
/NBApredict/settings.yaml:
--------------------------------------------------------------------------------
 1 | paths:
 2 |     directory: NBA
 3 |     database: db_path
 4 |     graph_dir: graph_path
 5 |     settings: settings path
 6 | 
 7 | Bovada:
 8 |     regularURL: https://www.bovada.lv/services/sports/event/v2/events/A/description/basketball/nba
 9 |     playoffURL: https://www.bovada.lv/services/sports/event/v2/events/A/description/basketball/nba-playoffs
10 | 
11 | prediction:
12 |     predict_lines: False
13 | 
14 | models:
15 |     four_factor_regression:
16 |         options:
17 |             graph: True
18 |             console_out: True
19 |     Bayesian_model:
20 |         settings:
21 |     ML_model:
22 |         settings:
23 | 
24 | league_year: 2020
25 | 


--------------------------------------------------------------------------------
/NBApredict/management/tables/teams.py:
--------------------------------------------------------------------------------
 1 | """Teams.py contains (a) function(s) to create the teams table in the database"""
 2 | 
 3 | 
 4 | def create_team_table(db, teams_data, tbl_name):
 5 |     """Create a table in DB named tbl_name with the columns in teams_data
 6 | 
 7 |     Args:
 8 |         db: a datotable.database.Database object connected to a database
 9 |         teams_data: A datatotable.data.DataOperator object with data on NBA teams
10 |         tbl_name: The desired name of the table
11 |     """
12 |     columns = teams_data.columns
13 |     columns["team_name"].append({"unique": True})
14 |     db.map_table(tbl_name=tbl_name, columns=columns)
15 |     db.create_tables()
16 |     db.clear_mappers()
17 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) [2019] [Spencer Weson]
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/NBApredict/run/all.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module runs the entire NBA_bet project.
 3 | 
 4 | This module wraps the entire project into a single script with run_all() as the function which drives the script. First,
 5 | it sets up the database and session connections. Then, it scrapes all new data. Finally, it predicts all games for which
 6 | data is available. Most session.commit() calls in the project are performed here. However, note predict_all() requires
 7 | a commit during the process in order to function correctly.
 8 | """
 9 | from sqlalchemy.orm import Session
10 | 
11 | # Local Imports
12 | from nbapredict.database.dbinterface import DBInterface
13 | from nbapredict.predict import bets
14 | from nbapredict.scrapers import scraper
15 | from nbapredict.configuration import Config
16 | 
17 | 
18 | def run_all():
19 |     """Run the entire NBA_bet project."""
20 |     db = DBInterface()
21 |     year = Config.get_property("league_year")
22 |     session = Session(bind=db.engine)
23 | 
24 |     scraper.scrape_all(db, session, year)
25 |     session.commit()
26 | 
27 |     bets.predict_all(db, session)
28 |     session.commit()
29 |     session.close()
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     run_all()
34 | 


--------------------------------------------------------------------------------
/NBApredict/database/reconcile.py:
--------------------------------------------------------------------------------
 1 | """
 2 | At the moment, reconcile contains one function which "reconciles" primary and reference tables for a specific column.
 3 | ToDo: Remove
 4 | """
 5 | 
 6 | 
 7 | def reconcile(ref_tbl, change_tbl, column, ref_key, change_key, session):
 8 |     """Compare the specified column over the two tables and change change_tbl values to ref_tbl values
 9 | 
10 |     Note that the change and reference tables must be related by a foreign key.
11 | 
12 |     Args:
13 |         ref_tbl: The reference table which contains the values to be changed in change_tbl
14 |         change_tbl: The table to be changed with values from reference table
15 |         column: The column to evaluate for changes. Column must be present in both tables.
16 |         ref_key: The key in the reference table to join the tables by
17 |         change_key: The key in the change table to join the tables by
18 |         session: An instance of a sqlalchemy Session class bound to the database's engine
19 | 
20 |     To-do:
21 |         Figure out how to run with multiple columns
22 |     """
23 |     join_objs = session.query(ref_tbl, change_tbl).join().\
24 |         filter(getattr(ref_tbl, ref_key) == getattr(change_tbl, change_key)).all()
25 | 
26 |     changed_objs = []
27 |     for obj in join_objs:
28 |         ref_obj = obj[0]
29 |         change_obj = obj[1]
30 |         ref_val = getattr(ref_obj, column)
31 |         change_val = getattr(change_obj, column)
32 |         if ref_val != change_val:
33 |             setattr(change_obj, column, ref_val)
34 |             session.add(change_obj)
35 | 
36 |     return
37 | 


--------------------------------------------------------------------------------
/NBApredict/br_web_scraper/client.py:
--------------------------------------------------------------------------------
 1 | from nbapredict.br_web_scraper import http_client
 2 | 
 3 | from nbapredict.br_web_scraper.output import box_scores_to_csv, schedule_to_csv
 4 | from nbapredict.br_web_scraper.output import output
 5 | from nbapredict.br_web_scraper.json_encoders import BasketballReferenceJSONEncoder
 6 | 
 7 | 
 8 | def player_box_scores(day, month, year, output_type=None, output_file_path=None, output_write_option=None, json_options=None):
 9 |     values = http_client.player_box_scores(day=day, month=month, year=year)
10 |     return output(
11 |         values=values,
12 |         output_type=output_type,
13 |         output_file_path=output_file_path,
14 |         output_write_option=output_write_option,
15 |         csv_writer=box_scores_to_csv,
16 |         encoder=BasketballReferenceJSONEncoder,
17 |         json_options=json_options,
18 |     )
19 | 
20 | 
21 | def season_schedule(season_end_year, output_type=None, output_file_path=None, output_write_option=None, json_options=None):
22 |     values = http_client.season_schedule(season_end_year)
23 |     return output(
24 |         values=values,
25 |         output_type=output_type,
26 |         output_file_path=output_file_path,
27 |         output_write_option=output_write_option,
28 |         csv_writer=schedule_to_csv,
29 |         encoder=BasketballReferenceJSONEncoder,
30 |         json_options=json_options,
31 |     )
32 | 
33 | 
34 | def players_season_totals(season_end_year, output_type=None, output_file_path=None, output_write_option=None, json_options=None):
35 |     values = http_client.players_season_totals(season_end_year)
36 |     return output(
37 |         values=values,
38 |         output_type=output_type,
39 |         output_file_path=output_file_path,
40 |         output_write_option=output_write_option,
41 |         csv_writer=schedule_to_csv,
42 |         encoder=BasketballReferenceJSONEncoder,
43 |         json_options=json_options,
44 |     )
45 | 
46 | 


--------------------------------------------------------------------------------
/NBApredict/management/tables/team_stats.py:
--------------------------------------------------------------------------------
 1 | """Team_stats.py contains function to create the team_stats table in the database"""
 2 | 
 3 | from datetime import datetime
 4 | from nbapredict.configuration import Config
 5 | from sqlalchemy import ForeignKey, UniqueConstraint
 6 | 
 7 | 
 8 | def create_table(db, team_stats_data, tbl_name):
 9 |     """Create a table of team stats in a database with appropriate foreign keys and constraints.
10 | 
11 |     Args:
12 |         db: a datotable.database.Database object connected to a database
13 |         team_stats_data: A datatotable.data.DataOperator object with data on NBA team stats
14 |         tbl_name: The desired table name
15 |     ToDo: Currently allows duplicate rows if those values are on different days. Solve with a constraint
16 |     """
17 |     columns = team_stats_data.columns
18 |     columns['team_id'].append(ForeignKey("teams_{}.id".format(Config.get_property('league_year'))))
19 |     constraints = [UniqueConstraint("team_id", "scrape_time")]
20 |     db.map_table(tbl_name=tbl_name, columns=columns, constraints=constraints)
21 |     db.create_tables()
22 |     db.clear_mappers()
23 | 
24 | 
25 | def insert(session, team_stats_tbl, team_stats_data):
26 |     """Insert new data into the team_stats_tbl.
27 | 
28 |     Args:
29 |         session: An instantiated SQLalchemy session object
30 |         team_stats_tbl: A mapped team stats table object
31 |         team_stats_data: A datatotable.data.DataOperator object with data on NBA team stats
32 |     """
33 |     last_insert_scrape_time = session.query(team_stats_tbl.scrape_time). \
34 |         order_by(team_stats_tbl.scrape_time.desc()).first().scrape_time
35 |     last_insert_date = datetime.date(last_insert_scrape_time)
36 |     current_scrape_date = datetime.date(datetime.now())
37 |     if last_insert_date < current_scrape_date:
38 |         session.add_all([team_stats_tbl(**row) for row in team_stats_data.rows])
39 |         session.commit()
40 | 


--------------------------------------------------------------------------------
/NBApredict/scrapers/scraper.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module wraps the team stats, schedule, and betting line scrapers together and stores their data in the database.
 3 | 
 4 | If the script is called, it instantiates a DBInterface object for database interactions and creates a SQLalchemy session
 5 | object from the DBInterface's information. Otherwise, the scape_all() function is called with database, session, and
 6 | league year arguments specified.
 7 | """
 8 | import os
 9 | from sqlalchemy.orm import Session
10 | 
11 | # Local Imports
12 | from nbapredict.database.dbinterface import DBInterface
13 | from nbapredict.scrapers import team_scraper, season_scraper, line_scraper
14 | import nbapredict.configuration as configuration
15 | 
16 | 
17 | def scrape_all(database, session, league_year):
18 |     """Scrape and store team stats, schedule information, and betting lines in the database.
19 | 
20 |     Note, this only adds data to the session. Changes must be committed to be saved.
21 | 
22 |     Args:
23 |         database: An instantiated DBInterface object from database.database for database interactions
24 |         session: An instance of a sqlalchemy Session class bound to the database's engine
25 |         league_year: The league year to scrape data from (i.e. 2018-2019 season is 2019)
26 |     """
27 |     # Insure the database folder exists
28 |     if not os.path.isdir(configuration.output_directory()):
29 |         os.mkdir(configuration.output_directory())
30 | 
31 |     team_scrape = team_scraper.scrape(database=database)
32 |     season_scrape = season_scraper.scrape(database=database, session=session)
33 |     line_scrape = line_scraper.scrape(database=database, session=session)
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     db_path = configuration.database_file(os.path.dirname(__file__))
38 |     db = DBInterface(db_path)
39 |     league_year = 2019
40 |     session = Session(bind=db.engine)
41 |     scrape_all(database=db, session=session, league_year=league_year)
42 | 


--------------------------------------------------------------------------------
/NBApredict/br_web_scraper/http_client.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | from nbapredict.br_web_scraper.errors import InvalidDate
 4 | from nbapredict.br_web_scraper.parsers.box_scores import parse_player_box_scores
 5 | from nbapredict.br_web_scraper.parsers.schedule import parse_schedule, parse_schedule_for_month_url_paths
 6 | from nbapredict.br_web_scraper.parsers.players_season_totals import parse_players_season_totals
 7 | 
 8 | BASE_URL = 'https://www.basketball-reference.com'
 9 | 
10 | 
11 | def player_box_scores(day, month, year):
12 |     url = '{BASE_URL}/friv/dailyleaders.cgi?month={month}&day={day}&year={year}'.format(
13 |         BASE_URL=BASE_URL,
14 |         day=day,
15 |         month=month,
16 |         year=year
17 |     )
18 | 
19 |     response = requests.get(url=url, allow_redirects=False)
20 | 
21 |     if 200 <= response.status_code < 300:
22 |         return parse_player_box_scores(response.content)
23 | 
24 |     raise InvalidDate(day=day, month=month, year=year)
25 | 
26 | 
27 | def schedule_for_month(url):
28 |     response = requests.get(url=url)
29 | 
30 |     response.raise_for_status()
31 | 
32 |     return parse_schedule(response.content)
33 | 
34 | 
35 | def season_schedule(season_end_year):
36 |     url = '{BASE_URL}/leagues/NBA_{season_end_year}_games.html'.format(
37 |         BASE_URL=BASE_URL,
38 |         season_end_year=season_end_year
39 |     )
40 | 
41 |     response = requests.get(url=url)
42 | 
43 |     response.raise_for_status()
44 | 
45 |     season_schedule_values = parse_schedule(response.content)
46 |     other_month_url_paths = parse_schedule_for_month_url_paths(response.content)
47 | 
48 |     for month_url_path in other_month_url_paths:
49 |         url = '{BASE_URL}{month_url_path}'.format(BASE_URL=BASE_URL, month_url_path=month_url_path)
50 |         monthly_schedule = schedule_for_month(url=url)
51 |         season_schedule_values.extend(monthly_schedule)
52 | 
53 |     return season_schedule_values
54 | 
55 | 
56 | def players_season_totals(season_end_year):
57 |     url = '{BASE_URL}/leagues/NBA_{season_end_year}_totals.html'.format(
58 |         BASE_URL=BASE_URL,
59 |         season_end_year=season_end_year,
60 |     )
61 | 
62 |     response = requests.get(url=url)
63 | 
64 |     response.raise_for_status()
65 | 
66 |     return parse_players_season_totals(response.content)
67 | 


--------------------------------------------------------------------------------
/NBApredict/br_web_scraper/parsers/players_season_totals.py:
--------------------------------------------------------------------------------
 1 | from lxml import html
 2 | 
 3 | from nbapredict.helpers.br_references import TEAM_ABBREVIATIONS_TO_TEAM, POSITION_ABBREVIATIONS_TO_POSITION
 4 | 
 5 | 
 6 | def parse_player_season_totals(row):
 7 |     return {
 8 |         "name": str(row[1].text_content()),
 9 |         "position": POSITION_ABBREVIATIONS_TO_POSITION[row[2].text_content()],
10 |         "age": int(row[3].text_content()),
11 |         "team": TEAM_ABBREVIATIONS_TO_TEAM[row[4].text_content()],
12 |         "games_played": int(row[5].text_content()),
13 |         "games_started": int(row[6].text_content()),
14 |         "minutes_played": int(row[7].text_content()),
15 |         "made_field_goals": int(row[8].text_content()),
16 |         "attempted_field_goals": int(row[9].text_content()),
17 |         "made_three_point_field_goals": int(row[11].text_content()),
18 |         "attempted_three_point_field_goals": int(row[12].text_content()),
19 |         "made_free_throws": int(row[18].text_content()),
20 |         "attempted_free_throws": int(row[19].text_content()),
21 |         "offensive_rebounds": int(row[21].text_content()),
22 |         "defensive_rebounds": int(row[22].text_content()),
23 |         "assists": int(row[24].text_content()),
24 |         "steals": int(row[25].text_content()),
25 |         "blocks": int(row[26].text_content()),
26 |         "turnovers": int(row[27].text_content()),
27 |         "personal_fouls": int(row[28].text_content()),
28 |     }
29 | 
30 | 
31 | def parse_players_season_totals(page):
32 |     tree = html.fromstring(page)
33 |     # Basketball Reference includes individual rows for players that played for multiple teams in a season
34 |     # These rows have a separate class ("italic_text partial_table") than the players that played for a single team
35 |     # across a season.
36 |     rows = tree.xpath('//table[@id="totals_stats"]/tbody/tr[contains(@class, "full_table") or contains(@class, "italic_text partial_table") and not(contains(@class, "rowSum"))]')
37 |     totals = []
38 |     for row in rows:
39 |         # Basketball Reference includes a "total" row for players that got traded
40 |         # which is essentially a sum of all player team rows
41 |         # I want to avoid including those, so I check the "team" field value for "TOT"
42 |         if row[4].text_content() != "TOT":
43 |             totals.append(parse_player_season_totals(row))
44 |     return totals
45 | 


--------------------------------------------------------------------------------
/NBApredict/predict/games.py:
--------------------------------------------------------------------------------
 1 | """Predict.games contains functions oriented around predicting games"""
 2 | 
 3 | from sqlalchemy import Integer, ForeignKey, String, UniqueConstraint
 4 | from sqlalchemy.orm import Session, relationship
 5 | 
 6 | # Local Imports
 7 | import nbapredict.predict.get as get
 8 | from nbapredict.configuration import Config
 9 | import nbapredict.models.four_factor_regression as lm
10 | import nbapredict.database.dbinterface as dbinterface
11 | 
12 | 
13 | def create_prediction_table(database, data, tbl_name):
14 |     """Create a prediction table from the data and with the table name in the database.
15 | 
16 |     ToDo: This will need a big overhaul
17 | 
18 |     Args:
19 |         database: An initialized DBInterface class from database.dbinterface.py
20 |         data: An initialized DataOperator object, from database.manipulator, with prediction data
21 |         tbl_name: The desired table name (with year as the last four characters)
22 |     """
23 |     # Create columns from data
24 |     sql_types = data.get_sql_type()
25 |     # Add new columns
26 |     year = tbl_name[-4:]
27 |     schedule_name = "sched_{}".format(year)
28 |     additional_cols = [{'game_id': [Integer, ForeignKey(schedule_name + ".id")]}, {"MOV": Integer}]
29 |     for col in additional_cols:
30 |         sql_types.update(col)
31 |     constraint = {UniqueConstraint: ["start_time", "home_team", "away_team"]}
32 |     # Map prediction table
33 |     database.map_table(tbl_name, sql_types, constraint)
34 | 
35 |     # Get tables for relationships
36 |     sched_tbl = database.get_table_mappings(schedule_name)
37 | 
38 |     # Create Relationships
39 |     if "game_preds_{}".format(year) not in sched_tbl.__mapper__.relationships.keys():
40 |         sched_tbl.predictions = relationship(database.Template)
41 | 
42 |     database.create_tables()
43 |     database.clear_mappers()
44 | 
45 | 
46 | def main():
47 |     db = dbinterface.DBInterface()
48 |     session = Session(bind=db.engine)
49 |     league_year = Config.get_property("league_year")
50 | 
51 |     regression = lm.main(db, session)
52 |     sched_tbl = db.get_table_mappings("sched_{}".format(league_year))
53 | 
54 |     if not db.table_exists("pred"):
55 |         # Returns a data manipulator class
56 |         sample = get.sample_prediction(db, session, ref_tbl=sched_tbl, model=regression)
57 |         create_prediction_table(db, sample, "game_pred_{}".format(league_year))
58 | 
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     main()


--------------------------------------------------------------------------------
/NBApredict/database/getters.py:
--------------------------------------------------------------------------------
 1 | """
 2 | getters contains functions which may be commonly used to get certain subsets of data, data transformations, or data
 3 | summaries
 4 | """
 5 | 
 6 | from datetime import timedelta
 7 | import pandas as pd
 8 | 
 9 | 
10 | def get_games_on_day(schedule, session, date):
11 |     """Return the games from schedule on the specified date
12 | 
13 |     Args:
14 |         schedule: A mapped table object containing a schedule of games
15 |         session: An instantiated session object
16 |         date: The date to check for games
17 |     """
18 |     next_day = date + timedelta(days=1)
19 |     return session.query(schedule).filter(schedule.start_time > date, schedule.start_time < next_day)
20 | 
21 | 
22 | def get_first_game_time_on_day(schedule, session, date):
23 |     """Return the first game game time on the specified date
24 | 
25 |     Args:
26 |         schedule: A mapped table object containing a schedule of games
27 |         session: An instantiated session object
28 |         date: The date to check for games
29 |     """
30 |     games_on_day = get_games_on_day(schedule, session, date).subquery()
31 |     first_game = session.query(games_on_day).order_by(games_on_day.c.start_time).first()
32 |     if first_game:
33 |         first_game_time = first_game[1]
34 |         return first_game_time
35 |     else:
36 |         return None
37 | 
38 | 
39 | def get_spreads_for_date(odds_table, session, date):
40 |     """Return the spreads from the odds_table that correspond to the games
41 | 
42 |     Args:
43 |         odds_table: Sqlalchemy table object that contains odds
44 |         session: Sqlalchemy session object
45 |         date: Date to extract odds for
46 |     """
47 |     next_day = date + timedelta(days=1)
48 |     query = session.query(odds_table.start_time, odds_table.home_team, odds_table.away_team, odds_table.spread). \
49 |                 filter(odds_table.start_time > date, odds_table.start_time < next_day)
50 | 
51 |     return query
52 | 
53 | 
54 | def get_pandas_df_from_table(database, session, tbl_name, qualifiers=False):
55 |     """Convert the specified table into a pandas dataframe, modify it according to qualifiers, and return the result
56 | 
57 |     Args:
58 |         database: An instantiated DBInterface class from dbinterface.py
59 |         session: SQLalchemy session object
60 |         tbl_name: name of the desired table
61 |         qualifiers: A list of columns or a function to filter rows by
62 |     """
63 |     tbl = database.get_table_mappings(tbl_name)
64 |     query = session.query(tbl)
65 |     if qualifiers:
66 |         return pd.read_sql(query.statement, query.session.bind)[qualifiers]
67 |     else:
68 |         return pd.read_sql(query.statement, query.session.bind)
69 | 


--------------------------------------------------------------------------------
/NBApredict/br_web_scraper/parsers/box_scores.py:
--------------------------------------------------------------------------------
 1 | from lxml import html
 2 | 
 3 | from nbapredict.helpers.br_references import Location, Outcome, TEAM_ABBREVIATIONS_TO_TEAM
 4 | 
 5 | 
 6 | def parse_location(symbol):
 7 |     if symbol == "@":
 8 |         return Location.AWAY
 9 |     elif symbol == "":
10 |         return Location.HOME
11 |     raise ValueError("Unknown symbol: {symbol}".format(symbol=symbol))
12 | 
13 | 
14 | def parse_outcome(symbol):
15 |     if symbol == "W":
16 |         return Outcome.WIN
17 |     elif symbol == "L":
18 |         return Outcome.LOSS
19 |     raise ValueError("Unknown symbol: {symbol}".format(symbol=symbol))
20 | 
21 | 
22 | def parse_seconds_played(formatted_playing_time):
23 |     if formatted_playing_time == "":
24 |         return 0
25 | 
26 |     # It seems like basketball reference formats everything in MM:SS
27 |     # even when the playing time is greater than 59 minutes, 59 seconds.
28 |     #
29 |     # Because of this, we can't use strptime / %M as valid values are 0-59.
30 |     # So have to parse time by splitting on ":" and assuming that
31 |     # the first part is the minute part and the second part is the seconds part
32 |     time_parts = formatted_playing_time.split(":")
33 |     minutes_played = time_parts[0]
34 |     seconds_played = time_parts[1]
35 |     return 60 * int(minutes_played) + int(seconds_played)
36 | 
37 | 
38 | def parse_player_box_score(row):
39 |     return {
40 |         "name": str(row[1].text_content()),
41 |         "team": TEAM_ABBREVIATIONS_TO_TEAM[row[2].text_content()],
42 |         "location": parse_location(row[3].text_content()),
43 |         "opponent": TEAM_ABBREVIATIONS_TO_TEAM[row[4].text_content()],
44 |         "outcome": parse_outcome(row[5].text_content()),
45 |         "seconds_played": int(parse_seconds_played(row[6].text_content())),
46 |         "made_field_goals": int(row[7].text_content()),
47 |         "attempted_field_goals": int(row[8].text_content()),
48 |         "made_three_point_field_goals": int(row[10].text_content()),
49 |         "attempted_three_point_field_goals": int(row[11].text_content()),
50 |         "made_free_throws": int(row[13].text_content()),
51 |         "attempted_free_throws": int(row[14].text_content()),
52 |         "offensive_rebounds": int(row[16].text_content()),
53 |         "defensive_rebounds": int(row[17].text_content()),
54 |         "assists": int(row[19].text_content()),
55 |         "steals": int(row[20].text_content()),
56 |         "blocks": int(row[21].text_content()),
57 |         "turnovers": int(row[22].text_content()),
58 |         "personal_fouls": int(row[23].text_content()),
59 |         "game_score": float(row[25].text_content()),
60 |     }
61 | 
62 | 
63 | def parse_player_box_scores(page):
64 |     tree = html.fromstring(page)
65 |     rows = tree.xpath('//table[@id="stats"]//tbody/tr[not(contains(@class, "thead"))]')
66 |     return list(map(lambda row: parse_player_box_score(row), rows))
67 | 


--------------------------------------------------------------------------------
/NBApredict/helpers/json.py:
--------------------------------------------------------------------------------
 1 | """
 2 | JSON interaction class and functions.
 3 | 
 4 | Created for a use-case which is no longer needed. This module is not used in the project.
 5 | """
 6 | import copy
 7 | import json
 8 | import os
 9 | import yaml
10 | 
11 | # Local imports
12 | from nbapredict.helpers import type
13 | 
14 | 
15 | class JsonFile:
16 |     """A class to handle JSON functionality such as load, create, add, and drop"""
17 |     def __init__(self, json_file):
18 |         self.path = json_file
19 |         if os.path.isfile(self.path):
20 |             return
21 |         else:  # Create a blank JSON if the file does not already exist
22 |             self.create_json()
23 | 
24 |     def add_objects(self, objects_dict):
25 |         """Adds a new object or objects to an existing json file
26 | 
27 |         To-do:
28 |             Currently rewrites the entire file which could be a performance issue. To change, make so that the json file
29 |             endings are removed, a comma inserted, and then re-insert the ending (or something like that)"""
30 | 
31 |         data = self.load_json()
32 |         modified_data = copy.deepcopy(data)
33 |         try:
34 |             for key, value in objects_dict.items():
35 |                 if type.is_python_type(value):  # Creates a yaml representation of python types
36 |                     value = yaml.dump(value)
37 |                 modified_data[key] = value
38 |             self.create_json(modified_data)
39 |         except (TypeError, json.decoder.JSONDecodeError):  # Rewrite the initial JSON if an error is encountered
40 |             self.create_json(data)
41 |             raise Exception("Could not add object to JSON. Json restored to previous format")
42 | 
43 |     def remove_objects(self, keys):
44 |         """Removes the specified object or objects from the json_file as specified by keys"""
45 |         with open(self.path, encoding='utf-8') as data_file:
46 |             data = json.loads(data_file.read())
47 | 
48 |         changed_data = data
49 |         if isinstance(keys, str):
50 |             del changed_data[keys]
51 |         else:
52 |             for key in keys:
53 |                 del changed_data[key]
54 |         try:
55 |             self.create_json(changed_data)
56 |         except TypeError:
57 |             self.create_json(data)
58 | 
59 |     def create_json(self, object_dict=None):
60 |         """Creates a json to store the specified objects"""
61 |         if object_dict:
62 |             with open(self.path, 'w') as fp:
63 |                 json.dump(object_dict, fp, sort_keys=True, indent=4)
64 |         else:
65 |             with open(self.path, 'w') as fp:
66 |                 json.dump({}, fp, sort_keys=True, indent=4)
67 | 
68 |     def check_for_object(self, object_key):
69 |         json_keys = self.load_json().keys()
70 |         if object_key in json_keys:
71 |             return True
72 |         else:
73 |             return False
74 | 
75 |     def load_json(self):
76 |         with open(self.path, "r") as file:
77 |             python_object = json.load(file)
78 |         return python_object
79 | 


--------------------------------------------------------------------------------
/NBApredict/br_web_scraper/data.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | 
  3 | 
  4 | class Location(Enum):
  5 |     HOME = "HOME"
  6 |     AWAY = "AWAY"
  7 | 
  8 | 
  9 | class Outcome(Enum):
 10 |     WIN = "WIN"
 11 |     LOSS = "LOSS"
 12 | 
 13 | 
 14 | class Team(Enum):
 15 |     ATLANTA_HAWKS = "ATLANTA HAWKS"
 16 |     BOSTON_CELTICS = "BOSTON CELTICS"
 17 |     BROOKLYN_NETS = "BROOKLYN NETS"
 18 |     CHARLOTTE_HORNETS = "CHARLOTTE HORNETS"
 19 |     CHICAGO_BULLS = "CHICAGO BULLS"
 20 |     CLEVELAND_CAVALIERS = "CLEVELAND CAVALIERS"
 21 |     DALLAS_MAVERICKS = "DALLAS MAVERICKS"
 22 |     DENVER_NUGGETS = "DENVER NUGGETS"
 23 |     DETROIT_PISTONS = "DETROIT PISTONS"
 24 |     GOLDEN_STATE_WARRIORS = "GOLDEN STATE WARRIORS"
 25 |     HOUSTON_ROCKETS = "HOUSTON ROCKETS"
 26 |     INDIANA_PACERS = "INDIANA PACERS"
 27 |     LOS_ANGELES_CLIPPERS = "LOS ANGELES CLIPPERS"
 28 |     LOS_ANGELES_LAKERS = "LOS ANGELES LAKERS"
 29 |     MEMPHIS_GRIZZLIES = "MEMPHIS GRIZZLIES"
 30 |     MIAMI_HEAT = "MIAMI HEAT"
 31 |     MILWAUKEE_BUCKS = "MILWAUKEE BUCKS"
 32 |     MINNESOTA_TIMBERWOLVES = "MINNESOTA TIMBERWOLVES"
 33 |     NEW_ORLEANS_PELICANS = "NEW ORLEANS PELICANS"
 34 |     NEW_YORK_KNICKS = "NEW YORK KNICKS"
 35 |     OKLAHOMA_CITY_THUNDER = "OKLAHOMA CITY THUNDER"
 36 |     ORLANDO_MAGIC = "ORLANDO MAGIC"
 37 |     PHILADELPHIA_76ERS = "PHILADELPHIA 76ERS"
 38 |     PHOENIX_SUNS = "PHOENIX SUNS"
 39 |     PORTLAND_TRAIL_BLAZERS = "PORTLAND TRAIL BLAZERS"
 40 |     SACRAMENTO_KINGS = "SACRAMENTO KINGS"
 41 |     SAN_ANTONIO_SPURS = "SAN ANTONIO SPURS"
 42 |     TORONTO_RAPTORS = "TORONTO RAPTORS"
 43 |     UTAH_JAZZ = "UTAH JAZZ"
 44 |     WASHINGTON_WIZARDS = "WASHINGTON WIZARDS"
 45 | 
 46 |     # DEPRECATED TEAMS
 47 |     CHARLOTTE_BOBCATS = "CHARLOTTE BOBCATS"
 48 |     NEW_JERSEY_NETS = "NEW JERSEY NETS"
 49 |     NEW_ORLEANS_HORNETS = "NEW ORLEANS HORNETS"
 50 |     NEW_ORLEANS_OKLAHOMA_CITY_HORNETS = "NEW ORLEANS/OKLAHOMA CITY HORNETS"
 51 |     SEATTLE_SUPERSONICS = "SEATTLE SUPERSONICS"
 52 |     VANCOUVER_GRIZZLIES = "VANCOUVER GRIZZLIES"
 53 | 
 54 | 
 55 | class OutputType(Enum):
 56 |     JSON = "JSON"
 57 |     CSV = "CSV"
 58 | 
 59 | 
 60 | class OutputWriteOption(Enum):
 61 |     WRITE = "w"
 62 |     CREATE_AND_WRITE = "w+"
 63 |     APPEND = "a"
 64 |     APPEND_AND_WRITE = "a+"
 65 | 
 66 | 
 67 | class Position(Enum):
 68 |     POINT_GUARD = "POINT GUARD"
 69 |     SHOOTING_GUARD = "SHOOTING GUARD"
 70 |     SMALL_FORWARD = "SMALL FORWARD"
 71 |     POWER_FORWARD = "POWER FORWARD"
 72 |     CENTER = "CENTER"
 73 | 
 74 | 
 75 | TEAM_ABBREVIATIONS_TO_TEAM = {
 76 |     'ATL': Team.ATLANTA_HAWKS,
 77 |     'BOS': Team.BOSTON_CELTICS,
 78 |     'BRK': Team.BROOKLYN_NETS,
 79 |     'CHI': Team.CHICAGO_BULLS,
 80 |     'CHO': Team.CHARLOTTE_HORNETS,
 81 |     'CLE': Team.CLEVELAND_CAVALIERS,
 82 |     'DAL': Team.DALLAS_MAVERICKS,
 83 |     'DEN': Team.DENVER_NUGGETS,
 84 |     'DET': Team.DETROIT_PISTONS,
 85 |     'GSW': Team.GOLDEN_STATE_WARRIORS,
 86 |     'HOU': Team.HOUSTON_ROCKETS,
 87 |     'IND': Team.INDIANA_PACERS,
 88 |     'LAC': Team.LOS_ANGELES_CLIPPERS,
 89 |     'LAL': Team.LOS_ANGELES_LAKERS,
 90 |     'MEM': Team.MEMPHIS_GRIZZLIES,
 91 |     'MIA': Team.MIAMI_HEAT,
 92 |     'MIL': Team.MILWAUKEE_BUCKS,
 93 |     'MIN': Team.MINNESOTA_TIMBERWOLVES,
 94 |     'NOP': Team.NEW_ORLEANS_PELICANS,
 95 |     'NYK': Team.NEW_YORK_KNICKS,
 96 |     'OKC': Team.OKLAHOMA_CITY_THUNDER,
 97 |     'ORL': Team.ORLANDO_MAGIC,
 98 |     'PHI': Team.PHILADELPHIA_76ERS,
 99 |     'PHO': Team.PHOENIX_SUNS,
100 |     'POR': Team.PORTLAND_TRAIL_BLAZERS,
101 |     'SAC': Team.SACRAMENTO_KINGS,
102 |     'SAS': Team.SAN_ANTONIO_SPURS,
103 |     'TOR': Team.TORONTO_RAPTORS,
104 |     'UTA': Team.UTAH_JAZZ,
105 |     'WAS': Team.WASHINGTON_WIZARDS,
106 | 
107 |     # DEPRECATED TEAMS
108 |     'NJN': Team.NEW_JERSEY_NETS,
109 |     'NOH': Team.NEW_ORLEANS_HORNETS,
110 |     'NOK': Team.NEW_ORLEANS_OKLAHOMA_CITY_HORNETS,
111 |     'CHA': Team.CHARLOTTE_BOBCATS,
112 |     'CHH': Team.CHARLOTTE_HORNETS,
113 |     'SEA': Team.SEATTLE_SUPERSONICS,
114 |     'VAN': Team.VANCOUVER_GRIZZLIES,
115 | }
116 | 
117 | POSITION_ABBREVIATIONS_TO_POSITION = {
118 |     "PG": Position.POINT_GUARD,
119 |     "SG": Position.SHOOTING_GUARD,
120 |     "SF": Position.SMALL_FORWARD,
121 |     "PF": Position.POWER_FORWARD,
122 |     "C": Position.CENTER,
123 | }
124 | 


--------------------------------------------------------------------------------
/NBApredict/run/daily.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module runs the entire NBA_bet project process daily one hour before the first game time.
  3 | 
  4 | It runs one hour before game times in order to capture the most up-to-date betting information. The project is meant to
  5 | be run from the command line. Once running, debug information from the scheduler will be printed as well as notifying
  6 | the user if a job has been successfully run. Terminate the process via a keyboard interrupt. For more details on what
  7 | happens during a scheduled job, refer to run/all.py
  8 | 
  9 | Example:
 10 |     From the project directory, run 'python -m run.daily'
 11 | """
 12 | from apscheduler.schedulers.background import BackgroundScheduler
 13 | from apscheduler.events import EVENT_JOB_EXECUTED, EVENT_JOB_ERROR, EVENT_JOB_MISSED
 14 | from datetime import datetime, timedelta
 15 | import logging
 16 | from sqlalchemy.orm import Session
 17 | import time
 18 | 
 19 | # Local Imports
 20 | from nbapredict.database import getters
 21 | from nbapredict.database.dbinterface import DBInterface
 22 | from nbapredict.run.all import run_all
 23 | 
 24 | 
 25 | def datetime_to_dict(d_time):
 26 |     """Take a datetime and convert it to a dictionary.
 27 | 
 28 |     The output is to be used as arguments for an apshceduler cron trigger."""
 29 |     time_dict = {"year": d_time.year, "month": d_time.month, "day": d_time.day, "hour": d_time.hour,
 30 |                  "minute": d_time.minute}
 31 |     return time_dict
 32 | 
 33 | 
 34 | def job_runs(event):
 35 |     """Attached to a Scheduler as a listener that prints job status on job completion."""
 36 |     if event.exception:
 37 |         print('The job did not run')
 38 |     else:
 39 |         print('The job completed @ {}'.format(datetime.now()))
 40 | 
 41 | 
 42 | def missed_job(event):
 43 |     print('The job was missed. Scheduling a new one to run in one minute')
 44 |     run_time = datetime_to_dict(datetime.now() + timedelta(minutes=1))
 45 |     scheduler.add_job(run_all, "cron", **run_time)
 46 |     scheduler.print_jobs()
 47 | 
 48 | 
 49 | if __name__ == "__main__":
 50 |     # DBInterface setup
 51 |     database = DBInterface()
 52 |     year = 2019
 53 |     session = Session(bind=database.engine)
 54 |     sched_tbl = database.get_table_mappings("sched_{}".format(year))
 55 | 
 56 |     # Get today and the last day of the season so jobs can be scheduled from today through end of season
 57 |     start_date = datetime.date(datetime.now())
 58 |     end_date = session.query(sched_tbl.start_time).order_by(sched_tbl.start_time.desc()).first()[0]
 59 |     end_date = datetime.date(end_date)
 60 | 
 61 |     # Get every date between now and the last day of the season
 62 |     date = start_date
 63 |     game_dates = [date]
 64 |     while date <= end_date:
 65 |         date = date + timedelta(days=1)
 66 |         game_dates.append(date)
 67 | 
 68 |     # Get start times for every day in date if there are games on that day
 69 |     start_times = []
 70 |     for date in game_dates:
 71 |         first_game_time = getters.get_first_game_time_on_day(sched_tbl, session, date)
 72 |         if first_game_time:
 73 |             start_times.append(first_game_time - timedelta(hours=1))
 74 | 
 75 |     # Transform start times into chron arguments for triggers
 76 |     cron_args = [datetime_to_dict(s_time) for s_time in start_times]
 77 |     # cron_args = [datetime.now() + timedelta(minutes=i*5) for i in range(1, 2)]  # TEST
 78 |     # cron_args = [datetime_to_dict(d_time) for d_time in cron_args]  # TEST
 79 | 
 80 |     # Setup scheduler, add jobs and listeners, and start the scheduler
 81 |     scheduler = BackgroundScheduler()
 82 |     scheduler.add_listener(job_runs, EVENT_JOB_EXECUTED | EVENT_JOB_ERROR)
 83 |     scheduler.add_listener(missed_job, EVENT_JOB_MISSED)
 84 |     for kwargs in cron_args:
 85 |         scheduler.add_job(run_all, "cron", **kwargs, misfire_grace_time=60)
 86 |     scheduler.start()
 87 |     scheduler.print_jobs()
 88 | 
 89 |     logging.basicConfig()
 90 |     logging.getLogger('apscheduler').setLevel(logging.DEBUG)
 91 | 
 92 |     try:
 93 |         sleep_time = 0
 94 |         while True:
 95 |             time.sleep(1)
 96 |             sleep_time += 1
 97 |             if sleep_time >= 600:
 98 |                 scheduler.wakeup()
 99 |                 sleep_time = 0
100 |     except (KeyboardInterrupt, SystemExit):
101 |         scheduler.shutdown()
102 | 


--------------------------------------------------------------------------------
/NBApredict/helpers/type.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Contains type checks and type conversion functions
  3 | """
  4 | 
  5 | from datetime import datetime
  6 | from enum import Enum
  7 | import os
  8 | 
  9 | 
 10 | def set_type(values):
 11 |     """Convert string values to integers or floats if applicable. Otherwise, return strings.
 12 | 
 13 |     If the string value has zero length, none is returned
 14 | 
 15 |     Args:
 16 |         values: A list of values
 17 | 
 18 |     Returns:
 19 |         The input list of values modified to match their type. String is the default return value. If the values are
 20 |         ints or floats, returns the list formatted as a list of ints or floats. Empty values will be replaced with none.
 21 | 
 22 |     To-Do:
 23 |         1. Add functionality to coerce elements of lists and not just lists
 24 |     """
 25 |     test_val = values[0]  # Is there a better method than taking a test val?
 26 |     if is_int(test_val):
 27 |         return _set_type(values, int)
 28 |     elif is_float(test_val):
 29 |         return _set_type(values, float)
 30 |     else:
 31 |         values = [x if len(x) > 0 else None for x in values]  # Set empty strings to None
 32 |         return values
 33 | 
 34 | 
 35 | def _set_type(values, new_type):
 36 |     """Transforms a list of values into the specified new type. If the value has zero length, returns none
 37 | 
 38 |     Args:
 39 |         values: A list of values
 40 |         new_type: A type class to modify the list to
 41 | 
 42 |     Returns:
 43 |         The values list modified to the new_type. If an element is empty, the element is set to None.
 44 |         """
 45 | 
 46 |     new_vals = []
 47 |     for i in values:
 48 |         if len(i) > 0:  # Some values may have len(0); we convert them to None to put into sql db
 49 |             new_vals.append(new_type(i))
 50 |         else:
 51 |             new_vals.append(None)
 52 |     return new_vals
 53 | 
 54 | 
 55 | def get_type(values):
 56 |     """Return the type of the values where type is defined as the modal type in the list.
 57 | 
 58 |     Args:
 59 |         values: A list or value to get the type for.
 60 | 
 61 |     Returns:
 62 |         The modal type of a list or the type of the element. Can be integer, float, string, datetime, or none
 63 | 
 64 |     To-Do:
 65 |         Modal type isn't a full proof method. Need to determine a better method.
 66 |     """
 67 |     if hasattr(values, "__len__") and (type(values) != type):  # Checks if the object is iterable
 68 |         val_types = []
 69 |         for i in values:
 70 |             val_types.append(_get_type(i))
 71 |         return max(set(val_types), key=val_types.count)  # The max, set, and key combo returns the modal type
 72 |     elif isinstance(values, Enum):  # For enum objects, pass the value to the get_type function (right choice? IDK)
 73 |         return _get_type(values.value)
 74 |     else:
 75 |         return _get_type(values)
 76 | 
 77 | 
 78 | def _get_type(val):
 79 |     """Return the type of the value if it is a int, float, or datetime. Otherwise, return a string.
 80 | 
 81 |     Args:
 82 |         val: A value to get the type of
 83 |     Returns:
 84 |         The type of the value passed into the function if it is an int, float, datetime, or string
 85 |     Raise:
 86 |         Exception: An exception raised if the val is not int, float, datetime, or string.
 87 |     """
 88 |     if isinstance(val, int):
 89 |         return "integer"
 90 |     elif isinstance(val, float):
 91 |         return "float"
 92 |     elif isinstance(val, datetime):
 93 |         return "datetime"
 94 |     elif isinstance(val, str):
 95 |         return "string"
 96 |     elif isinstance(val, bool):
 97 |         return "bool"
 98 |     elif val is None:
 99 |         return None
100 |     elif is_python_type(val):  # Handles types that are passed explicitly
101 |         return val
102 |     else:
103 |         raise Exception("Val is not an int, float, datetime, string, Bool, or None")
104 | 
105 | 
106 | def is_int(x):
107 |     """Return true if X can be coerced to a integer. Otherwise, return false."""
108 |     try:
109 |         int(x)  # Will raise ValueError if '.2'; will not raise error if .2
110 |         return True
111 |     except ValueError:
112 |         return False
113 | 
114 | 
115 | def is_float(x):
116 |     """Return true if X can be coerced to a float. Otherwise, return false."""
117 |     try:
118 |         float(x)
119 |         return True
120 |     except ValueError:
121 |         return False
122 | 
123 | 
124 | def is_python_type(x):
125 |     if x in [int, float, datetime, str, bool, None]:
126 |         return True
127 |     else:
128 |         return False
129 | 


--------------------------------------------------------------------------------
/NBApredict/helpers/classes.py:
--------------------------------------------------------------------------------
  1 | """Generic classes used throughout the project"""
  2 | 
  3 | 
  4 | class NestedDict:
  5 |     """NestedDict allows multi-level dictionaries which """
  6 | 
  7 |     def __init__(self, *args, **kwargs):
  8 |         """Creates a standard dictionary as a class property"""
  9 |         self.dict = dict(*args, **kwargs)
 10 | 
 11 |     def __getitem__(self, keys):
 12 |         """Returns the value for key and accepts iterables as keys to reach lower level branches of the dict."""
 13 |         # Allows getting top-level branch when a single key was provided
 14 |         if not isinstance(keys, tuple):
 15 |             if isinstance(keys, str) or isinstance(keys, int):  # Handles single item lists or strings
 16 |                 keys = (keys,)
 17 |             else:
 18 |                 keys = tuple(keys)
 19 | 
 20 |         branch = self.dict
 21 |         for key in keys:
 22 |             branch = branch[key]
 23 | 
 24 |         # If we return a branch, and not a leaf value, we wrap it into a NestedDict
 25 |         return NestedDict(branch).dict if isinstance(branch, dict) else branch
 26 | 
 27 |     def __setitem__(self, keys, value):
 28 |         # Allows setting top-level item when a single key was provided
 29 |         if not isinstance(keys, tuple):
 30 |             if len(keys) < 2:
 31 |                 keys = (*keys,)
 32 |             else:
 33 |                 keys = tuple(keys)
 34 | 
 35 |         branch = self.dict
 36 |         for key in keys[:-1]:
 37 |             if key not in branch:
 38 |                 branch[key] = {}
 39 |             branch = branch[key]
 40 |         branch[keys[-1]] = value
 41 | 
 42 |     def __keys__(self, depth=0):
 43 |         """Does not yet function
 44 | 
 45 |         Notes on next steps in the __recurse_keys__ function
 46 |         """
 47 |         keys = [[k] for k in self.dict.keys()]
 48 |         for k in keys:
 49 |             branch = self[k[0]]
 50 |             self.__recurse_keys__(key=k[0], branch=branch, key_list=k)
 51 |             branch = self[k[0]]
 52 |             if isinstance(branch, dict):
 53 |                 if len(branch.keys()) > 1:
 54 |                     b_keys = list(branch.keys())
 55 |                     length = len(b_keys)
 56 |                     # Copies of k to append the keys in the last layer to. When multiple keys are in the last layer,
 57 |                     # we need new tress to capture all key paths
 58 |                     new_trees = [k for _ in range(length)]
 59 |                     k.append(b_keys[0])  # Add the first key to the original tree
 60 |                     for i in range(1, length):
 61 |                         tree = new_trees[i]
 62 |                         tree.append(b_keys[i])
 63 |                         keys.append(tree)
 64 |                 else:
 65 |                     k.append(list(branch.keys())[0])
 66 |             while not isinstance(branch, dict):
 67 |                 branch_keys = [[bk] for bk in branch.dict.keys()]
 68 | 
 69 |         return keys
 70 | 
 71 |     @staticmethod
 72 |     def __recurse_keys__(branch, key_list, depth=0):
 73 |         """Not Functional
 74 | 
 75 |         Waiting to finish this up. There's several issues.
 76 |         1. You almost have to recurse through the tree of keys which can be a heavy computation
 77 |         2. The leaf of a branch needs to be handled in a different manner than branches along the way. The leaf will
 78 |         be a list itself, and it needs to be reformatted when finished.
 79 |         3. Finally, it may need a completely different implementation. Look at it with fresh eyes when you next work on
 80 |         it. """
 81 |         b_keys = list(branch.keys())
 82 |         if isinstance(branch, dict):
 83 |             if len(b_keys) > 1:
 84 |                 length = len(b_keys)
 85 |                 # Copies of k to append the keys in the last layer to. When multiple keys are in the last layer,
 86 |                 # we need new tress to capture all key paths
 87 |                 new_trees = [key_list for _ in range(1, length)]
 88 |                 key_list.append(b_keys[0])  # Add the first key to the original tree
 89 |                 print(key_list)
 90 |                 combined_k_lists = [key_list]
 91 |                 for i in range(length-1):
 92 |                     tree = new_trees[i]
 93 |                     print(tree)
 94 |                     print(b_keys)
 95 |                     tree.append(b_keys[i])
 96 |                     combined_k_lists.append(tree)
 97 |                 return combined_k_lists
 98 |             else:
 99 |                 return key_list.append(list(branch.keys())[0])
100 |         else:  # we have a nested dict
101 |             pass


--------------------------------------------------------------------------------
/NBApredict/br_web_scraper/output.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import json
  3 | 
  4 | from nbapredict.helpers.br_references import OutputType, OutputWriteOption
  5 | 
  6 | box_score_fieldname = [
  7 |     "name",
  8 |     "team",
  9 |     "location",
 10 |     "opponent",
 11 |     "outcome",
 12 |     "seconds_played",
 13 |     "made_field_goals",
 14 |     "attempted_field_goals",
 15 |     "made_three_point_field_goals",
 16 |     "attempted_three_point_field_goals",
 17 |     "made_free_throws",
 18 |     "attempted_free_throws",
 19 |     "offensive_rebounds",
 20 |     "defensive_rebounds",
 21 |     "assists",
 22 |     "steals",
 23 |     "blocks",
 24 |     "turnovers",
 25 |     "personal_fouls",
 26 |     "game_score",
 27 | ]
 28 | 
 29 | game_fieldname = [
 30 |     "start_time",
 31 |     "away_team",
 32 |     "away_team_score",
 33 |     "home_team",
 34 |     "home_team_score",
 35 | ]
 36 | 
 37 | default_json_options = {
 38 |     "sort_keys": True,
 39 |     "indent": 4,
 40 | }
 41 | 
 42 | 
 43 | def merge_two_dicts(first, second):
 44 |     combined = first.copy()
 45 |     combined.update(second)
 46 |     return combined
 47 | 
 48 | 
 49 | def output(values, output_type, output_file_path, encoder, csv_writer, output_write_option=None, json_options=None):
 50 |     if output_type is None:
 51 |         return values
 52 | 
 53 |     write_option = OutputWriteOption.WRITE if output_write_option is None else output_write_option
 54 | 
 55 |     if output_type == OutputType.JSON:
 56 |         options = default_json_options if json_options is None else merge_two_dicts(first=default_json_options, second=json_options)
 57 |         if output_file_path is None:
 58 |             return json.dumps(values, cls=encoder, **options)
 59 |         else:
 60 |             with open(output_file_path, write_option.value, newline="") as json_file:
 61 |                 return json.dump(values, json_file, cls=encoder, **options)
 62 | 
 63 |     if output_type == OutputType.CSV:
 64 |         if output_file_path is None:
 65 |             raise ValueError("CSV output must contain a file path")
 66 |         else:
 67 |             return csv_writer(rows=values, output_file_path=output_file_path, write_option=write_option)
 68 | 
 69 |     raise ValueError("Unknown output type: {output_type}".format(output_type=output_type))
 70 | 
 71 | # I wrote the explicit mapping of CSV values because there didn't seem to be a way of outputting the values of enums
 72 | # without doing it this way
 73 | 
 74 | 
 75 | def box_scores_to_csv(rows, output_file_path, write_option):
 76 |     with open(output_file_path, write_option.value, newline="") as csv_file:
 77 |         writer = csv.DictWriter(csv_file, fieldnames=box_score_fieldname)
 78 |         writer.writeheader()
 79 |         writer.writerows(
 80 |             {
 81 |                 "name": row["name"],
 82 |                 "team": row["team"].value,
 83 |                 "location": row["location"].value,
 84 |                 "opponent": row["opponent"].value,
 85 |                 "outcome": row["outcome"].value,
 86 |                 "seconds_played": row["seconds_played"],
 87 |                 "made_field_goals": row["made_field_goals"],
 88 |                 "attempted_field_goals": row["attempted_field_goals"],
 89 |                 "made_three_point_field_goals": row["made_three_point_field_goals"],
 90 |                 "attempted_three_point_field_goals": row["attempted_three_point_field_goals"],
 91 |                 "made_free_throws": row["made_free_throws"],
 92 |                 "attempted_free_throws": row["attempted_free_throws"],
 93 |                 "offensive_rebounds": row["offensive_rebounds"],
 94 |                 "defensive_rebounds": row["defensive_rebounds"],
 95 |                 "assists": row["assists"],
 96 |                 "steals": row["steals"],
 97 |                 "blocks": row["blocks"],
 98 |                 "turnovers": row["turnovers"],
 99 |                 "personal_fouls": row["personal_fouls"],
100 |                 "game_score": row["game_score"],
101 |             } for row in rows
102 |         )
103 | 
104 | 
105 | def schedule_to_csv(rows, output_file_path, write_option):
106 |     with open(output_file_path, write_option.value, newline="") as csv_file:
107 |         writer = csv.DictWriter(csv_file, fieldnames=game_fieldname)
108 |         writer.writeheader()
109 |         writer.writerows(
110 |             {
111 |                 "start_time": row["start_time"],
112 |                 "away_team": row["away_team"].value,
113 |                 "away_team_score": row["away_team_score"],
114 |                 "home_team": row["home_team"].value,
115 |                 "home_team_score": row["home_team_score"],
116 |             } for row in rows
117 |         )
118 | 


--------------------------------------------------------------------------------
/NBApredict/br_web_scraper/parsers/schedule.py:
--------------------------------------------------------------------------------
  1 | from lxml import html
  2 | import datetime
  3 | import pytz
  4 | 
  5 | from nbapredict.helpers.br_references import Team
  6 | 
  7 | TEAM_NAME_TO_TEAM = {
  8 |     member.value: member
  9 |     for (_, member) in Team.__members__.items()
 10 | }
 11 | 
 12 | TEAM_NAME_TO_TEAM["NEW ORLEANS/OKLAHOMA CITY HORNETS"] = Team.NEW_ORLEANS_OKLAHOMA_CITY_HORNETS
 13 | 
 14 | 
 15 | def parse_start_time(formatted_date, formatted_time_of_day):
 16 |     if formatted_time_of_day is not None and formatted_time_of_day not in ["", " "]:
 17 |         # Starting in 2018, the start times had a "p" or "a" appended to the end
 18 |         # Between 2001 and 2017, the start times had a "pm" or "am"
 19 |         #
 20 |         # https://www.basketball-reference.com/leagues/NBA_2018_games.html
 21 |         # vs.
 22 |         # https://www.basketball-reference.com/leagues/NBA_2001_games.html
 23 |         is_prior_format = formatted_time_of_day[-2:] == "am" or formatted_time_of_day[-2:] == "pm"
 24 | 
 25 |         # If format contains only "p" or "a" add an "m" so it can be parsed by datetime module
 26 |         if is_prior_format:
 27 |             combined_formatted_time = formatted_date + " " + formatted_time_of_day
 28 |         else:
 29 |             combined_formatted_time = formatted_date + " " + formatted_time_of_day + "m"
 30 | 
 31 |         if is_prior_format:
 32 |             start_time = datetime.datetime.strptime(combined_formatted_time, "%a, %b %d, %Y %I:%M %p")
 33 |         else:
 34 |             start_time = datetime.datetime.strptime(combined_formatted_time, "%a, %b %d, %Y %I:%M%p")
 35 |     else:
 36 |         start_time = datetime.datetime.strptime(formatted_date, "%a, %b %d, %Y")
 37 | 
 38 |     # All basketball reference times seem to be in Eastern
 39 |     est = pytz.timezone("US/Eastern")
 40 |     localized_start_time = est.localize(start_time)
 41 | 
 42 |     # When localized_start_time calls and returns astimezone(pytz.utc), the values are converted to UTC.
 43 |     # In this call, the day of the game can be changed. For example, an 10pm game on October 16th may be converted to a
 44 |     # 2am game in on October 17th in UTC. To avoid this effect, return localized_start_time
 45 |     return localized_start_time
 46 |     # return localized_start_time.astimezone(pytz.utc)
 47 | 
 48 | 
 49 | def current_time():
 50 |     now = datetime.datetime.now()
 51 |     est = pytz.timezone("US/Eastern")
 52 |     localized_now_time = est.localize(now)
 53 |     return localized_now_time.astimezone(pytz.utc)
 54 | 
 55 | 
 56 | def parse_game(row):
 57 |     start_time = parse_start_time(formatted_date=row[0].text_content(), formatted_time_of_day=row[1].text_content())
 58 | 
 59 |     # Test existed to check for games that haven't been played. Replaced to default unplayed games to 0-0 score
 60 |     #try:
 61 |     #    test = int(row[3].text_content())
 62 |     #except:
 63 |     #    print("invalid test")
 64 | 
 65 |     try:
 66 |         away_team_score = int(row[3].text_content())
 67 |         home_team_score = int(row[5].text_content())
 68 |     except:
 69 |         away_team_score = 0
 70 |         home_team_score = 0
 71 |     return {
 72 |         "start_time": start_time,
 73 |         "away_team": TEAM_NAME_TO_TEAM[row[2].text_content().upper()],
 74 |         "away_team_score": away_team_score,
 75 |         "home_team": TEAM_NAME_TO_TEAM[row[4].text_content().upper()],
 76 |         "home_team_score": home_team_score,
 77 |     }
 78 | 
 79 | 
 80 | def parse_schedule(page):
 81 |     tree = html.fromstring(page)
 82 |     rows = tree.xpath('//table[@id="schedule"]//tbody/tr')
 83 |     schedule = []
 84 |     for row in rows:
 85 |         if row.text_content() != "Playoffs":
 86 |             start_time = parse_start_time(formatted_date=row[0].text_content(),
 87 |                                           formatted_time_of_day=row[1].text_content())
 88 |             # now = current_time()
 89 |             # Scrape all data up to 'yesterday'; Don't scrape for today as in progress games create errors
 90 |             # if (start_time.month == now.month) and (start_time.day > (now.day - 1)):
 91 |             #    break
 92 |             # elif start_time > now:
 93 |             #    break
 94 |             schedule.append(parse_game(row))
 95 |         if row.text_content() == "Playoffs":
 96 |             pass  # An extraneous text_content() that arises when games switch from regular season to playoffs
 97 |     return schedule
 98 | 
 99 | 
100 | def parse_schedule_for_month_url_paths(page):
101 |     tree = html.fromstring(page)
102 |     months = tree.xpath('//div[@id="content"]/div[@class="filter"]/div[not(contains(@class, "current"))]/a')
103 |     return list(map(lambda month: month.attrib['href'], months))
104 | 


--------------------------------------------------------------------------------
/NBApredict/scrapers/team_scraper.py:
--------------------------------------------------------------------------------
  1 | """
  2 | team_scraper scrapes and stores team stats from basketball reference.
  3 | 
  4 | By default, it scrapes miscellaneous stats from 2019. Alternate years and tables may be scraped though functionality is
  5 | not yet guaranteed. The scraped tables are written to the specified database.
  6 | 
  7 | ToDo:
  8 |     1. Create a method for stripping extraneous characters from team-names. If querying a historical season (<2001),
  9 |     the teams that made the playoffs have a '*' appended that we want to strip from the team-name
 10 | """
 11 | 
 12 | from bs4 import BeautifulSoup  # Requires lxml to be installed as well
 13 | from datetime import datetime
 14 | import re
 15 | import requests
 16 | 
 17 | # Local imports.
 18 | from nbapredict.configuration import Config
 19 | from nbapredict.helpers.br_references import BASE_URL
 20 | from nbapredict.helpers.br_references import data_stat_headers as headers
 21 | from nbapredict.helpers import type
 22 | 
 23 | 
 24 | def team_statistics(tbl_name):
 25 |     """Build a URL for the specified year and return team statistics for the specified table on that page.
 26 | 
 27 |     Performance not guaranteed for tables that are not "misc_stats"
 28 | 
 29 |     Args:
 30 |         tbl_name: The name of the table to be returned
 31 | 
 32 |     Returns:
 33 |         A dictionary version of the specified table. Keys are column titles that return lists ordered by team.
 34 |     """
 35 | 
 36 |     url = '{BASE_URL}/leagues/NBA_{year}.html'.format(
 37 |         BASE_URL=BASE_URL,  # imported from br_references.py
 38 |         year=Config.get_property("league_year")
 39 |     )
 40 | 
 41 |     response = requests.get(url=url, allow_redirects=False)
 42 |     if 200 <= response.status_code < 300:
 43 |         scrape_time = datetime.now()
 44 |         return parse_table(response.content, tbl_name, scrape_time)  # Note that this uses the .content attribute
 45 | 
 46 |     raise Exception("Could not connect to URL")
 47 | 
 48 | 
 49 | def parse_table(page, tbl_name, scrape_time):
 50 |     """Parse the specified table on the specified page and return the data as a dictionary
 51 | 
 52 |      Args:
 53 |          page: The contents from a url response
 54 |          tbl_name: the desired table to be parsed
 55 | 
 56 |      Returns:
 57 |          A dictionary version of the specified table. Keys are column titles that return lists ordered by team.
 58 |      """
 59 | 
 60 |     cleaned_soup = BeautifulSoup(re.sub('<!--|-->', "", str(page)), features="lxml")  # Strips comments from page
 61 |     table = cleaned_soup.find('table', {'id': '{}'.format(tbl_name)})
 62 |     data_dict = get_data_dict_from_tbl(table)
 63 |     keys = data_dict.keys()
 64 |     for key in keys:
 65 |         data_dict[key] = type.set_type(data_dict[key])
 66 |     # Add a scrape time for each row in the dictionary
 67 |     data_dict['scrape_time'] = [scrape_time for i in range(len(data_dict[key]))]
 68 |     return data_dict
 69 | 
 70 | 
 71 | def get_data_dict_from_tbl(table):
 72 |     """Return a dictionary from a BeautifulSoup table with column names as keys and a list of values
 73 | 
 74 |     Args:
 75 |         table: a table as returned by the find method on a BeautifulSoup object
 76 |     """
 77 |     rows = table.find_all("tr")
 78 |     data_dict = dict()
 79 | 
 80 |     for row in rows:
 81 |         if row.find('th', {"scope": "row"}) is not None:
 82 |             for head in headers:
 83 |                 cell = row.find("td", {"data-stat": head})
 84 |                 a = cell.text.strip().encode()
 85 |                 cell_data = a.decode("utf-8")
 86 | 
 87 |                 if head in data_dict:
 88 |                     data_dict[head].append(cell_data)
 89 |                 else:
 90 |                     data_dict[head] = [cell_data]
 91 | 
 92 |     return data_dict
 93 | 
 94 | 
 95 | def clean_team_name(team_names):
 96 |     """Take a list of team_names, modify the names to match the format specified in br_references, and return a new list
 97 | 
 98 |     Args:
 99 |         team_names: a list of team_names to be checked for validity, and if needed, modified
100 |     """
101 |     new_team_names = []
102 |     for team in team_names:
103 |         new_team_names.append(''.join(a for a in team if a.isalpha() or a.isspace() or a.isdigit()).upper())
104 |     return new_team_names
105 | 
106 | 
107 | def scrape(tbl_name="misc_stats"):
108 |     """Scrape a basketball_reference table of team stats, parse the table, and write it to a database
109 | 
110 |     Args:
111 |         tbl_name: The name of the table to scrape on basketballreference.com
112 |     """
113 | 
114 |     # Get tbl_dictionary from basketball reference
115 |     tbl_dict = team_statistics(tbl_name)
116 |     tbl_dict["team_name"] = clean_team_name(tbl_dict["team_name"])
117 |     return tbl_dict
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     scrape()
122 | 


--------------------------------------------------------------------------------
/NBApredict/management/etl.py:
--------------------------------------------------------------------------------
  1 | """ ETL (Extract Transform Load) manages data scraping, modification, table creation, and data loading.
  2 | 
  3 | Main() calls the necessary ETL functions from scrapers and management.tables for all tables.
  4 | 
  5 | Tables:
  6 |     teams
  7 |     schedule
  8 |     odds
  9 |     team_stats
 10 | """
 11 | 
 12 | from datetime import datetime
 13 | from datatotable.database import Database
 14 | from datatotable.data import DataOperator
 15 | from nbapredict.configuration import Config
 16 | import nbapredict.management
 17 | import nbapredict.management.conversion as convert
 18 | from nbapredict.management.tables import teams, team_stats, odds, schedule
 19 | from nbapredict.scrapers import team_scraper, line_scraper, season_scraper
 20 | 
 21 | 
 22 | def main(db):
 23 |     year = Config.get_property("league_year")
 24 |     session = nbapredict.management.Session(bind=db.engine)
 25 | 
 26 |     # ~~~~~~~~~~~~~
 27 |     # Teams
 28 |     # ~~~~~~~~~~~~~
 29 |     team_dict = team_scraper.scrape()
 30 |     teams_data = DataOperator({"team_name": team_dict["team_name"]})
 31 |     teams_tbl_name = "teams_{}".format(year)
 32 |     if not db.table_exists(teams_tbl_name):
 33 |         teams.create_team_table(db=db, teams_data=teams_data, tbl_name=teams_tbl_name)
 34 |         teams_tbl = db.table_mappings[teams_tbl_name]
 35 |         session.add_all([teams_tbl(**row) for row in teams_data.rows])
 36 |         session.commit()
 37 |         del teams_tbl
 38 | 
 39 |     # ~~~~~~~~~~~~~
 40 |     # Team Stats
 41 |     # ~~~~~~~~~~~~~
 42 |     team_stats_tbl_name = "team_stats_{}".format(year)
 43 |     teams_tbl = db.table_mappings[teams_tbl_name]
 44 |     team_dict['team_id'] = team_dict.pop('team_name')
 45 |     team_dict['team_id'] = convert.values_to_foreign_key(session=session, foreign_tbl=teams_tbl, foreign_key="id",
 46 |                                                          foreign_value="team_name", child_data=team_dict['team_id'])
 47 |     # When team_stats_tbl is created, the teams_tbl automap object is changed. The changed format does not follow
 48 |     # the expected behavior of an automapped table. I suspect this is because a relationship is established.
 49 |     # If we reloaded, teams_tbl works fine. Therefore, delete the variable here for now
 50 |     del teams_tbl
 51 |     team_dict['scrape_date'] = [datetime.date(s_time) for s_time in team_dict['scrape_time']]
 52 |     team_stats_data = DataOperator(team_dict)
 53 |     if not db.table_exists(team_stats_tbl_name):
 54 |         team_stats.create_table(db=db, team_stats_data=team_stats_data, tbl_name=team_stats_tbl_name)
 55 |         team_stats_tbl = db.table_mappings[team_stats_tbl_name]
 56 |         session.add_all([team_stats_tbl(**row) for row in team_stats_data.rows])
 57 |         session.commit()
 58 |     else:
 59 |         team_stats_tbl = db.table_mappings[team_stats_tbl_name]
 60 |         team_stats.insert(session, team_stats_tbl, team_stats_data)
 61 | 
 62 |     # ~~~~~~~~~~~~~
 63 |     # Schedule
 64 |     # ~~~~~~~~~~~~~
 65 |     schedule_dict = season_scraper.scrape()
 66 |     schedule_data = DataOperator(schedule_dict)
 67 |     teams_tbl = db.table_mappings['teams_{}'.format(year)]
 68 |     schedule_data = schedule.format_data(session=session, schedule_data=schedule_data,
 69 |                                          team_tbl=teams_tbl, team_stats_tbl=team_stats_tbl)
 70 |     schedule_tbl_name = "schedule_{}".format(year)
 71 |     if not db.table_exists(schedule_tbl_name):
 72 |         schedule.create_table(db, schedule_data, schedule_tbl_name, teams_tbl, team_stats_tbl)
 73 |         schedule_tbl = db.table_mappings[schedule_tbl_name]
 74 |         session.add_all([schedule_tbl(**row) for row in schedule_data.rows])
 75 |         session.commit()
 76 |     else:
 77 |         schedule_tbl = db.table_mappings[schedule_tbl_name]
 78 |         update_rows = schedule.update_table(session, schedule_data, schedule_tbl, team_stats_tbl)
 79 |         session.add_all(update_rows)
 80 |         session.commit()
 81 | 
 82 |     # ~~~~~~~~~~~~~
 83 |     # Odds
 84 |     # ~~~~~~~~~~~~~
 85 |     odds_dict = line_scraper.scrape()
 86 |     odds_data = None
 87 |     if odds_dict:
 88 |         odds_dict = odds.format_data(session, odds_dict, teams_tbl, schedule_tbl)
 89 |         odds_data = DataOperator(odds_dict)
 90 |     # Evaluate if you have the correct columns in odds_data (i.e. home\away team id's)
 91 |     odds_tbl_name = "odds_{}".format(year)
 92 |     if not db.table_exists(odds_tbl_name) and odds_data:
 93 |         odds.create_table(db, odds_tbl_name, odds_data, schedule_tbl)
 94 |         odds_tbl = db.table_mappings[odds_tbl_name]
 95 |         session.add_all(odds_tbl(**row) for row in odds_data.rows)
 96 |         session.commit()
 97 |     elif odds_data:
 98 |         odds_tbl = db.table_mappings[odds_tbl_name]
 99 |         session.add_all(odds_tbl(**row) for row in odds_data.rows)
100 |         session.commit()
101 |         odds.update_table(session, odds_tbl, odds_data)
102 |         session.commit()
103 |         odds.delete(session, odds_tbl)
104 | 
105 |     session.close()
106 | 
107 | if __name__ == "__main__":
108 |     db = Database("test", Config.get_property("outputs"))
109 |     main(db)
110 | 


--------------------------------------------------------------------------------
/NBApredict/management/conversion.py:
--------------------------------------------------------------------------------
 1 | """Conversion contains functions to grease interoperability between tables. At the moment, this consists of the
 2 | values_to_foreign_key function."""
 3 | 
 4 | from nbapredict.helpers.classes import NestedDict
 5 | import pandas as pd
 6 | import sqlalchemy
 7 | 
 8 | 
 9 | def values_to_foreign_key(session, foreign_tbl, foreign_key, foreign_value, child_data):
10 |     """Return values from child data that exist in the foreign_tbl transformed into foreign key values
11 | 
12 |     Args:
13 |         session: A sqlalchemy session
14 |         foreign_tbl: The foreign table mapping child data references
15 |         foreign_key: The name of the column containing foreign key values
16 |         foreign_value: The name of the column containing values to match with child data
17 |         child_data: A list of data with values contained in foreign value
18 | 
19 |     Returns:
20 |          A list of values from the foreign key column that correspond to child data's relationship to the foreign values
21 |     """
22 |     # past 999 the SQLite backend raises a "too many variables warning". Here, we presume we don't have >999 unique
23 |     # values in child_data. Rather, presume we have < 999 unique values and take a set of the data.
24 |     set_data = set()
25 |     if len(child_data) > 999:
26 |         set_data = set(child_data)
27 |     if type(foreign_tbl) == sqlalchemy.sql.selectable.Alias:
28 |         conversion_dict = _values_to_foreign_key(session, foreign_tbl, foreign_key, foreign_value,
29 |                                                  set_data or child_data)
30 |         return [conversion_dict[i] for i in child_data]
31 |     else:
32 |         key_column = [getattr(foreign_tbl, foreign_key)]
33 |         if isinstance(child_data, dict):
34 |             composite_fd = True  # Composite functional dependency, two+ columns required to identify unique key
35 |             value_columns = [getattr(foreign_tbl, val) for val in child_data.keys()]
36 |             keys = list(child_data.keys())
37 |             filters = [value_columns[i].in_(child_data[keys[i]]) for i in range(len(keys))]
38 |         else:
39 |             composite_fd = False
40 |             value_columns = [getattr(foreign_tbl, foreign_value)]
41 |             filters = [value_columns[0].in_(set_data or child_data)]
42 | 
43 |         rows = session.query(*key_column, *value_columns).distinct().filter(*filters).all()
44 | 
45 |         if composite_fd:
46 |             nested_conversion_dict = NestedDict()
47 |             for r in rows:
48 |                 # multi-valued key with the foreign key as the value
49 |                 nested_conversion_dict[[col for col in r[1:]]] = r[0]
50 | 
51 |             # Generate a list of lists with the values in each row of child data
52 |             # These values form keys for the foreign keys stored in the nested_conversion_dict which is returned
53 |             conversion_keys = []
54 |             length = len(child_data[list(child_data.keys())[0]])
55 |             for i in range(length):
56 |                 conversion_keys.append([child_data[k][i] for k in child_data.keys()])
57 |             return [nested_conversion_dict[k] for k in conversion_keys]
58 |         else:
59 |             conversion_dict = {getattr(row, foreign_value): getattr(row, foreign_key) for row in rows}
60 |             return [conversion_dict[i] for i in child_data]
61 | 
62 | 
63 | def _values_to_foreign_key(session, foreign_subquery, foreign_key, foreign_value, child_data):
64 |     """Return values from child data that exist in the foreign_subquery transformed into foreign key values
65 | 
66 |     This function performs the same query as values_to_foreign_key() except it can take a subquery, which has
67 |     different syntax, as input rather than a table. The function presumes child_data has already been modified if
68 |     necessary. NOTE: this does not support multi-column conversions of child_data to foreign key.
69 | 
70 |     Args:
71 |         foreign_subquery: A subquery which is an Alias class in sqlalchemy. These classes are created when subquery()
72 |         is appended to a sqlalchemy query statement
73 |         foreign_key: The name of the column containing foreign key values
74 |         foreign_value: The name of the column containing values to match with child data
75 |         child_data: A list of data with values contained in foreign value
76 | 
77 |     Returns:
78 |          A conversion dict that maps child_data to foreign keys
79 |     """
80 |     rows = session.query(getattr(foreign_subquery.c, foreign_key), getattr(foreign_subquery.c, foreign_value)). \
81 |         filter(getattr(foreign_subquery.c, foreign_value).in_(child_data)).all()
82 |     conversion_dict = {getattr(row, foreign_value): getattr(row, foreign_key) for row in rows}
83 |     return conversion_dict
84 | 
85 | 
86 | def convert_sql_statement_to_table(session, sql_statement, qualifiers=False):
87 |     """Convert the specified table into a pandas dataframe, modify it according to qualifiers, and return the result
88 | 
89 |     Args:
90 |         session: SQLalchemy session object
91 |         sql_statement: A sql_statement. Typically, this is the statement property of an object returned by a query such
92 |         as session.query(tbl).statement
93 |         qualifiers: A list of columns or a function to filter rows by
94 |     """
95 |     if qualifiers:
96 |         return pd.read_sql(sql_statement, session.bind)[qualifiers]
97 |     else:
98 |         return pd.read_sql(sql_statement, session.bind)
99 | 


--------------------------------------------------------------------------------
/NBApredict/models/graphing.py:
--------------------------------------------------------------------------------
  1 | """
  2 | graphing contains functions for creating evaluative graphs for regressions
  3 | """
  4 | 
  5 | import math
  6 | import matplotlib.pyplot as plt
  7 | import numpy as np
  8 | import scipy.stats as sci_stats
  9 | from sklearn.linear_model import LinearRegression
 10 | import statistics as stats
 11 | import statsmodels.api as sm
 12 | from statsmodels.compat import lzip
 13 | from yellowbrick.regressor import ResidualsPlot
 14 | 
 15 | 
 16 | def pred_vs_actual(predictions, target, r_squared, out_path=None):
 17 |     """Create and returnsa scatter plot of a model's predictions versus target variables
 18 | 
 19 |     Args:
 20 |         predictions: The predictions from a regression
 21 |         target: The target variable of a regression
 22 |         r_squared: The r_squared of a regression
 23 |         out_path: An optional path to save the graph to
 24 |     Returns:
 25 |         The predicted vs. actual graph
 26 |     """
 27 | 
 28 |     # Generate coordinates for a 1:1 line
 29 |     minimum = int(predictions.min()) - 1
 30 |     maximum = int(predictions.max()) + 1
 31 |     diag_line_x = [i for i in range(minimum, maximum)]
 32 |     diag_line_y = [i for i in diag_line_x]
 33 | 
 34 |     # Build Scatterplot
 35 |     fig, ax = plt.subplots()
 36 |     ax.scatter(predictions, target)
 37 |     ax.set_title("Predicted vs. Actual")
 38 |     ax.set_xlabel("Predicted")
 39 |     ax.set_ylabel("Actual")
 40 |     ax.axhline(0, c="k", linewidth=0.25)
 41 |     ax.plot(diag_line_x, diag_line_y, c="r")
 42 |     ax.text(0.1, 0.9, "R^2 = {}".format(r_squared), transform=ax.transAxes, bbox=dict(fill=False))
 43 | 
 44 |     if out_path:
 45 |         fig.savefig(fname=out_path)
 46 |     return fig
 47 | 
 48 | 
 49 | def residuals_vs_fitted(predictions, residuals, out_path=None):
 50 |     """Create and return a scatter plot of a model's fitted values (predictions) versus the residuals
 51 | 
 52 |     Args:
 53 |         predictions: The predictions from a regression
 54 |         residuals: The residuals from a regression
 55 |         out_path: An optional path to save the graph to
 56 | 
 57 |     Returns:
 58 |         The residuals vs. fitted graph
 59 |     """
 60 |     # Get Jarque-bera test of normality
 61 |     name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']
 62 |     test = sm.stats.jarque_bera(residuals)
 63 |     jarque_bera = lzip(name, test)
 64 |     p_value = jarque_bera[1][1]
 65 | 
 66 |     mu = 0
 67 |     variance = stats.variance(residuals)
 68 |     sigma = math.sqrt(variance)
 69 |     x = np.linspace(mu-4*sigma, mu+4*sigma, 100)
 70 | 
 71 |     # Build Scatterplot
 72 |     fig, ax = plt.subplots(nrows=1, ncols=2, gridspec_kw={'width_ratios': [3, 1]})
 73 |     ax[0].scatter(predictions, residuals)
 74 |     ax[0].set_title("Residuals vs. Fitted Values")
 75 |     ax[0].set_xlabel("Fitted Values")
 76 |     ax[0].set_ylabel("Residuals")
 77 |     ax[0].axhline(0, c="k", linewidth=0.5)
 78 |     ax[1].hist(residuals, bins=30, orientation="horizontal")
 79 |     # ax[1].set_xticks(np.linspace(0, round(ax[1].get_xbound()[1]), 3))
 80 |     ax2 = ax[1].twiny()
 81 |     # ax2.set_xticks(np.linspace(0, round(ax2.get_xbound()[1], 2), 3))
 82 |     ax2.plot(sci_stats.norm.pdf(x, mu, sigma), x, color="red")
 83 |     ax[1].set_xlabel("Frequency")
 84 |     ax[1].set_title("Residual Distribution")
 85 |     fig.tight_layout()
 86 |     align_xaxis(ax[1], 0, ax2, 0)
 87 |     if out_path:
 88 |         fig.savefig(out_path)
 89 |     return fig
 90 | 
 91 | 
 92 | def cooks_distance(cooks_d, out_path=None):
 93 |     """Create and return a cook's distance graph
 94 | 
 95 |     Args:
 96 |         cooks_d: Cook's distance from a regression
 97 |         out_path: optional path to save the figure to
 98 |     Returns:
 99 |         The cook's distance graph
100 |     """
101 |     fig, ax = plt.subplots()
102 |     ax.stem(np.arange(len(cooks_d)), cooks_d)
103 |     ax.set_title("Cook's Distance")
104 |     ax.set_xlabel("Residuals")
105 |     ax.set_ylabel("Cook's Distance")
106 |     if out_path:
107 |         fig.savefig(out_path)
108 |     return fig
109 | 
110 | 
111 | def residual_independence(residuals):
112 |     """Create a residual time series plot to check for independence.
113 | 
114 |     Row number on X-axis, Residual on Y-axis
115 | 
116 |     Args:
117 |         residuals: Pandas series holding residuals
118 |     """
119 |     indices = [x for x in range(len(residuals))]
120 |     fig, ax = plt.subplots()
121 |     ax.stem(indices, residuals)
122 |     ax.set_title("Residual Independence")
123 |     ax.set_xlabel("Row Number")
124 |     ax.set_ylabel("Residual")
125 |     return fig
126 | 
127 | 
128 | def align_xaxis(ax1, v1, ax2, v2):
129 |     """adjust ax2 ylimit so that v2 in ax2 is aligned to v1 in ax1"""
130 |     _, x1 = ax1.transData.transform((0, v1))
131 |     _, x2 = ax2.transData.transform((0, v2))
132 |     inv = ax2.transData.inverted()
133 |     _, dx = inv.transform((0, 0)) - inv.transform((0, x1-x2))
134 |     minx, maxx = ax2.get_xlim()
135 |     ax2.set_xlim(minx+dx, maxx+dx)
136 | 
137 | 
138 | def residuals_yellowbrick(predictors, target):
139 |     """Returns a residuals vs. fitted graph with a histogram. Not currently functional.
140 | 
141 |     For future development. uses yellowbrick, which makes good graphs, but experiencing an unexplained missing
142 |     argument TypeError
143 |     """
144 |     lm = LinearRegression
145 |     visualizer = ResidualsPlot(lm)
146 |     visualizer.fit(predictors, target)
147 |     return visualizer


--------------------------------------------------------------------------------
/NBApredict/scrapers/season_scraper.py:
--------------------------------------------------------------------------------
  1 | """
  2 | season_scraper scrapes data from a specified season and writes it to the specified database.
  3 | 
  4 | The basketball_reference_web_scraper package is used to scrape the data. The data is then formatted and written to the
  5 | database. The table is automatically named 'sched' for schedule with the year appended as in 'sched_2019'.
  6 | """
  7 | 
  8 | from datetime import datetime
  9 | from datatotable.data import DataOperator
 10 | import pandas
 11 | from sqlalchemy import UniqueConstraint, func
 12 | 
 13 | # Local Imports
 14 | from nbapredict.br_web_scraper import client
 15 | from nbapredict.configuration import Config
 16 | 
 17 | 
 18 | def br_enum_to_string(season):
 19 |     """Substitute the value of each enum for an enum in season and return a modified season
 20 | 
 21 |     Args:
 22 |         season: A season as defined by basketball_reference_web_scraper
 23 | 
 24 |     Returns:
 25 |         A season modified so that any enums in the season are replaced by their values
 26 |     """
 27 |     new_season = []
 28 |     for game in season:
 29 |         game_dict = dict()
 30 |         keys = game.keys()
 31 |         for key in keys:
 32 |             if type(game[key]) not in [str, int, float, datetime]:
 33 |                 game_dict[key] = game[key].value  # Extract value from enum here
 34 |             else:
 35 |                 game_dict[key] = game[key]
 36 |         new_season.append(game_dict)
 37 |     return new_season
 38 | 
 39 | 
 40 | def create_season_table(database, data, tbl_name):
 41 |     """Creates the season table in the specified database, inserts the data, and clears mappers
 42 | 
 43 |     Use only if the table does not already exist
 44 | 
 45 |     Args:
 46 |         database: An instantiated DBInterface object from database.database for database interactions.
 47 |         data: A DataOperator object from database.manipulator that holds the data to add.
 48 |         tbl_name: The name of the table to create.
 49 |     """
 50 |     sql_types = data.get_sql_type()
 51 |     constraint = {UniqueConstraint: ["start_time", "home_team", "away_team"]}
 52 |     database.map_table(tbl_name, sql_types, constraint)
 53 |     database.create_tables()
 54 |     database.insert_rows(tbl_name, data.data)
 55 |     database.clear_mappers()  # if mappers aren't cleared, others scripts won't be able to use DBInterface.Template
 56 | 
 57 | 
 58 | def update_season_table(session, sched_tbl, season_df):
 59 |     """Updates the schedule table in the database with new data stored in the season_df
 60 | 
 61 |     Changes are added to the session and need to be committed later.
 62 |     During the playoffs, some games are removed from the sched_df as described in line.
 63 | 
 64 |     Args:
 65 |         session: A SQLalchemy session object
 66 |         sched_tbl: A mapped table that holds the schedule
 67 |         season_df: A pandas Dataframe version of the season as returned from br_web_scraper
 68 |     """
 69 |     date = datetime.date(datetime.now())
 70 |     update_query = session.query(sched_tbl).filter(sched_tbl.start_time < date,
 71 |                                                   sched_tbl.home_team_score == 0).order_by(sched_tbl.start_time)
 72 |     if update_query.count() == 0:
 73 |         # print("Season is up to date; Returning without performing an update.") Test/logging statement
 74 |         return
 75 | 
 76 |     all_update_rows = update_query.all()
 77 |     first_game_time = all_update_rows[0].start_time
 78 |     last_game_time = all_update_rows[len(all_update_rows) - 1].start_time
 79 | 
 80 |     # Reduce season to games between first and last game time
 81 |     season_df["start_time"] = season_df["start_time"].dt.tz_localize(None)
 82 |     update_df = season_df.loc[(season_df.start_time >= first_game_time) & (season_df.start_time <= last_game_time)]
 83 | 
 84 |     for row in all_update_rows:
 85 |         game = update_df.loc[(update_df.home_team == row.home_team) & (update_df.away_team == row.away_team) &
 86 |                              (update_df.start_time.dt.date == datetime.date(row.start_time))]
 87 |         if len(game) == 0:
 88 |             # This catches playoff games which do not end up happening (i.e. a game 7 in a series a team sweeps), and
 89 |             # removes it from the database
 90 |             session.delete(row)
 91 |         else:
 92 |             row.home_team_score = int(game.home_team_score)
 93 |             row.away_team_score = int(game.away_team_score)
 94 |             row.start_time = game.start_time.dt.to_pydatetime()[0]  # Convert Pandas TimeStamp to datetime
 95 |             session.add(row)
 96 | 
 97 | 
 98 | def add_rows(session, schedule, rows):
 99 |     """Add rows into the schedule if they contain games past the most recent game in schedule.
100 | 
101 |     Args:
102 |         session: An instantiated sqlalchemy session
103 |         schedule: A mapped schedule table
104 |         rows: rows compatible with schedule
105 |     """
106 |     most_recent_game = session.query(func.max(schedule.start_time)).one()[0]  # The most recent game in the database
107 |     most_recent_game = most_recent_game.replace(tzinfo=rows[0]["start_time"].tzinfo)  # Unify timezones
108 |     new_rows = [row for row in rows if row["start_time"] > most_recent_game]
109 |     new_row_objects = []
110 |     for row in new_rows:
111 |         new_row_objects.append(schedule(**row))
112 |     session.add_all(new_row_objects)
113 | 
114 | 
115 | def scrape():
116 |     """Scrape basketball reference for games in a season, parse the output, and write the output to a database.
117 | 
118 |     If the specified year has been completed, it will return every game in the season. If the season is ongoing, it will
119 |     return every game up to the day before the module is run. This ensures only completed games are returned.
120 | 
121 |     Args:
122 |         database: An instantiated DBInterface object from database.database for database interactions
123 |         session: A SQLalchemy session object
124 |     """
125 |     league_year = Config.get_property("league_year")
126 | 
127 |     # Create table
128 |     season_data = client.season_schedule(league_year)
129 |     season_data = br_enum_to_string(season_data)
130 |     return season_data
131 | 
132 | 
133 | if __name__ == '__main__':
134 |     scrape()
135 | 


--------------------------------------------------------------------------------
/NBApredict/database/manipulator.py:
--------------------------------------------------------------------------------
  1 | """
  2 | manipulator holds the DataOperator class which coerces raw_data into SQLalchemy compatible formats.
  3 | ToDo: Remove
  4 | """
  5 | from datetime import datetime
  6 | from nbapredict.helpers import type
  7 | from sqlalchemy import Integer, Float, String, DateTime, Boolean
  8 | 
  9 | 
 10 | class DataOperator:
 11 |     """DataOperator takes scraped data in init, and uses its member functions to return manipulations of that data"""
 12 | 
 13 |     def __init__(self, data):
 14 |         """Stores the data dictionary passed to it
 15 | 
 16 |         Args:
 17 |             data: A dictionary of data which will, usually, reflect data scraped from a website. Two dictionary
 18 |             formats are accepted. First, data may hold column names with data values formatted as:
 19 |             data[col1] = [val1, val2, ...]
 20 |             data[col2] = [val1, val2, ...]
 21 |             Second, data may be a list of rows formatted as:
 22 |             data[0] = {col1: val0, col2: val0, colx: val0}
 23 |             data[x] = {col1: valx, col2: valx, colx: valx}
 24 |         """
 25 |         self.data = data
 26 |         self.rows = None
 27 | 
 28 |     def get_sql_type(self):
 29 |         """Take the object's data and return a dictionary formatted as {key: SQLtype}.
 30 | 
 31 |         Returns:
 32 |             A dictionary with the same keys as tbl_dict. The dictionary's values are the sql_types of each key:value
 33 |             pair in tbl_dict. The sql_types are defined to function with SQLalchemy as column definitions.
 34 |         """
 35 |         py_types = self._get_py_type()  # py_types is a dict
 36 |         sql_types = self._py_type_to_sql_type(py_types)
 37 |         return sql_types
 38 | 
 39 |     def _get_py_type(self):
 40 |         """Take the classes data values and return a dictionary that holds the python type for the values.
 41 | 
 42 |         Returns:
 43 |             A dictionary formatted as key:py_type where the type can be integer, float, string, datetime, or none
 44 |         """
 45 |         py_types_dict = {}
 46 |         if isinstance(self.data, dict):
 47 |             tbl_keys = list(self.data.keys())
 48 |             py_types = [type.get_type(self.data[key]) for key in tbl_keys]
 49 |             py_types_dict = dict(zip(tbl_keys, py_types))
 50 |         elif isinstance(self.data, list):
 51 |             if isinstance(self.data[0], dict):
 52 |                 data = self.data[0]
 53 |                 tbl_keys = list(data.keys())
 54 |                 py_types = [type.get_type(data[key]) for key in tbl_keys]
 55 |                 py_types_dict = dict(zip(tbl_keys, py_types))
 56 |             else:
 57 |                 raise Exception("The data structure ({}) is not handled by _get_py_type".format(type(self.data)))
 58 |         return py_types_dict
 59 | 
 60 |     @staticmethod
 61 |     def _py_type_to_sql_type(py_types):
 62 |         """Convert and return a dictionary of python types to a dictionary of sql types.
 63 | 
 64 |         Raises:
 65 |             An exception if a py_type is not an integer, float, string, datetime, bool, or none
 66 | 
 67 |         To-do:
 68 |             * Change the logic into a switch statement
 69 |         """
 70 | 
 71 |         sql_types = dict()
 72 |         for key in py_types:
 73 |             py_type = py_types[key]
 74 |             if py_type == "integer" or py_type is int:
 75 |                 sql_types[key] = Integer
 76 |             elif py_type == "float" or py_type is float:
 77 |                 sql_types[key] = Float
 78 |             elif py_type == "string" or py_type is str:
 79 |                 sql_types[key] = String
 80 |             elif py_type == "datetime" or py_type is datetime:
 81 |                 sql_types[key] = DateTime
 82 |             elif py_type == "bool" or py_type is bool:
 83 |                 sql_types[key] = Boolean
 84 |             elif py_type is None:
 85 |                 continue  # We continue here so as to not create a column for null values
 86 |             else:
 87 |                 raise Exception("Error: py_type {} is not an integer, float, datetime,"
 88 |                                 " none, or string".format(py_types[key]))
 89 |         return sql_types
 90 | 
 91 |     # Table modification functions
 92 |     def dict_to_rows(self):
 93 |         """Convert and return class data into rows compatible with sqlalchemy's insert function
 94 | 
 95 |         Currently presumes each dictionary object is a list of equivalent length. Calls _dict_to_rows() to do primary
 96 |         processing. Does not yet function with lists.
 97 | 
 98 |         Returns:
 99 |             a list of rows compatible with SQLalchemy's
100 | 
101 |         Raise:
102 |             Exception: If the input is neither a list nor dictionary, an exception is raised
103 |         """
104 |         if isinstance(self.data, dict):
105 |             self.rows = self._dict_to_rows()
106 |             return self.rows
107 |         elif isinstance(self.data, list):
108 |             self.rows = self._list_to_rows()
109 |             return self.rows
110 |         else:
111 |             raise Exception("tbl is neither a list or dictionary, and cannot be handled")
112 | 
113 |     def _dict_to_rows(self):
114 |         """Convert and return an input dictionary into rows compatible with SQLalchemy"""
115 | 
116 |         rows = []
117 |         keys = list(self.data.keys())
118 |         # The length of the data should be checked outside the function to ensure each value is an equal length object
119 |         length = len(self.data[keys[0]])
120 |         for i in range(length):
121 |             row_dict = dict()
122 |             for key in keys:
123 |                 row_dict[key] = self.data[key][i]
124 |             rows.append(row_dict)
125 |         return rows
126 | 
127 |     def _list_to_rows(self):
128 |         """Not yet functional
129 | 
130 |         To-do:
131 |             Implement functionality for transforming lists into database rows"""
132 | 
133 |         raise Exception("tbl is a list. Function to convert lists into database rows is not implemented")
134 | 
135 |     def validate_data_length(self):
136 |         """Given a dictionary where keys references lists, check that all lists are the same length, and return T or F
137 | 
138 |         Returns:
139 |              True: if all the lists in the dictionary have the same length
140 |              False: if the dictionary's lists are of different lengths
141 |         """
142 |         keys = self.data.keys()
143 |         lengths = []
144 |         for key in keys:
145 |             lengths.append(len(self.data[key]))
146 |         length_set = set(lengths)
147 |         if len(length_set) == 1:
148 |             return True
149 |         else:
150 |             return False
151 | 


--------------------------------------------------------------------------------
/NBApredict/management/tables/odds.py:
--------------------------------------------------------------------------------
  1 | """odds.py contains function to create the odds table in the database"""
  2 | 
  3 | import nbapredict.management.conversion as convert
  4 | from sqlalchemy import ForeignKey, or_, func
  5 | from sqlalchemy.orm import aliased
  6 | from datetime import timedelta
  7 | import math
  8 | 
  9 | 
 10 | def format_data(session, odds_dict, team_tbl, schedule_tbl):
 11 |     """From the odds_dict, strip extraneous dictionary keys, add a 'game_id' FK, and return the odds_dict
 12 | 
 13 |     Args:
 14 |         session: A SQLalchemy session bound to the db
 15 |         odds_dict: A dictionary of data returned by line_scraper
 16 |         team_tbl: A mapped team table
 17 |         schedule_tbl: A mapped schedule table
 18 | 
 19 |     Returns:
 20 |         odds_dict formatted with foreign keys (mainly a FK for games in the schedule tbl)
 21 |     """
 22 |     odds_dict['home_team_id'] = convert.values_to_foreign_key(session, team_tbl, "id", 'team_name',
 23 |                                                               odds_dict.pop('home_team'))
 24 |     odds_dict = check_gametimes(session, schedule_tbl, odds_dict)
 25 | 
 26 |     # the columns that uniquely identify a game in the schedule table
 27 |     val_cols = ['home_team_id', 'start_time']
 28 |     uID = {k: odds_dict[k] for k in val_cols}  # Home team + start_time form a unique identifier for a game in schedule
 29 |     odds_dict['game_id'] = convert.values_to_foreign_key(session, schedule_tbl, "id", val_cols, uID)
 30 | 
 31 |     # Each of these columns is held in the schedule table
 32 |     del odds_dict['start_time']
 33 |     del odds_dict['away_team']
 34 |     del odds_dict['home_team_id']
 35 | 
 36 |     return odds_dict
 37 | 
 38 | 
 39 | def check_gametimes(session, schedule_tbl, odds_dict):
 40 |     """Check and, if necessary, change game times in the odds_dict
 41 | 
 42 |     Some games in Bovada do not have the same time as those in the official schedule. For example a Bovada game may
 43 |     start at 9:05 whereas the official game time is 9:00. """
 44 |     first_gametime = min(odds_dict['start_time']) - timedelta(hours=12)
 45 |     last_gametime = max(odds_dict['start_time']) + timedelta(days=1)
 46 |     sched_times = session.query(schedule_tbl.start_time).filter(
 47 |         schedule_tbl.home_team_id.in_(odds_dict['home_team_id']),
 48 |         schedule_tbl.start_time >= first_gametime,
 49 |         schedule_tbl.start_time <= last_gametime).all()
 50 |     sched_times = [t.start_time for t in sched_times]
 51 | 
 52 |     s_times = odds_dict['start_time']
 53 |     # List of tuples where the first element is the index to replace in odds_dict and the second element is the
 54 |     # unmatched time
 55 |     unmatched_times = [(t, s_times[t]) for t in range(len(s_times)) if s_times[t] not in sched_times]
 56 |     offsets = [timedelta(minutes=5)]  # Append more offsets here if they arise in the future
 57 | 
 58 |     # Check if the unmatched times +/- an offset exists in the schedule times
 59 |     for i in unmatched_times:
 60 |         for j in offsets:
 61 |             if (i[1] + j) in sched_times:
 62 |                 odds_dict['start_time'][i[0]] = i[1] + j
 63 |                 break
 64 |             elif (i[1] - j) in sched_times:
 65 |                 odds_dict['start_time'][i[0]] = i[1] - j
 66 |                 break
 67 | 
 68 |     return odds_dict
 69 | 
 70 | 
 71 | def create_table(db, tbl_name, odds_data, schedule_tbl):
 72 |     """Create a table of odds in the database"""
 73 |     columns = odds_data.columns
 74 |     schedule_tbl_name = schedule_tbl.__table__.fullname
 75 |     columns['game_id'].append(ForeignKey("{}.id".format(schedule_tbl_name)))
 76 |     db.map_table(tbl_name=tbl_name, columns=columns)
 77 |     db.create_tables()
 78 |     db.clear_mappers()
 79 | 
 80 | 
 81 | def update_table(session, odds_tbl, odds_data):
 82 |     """Return a list of rows to update in the odds table.
 83 | 
 84 |     This function wraps updated rows from any number of functions that perform updates on different criteria."""
 85 |     line_updates = update_lines(session, odds_tbl, odds_data)
 86 |     return line_updates
 87 | 
 88 | 
 89 | def update_lines(session, odds_tbl, odds_data):
 90 |     """Update odds_tbl rows that are missing betting data present in the odds_data"""
 91 | 
 92 |     game_ids = odds_data.data['game_id']
 93 |     rows = session.query(odds_tbl).filter(or_(odds_tbl.home_spread_price == None, odds_tbl.away_spread_price == None,
 94 |                                               odds_tbl.home_moneyline == None, odds_tbl.away_moneyline == None) &
 95 |                                           odds_tbl.game_id.in_(game_ids))
 96 |     if rows.count() > 0:
 97 |         rows = rows.all()
 98 |         data_df = odds_data.dataframe
 99 |         update_rows = []
100 |         bet_cols = ['home_spread_price', 'away_spread_price', 'home_moneyline', 'away_moneyline', 'spread']
101 |         for r in rows:
102 |             data_row = data_df[data_df['game_id'] == r.game_id]
103 |             updated = False
104 |             for c in bet_cols:
105 |                 data_val = data_row[c].to_numpy()[0]
106 |                 if math.isnan(data_val):
107 |                     data_val = None
108 |                 if data_val != getattr(r, c):
109 |                     setattr(r, c, data_val)
110 |                     updated = True
111 |             if updated:
112 |                 update_rows.append(r)
113 |     else:
114 |         update_rows = []
115 |     return update_rows
116 | 
117 | 
118 | def delete(session, odds_tbl):
119 |     """Wraps odds functions that delete rows"""
120 |     delete_duplicates(session, odds_tbl)
121 | 
122 | 
123 | def delete_duplicates(session, odds_tbl):
124 |     """Delete odds rows where multiple copies exist for a game but the betting information does not change"""
125 |     l = aliased(odds_tbl)  # left odds
126 |     r = aliased(odds_tbl)  # right odds
127 |     # Rows that hold the same information
128 |     join = session.query(l).join(r, l.game_id == r.game_id, isouter=True). \
129 |         filter(l.id != r.id, l.spread == r.spread, l.home_spread_price == r.home_spread_price,
130 |                l.home_moneyline == r.home_moneyline, l.away_moneyline == r.away_moneyline).distinct().subquery()
131 | 
132 |     # join = session.query(l). \
133 |     #     filter(l.id != r.id, l.spread == r.spread, l.home_spread_price == r.home_spread_price,
134 |     #            l.home_moneyline == r.home_moneyline, l.away_moneyline == r.away_moneyline).distinct().subquery()
135 | 
136 |     min_ids = session.query(func.min(join.c['id']).label('id')).group_by(join.c['game_id']).order_by(join.c['id']) \
137 |         .subquery()
138 | 
139 |     delete = session.query(join).filter(join.c['id'].notin_(min_ids)).subquery()
140 |     delete_alias = aliased(odds_tbl, delete)
141 |     delete_rows = session.query(delete_alias).all()
142 |     if len(delete_rows) > 0:
143 |         for i in delete_rows:
144 |             session.delete(i)
145 | 


--------------------------------------------------------------------------------
/NBApredict/helpers/br_references.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Contains constants from basketballreference.com or for interaction with basketballreference based tables.
  3 | 
  4 | For example, it contains all team names and abbreviations as well as headers for different tables that are queried.
  5 | All of the classes come from the br_web_scraper repo available on github. I am not clear on why the classes use enums
  6 | which I've largely worked around in the package. Other lists and dictionaries were generated specifically for this
  7 | project.
  8 | """
  9 | 
 10 | from enum import Enum
 11 | import os
 12 | 
 13 | 
 14 | class Location(Enum):
 15 |     HOME = "HOME"
 16 |     AWAY = "AWAY"
 17 | 
 18 | 
 19 | class Outcome(Enum):
 20 |     WIN = "WIN"
 21 |     LOSS = "LOSS"
 22 | 
 23 | 
 24 | class Team(Enum):
 25 |     ATLANTA_HAWKS = "ATLANTA HAWKS"
 26 |     BOSTON_CELTICS = "BOSTON CELTICS"
 27 |     BROOKLYN_NETS = "BROOKLYN NETS"
 28 |     CHARLOTTE_HORNETS = "CHARLOTTE HORNETS"
 29 |     CHICAGO_BULLS = "CHICAGO BULLS"
 30 |     CLEVELAND_CAVALIERS = "CLEVELAND CAVALIERS"
 31 |     DALLAS_MAVERICKS = "DALLAS MAVERICKS"
 32 |     DENVER_NUGGETS = "DENVER NUGGETS"
 33 |     DETROIT_PISTONS = "DETROIT PISTONS"
 34 |     GOLDEN_STATE_WARRIORS = "GOLDEN STATE WARRIORS"
 35 |     HOUSTON_ROCKETS = "HOUSTON ROCKETS"
 36 |     INDIANA_PACERS = "INDIANA PACERS"
 37 |     LOS_ANGELES_CLIPPERS = "LOS ANGELES CLIPPERS"
 38 |     LOS_ANGELES_LAKERS = "LOS ANGELES LAKERS"
 39 |     MEMPHIS_GRIZZLIES = "MEMPHIS GRIZZLIES"
 40 |     MIAMI_HEAT = "MIAMI HEAT"
 41 |     MILWAUKEE_BUCKS = "MILWAUKEE BUCKS"
 42 |     MINNESOTA_TIMBERWOLVES = "MINNESOTA TIMBERWOLVES"
 43 |     NEW_ORLEANS_PELICANS = "NEW ORLEANS PELICANS"
 44 |     NEW_YORK_KNICKS = "NEW YORK KNICKS"
 45 |     OKLAHOMA_CITY_THUNDER = "OKLAHOMA CITY THUNDER"
 46 |     ORLANDO_MAGIC = "ORLANDO MAGIC"
 47 |     PHILADELPHIA_76ERS = "PHILADELPHIA 76ERS"
 48 |     PHOENIX_SUNS = "PHOENIX SUNS"
 49 |     PORTLAND_TRAIL_BLAZERS = "PORTLAND TRAIL BLAZERS"
 50 |     SACRAMENTO_KINGS = "SACRAMENTO KINGS"
 51 |     SAN_ANTONIO_SPURS = "SAN ANTONIO SPURS"
 52 |     TORONTO_RAPTORS = "TORONTO RAPTORS"
 53 |     UTAH_JAZZ = "UTAH JAZZ"
 54 |     WASHINGTON_WIZARDS = "WASHINGTON WIZARDS"
 55 | 
 56 |     # DEPRECATED TEAMS
 57 |     CHARLOTTE_BOBCATS = "CHARLOTTE BOBCATS"
 58 |     NEW_JERSEY_NETS = "NEW JERSEY NETS"
 59 |     NEW_ORLEANS_HORNETS = "NEW ORLEANS HORNETS"
 60 |     NEW_ORLEANS_OKLAHOMA_CITY_HORNETS = "NEW ORLEANS/OKLAHOMA CITY HORNETS"
 61 |     SEATTLE_SUPERSONICS = "SEATTLE SUPERSONICS"
 62 |     VANCOUVER_GRIZZLIES = "VANCOUVER GRIZZLIES"
 63 | 
 64 | 
 65 | class OutputType(Enum):
 66 |     JSON = "JSON"
 67 |     CSV = "CSV"
 68 | 
 69 | 
 70 | class OutputWriteOption(Enum):
 71 |     WRITE = "w"
 72 |     CREATE_AND_WRITE = "w+"
 73 |     APPEND = "a"
 74 |     APPEND_AND_WRITE = "a+"
 75 | 
 76 | 
 77 | class Position(Enum):
 78 |     POINT_GUARD = "POINT GUARD"
 79 |     SHOOTING_GUARD = "SHOOTING GUARD"
 80 |     SMALL_FORWARD = "SMALL FORWARD"
 81 |     POWER_FORWARD = "POWER FORWARD"
 82 |     CENTER = "CENTER"
 83 | 
 84 | 
 85 | class Tables(Enum):
 86 |     misc_stats = "misc_stats"
 87 | 
 88 | 
 89 | TEAM_ABBREVIATIONS_TO_TEAM = {
 90 |     'ATL': Team.ATLANTA_HAWKS,
 91 |     'BOS': Team.BOSTON_CELTICS,
 92 |     'BRK': Team.BROOKLYN_NETS,
 93 |     'CHI': Team.CHICAGO_BULLS,
 94 |     'CHO': Team.CHARLOTTE_HORNETS,
 95 |     'CLE': Team.CLEVELAND_CAVALIERS,
 96 |     'DAL': Team.DALLAS_MAVERICKS,
 97 |     'DEN': Team.DENVER_NUGGETS,
 98 |     'DET': Team.DETROIT_PISTONS,
 99 |     'GSW': Team.GOLDEN_STATE_WARRIORS,
100 |     'HOU': Team.HOUSTON_ROCKETS,
101 |     'IND': Team.INDIANA_PACERS,
102 |     'LAC': Team.LOS_ANGELES_CLIPPERS,
103 |     'LAL': Team.LOS_ANGELES_LAKERS,
104 |     'MEM': Team.MEMPHIS_GRIZZLIES,
105 |     'MIA': Team.MIAMI_HEAT,
106 |     'MIL': Team.MILWAUKEE_BUCKS,
107 |     'MIN': Team.MINNESOTA_TIMBERWOLVES,
108 |     'NOP': Team.NEW_ORLEANS_PELICANS,
109 |     'NYK': Team.NEW_YORK_KNICKS,
110 |     'OKC': Team.OKLAHOMA_CITY_THUNDER,
111 |     'ORL': Team.ORLANDO_MAGIC,
112 |     'PHI': Team.PHILADELPHIA_76ERS,
113 |     'PHO': Team.PHOENIX_SUNS,
114 |     'POR': Team.PORTLAND_TRAIL_BLAZERS,
115 |     'SAC': Team.SACRAMENTO_KINGS,
116 |     'SAS': Team.SAN_ANTONIO_SPURS,
117 |     'TOR': Team.TORONTO_RAPTORS,
118 |     'UTA': Team.UTAH_JAZZ,
119 |     'WAS': Team.WASHINGTON_WIZARDS,
120 | 
121 |     # DEPRECATED TEAMS
122 |     'NJN': Team.NEW_JERSEY_NETS,
123 |     'NOH': Team.NEW_ORLEANS_HORNETS,
124 |     'NOK': Team.NEW_ORLEANS_OKLAHOMA_CITY_HORNETS,
125 |     'CHA': Team.CHARLOTTE_BOBCATS,
126 |     'CHH': Team.CHARLOTTE_HORNETS,
127 |     'SEA': Team.SEATTLE_SUPERSONICS,
128 |     'VAN': Team.VANCOUVER_GRIZZLIES,
129 | }
130 | 
131 | team_to_team_abbreviation = {
132 |     "ATLANTA HAWKS": "ATL",
133 |     "BOSTON CELTICS": "BOS",
134 |     "BROOKLYN NETS": "BRK",
135 |     "CHARLOTTE HORNETS": "CHO",
136 |     "CHICAGO BULLS": "CHI",
137 |     "CLEVELAND CAVALIERS": "CLE",
138 |     "DALLAS MAVERICKS": "DAL",
139 |     "DENVER NUGGETS": "DEN",
140 |     "DETROIT PISTONS": "DET",
141 |     "GOLDEN STATE WARRIORS": "GSW",
142 |     "HOUSTON ROCKETS": "HOU",
143 |     "INDIANA PACERS": "IND",
144 |     "LOS ANGELES CLIPPERS": "LAC",
145 |     "LOS ANGELES LAKERS": "LAL",
146 |     "MEMPHIS GRIZZLIES": "MEM",
147 |     "MIAMI HEAT": "MIA",
148 |     "MILWAUKEE BUCKS": "MIL",
149 |     "MINNESOTA TIMBERWOLVES": "MIN",
150 |     "NEW ORLEANS PELICANS": "NOP",
151 |     "NEW YORK KNICKS": "NYK",
152 |     "OKLAHOMA CITY THUNDER": "OKC",
153 |     "ORLANDO MAGIC": "ORL",
154 |     "PHILADELPHIA 76ERS": "PHI",
155 |     "PHOENIX SUNS": "PHO",
156 |     "PORTLAND TRAIL BLAZERS": "POR",
157 |     "SACRAMENTO KINGS": "SAC",
158 |     "SAN ANTONIO SPURS": "SAS",
159 |     "TORONTO RAPTORS": "TOR",
160 |     "UTAH JAZZ": "UTA",
161 |     "WASHINGTON WIZARDS": "WAS",
162 |     
163 |     # DEPRECATED TEAMS
164 |     "CHARLOTTE BOBCATS": "CHA",
165 |     "NEW JERSEY NETS": "NJN",
166 |     "NEW ORLEANS HORNETS": "NOH",
167 |     "NEW ORLEANS/OKLAHOMA CITY HORNETS": "NOK",
168 |     "SEATTLE SUPERSONICS": "SEA",
169 |     "VANCOUVER GRIZZLIES": "VAN"
170 | }
171 | 
172 | POSITION_ABBREVIATIONS_TO_POSITION = {
173 |     "PG": Position.POINT_GUARD,
174 |     "SG": Position.SHOOTING_GUARD,
175 |     "SF": Position.SMALL_FORWARD,
176 |     "PF": Position.POWER_FORWARD,
177 |     "C": Position.CENTER,
178 | }
179 | 
180 | bball_ref_tbl_names = {
181 |     "misc_stats": Tables.misc_stats
182 | }
183 | 
184 | data_stat_headers = [  # Column headers for the misc_stats table
185 |     "team_name", "age", "wins",
186 |     "losses", "wins_pyth", "losses_pyth",
187 |     "mov", "sos", "srs", "off_rtg",
188 |     "def_rtg", "pace", "fta_per_fga_pct",
189 |     "fg3a_per_fga_pct", "ts_pct",
190 |     "efg_pct", "tov_pct", "orb_pct",
191 |     "ft_rate", "opp_efg_pct", "opp_tov_pct",
192 |     "drb_pct", "opp_ft_rate", "arena_name",
193 |     "attendance", "attendance_per_g"
194 | ]
195 | 
196 | four_factors = [  # The offensive and defensive four factors
197 |     "efg_pct", "tov_pct", "orb_pct",
198 |     "ft_rate", "opp_efg_pct", "opp_tov_pct",
199 |     "drb_pct", "opp_ft_rate"
200 | ]
201 | 
202 | BASE_URL = 'https://www.basketball-reference.com'
203 | # JSON_REFERENCE_PATH = os.path.join(test.project_directory(), "data", "references.json")
204 | 


--------------------------------------------------------------------------------
/NBApredict/predict/get.py:
--------------------------------------------------------------------------------
  1 | """Funcs contains functions for generating predictions and their helper functions."""
  2 | import pandas as pd
  3 | 
  4 | import nbapredict.models.four_factor_regression as lm
  5 | import nbapredict.helpers.br_references as br_references
  6 | from nbapredict.database.manipulator import DataOperator
  7 | import nbapredict.database.getters as getters
  8 | 
  9 | 
 10 | def sample_prediction(database, session, ref_tbl, model):
 11 |     """Generate and return a one row sample prediction created from the first row of the reference table.
 12 | 
 13 |     ToDo: Change function to take any model
 14 |     ToDo: Change docstring to reference new classes, perhaps drop DB arg
 15 | 
 16 |     Args:
 17 |         database: An initialized DBInterface class from database.dbinterface.py
 18 |         session: A SQLalchemy session object
 19 |         ref_tbl: A mapped odds table
 20 |         model: A regression object from four_factor_regression.py
 21 | 
 22 |     Returns:
 23 |         A DataOperator object initialized with a prediction from regression
 24 |     """
 25 |     first_game_odds = session.query(ref_tbl).order_by(ref_tbl.start_time).first()
 26 | 
 27 |     home_tm = first_game_odds.home_team
 28 |     away_tm = first_game_odds.away_team
 29 |     start_time = first_game_odds.start_time
 30 | 
 31 |     sample_prediction = game_prediction(database, session, model, home_tm, away_tm, start_time)
 32 |     data = DataOperator(sample_prediction)
 33 |     return data
 34 | 
 35 | 
 36 | def game_prediction(database, session, regression, home_tm, away_tm, start_time, year=2019, console_out=False):
 37 |     """Predict a game versus the line, and return the information in a dictionary.
 38 | 
 39 |     Use console out for human readable output if desired. Cdf is a cumulative density function. SF is a survival
 40 |     function. CDF is calculated when the betting line's prediction is below the model's prediction. SF is calculated
 41 |     when the betting line's prediction is above the model's prediction.
 42 | 
 43 |     ToDO: Modify to use new database
 44 | 
 45 |     Args:
 46 |         database: an instantiated DBInterface class from database.dbinterface.py
 47 |         session: A SQLalchemy session object
 48 |         regression: A regression object
 49 |         start_time: Date.datetime with the date and start time of the game
 50 |         home_tm: The home team
 51 |         away_tm: The away team
 52 |         line: The betting line
 53 |         year: The year to use stats from in predicting the game
 54 |         console_out: If true, print the prediction results. Ignore otherwise
 55 |     """
 56 |     home_tm = team_name(home_tm)
 57 |     away_tm = team_name(away_tm)
 58 | 
 59 |     # Get Misc stats for year
 60 |     ff_list = lm.four_factors_list()
 61 |     ff_df = getters.get_pandas_df_from_table(database, session, "misc_stats_{}".format(year), ff_list)
 62 | 
 63 |     pred_df = prediction_df(home_tm, away_tm, ff_df)
 64 |     pred = prediction(regression, pred_df)
 65 |     # probability, function = line_probability(prediction, line, np.std(regression.residuals))
 66 | 
 67 |     #if console_out:
 68 |     #    prediction_result_console_output(home_tm, away_tm, line, prediction, probability)
 69 | 
 70 |     return {"start_time": start_time, "home_team": home_tm, "away_team": away_tm, "prediction": pred}
 71 | 
 72 | 
 73 | def prediction(reg, pred_df):
 74 |     """Generate and return a prediction for the observations in the pred_df.
 75 | 
 76 |     Args:
 77 |         reg: LinearRegression class from four_factors_regression.py
 78 |         pred_df: A dataframe of observations, with home and away statistics, from which to generate a prediction
 79 | 
 80 |     Returns:
 81 |         The predicted value generated from the regression object and the predictors"""
 82 |     return reg.results.predict(pred_df).values[0]
 83 | 
 84 | 
 85 | def console_output(home_tm, away_tm, line, prediction, probability):
 86 |     """Generate human readable printout comparing the model's predictions, the line, and the p_value of the line.
 87 | 
 88 |     Args:
 89 |         home_tm: The home team
 90 |         away_tm: The away team
 91 |         line: The betting line
 92 |         prediction: A prediction of the home team's margin of victory
 93 |         probability: The probability of the betting line as determined by a CDF or SF
 94 |     """
 95 |     if prediction > 0:
 96 |         print("The {} are projected to beat the {} by {} points".format(home_tm, away_tm, prediction))
 97 |         if (-1 * line) < prediction:
 98 |             print("If the model were true, the betting line's ({}) CDF, in relation to the prediction, would "
 99 |                   "be realized {}% of the time".format(line, probability))
100 |         else:
101 |             print("If the model were true, the betting line's ({}) SF, in relation to the prediction, would "
102 |                   "be realized {}% of the time".format(line, probability))
103 |     if prediction < 0:
104 |         print("The {} are projected to lose to the {} by {} points".format(home_tm, away_tm, prediction))
105 |         if (-1 * line) < prediction:
106 |             print("If the model were true, the betting line's ({}) CDF, in relation to the prediction, would "
107 |                   "be realized {}% of the time".format(line, probability))
108 |         else:
109 |             print("If the model were true, the betting line's ({}) SF, in relation to the prediction, would "
110 |                   "be realized {}% of the time".format(line, probability))
111 | 
112 | 
113 | def prediction_df(home_tm, away_tm, ff_df):
114 |     """Create and return a dataframe that merges the four factors for the home and away team.
115 | 
116 |     Args:
117 |         home_tm: The home team
118 |         away_tm: The away team
119 |         ff_df: Dataframe of the four factors for all teams
120 | 
121 |     Returns:
122 |         A single row four factors data frame of the home and away team's four factors
123 |     """
124 |     home_ff = team_ff(home_tm, ff_df, home=True)
125 |     away_ff = team_ff(away_tm, ff_df, home=False)
126 |     home_ff["key"] = 1
127 |     home_ff["const"] = 1.0  # sm.add_const does not add a constant for whatever reason
128 |     away_ff["key"] = 1
129 |     merged = pd.merge(home_ff, away_ff, on="key", sort=True)
130 |     merged = merged.drop(["key"], axis=1)
131 |     merged = merged.sort_index(axis=1)
132 |     return merged
133 | 
134 | 
135 | def team_ff(team, ff_df, home):
136 |     """Create and return a data frame of the four factors for the specified team.
137 | 
138 |     Args:
139 |         team: The team to extract the four factors for
140 |         ff_df: A dataframe of the four factors
141 |         home: Boolean which dictates if an '_h or '_a' should be appended to the team's stats
142 | 
143 |     Returns:
144 |         The four factors, with a home or away suffix, for a team are returned as a data frame
145 |     """
146 |     ff_list = br_references.four_factors
147 |     team_ff = ff_df[ff_df.team_name.str.lower() == team.lower()][ff_list]
148 |     if home:
149 |         team_ff = team_ff.rename(lm.append_h, axis='columns')
150 |     else:
151 |         team_ff = team_ff.rename(lm.append_a, axis='columns')
152 |     return team_ff
153 | 
154 | 
155 | def team_name(team):
156 |     """Match team to a standard team name (not cap-sensitive) and return the br_references standard team name."""
157 |     for team_name in br_references.Team:
158 |         if team.lower() == team_name.value.lower():
159 |             return team_name.value
160 | 


--------------------------------------------------------------------------------
/NBApredict/database/dbinterface.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file contains a DBInterface class which dictates table creation, deletion, and access.
  3 | ToDo: Remove
  4 | """
  5 | 
  6 | import os
  7 | from sqlalchemy import Column, Integer, Table
  8 | from sqlalchemy import create_engine, MetaData, event
  9 | from sqlalchemy.engine import Engine
 10 | from sqlalchemy.ext.declarative import declarative_base
 11 | from sqlalchemy.orm import mapper, clear_mappers
 12 | from sqlalchemy.ext.automap import automap_base
 13 | 
 14 | # Local Imports
 15 | from nbapredict.configuration import Config
 16 | 
 17 | 
 18 | class DBInterface:
 19 |     """DBInterface contains high level information about the desired database and creation, deletion, and access functions
 20 | 
 21 |     Attributes:
 22 |          path: The path to the database
 23 |          engine: SQLalchemy engine for accessing the database
 24 |          metadata: Metadata for the engine, used mostly for table access / reflection
 25 |          Base: SQLalchemy declarative_base() used for table creation
 26 |     """
 27 | 
 28 |     class Template(object):
 29 |         """Blank template to map tables to with the sqlalchemy mapper function
 30 | 
 31 |         Note:
 32 |             Template can only be mapped to one table at a time. Use clear_mappers to free the template for new tables
 33 |         """
 34 |         pass
 35 | 
 36 |     def __init__(self, url=None):
 37 |         """Initialize macro-level SQLalchemy objects as class attributes (engine, metadata, base).
 38 | 
 39 |         A session will allow interaction with the DB."""
 40 |         if not url:
 41 |             file_path = os.getcwd()
 42 |             self.path = Config.get_property("database")
 43 |         else:
 44 |             self.path = url
 45 |         self.engine = create_engine(self.path, pool_pre_ping=True)
 46 |         self.metadata = MetaData(self.engine)
 47 |         self.Base = declarative_base()
 48 | 
 49 |     def get_tables(self, table_names=False):
 50 |         """Find and return the specified tables or return all tables.
 51 | 
 52 |         Primary use is to check if table exists in database. Use get_table_mappings() for ORM style table interactions
 53 |         """
 54 |         meta = MetaData(bind=self.engine)
 55 |         meta.reflect(bind=self.engine)
 56 |         if table_names:
 57 |             return meta.tables[table_names]
 58 |         else:
 59 |             return meta.tables
 60 | 
 61 |     def get_table_mappings(self, table_names):
 62 |         """Find and return the specified table mappings or return all table mappings
 63 | 
 64 |         Args:
 65 |          table_names: The table names for which mappings are desired. Either a string or list
 66 |         """
 67 |         if isinstance(table_names, str):  # Allows a string, rather than list, to be passed to function
 68 |             holder = table_names
 69 |             table_names = [holder]
 70 | 
 71 |         self.metadata.reflect(self.engine, only=table_names)
 72 |         Base = automap_base(metadata=self.metadata)
 73 |         Base.prepare()
 74 | 
 75 |         mapped_tables = [Base.classes[name] for name in table_names]
 76 |         if len(mapped_tables) == 1:
 77 |             return mapped_tables[0]
 78 |         else:
 79 |             return mapped_tables
 80 | 
 81 |     def table_exists(self, tbl_name):
 82 |         """Check if a table exists in the database; Return True if it exists and False otherwise."""
 83 |         self.metadata.reflect(bind=self.engine)
 84 |         if tbl_name in self.metadata.tables:
 85 |             return True
 86 |         else:
 87 |             return False
 88 | 
 89 |     def create_tables(self):
 90 |         """Creates all tables which have been made or modified with the Base class of the DBInterface
 91 | 
 92 |         Note that existing tables which have been modified, such as by adding a relationship, will be updated when
 93 |         create_tables() is called. """
 94 |         self.metadata.create_all(self.engine)
 95 | 
 96 |     def map_table(self, tbl_name, column_types, constraints=None):
 97 |         """Map a table named tbl_name and with column_types to Template, add constraints if specified.
 98 | 
 99 |         Note: Foreign key constraints should likely be added to the mapped table explicitly rather than in this function
100 | 
101 |         Args:
102 |             tbl_name: The name of the table to be mapped
103 |             column_types: A dictionary with column names as keys and sql types as values
104 |             constraints: A dictionary of desired constraints where the constraints (Such as UniqueConstraint) are keys
105 |             and the columns to be constrained is a list of string column names
106 |         """
107 |         columns = self._generate_columns(column_types)
108 |         if constraints:
109 |             t = Table(tbl_name, self.metadata, Column('id', Integer, primary_key=True),
110 |                       *columns,
111 |                       *(constraint(*columns) for constraint, columns in constraints.items()),
112 |                       )
113 |         else:
114 |             t = Table(tbl_name, self.metadata, Column('id', Integer, primary_key=True),
115 |                       *columns
116 |                       )
117 | 
118 |         mapper(self.Template, t)
119 | 
120 |     @staticmethod
121 |     def _generate_columns(columns):
122 |         """Take columns where key is the column name and value is the column type into SQLlachemy columns.
123 | 
124 |         To use additional arguments, such as constraints, specify column values as a list where the constraints are
125 |         elements of the list"""
126 |         column_list = []
127 |         for key, value in columns.items():
128 |             try:
129 |                 column_list.append(Column(key, *value))  # Unpacks additional column arguments
130 |             except TypeError:  # if no additional arguments, just make a standard name and type column
131 |                 column_list.append(Column(key, value))
132 |         return column_list
133 | 
134 |     @staticmethod
135 |     def clear_mappers():
136 |         clear_mappers()
137 | 
138 |     def insert_row(self, table, row):
139 |         """Insert a single row into the specified table in the engine"""
140 |         conn = self.engine.connect()
141 |         table = self.get_tables(table)
142 |         conn.execute(table.insert(), row)
143 |         conn.close()
144 |         # Rows formatted as
145 |         #   [{'l_name': 'Jones', 'f_name': 'bob'},
146 |         #   {'l_name': 'Welker', 'f_name': 'alice'}])
147 | 
148 |     def insert_rows(self, table, rows):
149 |         """Insert rows into the specified table.
150 | 
151 |         Uses sqlalchemy's "Classic" method. ORM database interactions are mediated by sessions.
152 |         """
153 |         table = self.get_tables(table)
154 |         conn = self.engine.connect()
155 |         for row in rows:
156 |             conn.execute(table.insert(), row)
157 |         conn.close()
158 | 
159 |     def drop_table(self, drop_tbl):
160 |         """Drops the specified table from the database"""
161 |         self.metadata.reflect(bind=self.engine)
162 |         drop_tbls = self.metadata.tables[drop_tbl]
163 |         drop_tbls.drop()
164 |         self.metadata = MetaData(bind=self.engine)  # Updates the metadata to reflect changes
165 | 
166 | 
167 | @event.listens_for(Engine, "connect")
168 | def set_sqlite_pragma(dbapi_connection, connection_record):
169 |     """SQLalchemy listener function to allow foreign keys in SQLite"""
170 |     cursor = dbapi_connection.cursor()
171 |     cursor.execute("PRAGMA foreign_keys=ON")
172 |     cursor.close()
173 | 


--------------------------------------------------------------------------------
/NBApredict/configuration.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Path contains function which return file and folder paths for the project
  3 | ToDo: Research os.environ for setting variables. These may still be stored in Config, but they may offer a default
  4 | ToDo: or a different way to set variables, particularly for file pathes.
  5 | """
  6 | import os
  7 | import yaml
  8 | from nbapredict.helpers.classes import NestedDict
  9 | 
 10 | 
 11 | def project_directory():
 12 |     """Returns the project directory so long as configuration.py is in the top-level of the project"""
 13 |     return os.path.abspath(os.path.dirname(__file__))
 14 | 
 15 | 
 16 | def settings_file():
 17 |     """Returns the file path of settings.yaml"""
 18 |     return os.path.join(project_directory(), "settings.yaml")
 19 | 
 20 | 
 21 | def output_directory():
 22 |     """Returns the path to the output folder which holds the database and graphs"""
 23 |     return os.path.join(project_directory(), "outputs")
 24 | 
 25 | 
 26 | def rreplace(string, old, new, count):
 27 |     """Replace old with new in a string in reverse order.
 28 |     Args:
 29 |         string: String to modify
 30 |         old: Sub-string to replace
 31 |         new: Sub-string to replace old
 32 |         count: The number old sub-strings to be replaced"""
 33 |     li = string.rsplit(old, count)
 34 |     return new.join(li)
 35 | 
 36 | 
 37 | def database_file(calling_file_path):
 38 |     """Return the database file path with the path modified in relation to the path the function is called from.
 39 | 
 40 |     The base path is r"sqlite:///outputs//nba_db.db". This function modifies that path in relation to the calling file
 41 |     path by inserting ..// to the front of the base path. So a file nested one level below the root directory becomes
 42 |     r"sqlite:///..//outputs//nba_db.db"
 43 |     """
 44 |     head_path = project_directory()
 45 |     head_folder = os.path.split(head_path)[1]
 46 | 
 47 |     if os.path.realpath(calling_file_path) in head_path:
 48 |         # If NBApredict is imported from outside the project, replace calling_file_path with head_path
 49 |         calling_file_path = head_path
 50 | 
 51 |     calling_file_path = calling_file_path.replace("\\", "/")
 52 |     #print("Calling_file_path:", calling_file_path)
 53 |     sub_dirs = []
 54 |     split_path = os.path.split(calling_file_path)
 55 |     path = split_path[0]
 56 |     folder = split_path[1]
 57 |     while folder != head_folder:
 58 |         sub_dirs.append(folder)
 59 |         split_path = os.path.split(path)
 60 |         path = split_path[0]
 61 |         folder = split_path[1]
 62 | 
 63 |     if len(sub_dirs) > 0:
 64 |         modified_path = calling_file_path
 65 |         for folder in sub_dirs:
 66 |             modified_path = rreplace(modified_path, folder, "..", 1)
 67 | 
 68 |         path_addin = modified_path.split(head_folder)[1]
 69 |         path_addin = path_addin.replace("/", "//")
 70 |         while path_addin[0] == "/":
 71 |             path_addin = path_addin[1:]
 72 |         db_path = r"sqlite:///{}//outputs//nba_db.db".format(path_addin)
 73 |         return db_path
 74 |     else:
 75 |         return r"sqlite:///outputs//nba_db.db"
 76 | 
 77 | 
 78 | def graphs_directory():
 79 |     """Return the folder which holds graphs for the project."""
 80 |     return os.path.join(output_directory(), "graphs")
 81 | 
 82 | 
 83 | class Configuration:
 84 |     """Read and write configuration settings from settings.yaml
 85 | 
 86 |     Warning:
 87 |         Configuration cannot handle duplicate keys even if keys are of a different depth
 88 | 
 89 |     Attributes:
 90 |         _file: the source file of the Configuration instance
 91 |         _config: a dictionary of settings
 92 |         _key_order: each key in _config with values listing keys above the specified key
 93 |     """
 94 | 
 95 |     def __init__(self, file, settings):
 96 |         """sets _config to the settings dictionary and stores the _key_order for accessing each element in _config"""
 97 |         self._file = file
 98 |         self._key_order = self._generate_config_keys(settings)
 99 |         self._config = NestedDict(settings)
100 | 
101 |     def _generate_config_keys(self, config_dict, path=None, result=None, depth=0, ):
102 |         """Return a dictionary with each key, of any depth, in self._config.
103 | 
104 |         Each key's value is an ordered list of the nodes above the key and the key itself in self._config. A key in the
105 |         fourth level of config will be: {key: [node1, node2, node3, key]}.
106 | 
107 |         Args:
108 |             config_dict: A dictionary of configuration options
109 |             path: A list of keys above the current key in the dictionary
110 |             result: A dictionary which stores results
111 |             depth: The current depth of the recursion
112 |         """
113 |         # Initialize path and result. We avoid defaults so path and result are reset on each call
114 |         if path is None:
115 |             path = []
116 |         if result is None:
117 |             result = {}
118 |         for key, value in config_dict.items():
119 |             if depth == 0:  # Reset path each time the function reaches a top-level key in the dictionary
120 |                 path = [key]
121 |             if type(value) is dict:
122 |                 if key not in path:
123 |                     path.append(key)
124 |                 if key not in result.keys():
125 |                     result.update({key: path[:]})  # Create a new list to store path's current state
126 |                 result = self._generate_config_keys(value, path, result, depth=depth + 1)
127 |             else:
128 |                 result.update({key: path[:]})
129 |                 result[key].append(key)
130 | 
131 |         return result
132 | 
133 |     def get_property(self, property_key):
134 |         """Return the property associated with the property key from _config.
135 | 
136 |         Args:
137 |             property_key: The key, of any depth, of the desired property
138 |         """
139 |         if property_key not in self._key_order.keys():
140 |             return None
141 |         elif property_key in self._config.dict.keys():  # Checks if named property is in the top level of _config
142 |             return self._config[property_key]
143 |         else:
144 |             return self._config[self._key_order[property_key]]
145 | 
146 |     def _set_property(self, property_key, value):
147 |         """Private function for modifying key:value pairs in self._config.
148 | 
149 |         Additionally, rewrites self._key_order in order to store changes."""
150 |         if property_key not in self._key_order.keys():
151 |             raise KeyError("'{}' not in Config. Manually modify the settings.yaml file if you wish to add new"
152 |                            " settings.".format(property_key))
153 |         keys = [i for i in self._key_order[property_key]]
154 |         self._config[keys] = value
155 |         self._key_order = self._generate_config_keys(self._config.dict)
156 | 
157 |     def _write(self):
158 |         """Private function for over-writing self._config to the settings file"""
159 | 
160 | 
161 | def create_configuration(file, config_settings):
162 |     """Return an instantiated Configuration class."""
163 |     return Configuration(file, config_settings)
164 | 
165 | 
166 | def check_paths(config, comp_dict):
167 |     no_match = {}
168 |     for k, v in comp_dict.items():
169 |         if config.get_property(k) != v:
170 |             no_match[k] = v
171 |     return no_match
172 | 
173 | 
174 | def set_paths(config, change_dict):
175 |     for k, v in change_dict.items():
176 |         config._set_property(k, v)
177 |     return config
178 | 
179 | 
180 | with open(settings_file(), "r") as file:
181 |     config_settings = yaml.safe_load(file)
182 | 
183 | Config = create_configuration(settings_file(), config_settings)
184 | paths = {"directory": project_directory(), "database": database_file(os.getcwd()), "graph_dir": graphs_directory(),
185 |          "settings": settings_file()}
186 | paths = {"directory": project_directory()}
187 | paths.update({"database": database_file(os.getcwd())})
188 | change_paths = check_paths(Config, paths)
189 | set_paths(Config, change_paths)
190 | 
191 | # noinspection PyProtectedMember
192 | # Config._set_property("four_factor_regression", "something_else")
193 | 
194 | 
195 | t = 2
196 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # NBA_predict
  2 | 
  3 | NBApredict is a package for predicting NBA games against betting lines. It has two main behaviors: 
  4 | 1. Scrape and store team statistics, game results, and betting lines in a SQLite database.
  5 | 2. Generate predictions for each NBA game, compare the prediction to the betting line, and store the results.
  6 | 
  7 | 
  8 | ## Status
  9 | I have effectively archived this project. Given that, I thought it would be relevant to update the code and README to reflect the state of the project where it left off. Work on the project was previously sponsored by a benefactor, but monetary support dried up with the pause of the NBA season due to COVID-19 in March 2020. That pause occured in the middle of a significant reorganization which was perhaps 90% finished. Those changes are now stored in "main" as people continued to clone the master branch, and the code in that branch was trash. Hence why I started the reorganization. The old master branch is stored as "archive". (Note: "main" is now the equivalent of "master" if that's not clear above.)
 10 | 
 11 | This project will not run if you just clone it. However, there's hundreds of hours of work here! So, if you want to get this project to run for you, get in contact, and I'd be happy to iron out the few remaining kinks. Otherwise, I feel no real incentive to work on this for myself at the moment. 
 12 | 
 13 | In the rest of this README, I'll try to describe the state of the project so that you may have some idea of what utility you may derive from the work I've done. Hopefully this will be of some help in your own pursuits. 
 14 | 
 15 | ## Project Overview
 16 | ### Directories
 17 | This section overviews the main components of the project. Details for other sections of the project are available in the documentation. 
 18 | 
 19 | * br_web_scraper - This is just a clone of the same package listed in the [the credits section](#credits) with some changes to fit NBApredict.
 20 | * database - Can be ignored. This folder contains various modules intended to automate database operations. However, I found these modules useful for other projects, so I created the [DatatoTable](https://github.com/Spencer-Weston/DatatoTable) repo. DatatoTable is available as a package on PyPi, and the rest of NBApredict *should* use that package.
 21 | * helpers - Miscellaneous. Most of this would get removed were the reorganization finished. 
 22 | * management - Data management package 
 23 |     * tables - Each module in this directory has a corresponding table in the database. Each module will tend to have of format_data(), create_table(), and update_table() functions, among others, which perform the necessary work for that table.  
 24 |     * conversion - This module has been replaced by [DatatoTable.convert](https://github.com/Spencer-Weston/DatatoTable) functions. However, the functionality, either from this module or [DatatoTable](https://github.com/Spencer-Weston/DatatoTable), is essential for configuring foreign keys. An example is in tables.odds.format_data(). 
 25 |     * etl - Extract, Transform, Load. This module runs all processes which involve external data. 
 26 | * models - This directory is to hold whatever models get incorporated into the project. At the moment, it just holds the four_factor_regression module (explained below) and a graphing script which generates graphs for regression evaluation. 
 27 | * outputs - This directory is generated by the project. It holds the SQLite database generated by the project. It also holds a graphs directory which stores any saved graphs.
 28 | * predict - As it's named, this package is used to generate predictions. This is where work on the reorganization stopped, so these scripts are the least polished. The ToDo at the top of predict.bets describes the vision for this package. 
 29 | * run - The run directory holds two scripts, daily.py and all.py. The daily script will set the project to run daily while the all script runs the project when called. Neither will work unless work on upstream components is finished. 
 30 | * scrapers - The scrapers folder holds modules for scaping data. scraper.py's scrape_all() function will scrape all season, team, and betting line data. To just scrape one type of data, call the desired data's scrape function. For example, line_scraper.scrape() will scrape betting lines.
 31 | 
 32 | ## The Model
 33 | As of now, the model uses a linear regression based on the [Four Factors of Basketball Success](https://www.basketball-reference.com/about/factors.html) which encapsulates shooting, turnovers, rebounding, and free throws. Further, we include the opposing four factors, which are how a team's opponents perform on the four factors in aggregate. Thus, each team has eight variables, and the model uses sixteen variables (eight for each team) for each prediction. The target, Y, or dependent variable is home Margin of Victory (MOV). Away MOV is simply the inverse of home MOV. 
 34 | 
 35 | ### What are betting lines? 
 36 | MOV is targeted because it provides an easy comparison with two types of betting lines, the spread and moneyline. Here's what the spread and moneyline might look like for a matchup between the Milwaukee Bucks and Atlanta Hawks:
 37 | 
 38 | Milwaukee Bucks (Home):
 39 | 1. Spread: -8
 40 | 2. Moneyline: -350
 41 | 
 42 | Atlanta Hawks (Away):
 43 | 1. Spread: 8
 44 | 2. Moneyline: 270
 45 | 
 46 | First, the spread attempts to guess the MOV between two teams. The Milwaukee Bucks spread of -8 indicates the betting line expects the Bucks to beat the Hawks by eight points. Or, the Bucks are "given" eight points. If one thinks the Bucks will beat the Hawks by more than eight points, they bet the Bucks. If one believes the Bucks will either win by less than eight points or lose, they bet the Hawks. Typically, spreads have symetric, or near-symetric, returns where picking the Bucks or the Hawks provides an equal return on a correct bet.
 47 | 
 48 | In comparison, the moneyline states the likelihood of a team winning or losing in terms of a monetary return. A negative moneyline, such as the Buck's -350, means one must put up $350 in order to win $100. A positive moneyline, such as the Hawk's 270, means a bet of $100 will return $270 if it is correct. 
 49 | 
 50 | ### Generating Predictions
 51 | 
 52 | Before comparing predictions to betting lines, we need to ensure the model meets the assumptions of regression. For now, assume assumptions are met, and refer to [Additional Reading](#additional-reading) for further model discussion. To compare the model's predictions to betting lines, we look at the prediction's distance from the betting line. In the model, the prediction is the expected value, or the mean, of the matchup. All possible outcomes of the game are normally distributed around this mean with a standard deviation, which as of March 2020, is approximately thirteen. 
 53 | 
 54 | Continuing the Bucks-Hawks example, lets say the model predicts the Bucks to win by 6 in comparison to the betting line of 8. To compare the betting line to the prediction, we want to evaluate the likelihood of a Bucks win by 8 or more given a normal distribution with a mean of 6 and standard deviation of 13. Thus, we calculate the survival function* of 8 based on the distribution. The result is approximately 0.44 which means we'd expect the home MOV to be greater than or equal to 8 44% of the time. Inversely, we expect the home MOV to be less than 8 approximately 56% of the time. 
 55 | 
 56 | To compare moneylines instead of spreads, simply set the spread to 0, and the output will be the likelihood of a win or loss. 
 57 |  
 58 | 
 59 | *The model uses a cumulative density function when the predicted MOV is greater than the betting line
 60 | 
 61 | ## Usage
 62 | (Outdated: This usage hasn't been recreated in the reorganization yet.) 
 63 | 
 64 | Clone this repo to your local machine: https://github.com/Spencer-Weston/NBA_bet.git
 65 | 
 66 | To set the project to run daily:
 67 | ```~\NBApredict>python -m run.daily```
 68 | 
 69 | run.daily sets the project to run 1 hour before the first game of each day. This time is chosen because betting lines are not always available until later in the day. 
 70 | 
 71 | Or to run the project once:
 72 | ```~\NBApredict>python -m run.all```
 73 | 
 74 | 
 75 | ## Version: V0.2 - Reorganization
 76 | 
 77 | This version isn't finished as described in the [Status](#status) section. Still, here is a rough approximation of V0.2:
 78 | 
 79 | ### Why the Reorganization?
 80 | In short, the project sucked before this point (check the archive branch). The project strucure was not pythonic, so namespaces were messy. Modules were more agglomerations of random behaviors than coherent units of related functionality. The project structure now follows standard python package design principles. The initial database design did not incorporate normalized databases. Given that many tables stored the same data, such as game times, keeping tables in sync required adding unique update functions for every table. The new tables are normalized with cascades to avoid this. Various other quality of life improvements have been implemented or will be if I ever pick this project back up.
 81 | 
 82 | ### Finished
 83 | * Project organized into a pythonic package structure
 84 | * All tables are normalized
 85 | * Database operations exported to [DatatoTable](https://github.com/Spencer-Weston/DatatoTable) with much improved usability
 86 | * All scrapers (schedule, betting lines, team stats, and teams) and their associated table modules use the normalized format 
 87 | 
 88 | ### Unfinished
 89 | * Predictions and the associated table (The models still work; they'r just not threaded into the full workflow)
 90 | * Predictions and data interfaces. When completed, these functions would allow some degree of analysis for individual games or stats from the command line
 91 | * "Run All" functionality. Once the above is finished, the project will be left to run daily to keep up to date data and predictions.
 92 | 
 93 | ## Author
 94 | Spencer Weston
 95 | 
 96 | personal website: [Crockpot Thoughts](https://crockpotthoughts.wordpress.com/)
 97 | 
 98 | ## Additional Reading
 99 | * [How and Why](https://crockpotthoughts.wordpress.com/2019/07/23/an-nba-prediction-model-part-2-the-how-and-why/) 
100 | * [Model Evaluation and Explanation](https://crockpotthoughts.wordpress.com/2019/08/05/predicting-nba-games-part-3-the-model/)
101 | 
102 | ## Credits
103 | Jae Bradley: https://github.com/jaebradley/basketball_reference_web_scraper
104 |     - Used to scrape games and game results
105 | 
106 | ## License
107 | MIT
108 | 


--------------------------------------------------------------------------------
/NBApredict/scrapers/line_scraper.py:
--------------------------------------------------------------------------------
  1 | """
  2 | line_scraper scrapes NBA betting odds from Bovada and stores them in the database.
  3 | """
  4 | 
  5 | from datetime import datetime, timedelta
  6 | import re
  7 | import requests
  8 | from sqlalchemy import UniqueConstraint, ForeignKey, Integer
  9 | from sqlalchemy.exc import IntegrityError
 10 | from sqlalchemy.orm import Session, relationship
 11 | 
 12 | # Local Imports
 13 | from nbapredict.configuration import Config
 14 | from nbapredict.database.manipulator import DataOperator
 15 | from nbapredict.database import getters
 16 | from nbapredict.database.reconcile import reconcile
 17 | 
 18 | 
 19 | def bovada_json_request(url):
 20 |     response = requests.get(url, allow_redirects=False).json()
 21 |     if not len(response):
 22 |         return None
 23 |     return response
 24 | 
 25 | 
 26 | def odds_for_today():
 27 |     """Match betting odds from Bovada to the games_query and return the odds
 28 | 
 29 |     Args:
 30 |         date to reflect the current games on Bovada.
 31 | 
 32 |     Returns:
 33 |         A dictionary where the column keys lists of values
 34 |     """
 35 |     scrape_time = datetime.now()
 36 | 
 37 |     # Check for response from Bovada
 38 |     url = Config.get_property("regularURL")
 39 |     response = bovada_json_request(url)
 40 |     if not response:
 41 |         url = Config.get_property("playoffURL")
 42 |         response = bovada_json_request(url)
 43 |         if not response:
 44 |             return None
 45 | 
 46 |     # Move down tree towards games
 47 |     events = response[0]["events"]
 48 | 
 49 |     # Strip games from the 'event's object (which holds a bunch of random information)
 50 |     bovada_games = [e for e in events if e['description'].count('@') > 0 and e['type'] == 'GAMEEVENT']
 51 |     if not bovada_games:
 52 |         return None
 53 | 
 54 |     # Set-up the line dictionary which stores data in the correct table format
 55 |     lines = {"home_team": [], "away_team": [], 'start_time': [], "spread": [], "home_spread_price": [],
 56 |              "away_spread_price": [], "home_moneyline": [], "away_moneyline": [], "scrape_time": []}
 57 | 
 58 |     # Iterate through each game returned by bovada and store its information
 59 |     for game in bovada_games:
 60 |         link = game['link'].split('-')
 61 |         link = link[len(link)-1]
 62 |         str_time = re.findall('[0-9]', link)
 63 |         start_time = ''.join(str_time)
 64 |         start_time = datetime.strptime(start_time, "%Y%m%d%H%M")
 65 |         if datetime.now() > start_time:
 66 |             # An ongoing game will not have the correct betting data. We don't want to store this information
 67 |             print("This game ({}) is either ongoing or completed. Not scraping".format(game['description']))
 68 |             continue
 69 | 
 70 |         home_team, away_team = parse_teams(game["competitors"])
 71 | 
 72 |         # Get only the full match betting information from the game object
 73 |         betting_info = game["displayGroups"][0]["markets"]
 74 |         full_match_bets = [bet for bet in betting_info if bet["period"]["description"] == "Match"]
 75 | 
 76 |         # Extract the betting data associated with the game
 77 |         money_lines = False
 78 |         for bet in full_match_bets:
 79 |             if bet["description"] == "Moneyline":
 80 |                 home_moneyline, away_moneyline = parse_moneyline(bet)
 81 |                 if home_moneyline == "":
 82 |                     home_moneyline = None
 83 |                 if away_moneyline == "":
 84 |                     away_moneyline = None
 85 |                 money_lines = True
 86 |             elif bet["description"] == "Point Spread":
 87 |                 spread, home_spread_price, away_spread_price = parse_spread(bet)
 88 |                 if spread == "":
 89 |                     spread = None
 90 |                 if home_spread_price == "":
 91 |                     home_spread_price = None
 92 |                 if away_spread_price == "":
 93 |                     away_spread_price = None
 94 |         if not money_lines:
 95 |             home_moneyline = None
 96 |             away_moneyline = None
 97 | 
 98 |         game_lines = [home_team, away_team, start_time, spread, home_spread_price, away_spread_price, home_moneyline,
 99 |                       away_moneyline, scrape_time]
100 | 
101 |         # This section depends on python 3.7+ to preserve the order of dict keys in lines
102 |         i = 0
103 |         for key in lines:
104 |             lines[key].append(game_lines[i])
105 |             i += 1
106 |     return lines
107 | 
108 | 
109 | def parse_teams(competitors):
110 |     """Parse a competitors object from Bovada and return the home and away teams, respectively"""
111 |     if len(competitors) > 2:
112 |         raise Exception("Unexpected objects in competitors")
113 |     home_team = ""
114 |     away_team = ""
115 |     for team in competitors:
116 |         if team["home"]:
117 |             home_team = team["name"]
118 |         else:
119 |             away_team = team["name"]
120 |     if not home_team == "" or away_team == "":
121 |         return home_team.upper(), away_team.upper()
122 |     else:
123 |         raise Exception("Competitors was not properly parsed. Missing data.")
124 | 
125 | 
126 | def parse_moneyline(moneyline_bet):
127 |     """Parse a moneyline bet object from Bovada and return, in order, the home and away moneyline"""
128 |     outcomes = moneyline_bet["outcomes"]
129 |     home_moneyline = ""
130 |     away_moneyline = ""
131 |     if len(outcomes) > 2:
132 |         raise Exception("Unexpected objects in moneyline bet")
133 |     for o in outcomes:
134 |         price = o["price"]["american"]
135 |         if price == "EVEN":
136 |             price = 100
137 |         else:
138 |             price = int(price)
139 |         if o["type"] == "H":
140 |             home_moneyline = price
141 |         elif o["type"] == "A":
142 |             away_moneyline = price
143 |     if not home_moneyline == "" or away_moneyline == "":
144 |         return home_moneyline, away_moneyline
145 |     else:
146 |         raise Exception("Moneyline was not properly parsed. Missing data.")
147 | 
148 | 
149 | def parse_spread(spread_bet):
150 |     """Parse a spread bet object from Bovada and return, in order, the spread and the home and away spread prices"""
151 |     outcomes = spread_bet["outcomes"]
152 |     spread = ""
153 |     home_spread_price = ""
154 |     away_spread_price = ""
155 |     if len(outcomes) > 2:
156 |         raise Exception("Unexpected objects in spread bet")
157 |     for o in outcomes:
158 |         if o["type"] == "H":
159 |             spread = float(o["price"]["handicap"])
160 |             home_spread_price = int(o["price"]["american"])
161 |         elif o["type"] == "A":
162 |             away_spread_price = int(o["price"]["american"])
163 |     if not spread == "" or home_spread_price == "" or away_spread_price == "":
164 |         return spread, home_spread_price, away_spread_price
165 |     else:
166 |         raise Exception("Spread was not properly parsed. Missing data.")
167 | 
168 | 
169 | def create_odds_table(database, data, tbl_name, sched_tbl):
170 |     """Creates an odds_table in the database based on the data with foreign key based on the schedule
171 | 
172 |     Args:
173 |         database: An instance of the DBInterface class from database/DBInterface.py
174 |         data: A DataOperator object from database/manipulator which holds the data and
175 |         tbl_name:
176 |         sched_tbl: The schedule table which will contain the game_id for the odds_table and which will be given a
177 |         relationship to the odds table
178 |     """
179 |     # Set columns and constraints
180 |     sql_types = data.get_sql_type()
181 |     sched_tbl_name = sched_tbl.classes.items()[0][0]
182 |     sql_types.update({'game_id': [Integer, ForeignKey(sched_tbl_name + ".id")]})
183 |     constraint = {UniqueConstraint: ["home_team", "away_team", "start_time"]}
184 | 
185 |     database.map_table(tbl_name, sql_types, constraint)  # Maps the odds table
186 | 
187 |     # Establish relationship if it does not exist
188 |     if "odds" not in sched_tbl.__mapper__.relationships.keys():
189 |         sched_tbl.odds = relationship(database.Template)
190 | 
191 |     database.create_tables()
192 |     database.clear_mappers()
193 | 
194 | 
195 | def update_odds_table(odds_table, sched_tbl, rows, session):
196 |     """Update the odds_table with the information in rows
197 | 
198 |     Args:
199 |         odds_table: A mapped odds table object from the database
200 |         sched_tbl: A mapped schedule table object from the database
201 |         rows: A dictionary of rows with column names as keys with lists of values
202 |         session: A SQLalchemy session object
203 |     """
204 |     row_objects = []
205 |     if len(rows) == 0:  # Avoid messing with things if no rows exist
206 |         print("No new odds available. Returning without updating odds table")
207 |         return
208 |     for row in rows:
209 |         # Delete the row in the table if it exists to allow overwrite
210 |         existing_rows = session.query(odds_table).filter(odds_table.home_team == row["home_team"],
211 |                                                          odds_table.away_team == row["away_team"],
212 |                                                          odds_table.start_time == row["start_time"])
213 |         if len(existing_rows.all()) > 0:
214 |             for exist_row in existing_rows.all():
215 |                 session.delete(exist_row)
216 | 
217 |         # Adds all of the normal betting data
218 |         row_object = odds_table(**row)
219 | 
220 |         # Finds and adds the foreign key from the schedule
221 |         game = session.query(sched_tbl).filter(sched_tbl.home_team == row_object.home_team,
222 |                                                sched_tbl.away_team == row_object.away_team,
223 |                                                sched_tbl.start_time == row_object.start_time).all()
224 |         if len(game) > 1:
225 |             raise Exception("More than one game matches the row")
226 |         game = game[0]
227 |         row_object.game_id = game.id
228 | 
229 |         row_objects.append(row_object)
230 |     try:
231 |         session.add_all(row_objects)
232 |     except IntegrityError:  # If all objects cannot be added, try to add each one individually
233 |         for row in row_objects:
234 |             try:
235 |                 session.add(row)
236 |             except IntegrityError:
237 |                 continue
238 | 
239 | 
240 | def scrape():
241 |     """Scrapes betting line information from bovada and adds it to the session"""
242 |     league_year = Config.get_property("league_year")
243 |     lines = odds_for_today()
244 |     if not lines:
245 |         return False
246 |     return lines
247 | 
248 |     line_data = DataOperator(lines)
249 | 
250 |     tbl_name = "odds_{}".format(league_year)
251 |     tbl_exists = database.table_exists(tbl_name)
252 |     if not tbl_exists:
253 |         create_odds_table(database, line_data, tbl_name, schedule)
254 |         tbl_exists = database.table_exists(tbl_name)
255 | 
256 |     if line_data.validate_data_length() and tbl_exists:
257 |         # All values in line_data are expected to be be unique from values in the database. A possible place for errors
258 |         # to occur
259 |         odds_table = database.get_table_mappings([tbl_name])
260 | 
261 |         # Reconcile ensures the odds_table has appropriate start_times; Add logic so its not called every run
262 |         reconcile(schedule, odds_table, "start_time", "id", "game_id", session)
263 | 
264 |         update_odds_table(odds_table, schedule, line_data.dict_to_rows(), session)
265 |     else:
266 |         raise Exception("Something is wrong here (Not descriptive, but this point shouldn't be hit.)")
267 | 
268 |     return True
269 | 
270 | 
271 | if __name__ == "__main__":
272 |     from datatotable.database import Database
273 |     db = Database("test", Config.get_property("outputs"))
274 |     year = 2019
275 |     session = Session(bind=db.engine)
276 |     scrape(db, session)
277 | 


--------------------------------------------------------------------------------
/NBApredict/management/tables/schedule.py:
--------------------------------------------------------------------------------
  1 | """schedule.py contains function to create the schedule table in the database"""
  2 | 
  3 | from datetime import datetime, timedelta
  4 | import math
  5 | import nbapredict.management.conversion as convert
  6 | from sqlalchemy import ForeignKey, func, tuple_
  7 | from sqlalchemy.orm import aliased
  8 | import pandas as pd
  9 | 
 10 | 
 11 | def format_data(session, schedule_data, team_tbl, team_stats_tbl):
 12 |     """Format and return schedule data to match the database schema.
 13 | 
 14 |     Adds a Margin of Victory column and adds/modifies foreign key columns
 15 | 
 16 |     Args:
 17 |         schedule_data: A DataOperator object with schedule data
 18 |         team_tbl: A mapped instance of the team_tbl
 19 |         team_stats_tbl: A mapped instance of the team_stats_tbl
 20 |     """
 21 |     h_score = schedule_data.data['home_team_score']
 22 |     a_score = schedule_data.data['away_team_score']
 23 |     schedule_data.data['MOV'] = [h_score[i] - a_score[i] for i in range(schedule_data.num_rows())]
 24 |     schedule_data.data['playoffs'] = ['']
 25 |     schedule_data.data['game_date'] = [datetime.date(t) for t in schedule_data.data['start_time']]
 26 |     schedule_data.fill('playoffs', None)
 27 |     schedule_data.data["home_team_id"] = convert.values_to_foreign_key(session, foreign_tbl=team_tbl, foreign_key="id",
 28 |                                                                        foreign_value="team_name",
 29 |                                                                        child_data=schedule_data.data.pop("home_team"))
 30 |     schedule_data.data["away_team_id"] = convert.values_to_foreign_key(session, foreign_tbl=team_tbl, foreign_key="id",
 31 |                                                                        foreign_value="team_name",
 32 |                                                                        child_data=schedule_data.data.pop("away_team"))
 33 | 
 34 |     today = datetime.date(datetime.now())
 35 |     tomorrow = today + timedelta(days=1)
 36 |     tmrw_idx = 0
 37 |     for idx in range(len(schedule_data.data['start_time'])):
 38 |         if schedule_data.data['start_time'][idx].date() >= tomorrow:
 39 |             tmrw_idx = idx
 40 |             break
 41 |     if not tmrw_idx:
 42 |         raise ValueError("tmrw_idx was not found")
 43 |     subquery = session.query(team_stats_tbl.id, team_stats_tbl.team_id, func.max(team_stats_tbl.scrape_time)). \
 44 |         filter(team_stats_tbl.scrape_date <= today).group_by(team_stats_tbl.team_id).subquery()
 45 |     schedule_data.data['home_stats_id'] = convert.values_to_foreign_key(session, subquery, 'id', 'team_id',
 46 |                                                                         schedule_data.data['home_team_id'][:tmrw_idx])
 47 |     schedule_data.data['away_stats_id'] = convert.values_to_foreign_key(session, subquery, 'id', 'team_id',
 48 |                                                                         schedule_data.data['away_team_id'][:tmrw_idx])
 49 |     schedule_data.fill('home_stats_id', None)
 50 |     schedule_data.fill('away_stats_id', None)
 51 | 
 52 |     return schedule_data
 53 | 
 54 | 
 55 | def create_table(db, schedule_data, tbl_name, team_tbl, team_stats_tbl):
 56 |     """Create a table of the NBA schedule in the database.
 57 |     Args:
 58 |         db: a datotable.database.Database object connected to a database
 59 |         schedule_data: A datatotable.data.DataOperator object with schedule data
 60 |         tbl_name: The desired name of the table
 61 |         team_tbl: A mapped team table to set foreign keys on
 62 |         team_stats_tbl: A mapped team stats table to set foreign keys on
 63 |     """
 64 |     columns = schedule_data.columns
 65 |     team_tbl_name = team_tbl.__table__.fullname
 66 |     team_stats_tbl_name = team_stats_tbl.__table__.fullname
 67 |     columns['home_team_id'].append(ForeignKey("{}.id".format(team_tbl_name)))
 68 |     columns['away_team_id'].append(ForeignKey("{}.id".format(team_tbl_name)))
 69 |     columns['home_stats_id'].append(ForeignKey("{}.id".format(team_stats_tbl_name)))
 70 |     columns['away_stats_id'].append(ForeignKey("{}.id".format(team_stats_tbl_name)))
 71 |     db.map_table(tbl_name=tbl_name, columns=columns)
 72 |     db.create_tables()
 73 |     db.clear_mappers()
 74 | 
 75 | 
 76 | def update_table(session, schedule_data, schedule_tbl, team_stats_tbl):
 77 |     """Wrap and run update functions for the schedule_tbl."""
 78 | 
 79 |     update_games(session, schedule_tbl, schedule_data)
 80 |     score_updates = update_scores(session, schedule_tbl, schedule_data)
 81 |     stats_updates = update_stats(session, schedule_tbl, team_stats_tbl)
 82 |     time_updates = update_start_time(session, schedule_tbl, schedule_data)
 83 | 
 84 |     # Some rows may be updated in different functions. Use a set to remove duplicates
 85 |     return set(score_updates + stats_updates + time_updates)
 86 | 
 87 | 
 88 | def update_scores(session, schedule_tbl, schedule_data) -> list:
 89 |     date = datetime.date(datetime.now())
 90 |     update_query = session.query(schedule_tbl).filter(schedule_tbl.start_time < date,
 91 |                                                       schedule_tbl.home_team_score == 0). \
 92 |         order_by(schedule_tbl.start_time)
 93 |     # if update_query.count() == 0:
 94 |     #     return
 95 |     rows = update_query.all()
 96 |     if len(rows) == 0:
 97 |         return []
 98 |     first_game_time = rows[0].start_time
 99 |     last_game_time = rows[len(rows) - 1].start_time
100 | 
101 |     sched_df = schedule_data.dataframe
102 |     sched_df["start_time"] = sched_df["start_time"].dt.tz_localize(None)
103 |     update_df = sched_df.loc[(sched_df.start_time >= first_game_time) & (sched_df.start_time <= last_game_time)]
104 | 
105 |     update_rows = []
106 |     for row in rows:
107 |         game = update_df.loc[(update_df.home_team_id == row.home_team_id) & (update_df.away_team_id == row.away_team_id)
108 |                              & (update_df.start_time.dt.date == datetime.date(row.start_time))]
109 |         row.home_team_score = int(game.home_team_score)
110 |         row.away_team_score = int(game.away_team_score)
111 |         row.MOV = row.home_team_score - row.away_team_score
112 |         row.start_time = game.start_time.dt.to_pydatetime()[0]  # Convert Pandas TimeStamp to datetime
113 |         update_rows.append(row)
114 |     return update_rows
115 | 
116 | 
117 | def update_stats(session, schedule_tbl, team_stats_tbl) -> list:
118 |     tomorrow = datetime.date(datetime.now()) + timedelta(days=1)
119 | 
120 |     d_time = session.query(func.min(schedule_tbl.start_time)).filter(schedule_tbl.home_stats_id == None).all()[0][0]
121 |     date = datetime.date(d_time)
122 |     date_ranges = []
123 |     while date < tomorrow:
124 |         next_day = date + timedelta(days=1)
125 |         date_ranges.append((date, next_day))
126 |         date = next_day
127 | 
128 |     update_rows = []
129 |     for d in date_ranges:
130 |         # Get the team stats with the greatest scrape_time before the end date of the range (31 obs, all teams + L. AVG)
131 |         stats_q = session.query(team_stats_tbl.id, team_stats_tbl.team_id,
132 |                                 func.max(team_stats_tbl.scrape_time).label('s_time')). \
133 |             filter(team_stats_tbl.scrape_time < d[1]).group_by(team_stats_tbl.team_id).subquery()
134 |         home_stats = aliased(stats_q, 'home_stats')
135 |         away_stats = aliased(stats_q, 'away_stats')
136 | 
137 |         sched_rows = session.query(schedule_tbl, home_stats.c.id.label('h_s_id'), away_stats.c.id.label('a_s_id')). \
138 |             filter(schedule_tbl.home_stats_id == None, schedule_tbl.start_time > d[0], schedule_tbl.start_time < d[1]).\
139 |             join(home_stats, schedule_tbl.home_team_id == home_stats.c.team_id). \
140 |             join(away_stats, schedule_tbl.away_team_id == away_stats.c.team_id).all()
141 | 
142 |         # ToDo: remove explicit 2020 references
143 |         for row in sched_rows:
144 |             row.schedule_2020.home_stats_id = row.a_s_id
145 |             row.schedule_2020.away_stats_id = row.h_s_id
146 |             update_rows.append(row.schedule_2020)
147 |     return update_rows
148 | 
149 | 
150 | def update_start_time(session, schedule_tbl, schedule_data) -> list:
151 |     """Return updated rows for any games where the start_time has changed.
152 | 
153 |     Note this will not check if the date of a game has changed."""
154 |     today = datetime.date(datetime.now())
155 |     end_week = datetime.date(datetime.now()) + timedelta(days=7)
156 | 
157 |     games = session.query(schedule_tbl).filter(schedule_tbl.game_date >= today,
158 |                                                     schedule_tbl.game_date <= end_week).all()
159 | 
160 |     df = schedule_data.dataframe[['start_time', 'game_date', 'home_team_id', 'away_team_id']]
161 |     df.start_time = df.start_time.dt.tz_localize(None)
162 |     df = df[(df.start_time >= pd.Timestamp(today)) & (df.game_date <= end_week)]
163 | 
164 |     update_rows = []
165 |     for game in games:
166 |         if df[(df.start_time == game.start_time) & (df.home_team_id == game.home_team_id)].empty:
167 |             date = game.game_date
168 |             changed_game = df[(df.home_team_id == game.home_team_id) & (df.away_team_id == game.away_team_id) &
169 |                               (df.game_date == game.game_date)]
170 |             if changed_game.empty:
171 |                 raise ValueError('Game time for {} @ {} has changed,'
172 |                                  ' but cannot find the new game time'.format(game.home_team_id,
173 |                                                                              game.away_team_id))
174 |             elif len(changed_game) == 1:
175 |                 new_time = changed_game.start_time
176 |                 new_time_timestamp = pd.to_datetime(new_time.values[0])
177 |                 game.start_time = new_time_timestamp
178 |                 update_rows.append(game)
179 |             else:
180 |                 raise ValueError('Game time for {} @ {} has changed,'
181 |                                  'but there are multiple replacement values available'.format(game.home_team_id,
182 |                                                                                               game.away_team_id))
183 |     return update_rows
184 | 
185 | 
186 | def update_games(session, schedule_tbl, schedule_data):
187 |     """Check if any games have been removed or added from the schedule and add that change to the database.
188 | 
189 |     ToDo: Add check for new games (i.e. when Clippers-Lakers gets rescheduled)
190 |     ToDo: This should work for playoff games too, right?
191 |     ToDo: Iterating through indices potentially slow, though great alternatives don't seem to exist
192 |     """
193 | 
194 |     data_len = len(schedule_data.data['start_time'])
195 |     tbl_len = session.query(schedule_tbl).count()
196 |     if data_len < tbl_len:
197 |         data_df = pd.DataFrame({'home_team_id': schedule_data.data['home_team_id'],
198 |                                 'game_date': schedule_data.data['game_date']})
199 | 
200 |         tbl_id_dates = session.query(schedule_tbl.home_team_id, schedule_tbl.game_date).all()
201 |         id_dates_dict = {'home_team_id': [r.home_team_id for r in tbl_id_dates],
202 |                          'game_date': [r.game_date for r in tbl_id_dates]}
203 |         tbl_df = pd.DataFrame(id_dates_dict)
204 | 
205 |         # Outer join for all rows, indicator for diff column
206 |         comp = data_df.merge(tbl_df, how='outer', indicator=True)
207 |         tbl_only = comp[comp['_merge'] == 'right_only']
208 |         ids = tbl_only['home_team_id'].values.tolist()
209 |         dates = tbl_only['game_date'].values.tolist()
210 |         cancelled_games = [(ids[i], dates[i]) for i in range(len(ids))]
211 | 
212 |         delete_rows = session.query(schedule_tbl).filter(tuple_(schedule_tbl.home_team_id, schedule_tbl.game_date).
213 |                                                          in_(cancelled_games))
214 |         if delete_rows.count() > 0:
215 |             for row in delete_rows:
216 |                 session.delete(row)
217 | 
218 | 
219 | 


--------------------------------------------------------------------------------
/NBApredict/models/four_factor_regression.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Author: Spencer Weston
  3 | 
  4 | Purpose: Four Factor Regression performs a regression on the Margin of Victory (mov) between NBA teams with each teams
  5 | four factors(offensive and defensive) as predictors. The regression object is returned from the module.
  6 | 
  7 | Args (default):
  8 |     year (2019): The year of the season desired
  9 |     db_url ('sqlite:///database//nba_db.db'): Path to the database where data should be written
 10 | 
 11 | Returns:
 12 |     Returns a LinearRegression class
 13 | """
 14 | from datetime import datetime
 15 | import matplotlib.pyplot as plt
 16 | import numpy as np
 17 | import pandas as pd
 18 | import os
 19 | import scipy.stats as stats
 20 | from sqlalchemy.orm import Session
 21 | from sqlalchemy import func, alias
 22 | 
 23 | import statsmodels.api as sm
 24 | from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
 25 | 
 26 | # Local Packages
 27 | from datatotable.database import Database
 28 | from nbapredict.database import getters
 29 | from nbapredict.helpers import br_references as br
 30 | from nbapredict.management import conversion
 31 | from nbapredict.models import graphing
 32 | from nbapredict.configuration import Config
 33 | 
 34 | 
 35 | class LinearRegression:
 36 |     """A class that creates and holds linear regression information and functions for regression evaluation.
 37 | 
 38 |     LinearRegression is initialized with a target variable and the desired predictors. Then, a regression is run and
 39 |     necessary regression stats are stored as class parameters. Member functions generate evaluative graphs and/or
 40 |     stats for the regression.
 41 | 
 42 |     Attributes:
 43 |         target: The target variable
 44 |         predictors: The predictive variables
 45 |         results: statsmodels results wrapper
 46 |         predictions: predicted results from the regression
 47 |         r_squared: r_squared of the regression
 48 |         adj_r_squared: adj_r_squared of the regression
 49 |         r_squared_rnd: r_squared rounded to three decimal places
 50 |         residuals: residuals of the gression
 51 |         p_values: p_values of the coefficients
 52 |         coefs: values of the coefficients
 53 |         output: data frame of coefficients with their values and p_values"""
 54 | 
 55 |     def __init__(self, target, predictors):
 56 |         """Performs a linear regression and stores pertinent regression outputs as class variables
 57 | 
 58 |         Args:
 59 |             target: The target variable
 60 |             predictors: The prediction variables"""
 61 |         self.target = target
 62 |         self.predictors = sm.add_constant(predictors)
 63 |         self.results = sm.OLS(target, self.predictors).fit()
 64 |         self.predictions = self.results.predict(self.predictors)
 65 |         self.r_squared = self.results.rsquared
 66 |         self.adj_r_squared = self.results.rsquared_adj
 67 |         self.r_squared_rnd = np.around(self.r_squared, 3)
 68 |         self.residuals = self.results.resid
 69 |         self.p_values = self.results.pvalues
 70 |         self.coefs = self.results.params
 71 |         self.output = pd.concat([self.coefs, self.p_values], axis=1)
 72 |         self.output.columns = ["coefficient", "p_value"]
 73 | 
 74 |     def predicted_vs_actual(self, out_path=None):
 75 |         """Generate a predicted vs. actual graph, save to out_path if it exists, and return the graph."""
 76 |         graph = graphing.pred_vs_actual(self.predictions, self.target, self.r_squared_rnd, out_path=out_path)
 77 |         return graph
 78 | 
 79 |     def residuals_vs_fitted(self, out_path=None):
 80 |         """Generate a residuals vs. fitted graph, save to out_path if it exists, and return the graph."""
 81 |         graph = graphing.residuals_vs_fitted(self.predictions, self.residuals, out_path)
 82 |         return graph
 83 | 
 84 |     def qqplot(self, out_path=None):
 85 |         """Generate a qq plot, save to out_path if it exists, and return the graph."""
 86 |         fig = sm.qqplot(self.residuals, dist=stats.t, fit=True, line="45")
 87 |         if out_path:
 88 |             fig.savefig(out_path)
 89 |         return fig
 90 | 
 91 |     def influence_plot(self, out_path=None):
 92 |         """Generate an influence plot, save to out_path if it exists, and return the graph."""
 93 |         fig, ax = plt.subplots(figsize=(12, 8))
 94 |         fig = sm.graphics.influence_plot(self.results, alpha=0, ax=ax, criterion="cooks")
 95 |         if out_path:
 96 |             fig.savefig(out_path)
 97 |         return fig
 98 | 
 99 |     def cooks_distance(self, out_path=None):
100 |         """Generate a cook's distance graph, save to out_path if it exists, and return the graph."""
101 |         influence = self.results.get_influence()
102 |         # c is the distance and p is p-value
103 |         (c, p) = influence.cooks_distance
104 |         graph = graphing.cooks_distance(c, out_path)
105 |         return graph
106 | 
107 |     def residual_independence(self, out_path=None):
108 |         """Generate a residual independence plot, save to out_path if it exists, and return the graph."""
109 |         residuals = self.residuals
110 |         plot = graphing.residual_independence(residuals)
111 |         if out_path:
112 |             plot.savefig(out_path)
113 |         return plot
114 | 
115 |     def vif(self):
116 |         """Determine the Variance Inflation Factor (vif) of the coefficients and return a dataframe of the vif's."""
117 |         vif_out = pd.DataFrame()
118 |         predictors = np.array(self.predictors)
119 |         vif_out["VIF Factor"] = [vif(predictors, i) for i in range(predictors.shape[1])]
120 |         vif_out["features"] = self.predictors.columns
121 |         return vif_out
122 | 
123 |     def residual_distribution(self):
124 |         """Calculate the normal curve of the residuals and return the distribution"""
125 |         norm = stats.norm
126 |         mu, std = norm.fit(self.residuals)
127 |         # mu = 0  # By definition, mu of resids = 0, but the fit provides approximately 0. It's perhaps best to just
128 |         # set mu=0?
129 |         return norm(loc=mu, scale=std)
130 | 
131 | 
132 | def create_ff_regression_df(session, team_stats_tbl, sched_tbl, ff_list):
133 |     """Create and return a regression data frame of the four factors (ff) for each team in a matchup.
134 | 
135 |     Args:
136 |         session: Sqlalchemy session object
137 |         team_stats_tbl: mapped team stats table object
138 |         sched_tbl: mapped schedule table object
139 |         ff_list: List of the four factors variable
140 | 
141 |     Returns:
142 |          A data frame with home('_h') and away('_a') stats and the margin of victory (mov). The mov is the target
143 |          for a regression. The '_h' and '_a" stats are the home and away four factors in a specific matchup.
144 |     """
145 |     home_stats = alias(team_stats_tbl, name='home')
146 |     away_stats = alias(team_stats_tbl, name='away')
147 |     sched = alias(sched_tbl, name='sched')
148 |     home_stat_ff = [getattr(home_stats.c, col) for col in ff_list if col in home_stats.c.keys()]
149 |     away_stat_ff = [getattr(away_stats.c, col) for col in ff_list if col in away_stats.c.keys()]
150 |     sched_stats_query = session.query(sched, *home_stat_ff, *away_stat_ff).filter(sched.c['home_team_score'] > 0).\
151 |         join(home_stats, home_stats.c['id'] == sched.c['home_stats_id']).\
152 |         join(away_stats, away_stats.c['id'] == sched.c['away_stats_id']).subquery(with_labels=True)
153 |     sched_stats = session.query(sched_stats_query)
154 | 
155 |     df = conversion.convert_sql_statement_to_table(session, sched_stats.statement)
156 |     return df
157 | 
158 | 
159 | def alt_regression_df(session, team_stats_tbl, sched_tbl, ff_list, qualifiers=None):
160 |     """Alternate regression df where the latest team_stats are applied to all games in schedule.
161 | 
162 |     Args:
163 |         session: A sqlalchemy session object
164 |         team_stats_tbl: A mapped team stats table object
165 |         sched_tbl: a mapped schedule table object'
166 |         qualifiers: Optional qualifiers to apply to the returned regression dataframe. Can be columns to subset from the
167 |         regression dataframe or a function
168 | 
169 |     Returns:
170 |         A regression dataframe, modified by qualifiers if specified, with the four factors
171 |     """
172 |     team_stats = session.query(team_stats_tbl).group_by(team_stats_tbl.team_id).having(func.max(team_stats_tbl.id)).\
173 |         subquery()
174 |     home_stats = alias(team_stats, name='home')
175 |     away_stats = alias(team_stats, name='away')
176 |     sched = alias(sched_tbl, name='sched')
177 |     home_stat_ff = [getattr(home_stats.c, col) for col in ff_list if col in home_stats.c.keys()]
178 |     away_stat_ff = [getattr(away_stats.c, col) for col in ff_list if col in away_stats.c.keys()]
179 | 
180 |     sched_stats_query = session.query(sched, *home_stat_ff, *away_stat_ff).filter(sched.c['home_team_score'] > 0).\
181 |         join(home_stats, home_stats.c['team_id'] == sched.c['home_team_id']).\
182 |         join(away_stats, away_stats.c['team_id'] == sched.c['away_team_id']).subquery(with_labels=True)
183 | 
184 |     sched_stats = session.query(sched_stats_query)
185 | 
186 |     if qualifiers:
187 |         df = conversion.convert_sql_statement_to_table(session, sched_stats.statement, qualifiers)
188 |     else:
189 |         df = conversion.convert_sql_statement_to_table(session, sched_stats.statement)
190 |     return(df)
191 | 
192 | 
193 | def get_team_ff(ff_df, team, ff_list, home):
194 |     """Extract the four factors for a specific team from the ff_df and return the result.
195 | 
196 |     Further, if home is True, a "_h" is appended to each four factor for the team. And if False, "_a" is appended.
197 |     This is to specify if the team is home or away.
198 | 
199 |     Args:
200 |         ff_df: four factors Pandas data frame (read from SQL table)
201 |         team: A team name
202 |         ff_list: List of the four factors variable
203 |         home: Boolean. True if the team is home; False if the team is away
204 |     """
205 |     team_ff = ff_df[ff_df.team_name.str.lower() == team.lower()][ff_list]
206 |     if home:
207 |         team_ff = team_ff.rename(append_h, axis='columns')
208 |     else:
209 |         team_ff = team_ff.rename(append_a, axis='columns')
210 |     return team_ff
211 | 
212 | 
213 | def append_h(string):
214 |     """Append "_h" to string and return the modified string"""
215 |     string = '{}{}'.format(string, '_h')
216 |     return string
217 | 
218 | 
219 | def append_a(string):
220 |     """Append "_a" to string and return the modified string"""
221 |     string = '{}{}'.format(string, '_a')
222 |     return string
223 | 
224 | 
225 | def ensure_unique_index(index, indices, i=1):  # Indexed to 1 so +1 == 2nd, 3rd, 4th, etc. game
226 |     """Check if index is in indices, modify index until it's unique, and return the unique index
227 | 
228 |     If the index is unique, it's returned as is. Otherwise, the function calls itself and increments i. The recursion
229 |     stops when the index and numerical suffix (i) are not in indices. Used to create unique identifiers for multiple
230 |     matchups between the same teams.
231 | 
232 |     Args:
233 |         index: A string index to check for in indices
234 |         indices: A list of indices to check the index against
235 |         i: A numerical suffix used to modify index until it does not exist in indices
236 |     Returns:
237 |         index, or a modified form of index, that does not exist in indices
238 |     """
239 |     if index in indices:
240 |         i = i+1
241 |         test_index = "{}{}".format(index, i)
242 |         if test_index in indices:
243 |             return ensure_unique_index(index, indices, i)
244 |         else:
245 |             return test_index
246 |     else:
247 |         return index
248 | 
249 | 
250 | def four_factors_list():
251 |     """Create a four factor(ff) list and identifying information and return it."""
252 |     # Import and specify a list of factors to extract from database
253 |     ff_list = br.four_factors.copy()
254 |     return ff_list
255 | 
256 | 
257 | def main(session, team_stats_tbl, sched_tbl, graph=False):
258 |     """Create a regression data frame, run a regression through the LinearRegression class, and return the class
259 | 
260 |     Args:
261 |         session: An instantiated Session object from sqlalchemy
262 |         team_stats_tbl: A mapped team stats table class
263 |         sched_tbl: A mapped schedule table class
264 |         graph: A boolean that creates graphs if true
265 | 
266 |     Returns:
267 |         A LinearRegression class
268 |     """
269 |     league_year = Config.get_property("league_year")
270 |     graph_dir = Config.get_property("graph_dir")
271 |     if not os.path.exists(graph_dir) and graph:
272 |         os.mkdir(graph_dir)
273 | 
274 |     # Import and specify a list of factors to extract from database
275 |     ff_list = four_factors_list()
276 | 
277 |     # regression_df = create_ff_regression_df(session, team_stats_tbl, sched_tbl, ff_list)
278 |     regression_df = alt_regression_df(session, team_stats_tbl, sched_tbl, ff_list)
279 |     print('using alternative/old regression_df')
280 | 
281 |     # Separate DF's into them into X (predictors) and y (target)
282 |     predictors = regression_df[regression_df.columns.drop(list(regression_df.filter(regex='sched')))]
283 |     target = regression_df["sched_MOV"]
284 | 
285 |     ff_reg = LinearRegression(target, predictors)
286 | 
287 |     # Note: On Windows, graphs will not appear to update
288 |     # To change that, go to folder properties -> customize -> optimize for: Documents
289 |     if graph:
290 |         ff_reg.predicted_vs_actual(out_path=os.path.join(graph_dir, "pred_vs_actual_{}.png".format(league_year)))
291 |         ff_reg.residuals_vs_fitted(out_path=os.path.join(graph_dir, "residuals_vs_fitted_{}.png".format(league_year)))
292 |         ff_reg.qqplot(out_path=os.path.join(graph_dir, "qqplot_{}.png".format(league_year)))
293 |         ff_reg.influence_plot(out_path=os.path.join(graph_dir, "influence_{}.png".format(league_year)))
294 |         ff_reg.cooks_distance(out_path=os.path.join(graph_dir, "cooks_distance_{}.png".format(league_year)))
295 |         ff_reg.residual_independence(out_path=os.path.join(graph_dir, "resid_independence_{}.png".format(league_year)))
296 | 
297 |     # Multicollinearity
298 |     # vif_df = ff_reg.vif()
299 |     # ff_reg.residual_distribution()
300 | 
301 |     return ff_reg
302 | 
303 | 
304 | if __name__ == "__main__":
305 |     db = Database('test', "../management")
306 |     session = Session(db.engine)
307 |     year = Config.get_property('league_year')
308 |     sched_tbl = db.table_mappings['schedule_{}'.format(year)]
309 |     team_stats_tbl = db.table_mappings['team_stats_{}'.format(year)]
310 |     test = main(session, graph=True)
311 |     t=2
312 | 


--------------------------------------------------------------------------------
/project_notebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "Author: Spencer Weston\n",
 10 |     "\n",
 11 |     "Website: [Crockpot Thoughts](https://crockpotthoughts.wordpress.com/)\n",
 12 |     "\n",
 13 |     "Last Update: 08/10/2019"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "# Guide to NBA Bet\n",
 21 |     "In this notebook, we'll overview the usage of the NBA bet project. First, we'll run the project from the top level. And afterwards, we'll dive into some of the details of the projects implementation. You can work through this notebook in two ways. If you view it on Github, you can download a clone of the project and follow along. Or, if you have jupyter installed, you can run the notebook yourself. However, since it is the NBA offseason, the project will have limited functionality."
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## Running the Project\n",
 29 |     "The run directory holds the scripts to run the entire projects. There are two modules: all and daily. Daily calls run.all.run_all() one hour before the next game in the schedule. All contains run_all() which runs the entire project.  "
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "Daily is called from the command line and runs on main. To call it at the command line, enter: ~\\NBA_bet>python -m run.daily"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "To run the project on an ad hoc basis, we use the run_all() function:"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 1,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "ename": "ModuleNotFoundError",
 53 |      "evalue": "No module named 'database'",
 54 |      "traceback": [
 55 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
 56 |       "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
 57 |       "\u001b[1;32m<ipython-input-1-86f7964a33f5>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mNBApredict\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mall\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mrun_all\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
 58 |       "\u001b[1;32m~\\Documents\\Projects\\NBApredict\\NBApredict\\NBApredict\\run\\all.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m     10\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     11\u001b[0m \u001b[1;31m# Local Imports\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 12\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mdatabase\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdatabase\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mDatabase\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     13\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mpredict\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mpredict\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     14\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mscrapers\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mscraper\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
 59 |       "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'database'"
 60 |      ],
 61 |      "output_type": "error"
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "from NBApredict.run.all import run_all"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 2,
 71 |    "metadata": {},
 72 |    "outputs": [
 73 |     {
 74 |      "name": "stderr",
 75 |      "output_type": "stream",
 76 |      "text": [
 77 |       "c:\\users\\spencer\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\numpy\\core\\fromnumeric.py:2389: FutureWarning: Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.\n",
 78 |       "  return ptp(axis=axis, out=out, **kwargs)\n"
 79 |      ]
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "run_all()"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "Console output will raise a deprecation warning, an issue I've scheduled to fix. \n",
 91 |     "\n",
 92 |     "It's really that simple to run the project. If you've got this far, you've run the whole project. Now let's look at what's going on under the hood."
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "## Scraping Data\n",
100 |     "We scrape data from two sources: Basketballreference.com for basketball statistics and Bovada.com for betting lines. We're concerned with three types of data:\n",
101 |     "1. Team Stats - These statistics are our explanatory variables. They describe Dean Oliver's four factors as described in the [third post](https://crockpotthoughts.wordpress.com/2019/08/05/predicting-nba-games-part-3-the-model/) in my series overviewing the project. \n",
102 |     "2. The Schedule - We need the schedule to know when games are played.\n",
103 |     "3. Betting Lines - Lines to compare our predictions to.\n",
104 |     "\n",
105 |     "All the data scrapers are held in the \"scrapers\" folder. To scrape all data, scrapers.scraper.scrape_all() will scrape and store all data in a local SQLite database. You can find the database in the \"outputs\" folder. When the project is run, it will create this folder for you. The scrape_all function takes three arguments: a database, session, and the league year. Let's set these arguments up:"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 3,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "from scrapers import scraper\n",
115 |     "from database.database import Database\n",
116 |     "from sqlalchemy.orm import Session"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 4,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "db = Database()\n",
126 |     "session = Session(bind=db.engine)\n",
127 |     "league_year = 2019"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "On initialization, the Database class will intialize within the working directory. Alternatively, init accepts a file path to the desired database location. We then initialize the Session with the engine, part of the sqlalchemy package, from db. Now, we can pass these arguments to the scrape_all function."
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 5,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "scraper.scrape_all(database=db, session=session, league_year=league_year)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "Now, check the outputs folder. You'll find nba_db.db within. It can be viewed with your database software of choice, or you can interact with it through the instantiated Database class. We can see our tables through the db.get_tables() function. Note: If you didn't call run_all() earlier in the notebook, you will not have the 'predictions_2019' table. "
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 6,
156 |    "metadata": {},
157 |    "outputs": [
158 |     {
159 |      "data": {
160 |       "text/plain": [
161 |        "dict_keys(['misc_stats_2019', 'odds_2019', 'sched_2019', 'predictions_2019'])"
162 |       ]
163 |      },
164 |      "execution_count": 6,
165 |      "metadata": {},
166 |      "output_type": "execute_result"
167 |     }
168 |    ],
169 |    "source": [
170 |     "db.get_tables().keys()"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "markdown",
175 |    "metadata": {},
176 |    "source": [
177 |     "If you'd like, you can only scrape one type of data at a time. Just call the individual scraper's scrape() function."
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 7,
183 |    "metadata": {
184 |     "scrolled": true
185 |    },
186 |    "outputs": [
187 |     {
188 |      "data": {
189 |       "text/plain": [
190 |        "True"
191 |       ]
192 |      },
193 |      "execution_count": 7,
194 |      "metadata": {},
195 |      "output_type": "execute_result"
196 |     }
197 |    ],
198 |    "source": [
199 |     "from scrapers import line_scraper, team_scraper, season_scraper\n",
200 |     "\n",
201 |     "team_scraper.scrape(db, league_year)\n",
202 |     "line_scraper.scrape(db, session, league_year)\n",
203 |     "season_scraper.scrape(db, session, league_year)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "metadata": {},
209 |    "source": [
210 |     "## Generating Predictions\n",
211 |     "Now, let's look at predicting games. We can predict all games we have betting lines for, all games on a specific day, or individual games. Predicting all games will store all results in the database while games on a specific day or individual games will return a dictionary with results. This portion of the project suffers from entangled function because too many specifications and data operations are managed within functions. However, predict_all() can be run with a simple call:"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 8,
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "from predict import predict\n",
221 |     "\n",
222 |     "predict.predict_all(db, session, league_year)"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "markdown",
227 |    "metadata": {},
228 |    "source": [
229 |     "Uhh IDK placeholder"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 10,
235 |    "metadata": {},
236 |    "outputs": [
237 |     {
238 |      "ename": "NameError",
239 |      "evalue": "name 'year' is not defined",
240 |      "traceback": [
241 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
242 |       "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
243 |       "\u001b[1;32m<ipython-input-10-8dd288c0c5c2>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[0mdate\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m2019\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m3\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m26\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0mpredict\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpredict_games_on_date\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdb\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msession\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mleague_year\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdate\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconsole_out\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
244 |       "\u001b[1;32m~\\Documents\\Projects\\test\\NBA_bet\\predict\\predict.py\u001b[0m in \u001b[0;36mpredict_games_on_date\u001b[1;34m(database, session, league_year, date, console_out)\u001b[0m\n\u001b[0;32m    479\u001b[0m     \u001b[0mgame_spreads\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mgame\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mgame\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mgames_query\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    480\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 481\u001b[1;33m     \u001b[0mresults\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpredict_games_on_day\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdatabase\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msession\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgame_spreads\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconsole_out\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mconsole_out\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    482\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    483\u001b[0m     \u001b[0mprediction_tbl\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"predictions_{}\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mleague_year\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
245 |       "\u001b[1;32m~\\Documents\\Projects\\test\\NBA_bet\\predict\\predict.py\u001b[0m in \u001b[0;36mpredict_games_on_day\u001b[1;34m(database, session, games, console_out)\u001b[0m\n\u001b[0;32m    189\u001b[0m     \"\"\"\n\u001b[0;32m    190\u001b[0m     \u001b[0mresults\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 191\u001b[1;33m     \u001b[0mregression\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlm\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdatabase\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdatabase\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msession\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msession\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0myear\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0myear\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    192\u001b[0m     \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    193\u001b[0m         \u001b[1;32mfor\u001b[0m \u001b[0mgame\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mgames\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
246 |       "\u001b[1;31mNameError\u001b[0m: name 'year' is not defined"
247 |      ],
248 |      "output_type": "error"
249 |     }
250 |    ],
251 |    "source": [
252 |     "from datetime import datetime\n",
253 |     "\n",
254 |     "date = datetime(2019, 3, 26)\n",
255 |     "predict.predict_games_on_date(db, session, league_year, date, console_out=True)"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": []
264 |   }
265 |  ],
266 |  "metadata": {
267 |   "kernelspec": {
268 |    "display_name": "Python 3",
269 |    "language": "python",
270 |    "name": "python3"
271 |   },
272 |   "language_info": {
273 |    "codemirror_mode": {
274 |     "name": "ipython",
275 |     "version": 3
276 |    },
277 |    "file_extension": ".py",
278 |    "mimetype": "text/x-python",
279 |    "name": "python",
280 |    "nbconvert_exporter": "python",
281 |    "pygments_lexer": "ipython3",
282 |    "version": "3.7.1"
283 |   }
284 |  },
285 |  "nbformat": 4,
286 |  "nbformat_minor": 2
287 | }
288 | 


--------------------------------------------------------------------------------
/NBApredict/predict/bets.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Predict.odds contains functions organized around comparing predictions to odds
  3 | 
  4 | ToDo:
  5 |     In theory, the module will allow multiple model inputs. Thus, we can pass it a linear, bayesian, ML, etc. model,
  6 |     generate results, and store them. That functionality does not exist. This should also have a class of some sort to
  7 |     manage predictions. It will add specificity and remove call complexity and name overlaps (i.e.
  8 |     predict_games_on_day() vs. predict_games_on_date())
  9 | """
 10 | 
 11 | from datetime import datetime
 12 | import numpy as np
 13 | import pandas as pd
 14 | import scipy.stats as stats
 15 | from sqlalchemy.orm import Session
 16 | from sqlalchemy import or_
 17 | from sqlalchemy.exc import IntegrityError
 18 | 
 19 | # Local imports
 20 | from nbapredict.configuration import Config
 21 | from nbapredict.helpers import br_references
 22 | from datatotable.database import Database
 23 | from datatotable.data import DataOperator
 24 | from nbapredict.database import getters
 25 | from nbapredict.management import conversion
 26 | from nbapredict.management.tables import predictions
 27 | from nbapredict.models import four_factor_regression as ff_reg
 28 | 
 29 | 
 30 | def get_prediction(reg, pred_df):
 31 |     """Generate and return a prediction for the observations in the pred_df.
 32 | 
 33 |     Args:
 34 |         reg: LinearRegression class from four_factors_regression.py
 35 |         pred_df: A dataframe of observations, with home and away statistics, from which to generate a prediction
 36 | 
 37 |     Returns:
 38 |         The predicted value generated from the regression object and the predictors"""
 39 |     return reg.results.predict(pred_df).values[0]
 40 | 
 41 | 
 42 | def get_team_name(team):
 43 |     """Match team to a standard team name and return the br_references standard team name."""
 44 |     for team_name in br_references.Team:
 45 |         if team.lower() == team_name.value.lower():
 46 |             return team_name.value
 47 | 
 48 | 
 49 | # def create_prediction_df(home_tm, away_tm, ff_df):
 50 | #     """Create and return a dataframe that merges the four factors for the home and away team.
 51 | #     TODO: Replace with ff_reg.alt_regression_df/getregression_df
 52 | #
 53 | #     Args:
 54 | #         home_tm: The home team
 55 | #         away_tm: The away team
 56 | #         ff_df: Dataframe of the four factors for all teams
 57 | #
 58 | #     Returns:
 59 | #         A single row four factors data frame of the home and away team's four factors
 60 | #     """
 61 | #     home_ff = get_team_ff(home_tm, ff_df, home=True)
 62 | #     away_ff = get_team_ff(away_tm, ff_df, home=False)
 63 | #     home_ff["key"] = 1
 64 | #     home_ff["const"] = 1.0  # sm.add_const does not add a constant for whatever reason
 65 | #     away_ff["key"] = 1
 66 | #     merged = pd.merge(home_ff, away_ff, on="key", sort=True)
 67 | #     merged = merged.drop(["key"], axis=1)
 68 | #     merged = merged.sort_index(axis=1)
 69 | #     return merged
 70 | 
 71 | 
 72 | def get_team_ff(team, ff_df, home):
 73 |     """Create and return a data frame of the four factors for the specified team.
 74 | 
 75 |     Args:
 76 |         team: The team to extract the four factors for
 77 |         ff_df: A dataframe of the four factors
 78 |         home: Boolean which dictates if an '_h or '_a' should be appended to the team's stats
 79 | 
 80 |     Returns:
 81 |         The four factors, with a home or away suffix, for a team are returned as a data frame
 82 |     """
 83 |     ff_list = br_references.four_factors
 84 |     team_ff = ff_df[ff_df.team_name.str.lower() == team.lower()][ff_list]
 85 |     if home:
 86 |         team_ff = team_ff.rename(ff_reg.append_h, axis='columns')
 87 |     else:
 88 |         team_ff = team_ff.rename(ff_reg.append_a, axis='columns')
 89 |     return team_ff
 90 | 
 91 | 
 92 | def line_probability(prediction, line, std):
 93 |     """Calculate and return the CDF or SF, as appropriate, of the line if the model were true.
 94 | 
 95 |     "if the model were true" means that if the assumption holds that the residuals are homoscedastic and follow a
 96 |     normal distribution
 97 | 
 98 |     Args:
 99 |         prediction: The prediction for a game
100 |         line: The line associated with the same game as the prediction
101 |         std: The standard deviation of the residuals for the model used to make the prediction
102 | 
103 |     Returns:
104 |         The survival function or cumulative density function for the line in relation to the prediction
105 |     """
106 |     # ToDo: T-Distribution?
107 |     dist = stats.norm(loc=prediction, scale=std)
108 |     line_prediction = -1 * line
109 | 
110 |     if prediction > line_prediction:
111 |         return dist.cdf(line_prediction), "cdf"
112 |     elif prediction < line_prediction:
113 |         return dist.sf(line_prediction), "sf"
114 |     elif prediction == line_prediction:
115 |         return 0.5  # If the predictions are equal, the cdf automatically equals 0.5
116 | 
117 | 
118 | def prediction_result_console_output(home_tm, away_tm, line, prediction, probability):
119 |     """Generate human readable printout comparing the model's predictions, the line, and the p_value of the line.
120 | 
121 |     Args:
122 |         home_tm: The home team
123 |         away_tm: The away team
124 |         line: The betting line
125 |         prediction: A prediction of the home team's margin of victory
126 |         probability: The probability of the betting line as determined by a CDF or SF
127 |     """
128 |     if prediction > 0:
129 |         print("The {} are projected to beat the {} by {} points".format(home_tm, away_tm, prediction))
130 |         if (-1 * line) < prediction:
131 |             print("If the model were true, the betting line's ({}) CDF, in relation to the prediction, would "
132 |                   "be realized {}% of the time".format(line, probability))
133 |         else:
134 |             print("If the model were true, the betting line's ({}) SF, in relation to the prediction, would "
135 |                   "be realized {}% of the time".format(line, probability))
136 |     if prediction < 0:
137 |         print("The {} are projected to lose to the {} by {} points".format(home_tm, away_tm, prediction))
138 |         if (-1 * line) < prediction:
139 |             print("If the model were true, the betting line's ({}) CDF, in relation to the prediction, would "
140 |                   "be realized {}% of the time".format(line, probability))
141 |         else:
142 |             print("If the model were true, the betting line's ({}) SF, in relation to the prediction, would "
143 |                   "be realized {}% of the time".format(line, probability))
144 | 
145 | 
146 | def insert_predictions(rows, session, pred_tbl, sched_tbl):
147 |     """Add rows into the prediction table in session with additional information from sched_tbl and odds_tbl.
148 | 
149 |     # ToDo: Will need equivalent function, but it won't look like this
150 |     Args:
151 |         rows: SQLalchemy compatible rows
152 |         session: A SQLalchemy session object
153 |         pred_tbl: A mapped prediction table object
154 |         sched_tbl: A mapped scheduled table object
155 |     """
156 |     row_objects = []
157 |     for row in rows:
158 |         row_obj = pred_tbl(**row)
159 |         row_objects.append(row_obj)
160 |     row_objects = update_schedule_attributes(row_objects, session, sched_tbl)
161 | 
162 |     session.add_all(row_objects)
163 | 
164 | 
165 | def insert_new_predictions(rows, session, pred_tbl, sched_tbl, odds_tbl):
166 |     """Insert unique predictions in rows which do not already exist in the prediction table.
167 | 
168 |     Additional information from sched_tbl and odds_tbl is added to the rows as well.
169 | 
170 |     # ToDo: Will need significant rewrite (Also note similarities between this function and the one above)
171 |     Args:
172 |         rows: SQLalchemy compatible rows
173 |         session: a SQLalchemy session object
174 |         pred_tbl: A mapped prediction table object
175 |         sched_tbl: A mapped scheduled table object
176 |         odds_tbl: A mapped odds_tbl object
177 |     """
178 |     row_objects = []
179 |     existing_predictions = session.query(pred_tbl.home_team, pred_tbl.away_team, pred_tbl.start_time).all()
180 |     existing_predictions = [(game.home_team, game.away_team, game.start_time) for game in existing_predictions]
181 |     for row in rows:
182 |         game_identifier = (row["home_team"], row["away_team"], row["start_time"])
183 |         if game_identifier in existing_predictions:
184 |             continue
185 |         else:
186 |             row_obj = pred_tbl(**row)
187 |             row_objects.append(row_obj)
188 |     if len(row_objects) > 0:
189 |         row_objects = update_odds_id(row_objects, session, odds_tbl)
190 |         row_objects = update_schedule_attributes(row_objects, session, sched_tbl)
191 |         session.add_all(row_objects)
192 | 
193 | 
194 | def update_prediction_table(session, pred_tbl, sched_tbl, odds_tbl):
195 |     """Find and update null or 0 values in the score, odds_id, or bet_result columns of the prediction table.
196 | 
197 |     Args:
198 |         session: A SQLalchemy session object
199 |         pred_tbl: A mapped prediction table object
200 |         sched_tbl: A mapped scheduled table object
201 |         odds_tbl: A mapped odds_tbl object
202 |     """
203 |     score_update_objs = session.query(pred_tbl).filter(or_(pred_tbl.home_team_score == 0,
204 |                                                            pred_tbl.away_team_score == 0)).all()
205 |     session.add_all(score_update_objs)
206 | 
207 |     bet_update_objs = session.query(pred_tbl).filter(pred_tbl.bet_result.is_(None), pred_tbl.home_team_score > 0).all()
208 |     bet_update_objs = update_bet_results(bet_update_objs)
209 |     session.add_all(bet_update_objs)
210 | 
211 | 
212 | def update_bet_results(bet_update_objects):
213 |     """Take bet_update_objects, determine the prediction result, and add the result to each row in bet_update_objects.
214 | 
215 |     # ToDo: Will need this function, but will require a lot of modification
216 |     Args:
217 |         bet_update_objects: Objects from a query.all() from the prediction table. Objects should have a home and
218 |         away team score.
219 | 
220 |     Returns:
221 |         bet_update_objects updated with the bet results (WIN, LOSS, or PUSH).
222 |     """
223 |     for row in bet_update_objects:
224 |         score_margin = row.home_team_score - row.away_team_score
225 |         line_inverse = row.line * -1
226 |         prediction = row.prediction
227 |         if score_margin == line_inverse:
228 |             row.bet_result = "PUSH"
229 |         elif (score_margin < line_inverse) and (prediction < line_inverse):
230 |             row.bet_result = "WIN"
231 |         elif (score_margin > line_inverse) and (prediction > line_inverse):
232 |             row.bet_result = "WIN"
233 |         else:
234 |             row.bet_result = "LOSS"
235 |     return bet_update_objects
236 | 
237 | 
238 | def get_sample_prediction(session, regression):
239 |     """Generate and return a sample prediction formatted specifically for table creation.
240 | 
241 |     Args:
242 |         session: A SQLalchemy session object
243 |         regression: A regression object from four_factor_regression.py
244 | 
245 |     Returns:
246 |         A DataOperator object initialized with a prediction from regression
247 |     """
248 |     one_row_dataframe = regression.predictors.loc[[0]]
249 | 
250 |     sample_prediction = predict_game(session, regression, one_row_dataframe)
251 |     data = DataOperator(sample_prediction)
252 |     return data
253 | 
254 | 
255 | def predict_game(session, regression, x_df, console_out=False):
256 |     """Predict a game and return the information in a dictionary.
257 | 
258 |     Use console out for human readable output if desired.Cdf is a cumulative density function. SF is a survival
259 |     function. CDF is calculated when the betting line's prediction is below the model's prediction. SF is calculated
260 |     when the betting line's prediction is above the model's prediction.
261 | 
262 |     Args:
263 |         session: A SQLalchemy session object
264 |         regression: A regression object
265 | 
266 |         console_out: If true, print the prediction results. Ignore otherwise
267 |     """
268 | 
269 |     prediction = get_prediction(regression, x_df)
270 |     # probability, function = line_probability(prediction, line, np.std(regression.residuals))
271 | 
272 |     # if console_out:
273 |     #     prediction_result_console_output(home_tm, away_tm, prediction, probability)
274 | 
275 |     return {"prediction": prediction}
276 | 
277 | 
278 | def predict_games_in_odds(session, regression, odds_tbl):
279 |     """Generate and return predictions for all games with odds in the odds_tbl
280 | 
281 |     ToDo: Take tables as inputs vs. DB
282 |     Args:
283 |         session: A SQLalchemy session object
284 |         regression: A linear regression object generated from four_factor_regression
285 |         odds_tbl: Mapped sqlalchemy odds table
286 | 
287 |     """
288 |     all_odds = session.query(odds_tbl).all()
289 |     predictions = []
290 |     for odds in all_odds:
291 |         home_team = odds.home_team
292 |         away_team = odds.away_team
293 |         start_time = odds.start_time
294 |         line = odds.spread
295 |         predictions.append(predict_game(session, regression, home_team, away_team, start_time, line))
296 |     return predictions
297 | 
298 | 
299 | def predict_games_on_day(database, session, games, console_out=False):
300 |     """Take a SQLalchemy query object of games, and return a prediction for each game.
301 | 
302 |     ToDO: On day versus on date?
303 |     Args:
304 |         database: an instantiated DBInterface class from database.dbinterface.py
305 |         session: A SQLalchemy session object
306 |         games: a SQLalchemy query object of games containing start_time, home_tm, away_tm, and the spread
307 |         console_out: A bool. True to print prediction outputs
308 |     """
309 |     results = []
310 |     regression = ff_reg.main(database=database, session=session, year=year)
311 |     try:
312 |         for game in games:
313 |             prediction = predict_game(database=database, session=session, regression=regression, home_tm=game.home_team,
314 |                                       away_tm=game.away_team, start_time=game.start_time, line=game.spread,
315 |                                       console_out=console_out)
316 |             results.append(prediction)
317 |     except AttributeError:
318 |         # If games doesn't contain spreads, catch the attribute error and pass a 0 line.
319 |         # If games is missing other data, function will break.
320 |         for game in games:
321 |             prediction = predict_game(database=database, session=session, regression=regression, home_tm=game.home_team,
322 |                                       away_tm=game.away_team, start_time=game.start_time, line=0,
323 |                                       console_out=console_out)
324 |             results.append(prediction)
325 |     return results
326 | 
327 | 
328 | def predict_games_on_date(database, session, league_year, date, console_out):
329 |     """Predict games on the specified date and write the results to the database
330 | 
331 |     ToDO: On day versus on date?
332 |     Args:
333 |         database: An instantiated DBInterface class from dbinterface.py
334 |         session: A sqlalchemy session object for queries and writes
335 |         league_year: The league year to work with. For example, the league year of the 2018-19 season is 2019
336 |         date: Either a datetime.date or a dictionary keyed formatted as {"day": day, "month": month, "year": year"}
337 |         console_out: If true, prints prediction results to the console
338 |     """
339 |     # Get lines for the games
340 |     if not isinstance(date, datetime):
341 |         date = datetime(date["year"], date["month"], date["day"])
342 |     odds_tbl = database.get_table_mappings(["odds_{}".format(league_year)])
343 |     games_query = getters.get_spreads_for_date(odds_tbl, session, date)
344 |     game_spreads = [game for game in games_query]
345 | 
346 |     results = predict_games_on_day(database, session, game_spreads, console_out=console_out)
347 | 
348 |     prediction_tbl = "predictions_{}".format(league_year)
349 |     data = DataOperator(results)
350 | 
351 |     sched_tbl = database.get_table_mappings("sched_{}".format(league_year))
352 |     pred_tbl = database.get_table_mappings("predictions_{}".format(league_year))
353 | 
354 |     # Results are sent to DataOperator in row format, so just pass data.data instead of data.dict_to_rows()
355 |     try:
356 |         insert_predictions(data.data, session, pred_tbl, sched_tbl, odds_tbl)
357 |         session.commit()
358 |     except IntegrityError:
359 |         session.rollback()
360 |         update_prediction_table(session, pred_tbl, sched_tbl, odds_tbl)
361 |         session.commit()
362 |     finally:
363 |         session.close()
364 | 
365 | 
366 | def predict_all(db):
367 |     """Generate and store predictions for all games available in the odds table.
368 | 
369 |     Checks if the table exists. If it doesn't, generate a table in the database.
370 |     """
371 |     session = Session(bind=db.engine)
372 |     league_year = Config.get_property("league_year")
373 |     sched_tbl = db.table_mappings["schedule_{}".format(league_year)]
374 |     team_stats_tbl = db.table_mappings['team_stats_{}'.format(league_year)]
375 |     odds_tbl = db.table_mappings['odds_{}'.format(league_year)]
376 | 
377 |     regression = ff_reg.main(session, team_stats_tbl, sched_tbl)
378 | 
379 |     pred_tbl_name = "predictions_{}".format(league_year)
380 | 
381 |     if not db.table_exists(pred_tbl_name):
382 |         sample = get_sample_prediction(session, regression, sched_tbl)
383 |         pred_data = predictions.format_data()
384 |         predictions.create_table()
385 |         pred_tbl = db.table_mappings[pred_tbl_name]
386 |         session.add_all([pred_tbl(**row) for row in pred_data.rows])
387 |         session.commit()
388 |     else:
389 |         # Data operator
390 |         pred_tbl = db.table_mappings[pred_tbl_name]
391 |         schedule_tbl = db.table_mappings[pred_tbl_name]
392 |         update_rows = predictions.insert(session, )
393 |         results = predict_games_in_odds(session, regression, odds_tbl)
394 |         session.add_all(update_rows)
395 |         session.commit()
396 | 
397 |     insert_new_predictions(results, session, pred_tbl, sched_tbl, odds_tbl)
398 | 
399 |     session.commit()  # Commit here b/c update_prediction_tbl() needs the inserted values
400 | 
401 |     update_prediction_table(session, pred_tbl, sched_tbl, odds_tbl)
402 | 
403 | 
404 | if __name__ == "__main__":
405 |     db = Database('test', "../management")
406 |     predict_all(db)
407 |     predict_game("Sacramento Kings", "Orlando Magic", line=-5.5, year=2019, console_out=True)
408 |     date = datetime(2019, 3, 26)
409 |     predict_games_on_date(db, session, league_year=2019, date=date, console_out=True)
410 | 


--------------------------------------------------------------------------------