├── NBApredict ├── run │ ├── __init__.py │ ├── all.py │ └── daily.py ├── database │ ├── __init__.py │ ├── reconcile.py │ ├── getters.py │ ├── manipulator.py │ └── dbinterface.py ├── br_web_scraper │ ├── __init__.py │ ├── parsers │ │ ├── __init__.py │ │ ├── players_season_totals.py │ │ ├── box_scores.py │ │ └── schedule.py │ ├── errors.py │ ├── json_encoders.py │ ├── client.py │ ├── http_client.py │ ├── data.py │ └── output.py ├── management │ ├── tables │ │ ├── __init__.py │ │ ├── results.py │ │ ├── predictions.py │ │ ├── teams.py │ │ ├── team_stats.py │ │ ├── odds.py │ │ └── schedule.py │ ├── __init__.py │ ├── etl.py │ └── conversion.py ├── __init__.py ├── settings.yaml ├── scrapers │ ├── scraper.py │ ├── team_scraper.py │ ├── season_scraper.py │ └── line_scraper.py ├── predict │ ├── games.py │ ├── get.py │ └── bets.py ├── helpers │ ├── json.py │ ├── type.py │ ├── classes.py │ └── br_references.py ├── models │ ├── graphing.py │ └── four_factor_regression.py └── configuration.py ├── .gitignore ├── LICENSE.txt ├── README.md └── project_notebook.ipynb /NBApredict/run/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /NBApredict/database/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /NBApredict/br_web_scraper/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /NBApredict/management/tables/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /NBApredict/management/tables/results.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /NBApredict/br_web_scraper/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /NBApredict/management/__init__.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy.orm import sessionmaker 2 | Session = sessionmaker() 3 | -------------------------------------------------------------------------------- /NBApredict/__init__.py: -------------------------------------------------------------------------------- 1 | # __init__.py signals to python that the folder contains relevant 2 | # packages and information -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | #Ignore the following file extensions: 2 | *.exe 3 | *.log 4 | *.txt 5 | *.pyc 6 | *.json 7 | *.idea 8 | *.csv 9 | *.db 10 | *.sqlite 11 | *xlsx 12 | graphs 13 | scratch* 14 | .ipynb_checkpoints 15 | */.ipynb_checkpoints/* 16 | *test_regression 17 | -------------------------------------------------------------------------------- /NBApredict/management/tables/predictions.py: -------------------------------------------------------------------------------- 1 | """Functions for prediction table creation and operations.""" 2 | 3 | 4 | def format_data(): 5 | pass 6 | 7 | 8 | def create_table(db, prediction_data): 9 | pass 10 | 11 | 12 | def insert(): 13 | pass 14 | -------------------------------------------------------------------------------- /NBApredict/br_web_scraper/errors.py: -------------------------------------------------------------------------------- 1 | class InvalidDate(Exception): 2 | def __init__(self, day, month, year): 3 | message = "Date with year set to {year}, month set to {month}, and day set to {day} is invalid"\ 4 | .format( 5 | year=year, 6 | month=month, 7 | day=day, 8 | ) 9 | super().__init__(message) 10 | -------------------------------------------------------------------------------- /NBApredict/br_web_scraper/json_encoders.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from json import JSONEncoder 3 | from enum import Enum 4 | 5 | 6 | class BasketballReferenceJSONEncoder(JSONEncoder): 7 | def default(self, obj): 8 | if isinstance(obj, datetime): 9 | return obj.isoformat() 10 | 11 | if isinstance(obj, Enum): 12 | return obj.value 13 | 14 | return JSONEncoder.default(self, obj) 15 | 16 | -------------------------------------------------------------------------------- /NBApredict/settings.yaml: -------------------------------------------------------------------------------- 1 | paths: 2 | directory: NBA 3 | database: db_path 4 | graph_dir: graph_path 5 | settings: settings path 6 | 7 | Bovada: 8 | regularURL: https://www.bovada.lv/services/sports/event/v2/events/A/description/basketball/nba 9 | playoffURL: https://www.bovada.lv/services/sports/event/v2/events/A/description/basketball/nba-playoffs 10 | 11 | prediction: 12 | predict_lines: False 13 | 14 | models: 15 | four_factor_regression: 16 | options: 17 | graph: True 18 | console_out: True 19 | Bayesian_model: 20 | settings: 21 | ML_model: 22 | settings: 23 | 24 | league_year: 2020 25 | -------------------------------------------------------------------------------- /NBApredict/management/tables/teams.py: -------------------------------------------------------------------------------- 1 | """Teams.py contains (a) function(s) to create the teams table in the database""" 2 | 3 | 4 | def create_team_table(db, teams_data, tbl_name): 5 | """Create a table in DB named tbl_name with the columns in teams_data 6 | 7 | Args: 8 | db: a datotable.database.Database object connected to a database 9 | teams_data: A datatotable.data.DataOperator object with data on NBA teams 10 | tbl_name: The desired name of the table 11 | """ 12 | columns = teams_data.columns 13 | columns["team_name"].append({"unique": True}) 14 | db.map_table(tbl_name=tbl_name, columns=columns) 15 | db.create_tables() 16 | db.clear_mappers() 17 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) [2019] [Spencer Weson] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /NBApredict/run/all.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module runs the entire NBA_bet project. 3 | 4 | This module wraps the entire project into a single script with run_all() as the function which drives the script. First, 5 | it sets up the database and session connections. Then, it scrapes all new data. Finally, it predicts all games for which 6 | data is available. Most session.commit() calls in the project are performed here. However, note predict_all() requires 7 | a commit during the process in order to function correctly. 8 | """ 9 | from sqlalchemy.orm import Session 10 | 11 | # Local Imports 12 | from nbapredict.database.dbinterface import DBInterface 13 | from nbapredict.predict import bets 14 | from nbapredict.scrapers import scraper 15 | from nbapredict.configuration import Config 16 | 17 | 18 | def run_all(): 19 | """Run the entire NBA_bet project.""" 20 | db = DBInterface() 21 | year = Config.get_property("league_year") 22 | session = Session(bind=db.engine) 23 | 24 | scraper.scrape_all(db, session, year) 25 | session.commit() 26 | 27 | bets.predict_all(db, session) 28 | session.commit() 29 | session.close() 30 | 31 | 32 | if __name__ == "__main__": 33 | run_all() 34 | -------------------------------------------------------------------------------- /NBApredict/database/reconcile.py: -------------------------------------------------------------------------------- 1 | """ 2 | At the moment, reconcile contains one function which "reconciles" primary and reference tables for a specific column. 3 | ToDo: Remove 4 | """ 5 | 6 | 7 | def reconcile(ref_tbl, change_tbl, column, ref_key, change_key, session): 8 | """Compare the specified column over the two tables and change change_tbl values to ref_tbl values 9 | 10 | Note that the change and reference tables must be related by a foreign key. 11 | 12 | Args: 13 | ref_tbl: The reference table which contains the values to be changed in change_tbl 14 | change_tbl: The table to be changed with values from reference table 15 | column: The column to evaluate for changes. Column must be present in both tables. 16 | ref_key: The key in the reference table to join the tables by 17 | change_key: The key in the change table to join the tables by 18 | session: An instance of a sqlalchemy Session class bound to the database's engine 19 | 20 | To-do: 21 | Figure out how to run with multiple columns 22 | """ 23 | join_objs = session.query(ref_tbl, change_tbl).join().\ 24 | filter(getattr(ref_tbl, ref_key) == getattr(change_tbl, change_key)).all() 25 | 26 | changed_objs = [] 27 | for obj in join_objs: 28 | ref_obj = obj[0] 29 | change_obj = obj[1] 30 | ref_val = getattr(ref_obj, column) 31 | change_val = getattr(change_obj, column) 32 | if ref_val != change_val: 33 | setattr(change_obj, column, ref_val) 34 | session.add(change_obj) 35 | 36 | return 37 | -------------------------------------------------------------------------------- /NBApredict/br_web_scraper/client.py: -------------------------------------------------------------------------------- 1 | from nbapredict.br_web_scraper import http_client 2 | 3 | from nbapredict.br_web_scraper.output import box_scores_to_csv, schedule_to_csv 4 | from nbapredict.br_web_scraper.output import output 5 | from nbapredict.br_web_scraper.json_encoders import BasketballReferenceJSONEncoder 6 | 7 | 8 | def player_box_scores(day, month, year, output_type=None, output_file_path=None, output_write_option=None, json_options=None): 9 | values = http_client.player_box_scores(day=day, month=month, year=year) 10 | return output( 11 | values=values, 12 | output_type=output_type, 13 | output_file_path=output_file_path, 14 | output_write_option=output_write_option, 15 | csv_writer=box_scores_to_csv, 16 | encoder=BasketballReferenceJSONEncoder, 17 | json_options=json_options, 18 | ) 19 | 20 | 21 | def season_schedule(season_end_year, output_type=None, output_file_path=None, output_write_option=None, json_options=None): 22 | values = http_client.season_schedule(season_end_year) 23 | return output( 24 | values=values, 25 | output_type=output_type, 26 | output_file_path=output_file_path, 27 | output_write_option=output_write_option, 28 | csv_writer=schedule_to_csv, 29 | encoder=BasketballReferenceJSONEncoder, 30 | json_options=json_options, 31 | ) 32 | 33 | 34 | def players_season_totals(season_end_year, output_type=None, output_file_path=None, output_write_option=None, json_options=None): 35 | values = http_client.players_season_totals(season_end_year) 36 | return output( 37 | values=values, 38 | output_type=output_type, 39 | output_file_path=output_file_path, 40 | output_write_option=output_write_option, 41 | csv_writer=schedule_to_csv, 42 | encoder=BasketballReferenceJSONEncoder, 43 | json_options=json_options, 44 | ) 45 | 46 | -------------------------------------------------------------------------------- /NBApredict/management/tables/team_stats.py: -------------------------------------------------------------------------------- 1 | """Team_stats.py contains function to create the team_stats table in the database""" 2 | 3 | from datetime import datetime 4 | from nbapredict.configuration import Config 5 | from sqlalchemy import ForeignKey, UniqueConstraint 6 | 7 | 8 | def create_table(db, team_stats_data, tbl_name): 9 | """Create a table of team stats in a database with appropriate foreign keys and constraints. 10 | 11 | Args: 12 | db: a datotable.database.Database object connected to a database 13 | team_stats_data: A datatotable.data.DataOperator object with data on NBA team stats 14 | tbl_name: The desired table name 15 | ToDo: Currently allows duplicate rows if those values are on different days. Solve with a constraint 16 | """ 17 | columns = team_stats_data.columns 18 | columns['team_id'].append(ForeignKey("teams_{}.id".format(Config.get_property('league_year')))) 19 | constraints = [UniqueConstraint("team_id", "scrape_time")] 20 | db.map_table(tbl_name=tbl_name, columns=columns, constraints=constraints) 21 | db.create_tables() 22 | db.clear_mappers() 23 | 24 | 25 | def insert(session, team_stats_tbl, team_stats_data): 26 | """Insert new data into the team_stats_tbl. 27 | 28 | Args: 29 | session: An instantiated SQLalchemy session object 30 | team_stats_tbl: A mapped team stats table object 31 | team_stats_data: A datatotable.data.DataOperator object with data on NBA team stats 32 | """ 33 | last_insert_scrape_time = session.query(team_stats_tbl.scrape_time). \ 34 | order_by(team_stats_tbl.scrape_time.desc()).first().scrape_time 35 | last_insert_date = datetime.date(last_insert_scrape_time) 36 | current_scrape_date = datetime.date(datetime.now()) 37 | if last_insert_date < current_scrape_date: 38 | session.add_all([team_stats_tbl(**row) for row in team_stats_data.rows]) 39 | session.commit() 40 | -------------------------------------------------------------------------------- /NBApredict/scrapers/scraper.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module wraps the team stats, schedule, and betting line scrapers together and stores their data in the database. 3 | 4 | If the script is called, it instantiates a DBInterface object for database interactions and creates a SQLalchemy session 5 | object from the DBInterface's information. Otherwise, the scape_all() function is called with database, session, and 6 | league year arguments specified. 7 | """ 8 | import os 9 | from sqlalchemy.orm import Session 10 | 11 | # Local Imports 12 | from nbapredict.database.dbinterface import DBInterface 13 | from nbapredict.scrapers import team_scraper, season_scraper, line_scraper 14 | import nbapredict.configuration as configuration 15 | 16 | 17 | def scrape_all(database, session, league_year): 18 | """Scrape and store team stats, schedule information, and betting lines in the database. 19 | 20 | Note, this only adds data to the session. Changes must be committed to be saved. 21 | 22 | Args: 23 | database: An instantiated DBInterface object from database.database for database interactions 24 | session: An instance of a sqlalchemy Session class bound to the database's engine 25 | league_year: The league year to scrape data from (i.e. 2018-2019 season is 2019) 26 | """ 27 | # Insure the database folder exists 28 | if not os.path.isdir(configuration.output_directory()): 29 | os.mkdir(configuration.output_directory()) 30 | 31 | team_scrape = team_scraper.scrape(database=database) 32 | season_scrape = season_scraper.scrape(database=database, session=session) 33 | line_scrape = line_scraper.scrape(database=database, session=session) 34 | 35 | 36 | if __name__ == "__main__": 37 | db_path = configuration.database_file(os.path.dirname(__file__)) 38 | db = DBInterface(db_path) 39 | league_year = 2019 40 | session = Session(bind=db.engine) 41 | scrape_all(database=db, session=session, league_year=league_year) 42 | -------------------------------------------------------------------------------- /NBApredict/br_web_scraper/http_client.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | from nbapredict.br_web_scraper.errors import InvalidDate 4 | from nbapredict.br_web_scraper.parsers.box_scores import parse_player_box_scores 5 | from nbapredict.br_web_scraper.parsers.schedule import parse_schedule, parse_schedule_for_month_url_paths 6 | from nbapredict.br_web_scraper.parsers.players_season_totals import parse_players_season_totals 7 | 8 | BASE_URL = 'https://www.basketball-reference.com' 9 | 10 | 11 | def player_box_scores(day, month, year): 12 | url = '{BASE_URL}/friv/dailyleaders.cgi?month={month}&day={day}&year={year}'.format( 13 | BASE_URL=BASE_URL, 14 | day=day, 15 | month=month, 16 | year=year 17 | ) 18 | 19 | response = requests.get(url=url, allow_redirects=False) 20 | 21 | if 200 <= response.status_code < 300: 22 | return parse_player_box_scores(response.content) 23 | 24 | raise InvalidDate(day=day, month=month, year=year) 25 | 26 | 27 | def schedule_for_month(url): 28 | response = requests.get(url=url) 29 | 30 | response.raise_for_status() 31 | 32 | return parse_schedule(response.content) 33 | 34 | 35 | def season_schedule(season_end_year): 36 | url = '{BASE_URL}/leagues/NBA_{season_end_year}_games.html'.format( 37 | BASE_URL=BASE_URL, 38 | season_end_year=season_end_year 39 | ) 40 | 41 | response = requests.get(url=url) 42 | 43 | response.raise_for_status() 44 | 45 | season_schedule_values = parse_schedule(response.content) 46 | other_month_url_paths = parse_schedule_for_month_url_paths(response.content) 47 | 48 | for month_url_path in other_month_url_paths: 49 | url = '{BASE_URL}{month_url_path}'.format(BASE_URL=BASE_URL, month_url_path=month_url_path) 50 | monthly_schedule = schedule_for_month(url=url) 51 | season_schedule_values.extend(monthly_schedule) 52 | 53 | return season_schedule_values 54 | 55 | 56 | def players_season_totals(season_end_year): 57 | url = '{BASE_URL}/leagues/NBA_{season_end_year}_totals.html'.format( 58 | BASE_URL=BASE_URL, 59 | season_end_year=season_end_year, 60 | ) 61 | 62 | response = requests.get(url=url) 63 | 64 | response.raise_for_status() 65 | 66 | return parse_players_season_totals(response.content) 67 | -------------------------------------------------------------------------------- /NBApredict/br_web_scraper/parsers/players_season_totals.py: -------------------------------------------------------------------------------- 1 | from lxml import html 2 | 3 | from nbapredict.helpers.br_references import TEAM_ABBREVIATIONS_TO_TEAM, POSITION_ABBREVIATIONS_TO_POSITION 4 | 5 | 6 | def parse_player_season_totals(row): 7 | return { 8 | "name": str(row[1].text_content()), 9 | "position": POSITION_ABBREVIATIONS_TO_POSITION[row[2].text_content()], 10 | "age": int(row[3].text_content()), 11 | "team": TEAM_ABBREVIATIONS_TO_TEAM[row[4].text_content()], 12 | "games_played": int(row[5].text_content()), 13 | "games_started": int(row[6].text_content()), 14 | "minutes_played": int(row[7].text_content()), 15 | "made_field_goals": int(row[8].text_content()), 16 | "attempted_field_goals": int(row[9].text_content()), 17 | "made_three_point_field_goals": int(row[11].text_content()), 18 | "attempted_three_point_field_goals": int(row[12].text_content()), 19 | "made_free_throws": int(row[18].text_content()), 20 | "attempted_free_throws": int(row[19].text_content()), 21 | "offensive_rebounds": int(row[21].text_content()), 22 | "defensive_rebounds": int(row[22].text_content()), 23 | "assists": int(row[24].text_content()), 24 | "steals": int(row[25].text_content()), 25 | "blocks": int(row[26].text_content()), 26 | "turnovers": int(row[27].text_content()), 27 | "personal_fouls": int(row[28].text_content()), 28 | } 29 | 30 | 31 | def parse_players_season_totals(page): 32 | tree = html.fromstring(page) 33 | # Basketball Reference includes individual rows for players that played for multiple teams in a season 34 | # These rows have a separate class ("italic_text partial_table") than the players that played for a single team 35 | # across a season. 36 | rows = tree.xpath('//table[@id="totals_stats"]/tbody/tr[contains(@class, "full_table") or contains(@class, "italic_text partial_table") and not(contains(@class, "rowSum"))]') 37 | totals = [] 38 | for row in rows: 39 | # Basketball Reference includes a "total" row for players that got traded 40 | # which is essentially a sum of all player team rows 41 | # I want to avoid including those, so I check the "team" field value for "TOT" 42 | if row[4].text_content() != "TOT": 43 | totals.append(parse_player_season_totals(row)) 44 | return totals 45 | -------------------------------------------------------------------------------- /NBApredict/predict/games.py: -------------------------------------------------------------------------------- 1 | """Predict.games contains functions oriented around predicting games""" 2 | 3 | from sqlalchemy import Integer, ForeignKey, String, UniqueConstraint 4 | from sqlalchemy.orm import Session, relationship 5 | 6 | # Local Imports 7 | import nbapredict.predict.get as get 8 | from nbapredict.configuration import Config 9 | import nbapredict.models.four_factor_regression as lm 10 | import nbapredict.database.dbinterface as dbinterface 11 | 12 | 13 | def create_prediction_table(database, data, tbl_name): 14 | """Create a prediction table from the data and with the table name in the database. 15 | 16 | ToDo: This will need a big overhaul 17 | 18 | Args: 19 | database: An initialized DBInterface class from database.dbinterface.py 20 | data: An initialized DataOperator object, from database.manipulator, with prediction data 21 | tbl_name: The desired table name (with year as the last four characters) 22 | """ 23 | # Create columns from data 24 | sql_types = data.get_sql_type() 25 | # Add new columns 26 | year = tbl_name[-4:] 27 | schedule_name = "sched_{}".format(year) 28 | additional_cols = [{'game_id': [Integer, ForeignKey(schedule_name + ".id")]}, {"MOV": Integer}] 29 | for col in additional_cols: 30 | sql_types.update(col) 31 | constraint = {UniqueConstraint: ["start_time", "home_team", "away_team"]} 32 | # Map prediction table 33 | database.map_table(tbl_name, sql_types, constraint) 34 | 35 | # Get tables for relationships 36 | sched_tbl = database.get_table_mappings(schedule_name) 37 | 38 | # Create Relationships 39 | if "game_preds_{}".format(year) not in sched_tbl.__mapper__.relationships.keys(): 40 | sched_tbl.predictions = relationship(database.Template) 41 | 42 | database.create_tables() 43 | database.clear_mappers() 44 | 45 | 46 | def main(): 47 | db = dbinterface.DBInterface() 48 | session = Session(bind=db.engine) 49 | league_year = Config.get_property("league_year") 50 | 51 | regression = lm.main(db, session) 52 | sched_tbl = db.get_table_mappings("sched_{}".format(league_year)) 53 | 54 | if not db.table_exists("pred"): 55 | # Returns a data manipulator class 56 | sample = get.sample_prediction(db, session, ref_tbl=sched_tbl, model=regression) 57 | create_prediction_table(db, sample, "game_pred_{}".format(league_year)) 58 | 59 | 60 | 61 | if __name__ == "__main__": 62 | main() -------------------------------------------------------------------------------- /NBApredict/database/getters.py: -------------------------------------------------------------------------------- 1 | """ 2 | getters contains functions which may be commonly used to get certain subsets of data, data transformations, or data 3 | summaries 4 | """ 5 | 6 | from datetime import timedelta 7 | import pandas as pd 8 | 9 | 10 | def get_games_on_day(schedule, session, date): 11 | """Return the games from schedule on the specified date 12 | 13 | Args: 14 | schedule: A mapped table object containing a schedule of games 15 | session: An instantiated session object 16 | date: The date to check for games 17 | """ 18 | next_day = date + timedelta(days=1) 19 | return session.query(schedule).filter(schedule.start_time > date, schedule.start_time < next_day) 20 | 21 | 22 | def get_first_game_time_on_day(schedule, session, date): 23 | """Return the first game game time on the specified date 24 | 25 | Args: 26 | schedule: A mapped table object containing a schedule of games 27 | session: An instantiated session object 28 | date: The date to check for games 29 | """ 30 | games_on_day = get_games_on_day(schedule, session, date).subquery() 31 | first_game = session.query(games_on_day).order_by(games_on_day.c.start_time).first() 32 | if first_game: 33 | first_game_time = first_game[1] 34 | return first_game_time 35 | else: 36 | return None 37 | 38 | 39 | def get_spreads_for_date(odds_table, session, date): 40 | """Return the spreads from the odds_table that correspond to the games 41 | 42 | Args: 43 | odds_table: Sqlalchemy table object that contains odds 44 | session: Sqlalchemy session object 45 | date: Date to extract odds for 46 | """ 47 | next_day = date + timedelta(days=1) 48 | query = session.query(odds_table.start_time, odds_table.home_team, odds_table.away_team, odds_table.spread). \ 49 | filter(odds_table.start_time > date, odds_table.start_time < next_day) 50 | 51 | return query 52 | 53 | 54 | def get_pandas_df_from_table(database, session, tbl_name, qualifiers=False): 55 | """Convert the specified table into a pandas dataframe, modify it according to qualifiers, and return the result 56 | 57 | Args: 58 | database: An instantiated DBInterface class from dbinterface.py 59 | session: SQLalchemy session object 60 | tbl_name: name of the desired table 61 | qualifiers: A list of columns or a function to filter rows by 62 | """ 63 | tbl = database.get_table_mappings(tbl_name) 64 | query = session.query(tbl) 65 | if qualifiers: 66 | return pd.read_sql(query.statement, query.session.bind)[qualifiers] 67 | else: 68 | return pd.read_sql(query.statement, query.session.bind) 69 | -------------------------------------------------------------------------------- /NBApredict/br_web_scraper/parsers/box_scores.py: -------------------------------------------------------------------------------- 1 | from lxml import html 2 | 3 | from nbapredict.helpers.br_references import Location, Outcome, TEAM_ABBREVIATIONS_TO_TEAM 4 | 5 | 6 | def parse_location(symbol): 7 | if symbol == "@": 8 | return Location.AWAY 9 | elif symbol == "": 10 | return Location.HOME 11 | raise ValueError("Unknown symbol: {symbol}".format(symbol=symbol)) 12 | 13 | 14 | def parse_outcome(symbol): 15 | if symbol == "W": 16 | return Outcome.WIN 17 | elif symbol == "L": 18 | return Outcome.LOSS 19 | raise ValueError("Unknown symbol: {symbol}".format(symbol=symbol)) 20 | 21 | 22 | def parse_seconds_played(formatted_playing_time): 23 | if formatted_playing_time == "": 24 | return 0 25 | 26 | # It seems like basketball reference formats everything in MM:SS 27 | # even when the playing time is greater than 59 minutes, 59 seconds. 28 | # 29 | # Because of this, we can't use strptime / %M as valid values are 0-59. 30 | # So have to parse time by splitting on ":" and assuming that 31 | # the first part is the minute part and the second part is the seconds part 32 | time_parts = formatted_playing_time.split(":") 33 | minutes_played = time_parts[0] 34 | seconds_played = time_parts[1] 35 | return 60 * int(minutes_played) + int(seconds_played) 36 | 37 | 38 | def parse_player_box_score(row): 39 | return { 40 | "name": str(row[1].text_content()), 41 | "team": TEAM_ABBREVIATIONS_TO_TEAM[row[2].text_content()], 42 | "location": parse_location(row[3].text_content()), 43 | "opponent": TEAM_ABBREVIATIONS_TO_TEAM[row[4].text_content()], 44 | "outcome": parse_outcome(row[5].text_content()), 45 | "seconds_played": int(parse_seconds_played(row[6].text_content())), 46 | "made_field_goals": int(row[7].text_content()), 47 | "attempted_field_goals": int(row[8].text_content()), 48 | "made_three_point_field_goals": int(row[10].text_content()), 49 | "attempted_three_point_field_goals": int(row[11].text_content()), 50 | "made_free_throws": int(row[13].text_content()), 51 | "attempted_free_throws": int(row[14].text_content()), 52 | "offensive_rebounds": int(row[16].text_content()), 53 | "defensive_rebounds": int(row[17].text_content()), 54 | "assists": int(row[19].text_content()), 55 | "steals": int(row[20].text_content()), 56 | "blocks": int(row[21].text_content()), 57 | "turnovers": int(row[22].text_content()), 58 | "personal_fouls": int(row[23].text_content()), 59 | "game_score": float(row[25].text_content()), 60 | } 61 | 62 | 63 | def parse_player_box_scores(page): 64 | tree = html.fromstring(page) 65 | rows = tree.xpath('//table[@id="stats"]//tbody/tr[not(contains(@class, "thead"))]') 66 | return list(map(lambda row: parse_player_box_score(row), rows)) 67 | -------------------------------------------------------------------------------- /NBApredict/helpers/json.py: -------------------------------------------------------------------------------- 1 | """ 2 | JSON interaction class and functions. 3 | 4 | Created for a use-case which is no longer needed. This module is not used in the project. 5 | """ 6 | import copy 7 | import json 8 | import os 9 | import yaml 10 | 11 | # Local imports 12 | from nbapredict.helpers import type 13 | 14 | 15 | class JsonFile: 16 | """A class to handle JSON functionality such as load, create, add, and drop""" 17 | def __init__(self, json_file): 18 | self.path = json_file 19 | if os.path.isfile(self.path): 20 | return 21 | else: # Create a blank JSON if the file does not already exist 22 | self.create_json() 23 | 24 | def add_objects(self, objects_dict): 25 | """Adds a new object or objects to an existing json file 26 | 27 | To-do: 28 | Currently rewrites the entire file which could be a performance issue. To change, make so that the json file 29 | endings are removed, a comma inserted, and then re-insert the ending (or something like that)""" 30 | 31 | data = self.load_json() 32 | modified_data = copy.deepcopy(data) 33 | try: 34 | for key, value in objects_dict.items(): 35 | if type.is_python_type(value): # Creates a yaml representation of python types 36 | value = yaml.dump(value) 37 | modified_data[key] = value 38 | self.create_json(modified_data) 39 | except (TypeError, json.decoder.JSONDecodeError): # Rewrite the initial JSON if an error is encountered 40 | self.create_json(data) 41 | raise Exception("Could not add object to JSON. Json restored to previous format") 42 | 43 | def remove_objects(self, keys): 44 | """Removes the specified object or objects from the json_file as specified by keys""" 45 | with open(self.path, encoding='utf-8') as data_file: 46 | data = json.loads(data_file.read()) 47 | 48 | changed_data = data 49 | if isinstance(keys, str): 50 | del changed_data[keys] 51 | else: 52 | for key in keys: 53 | del changed_data[key] 54 | try: 55 | self.create_json(changed_data) 56 | except TypeError: 57 | self.create_json(data) 58 | 59 | def create_json(self, object_dict=None): 60 | """Creates a json to store the specified objects""" 61 | if object_dict: 62 | with open(self.path, 'w') as fp: 63 | json.dump(object_dict, fp, sort_keys=True, indent=4) 64 | else: 65 | with open(self.path, 'w') as fp: 66 | json.dump({}, fp, sort_keys=True, indent=4) 67 | 68 | def check_for_object(self, object_key): 69 | json_keys = self.load_json().keys() 70 | if object_key in json_keys: 71 | return True 72 | else: 73 | return False 74 | 75 | def load_json(self): 76 | with open(self.path, "r") as file: 77 | python_object = json.load(file) 78 | return python_object 79 | -------------------------------------------------------------------------------- /NBApredict/br_web_scraper/data.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class Location(Enum): 5 | HOME = "HOME" 6 | AWAY = "AWAY" 7 | 8 | 9 | class Outcome(Enum): 10 | WIN = "WIN" 11 | LOSS = "LOSS" 12 | 13 | 14 | class Team(Enum): 15 | ATLANTA_HAWKS = "ATLANTA HAWKS" 16 | BOSTON_CELTICS = "BOSTON CELTICS" 17 | BROOKLYN_NETS = "BROOKLYN NETS" 18 | CHARLOTTE_HORNETS = "CHARLOTTE HORNETS" 19 | CHICAGO_BULLS = "CHICAGO BULLS" 20 | CLEVELAND_CAVALIERS = "CLEVELAND CAVALIERS" 21 | DALLAS_MAVERICKS = "DALLAS MAVERICKS" 22 | DENVER_NUGGETS = "DENVER NUGGETS" 23 | DETROIT_PISTONS = "DETROIT PISTONS" 24 | GOLDEN_STATE_WARRIORS = "GOLDEN STATE WARRIORS" 25 | HOUSTON_ROCKETS = "HOUSTON ROCKETS" 26 | INDIANA_PACERS = "INDIANA PACERS" 27 | LOS_ANGELES_CLIPPERS = "LOS ANGELES CLIPPERS" 28 | LOS_ANGELES_LAKERS = "LOS ANGELES LAKERS" 29 | MEMPHIS_GRIZZLIES = "MEMPHIS GRIZZLIES" 30 | MIAMI_HEAT = "MIAMI HEAT" 31 | MILWAUKEE_BUCKS = "MILWAUKEE BUCKS" 32 | MINNESOTA_TIMBERWOLVES = "MINNESOTA TIMBERWOLVES" 33 | NEW_ORLEANS_PELICANS = "NEW ORLEANS PELICANS" 34 | NEW_YORK_KNICKS = "NEW YORK KNICKS" 35 | OKLAHOMA_CITY_THUNDER = "OKLAHOMA CITY THUNDER" 36 | ORLANDO_MAGIC = "ORLANDO MAGIC" 37 | PHILADELPHIA_76ERS = "PHILADELPHIA 76ERS" 38 | PHOENIX_SUNS = "PHOENIX SUNS" 39 | PORTLAND_TRAIL_BLAZERS = "PORTLAND TRAIL BLAZERS" 40 | SACRAMENTO_KINGS = "SACRAMENTO KINGS" 41 | SAN_ANTONIO_SPURS = "SAN ANTONIO SPURS" 42 | TORONTO_RAPTORS = "TORONTO RAPTORS" 43 | UTAH_JAZZ = "UTAH JAZZ" 44 | WASHINGTON_WIZARDS = "WASHINGTON WIZARDS" 45 | 46 | # DEPRECATED TEAMS 47 | CHARLOTTE_BOBCATS = "CHARLOTTE BOBCATS" 48 | NEW_JERSEY_NETS = "NEW JERSEY NETS" 49 | NEW_ORLEANS_HORNETS = "NEW ORLEANS HORNETS" 50 | NEW_ORLEANS_OKLAHOMA_CITY_HORNETS = "NEW ORLEANS/OKLAHOMA CITY HORNETS" 51 | SEATTLE_SUPERSONICS = "SEATTLE SUPERSONICS" 52 | VANCOUVER_GRIZZLIES = "VANCOUVER GRIZZLIES" 53 | 54 | 55 | class OutputType(Enum): 56 | JSON = "JSON" 57 | CSV = "CSV" 58 | 59 | 60 | class OutputWriteOption(Enum): 61 | WRITE = "w" 62 | CREATE_AND_WRITE = "w+" 63 | APPEND = "a" 64 | APPEND_AND_WRITE = "a+" 65 | 66 | 67 | class Position(Enum): 68 | POINT_GUARD = "POINT GUARD" 69 | SHOOTING_GUARD = "SHOOTING GUARD" 70 | SMALL_FORWARD = "SMALL FORWARD" 71 | POWER_FORWARD = "POWER FORWARD" 72 | CENTER = "CENTER" 73 | 74 | 75 | TEAM_ABBREVIATIONS_TO_TEAM = { 76 | 'ATL': Team.ATLANTA_HAWKS, 77 | 'BOS': Team.BOSTON_CELTICS, 78 | 'BRK': Team.BROOKLYN_NETS, 79 | 'CHI': Team.CHICAGO_BULLS, 80 | 'CHO': Team.CHARLOTTE_HORNETS, 81 | 'CLE': Team.CLEVELAND_CAVALIERS, 82 | 'DAL': Team.DALLAS_MAVERICKS, 83 | 'DEN': Team.DENVER_NUGGETS, 84 | 'DET': Team.DETROIT_PISTONS, 85 | 'GSW': Team.GOLDEN_STATE_WARRIORS, 86 | 'HOU': Team.HOUSTON_ROCKETS, 87 | 'IND': Team.INDIANA_PACERS, 88 | 'LAC': Team.LOS_ANGELES_CLIPPERS, 89 | 'LAL': Team.LOS_ANGELES_LAKERS, 90 | 'MEM': Team.MEMPHIS_GRIZZLIES, 91 | 'MIA': Team.MIAMI_HEAT, 92 | 'MIL': Team.MILWAUKEE_BUCKS, 93 | 'MIN': Team.MINNESOTA_TIMBERWOLVES, 94 | 'NOP': Team.NEW_ORLEANS_PELICANS, 95 | 'NYK': Team.NEW_YORK_KNICKS, 96 | 'OKC': Team.OKLAHOMA_CITY_THUNDER, 97 | 'ORL': Team.ORLANDO_MAGIC, 98 | 'PHI': Team.PHILADELPHIA_76ERS, 99 | 'PHO': Team.PHOENIX_SUNS, 100 | 'POR': Team.PORTLAND_TRAIL_BLAZERS, 101 | 'SAC': Team.SACRAMENTO_KINGS, 102 | 'SAS': Team.SAN_ANTONIO_SPURS, 103 | 'TOR': Team.TORONTO_RAPTORS, 104 | 'UTA': Team.UTAH_JAZZ, 105 | 'WAS': Team.WASHINGTON_WIZARDS, 106 | 107 | # DEPRECATED TEAMS 108 | 'NJN': Team.NEW_JERSEY_NETS, 109 | 'NOH': Team.NEW_ORLEANS_HORNETS, 110 | 'NOK': Team.NEW_ORLEANS_OKLAHOMA_CITY_HORNETS, 111 | 'CHA': Team.CHARLOTTE_BOBCATS, 112 | 'CHH': Team.CHARLOTTE_HORNETS, 113 | 'SEA': Team.SEATTLE_SUPERSONICS, 114 | 'VAN': Team.VANCOUVER_GRIZZLIES, 115 | } 116 | 117 | POSITION_ABBREVIATIONS_TO_POSITION = { 118 | "PG": Position.POINT_GUARD, 119 | "SG": Position.SHOOTING_GUARD, 120 | "SF": Position.SMALL_FORWARD, 121 | "PF": Position.POWER_FORWARD, 122 | "C": Position.CENTER, 123 | } 124 | -------------------------------------------------------------------------------- /NBApredict/run/daily.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module runs the entire NBA_bet project process daily one hour before the first game time. 3 | 4 | It runs one hour before game times in order to capture the most up-to-date betting information. The project is meant to 5 | be run from the command line. Once running, debug information from the scheduler will be printed as well as notifying 6 | the user if a job has been successfully run. Terminate the process via a keyboard interrupt. For more details on what 7 | happens during a scheduled job, refer to run/all.py 8 | 9 | Example: 10 | From the project directory, run 'python -m run.daily' 11 | """ 12 | from apscheduler.schedulers.background import BackgroundScheduler 13 | from apscheduler.events import EVENT_JOB_EXECUTED, EVENT_JOB_ERROR, EVENT_JOB_MISSED 14 | from datetime import datetime, timedelta 15 | import logging 16 | from sqlalchemy.orm import Session 17 | import time 18 | 19 | # Local Imports 20 | from nbapredict.database import getters 21 | from nbapredict.database.dbinterface import DBInterface 22 | from nbapredict.run.all import run_all 23 | 24 | 25 | def datetime_to_dict(d_time): 26 | """Take a datetime and convert it to a dictionary. 27 | 28 | The output is to be used as arguments for an apshceduler cron trigger.""" 29 | time_dict = {"year": d_time.year, "month": d_time.month, "day": d_time.day, "hour": d_time.hour, 30 | "minute": d_time.minute} 31 | return time_dict 32 | 33 | 34 | def job_runs(event): 35 | """Attached to a Scheduler as a listener that prints job status on job completion.""" 36 | if event.exception: 37 | print('The job did not run') 38 | else: 39 | print('The job completed @ {}'.format(datetime.now())) 40 | 41 | 42 | def missed_job(event): 43 | print('The job was missed. Scheduling a new one to run in one minute') 44 | run_time = datetime_to_dict(datetime.now() + timedelta(minutes=1)) 45 | scheduler.add_job(run_all, "cron", **run_time) 46 | scheduler.print_jobs() 47 | 48 | 49 | if __name__ == "__main__": 50 | # DBInterface setup 51 | database = DBInterface() 52 | year = 2019 53 | session = Session(bind=database.engine) 54 | sched_tbl = database.get_table_mappings("sched_{}".format(year)) 55 | 56 | # Get today and the last day of the season so jobs can be scheduled from today through end of season 57 | start_date = datetime.date(datetime.now()) 58 | end_date = session.query(sched_tbl.start_time).order_by(sched_tbl.start_time.desc()).first()[0] 59 | end_date = datetime.date(end_date) 60 | 61 | # Get every date between now and the last day of the season 62 | date = start_date 63 | game_dates = [date] 64 | while date <= end_date: 65 | date = date + timedelta(days=1) 66 | game_dates.append(date) 67 | 68 | # Get start times for every day in date if there are games on that day 69 | start_times = [] 70 | for date in game_dates: 71 | first_game_time = getters.get_first_game_time_on_day(sched_tbl, session, date) 72 | if first_game_time: 73 | start_times.append(first_game_time - timedelta(hours=1)) 74 | 75 | # Transform start times into chron arguments for triggers 76 | cron_args = [datetime_to_dict(s_time) for s_time in start_times] 77 | # cron_args = [datetime.now() + timedelta(minutes=i*5) for i in range(1, 2)] # TEST 78 | # cron_args = [datetime_to_dict(d_time) for d_time in cron_args] # TEST 79 | 80 | # Setup scheduler, add jobs and listeners, and start the scheduler 81 | scheduler = BackgroundScheduler() 82 | scheduler.add_listener(job_runs, EVENT_JOB_EXECUTED | EVENT_JOB_ERROR) 83 | scheduler.add_listener(missed_job, EVENT_JOB_MISSED) 84 | for kwargs in cron_args: 85 | scheduler.add_job(run_all, "cron", **kwargs, misfire_grace_time=60) 86 | scheduler.start() 87 | scheduler.print_jobs() 88 | 89 | logging.basicConfig() 90 | logging.getLogger('apscheduler').setLevel(logging.DEBUG) 91 | 92 | try: 93 | sleep_time = 0 94 | while True: 95 | time.sleep(1) 96 | sleep_time += 1 97 | if sleep_time >= 600: 98 | scheduler.wakeup() 99 | sleep_time = 0 100 | except (KeyboardInterrupt, SystemExit): 101 | scheduler.shutdown() 102 | -------------------------------------------------------------------------------- /NBApredict/helpers/type.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains type checks and type conversion functions 3 | """ 4 | 5 | from datetime import datetime 6 | from enum import Enum 7 | import os 8 | 9 | 10 | def set_type(values): 11 | """Convert string values to integers or floats if applicable. Otherwise, return strings. 12 | 13 | If the string value has zero length, none is returned 14 | 15 | Args: 16 | values: A list of values 17 | 18 | Returns: 19 | The input list of values modified to match their type. String is the default return value. If the values are 20 | ints or floats, returns the list formatted as a list of ints or floats. Empty values will be replaced with none. 21 | 22 | To-Do: 23 | 1. Add functionality to coerce elements of lists and not just lists 24 | """ 25 | test_val = values[0] # Is there a better method than taking a test val? 26 | if is_int(test_val): 27 | return _set_type(values, int) 28 | elif is_float(test_val): 29 | return _set_type(values, float) 30 | else: 31 | values = [x if len(x) > 0 else None for x in values] # Set empty strings to None 32 | return values 33 | 34 | 35 | def _set_type(values, new_type): 36 | """Transforms a list of values into the specified new type. If the value has zero length, returns none 37 | 38 | Args: 39 | values: A list of values 40 | new_type: A type class to modify the list to 41 | 42 | Returns: 43 | The values list modified to the new_type. If an element is empty, the element is set to None. 44 | """ 45 | 46 | new_vals = [] 47 | for i in values: 48 | if len(i) > 0: # Some values may have len(0); we convert them to None to put into sql db 49 | new_vals.append(new_type(i)) 50 | else: 51 | new_vals.append(None) 52 | return new_vals 53 | 54 | 55 | def get_type(values): 56 | """Return the type of the values where type is defined as the modal type in the list. 57 | 58 | Args: 59 | values: A list or value to get the type for. 60 | 61 | Returns: 62 | The modal type of a list or the type of the element. Can be integer, float, string, datetime, or none 63 | 64 | To-Do: 65 | Modal type isn't a full proof method. Need to determine a better method. 66 | """ 67 | if hasattr(values, "__len__") and (type(values) != type): # Checks if the object is iterable 68 | val_types = [] 69 | for i in values: 70 | val_types.append(_get_type(i)) 71 | return max(set(val_types), key=val_types.count) # The max, set, and key combo returns the modal type 72 | elif isinstance(values, Enum): # For enum objects, pass the value to the get_type function (right choice? IDK) 73 | return _get_type(values.value) 74 | else: 75 | return _get_type(values) 76 | 77 | 78 | def _get_type(val): 79 | """Return the type of the value if it is a int, float, or datetime. Otherwise, return a string. 80 | 81 | Args: 82 | val: A value to get the type of 83 | Returns: 84 | The type of the value passed into the function if it is an int, float, datetime, or string 85 | Raise: 86 | Exception: An exception raised if the val is not int, float, datetime, or string. 87 | """ 88 | if isinstance(val, int): 89 | return "integer" 90 | elif isinstance(val, float): 91 | return "float" 92 | elif isinstance(val, datetime): 93 | return "datetime" 94 | elif isinstance(val, str): 95 | return "string" 96 | elif isinstance(val, bool): 97 | return "bool" 98 | elif val is None: 99 | return None 100 | elif is_python_type(val): # Handles types that are passed explicitly 101 | return val 102 | else: 103 | raise Exception("Val is not an int, float, datetime, string, Bool, or None") 104 | 105 | 106 | def is_int(x): 107 | """Return true if X can be coerced to a integer. Otherwise, return false.""" 108 | try: 109 | int(x) # Will raise ValueError if '.2'; will not raise error if .2 110 | return True 111 | except ValueError: 112 | return False 113 | 114 | 115 | def is_float(x): 116 | """Return true if X can be coerced to a float. Otherwise, return false.""" 117 | try: 118 | float(x) 119 | return True 120 | except ValueError: 121 | return False 122 | 123 | 124 | def is_python_type(x): 125 | if x in [int, float, datetime, str, bool, None]: 126 | return True 127 | else: 128 | return False 129 | -------------------------------------------------------------------------------- /NBApredict/helpers/classes.py: -------------------------------------------------------------------------------- 1 | """Generic classes used throughout the project""" 2 | 3 | 4 | class NestedDict: 5 | """NestedDict allows multi-level dictionaries which """ 6 | 7 | def __init__(self, *args, **kwargs): 8 | """Creates a standard dictionary as a class property""" 9 | self.dict = dict(*args, **kwargs) 10 | 11 | def __getitem__(self, keys): 12 | """Returns the value for key and accepts iterables as keys to reach lower level branches of the dict.""" 13 | # Allows getting top-level branch when a single key was provided 14 | if not isinstance(keys, tuple): 15 | if isinstance(keys, str) or isinstance(keys, int): # Handles single item lists or strings 16 | keys = (keys,) 17 | else: 18 | keys = tuple(keys) 19 | 20 | branch = self.dict 21 | for key in keys: 22 | branch = branch[key] 23 | 24 | # If we return a branch, and not a leaf value, we wrap it into a NestedDict 25 | return NestedDict(branch).dict if isinstance(branch, dict) else branch 26 | 27 | def __setitem__(self, keys, value): 28 | # Allows setting top-level item when a single key was provided 29 | if not isinstance(keys, tuple): 30 | if len(keys) < 2: 31 | keys = (*keys,) 32 | else: 33 | keys = tuple(keys) 34 | 35 | branch = self.dict 36 | for key in keys[:-1]: 37 | if key not in branch: 38 | branch[key] = {} 39 | branch = branch[key] 40 | branch[keys[-1]] = value 41 | 42 | def __keys__(self, depth=0): 43 | """Does not yet function 44 | 45 | Notes on next steps in the __recurse_keys__ function 46 | """ 47 | keys = [[k] for k in self.dict.keys()] 48 | for k in keys: 49 | branch = self[k[0]] 50 | self.__recurse_keys__(key=k[0], branch=branch, key_list=k) 51 | branch = self[k[0]] 52 | if isinstance(branch, dict): 53 | if len(branch.keys()) > 1: 54 | b_keys = list(branch.keys()) 55 | length = len(b_keys) 56 | # Copies of k to append the keys in the last layer to. When multiple keys are in the last layer, 57 | # we need new tress to capture all key paths 58 | new_trees = [k for _ in range(length)] 59 | k.append(b_keys[0]) # Add the first key to the original tree 60 | for i in range(1, length): 61 | tree = new_trees[i] 62 | tree.append(b_keys[i]) 63 | keys.append(tree) 64 | else: 65 | k.append(list(branch.keys())[0]) 66 | while not isinstance(branch, dict): 67 | branch_keys = [[bk] for bk in branch.dict.keys()] 68 | 69 | return keys 70 | 71 | @staticmethod 72 | def __recurse_keys__(branch, key_list, depth=0): 73 | """Not Functional 74 | 75 | Waiting to finish this up. There's several issues. 76 | 1. You almost have to recurse through the tree of keys which can be a heavy computation 77 | 2. The leaf of a branch needs to be handled in a different manner than branches along the way. The leaf will 78 | be a list itself, and it needs to be reformatted when finished. 79 | 3. Finally, it may need a completely different implementation. Look at it with fresh eyes when you next work on 80 | it. """ 81 | b_keys = list(branch.keys()) 82 | if isinstance(branch, dict): 83 | if len(b_keys) > 1: 84 | length = len(b_keys) 85 | # Copies of k to append the keys in the last layer to. When multiple keys are in the last layer, 86 | # we need new tress to capture all key paths 87 | new_trees = [key_list for _ in range(1, length)] 88 | key_list.append(b_keys[0]) # Add the first key to the original tree 89 | print(key_list) 90 | combined_k_lists = [key_list] 91 | for i in range(length-1): 92 | tree = new_trees[i] 93 | print(tree) 94 | print(b_keys) 95 | tree.append(b_keys[i]) 96 | combined_k_lists.append(tree) 97 | return combined_k_lists 98 | else: 99 | return key_list.append(list(branch.keys())[0]) 100 | else: # we have a nested dict 101 | pass -------------------------------------------------------------------------------- /NBApredict/br_web_scraper/output.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | 4 | from nbapredict.helpers.br_references import OutputType, OutputWriteOption 5 | 6 | box_score_fieldname = [ 7 | "name", 8 | "team", 9 | "location", 10 | "opponent", 11 | "outcome", 12 | "seconds_played", 13 | "made_field_goals", 14 | "attempted_field_goals", 15 | "made_three_point_field_goals", 16 | "attempted_three_point_field_goals", 17 | "made_free_throws", 18 | "attempted_free_throws", 19 | "offensive_rebounds", 20 | "defensive_rebounds", 21 | "assists", 22 | "steals", 23 | "blocks", 24 | "turnovers", 25 | "personal_fouls", 26 | "game_score", 27 | ] 28 | 29 | game_fieldname = [ 30 | "start_time", 31 | "away_team", 32 | "away_team_score", 33 | "home_team", 34 | "home_team_score", 35 | ] 36 | 37 | default_json_options = { 38 | "sort_keys": True, 39 | "indent": 4, 40 | } 41 | 42 | 43 | def merge_two_dicts(first, second): 44 | combined = first.copy() 45 | combined.update(second) 46 | return combined 47 | 48 | 49 | def output(values, output_type, output_file_path, encoder, csv_writer, output_write_option=None, json_options=None): 50 | if output_type is None: 51 | return values 52 | 53 | write_option = OutputWriteOption.WRITE if output_write_option is None else output_write_option 54 | 55 | if output_type == OutputType.JSON: 56 | options = default_json_options if json_options is None else merge_two_dicts(first=default_json_options, second=json_options) 57 | if output_file_path is None: 58 | return json.dumps(values, cls=encoder, **options) 59 | else: 60 | with open(output_file_path, write_option.value, newline="") as json_file: 61 | return json.dump(values, json_file, cls=encoder, **options) 62 | 63 | if output_type == OutputType.CSV: 64 | if output_file_path is None: 65 | raise ValueError("CSV output must contain a file path") 66 | else: 67 | return csv_writer(rows=values, output_file_path=output_file_path, write_option=write_option) 68 | 69 | raise ValueError("Unknown output type: {output_type}".format(output_type=output_type)) 70 | 71 | # I wrote the explicit mapping of CSV values because there didn't seem to be a way of outputting the values of enums 72 | # without doing it this way 73 | 74 | 75 | def box_scores_to_csv(rows, output_file_path, write_option): 76 | with open(output_file_path, write_option.value, newline="") as csv_file: 77 | writer = csv.DictWriter(csv_file, fieldnames=box_score_fieldname) 78 | writer.writeheader() 79 | writer.writerows( 80 | { 81 | "name": row["name"], 82 | "team": row["team"].value, 83 | "location": row["location"].value, 84 | "opponent": row["opponent"].value, 85 | "outcome": row["outcome"].value, 86 | "seconds_played": row["seconds_played"], 87 | "made_field_goals": row["made_field_goals"], 88 | "attempted_field_goals": row["attempted_field_goals"], 89 | "made_three_point_field_goals": row["made_three_point_field_goals"], 90 | "attempted_three_point_field_goals": row["attempted_three_point_field_goals"], 91 | "made_free_throws": row["made_free_throws"], 92 | "attempted_free_throws": row["attempted_free_throws"], 93 | "offensive_rebounds": row["offensive_rebounds"], 94 | "defensive_rebounds": row["defensive_rebounds"], 95 | "assists": row["assists"], 96 | "steals": row["steals"], 97 | "blocks": row["blocks"], 98 | "turnovers": row["turnovers"], 99 | "personal_fouls": row["personal_fouls"], 100 | "game_score": row["game_score"], 101 | } for row in rows 102 | ) 103 | 104 | 105 | def schedule_to_csv(rows, output_file_path, write_option): 106 | with open(output_file_path, write_option.value, newline="") as csv_file: 107 | writer = csv.DictWriter(csv_file, fieldnames=game_fieldname) 108 | writer.writeheader() 109 | writer.writerows( 110 | { 111 | "start_time": row["start_time"], 112 | "away_team": row["away_team"].value, 113 | "away_team_score": row["away_team_score"], 114 | "home_team": row["home_team"].value, 115 | "home_team_score": row["home_team_score"], 116 | } for row in rows 117 | ) 118 | -------------------------------------------------------------------------------- /NBApredict/br_web_scraper/parsers/schedule.py: -------------------------------------------------------------------------------- 1 | from lxml import html 2 | import datetime 3 | import pytz 4 | 5 | from nbapredict.helpers.br_references import Team 6 | 7 | TEAM_NAME_TO_TEAM = { 8 | member.value: member 9 | for (_, member) in Team.__members__.items() 10 | } 11 | 12 | TEAM_NAME_TO_TEAM["NEW ORLEANS/OKLAHOMA CITY HORNETS"] = Team.NEW_ORLEANS_OKLAHOMA_CITY_HORNETS 13 | 14 | 15 | def parse_start_time(formatted_date, formatted_time_of_day): 16 | if formatted_time_of_day is not None and formatted_time_of_day not in ["", " "]: 17 | # Starting in 2018, the start times had a "p" or "a" appended to the end 18 | # Between 2001 and 2017, the start times had a "pm" or "am" 19 | # 20 | # https://www.basketball-reference.com/leagues/NBA_2018_games.html 21 | # vs. 22 | # https://www.basketball-reference.com/leagues/NBA_2001_games.html 23 | is_prior_format = formatted_time_of_day[-2:] == "am" or formatted_time_of_day[-2:] == "pm" 24 | 25 | # If format contains only "p" or "a" add an "m" so it can be parsed by datetime module 26 | if is_prior_format: 27 | combined_formatted_time = formatted_date + " " + formatted_time_of_day 28 | else: 29 | combined_formatted_time = formatted_date + " " + formatted_time_of_day + "m" 30 | 31 | if is_prior_format: 32 | start_time = datetime.datetime.strptime(combined_formatted_time, "%a, %b %d, %Y %I:%M %p") 33 | else: 34 | start_time = datetime.datetime.strptime(combined_formatted_time, "%a, %b %d, %Y %I:%M%p") 35 | else: 36 | start_time = datetime.datetime.strptime(formatted_date, "%a, %b %d, %Y") 37 | 38 | # All basketball reference times seem to be in Eastern 39 | est = pytz.timezone("US/Eastern") 40 | localized_start_time = est.localize(start_time) 41 | 42 | # When localized_start_time calls and returns astimezone(pytz.utc), the values are converted to UTC. 43 | # In this call, the day of the game can be changed. For example, an 10pm game on October 16th may be converted to a 44 | # 2am game in on October 17th in UTC. To avoid this effect, return localized_start_time 45 | return localized_start_time 46 | # return localized_start_time.astimezone(pytz.utc) 47 | 48 | 49 | def current_time(): 50 | now = datetime.datetime.now() 51 | est = pytz.timezone("US/Eastern") 52 | localized_now_time = est.localize(now) 53 | return localized_now_time.astimezone(pytz.utc) 54 | 55 | 56 | def parse_game(row): 57 | start_time = parse_start_time(formatted_date=row[0].text_content(), formatted_time_of_day=row[1].text_content()) 58 | 59 | # Test existed to check for games that haven't been played. Replaced to default unplayed games to 0-0 score 60 | #try: 61 | # test = int(row[3].text_content()) 62 | #except: 63 | # print("invalid test") 64 | 65 | try: 66 | away_team_score = int(row[3].text_content()) 67 | home_team_score = int(row[5].text_content()) 68 | except: 69 | away_team_score = 0 70 | home_team_score = 0 71 | return { 72 | "start_time": start_time, 73 | "away_team": TEAM_NAME_TO_TEAM[row[2].text_content().upper()], 74 | "away_team_score": away_team_score, 75 | "home_team": TEAM_NAME_TO_TEAM[row[4].text_content().upper()], 76 | "home_team_score": home_team_score, 77 | } 78 | 79 | 80 | def parse_schedule(page): 81 | tree = html.fromstring(page) 82 | rows = tree.xpath('//table[@id="schedule"]//tbody/tr') 83 | schedule = [] 84 | for row in rows: 85 | if row.text_content() != "Playoffs": 86 | start_time = parse_start_time(formatted_date=row[0].text_content(), 87 | formatted_time_of_day=row[1].text_content()) 88 | # now = current_time() 89 | # Scrape all data up to 'yesterday'; Don't scrape for today as in progress games create errors 90 | # if (start_time.month == now.month) and (start_time.day > (now.day - 1)): 91 | # break 92 | # elif start_time > now: 93 | # break 94 | schedule.append(parse_game(row)) 95 | if row.text_content() == "Playoffs": 96 | pass # An extraneous text_content() that arises when games switch from regular season to playoffs 97 | return schedule 98 | 99 | 100 | def parse_schedule_for_month_url_paths(page): 101 | tree = html.fromstring(page) 102 | months = tree.xpath('//div[@id="content"]/div[@class="filter"]/div[not(contains(@class, "current"))]/a') 103 | return list(map(lambda month: month.attrib['href'], months)) 104 | -------------------------------------------------------------------------------- /NBApredict/scrapers/team_scraper.py: -------------------------------------------------------------------------------- 1 | """ 2 | team_scraper scrapes and stores team stats from basketball reference. 3 | 4 | By default, it scrapes miscellaneous stats from 2019. Alternate years and tables may be scraped though functionality is 5 | not yet guaranteed. The scraped tables are written to the specified database. 6 | 7 | ToDo: 8 | 1. Create a method for stripping extraneous characters from team-names. If querying a historical season (<2001), 9 | the teams that made the playoffs have a '*' appended that we want to strip from the team-name 10 | """ 11 | 12 | from bs4 import BeautifulSoup # Requires lxml to be installed as well 13 | from datetime import datetime 14 | import re 15 | import requests 16 | 17 | # Local imports. 18 | from nbapredict.configuration import Config 19 | from nbapredict.helpers.br_references import BASE_URL 20 | from nbapredict.helpers.br_references import data_stat_headers as headers 21 | from nbapredict.helpers import type 22 | 23 | 24 | def team_statistics(tbl_name): 25 | """Build a URL for the specified year and return team statistics for the specified table on that page. 26 | 27 | Performance not guaranteed for tables that are not "misc_stats" 28 | 29 | Args: 30 | tbl_name: The name of the table to be returned 31 | 32 | Returns: 33 | A dictionary version of the specified table. Keys are column titles that return lists ordered by team. 34 | """ 35 | 36 | url = '{BASE_URL}/leagues/NBA_{year}.html'.format( 37 | BASE_URL=BASE_URL, # imported from br_references.py 38 | year=Config.get_property("league_year") 39 | ) 40 | 41 | response = requests.get(url=url, allow_redirects=False) 42 | if 200 <= response.status_code < 300: 43 | scrape_time = datetime.now() 44 | return parse_table(response.content, tbl_name, scrape_time) # Note that this uses the .content attribute 45 | 46 | raise Exception("Could not connect to URL") 47 | 48 | 49 | def parse_table(page, tbl_name, scrape_time): 50 | """Parse the specified table on the specified page and return the data as a dictionary 51 | 52 | Args: 53 | page: The contents from a url response 54 | tbl_name: the desired table to be parsed 55 | 56 | Returns: 57 | A dictionary version of the specified table. Keys are column titles that return lists ordered by team. 58 | """ 59 | 60 | cleaned_soup = BeautifulSoup(re.sub('', "", str(page)), features="lxml") # Strips comments from page 61 | table = cleaned_soup.find('table', {'id': '{}'.format(tbl_name)}) 62 | data_dict = get_data_dict_from_tbl(table) 63 | keys = data_dict.keys() 64 | for key in keys: 65 | data_dict[key] = type.set_type(data_dict[key]) 66 | # Add a scrape time for each row in the dictionary 67 | data_dict['scrape_time'] = [scrape_time for i in range(len(data_dict[key]))] 68 | return data_dict 69 | 70 | 71 | def get_data_dict_from_tbl(table): 72 | """Return a dictionary from a BeautifulSoup table with column names as keys and a list of values 73 | 74 | Args: 75 | table: a table as returned by the find method on a BeautifulSoup object 76 | """ 77 | rows = table.find_all("tr") 78 | data_dict = dict() 79 | 80 | for row in rows: 81 | if row.find('th', {"scope": "row"}) is not None: 82 | for head in headers: 83 | cell = row.find("td", {"data-stat": head}) 84 | a = cell.text.strip().encode() 85 | cell_data = a.decode("utf-8") 86 | 87 | if head in data_dict: 88 | data_dict[head].append(cell_data) 89 | else: 90 | data_dict[head] = [cell_data] 91 | 92 | return data_dict 93 | 94 | 95 | def clean_team_name(team_names): 96 | """Take a list of team_names, modify the names to match the format specified in br_references, and return a new list 97 | 98 | Args: 99 | team_names: a list of team_names to be checked for validity, and if needed, modified 100 | """ 101 | new_team_names = [] 102 | for team in team_names: 103 | new_team_names.append(''.join(a for a in team if a.isalpha() or a.isspace() or a.isdigit()).upper()) 104 | return new_team_names 105 | 106 | 107 | def scrape(tbl_name="misc_stats"): 108 | """Scrape a basketball_reference table of team stats, parse the table, and write it to a database 109 | 110 | Args: 111 | tbl_name: The name of the table to scrape on basketballreference.com 112 | """ 113 | 114 | # Get tbl_dictionary from basketball reference 115 | tbl_dict = team_statistics(tbl_name) 116 | tbl_dict["team_name"] = clean_team_name(tbl_dict["team_name"]) 117 | return tbl_dict 118 | 119 | 120 | if __name__ == "__main__": 121 | scrape() 122 | -------------------------------------------------------------------------------- /NBApredict/management/etl.py: -------------------------------------------------------------------------------- 1 | """ ETL (Extract Transform Load) manages data scraping, modification, table creation, and data loading. 2 | 3 | Main() calls the necessary ETL functions from scrapers and management.tables for all tables. 4 | 5 | Tables: 6 | teams 7 | schedule 8 | odds 9 | team_stats 10 | """ 11 | 12 | from datetime import datetime 13 | from datatotable.database import Database 14 | from datatotable.data import DataOperator 15 | from nbapredict.configuration import Config 16 | import nbapredict.management 17 | import nbapredict.management.conversion as convert 18 | from nbapredict.management.tables import teams, team_stats, odds, schedule 19 | from nbapredict.scrapers import team_scraper, line_scraper, season_scraper 20 | 21 | 22 | def main(db): 23 | year = Config.get_property("league_year") 24 | session = nbapredict.management.Session(bind=db.engine) 25 | 26 | # ~~~~~~~~~~~~~ 27 | # Teams 28 | # ~~~~~~~~~~~~~ 29 | team_dict = team_scraper.scrape() 30 | teams_data = DataOperator({"team_name": team_dict["team_name"]}) 31 | teams_tbl_name = "teams_{}".format(year) 32 | if not db.table_exists(teams_tbl_name): 33 | teams.create_team_table(db=db, teams_data=teams_data, tbl_name=teams_tbl_name) 34 | teams_tbl = db.table_mappings[teams_tbl_name] 35 | session.add_all([teams_tbl(**row) for row in teams_data.rows]) 36 | session.commit() 37 | del teams_tbl 38 | 39 | # ~~~~~~~~~~~~~ 40 | # Team Stats 41 | # ~~~~~~~~~~~~~ 42 | team_stats_tbl_name = "team_stats_{}".format(year) 43 | teams_tbl = db.table_mappings[teams_tbl_name] 44 | team_dict['team_id'] = team_dict.pop('team_name') 45 | team_dict['team_id'] = convert.values_to_foreign_key(session=session, foreign_tbl=teams_tbl, foreign_key="id", 46 | foreign_value="team_name", child_data=team_dict['team_id']) 47 | # When team_stats_tbl is created, the teams_tbl automap object is changed. The changed format does not follow 48 | # the expected behavior of an automapped table. I suspect this is because a relationship is established. 49 | # If we reloaded, teams_tbl works fine. Therefore, delete the variable here for now 50 | del teams_tbl 51 | team_dict['scrape_date'] = [datetime.date(s_time) for s_time in team_dict['scrape_time']] 52 | team_stats_data = DataOperator(team_dict) 53 | if not db.table_exists(team_stats_tbl_name): 54 | team_stats.create_table(db=db, team_stats_data=team_stats_data, tbl_name=team_stats_tbl_name) 55 | team_stats_tbl = db.table_mappings[team_stats_tbl_name] 56 | session.add_all([team_stats_tbl(**row) for row in team_stats_data.rows]) 57 | session.commit() 58 | else: 59 | team_stats_tbl = db.table_mappings[team_stats_tbl_name] 60 | team_stats.insert(session, team_stats_tbl, team_stats_data) 61 | 62 | # ~~~~~~~~~~~~~ 63 | # Schedule 64 | # ~~~~~~~~~~~~~ 65 | schedule_dict = season_scraper.scrape() 66 | schedule_data = DataOperator(schedule_dict) 67 | teams_tbl = db.table_mappings['teams_{}'.format(year)] 68 | schedule_data = schedule.format_data(session=session, schedule_data=schedule_data, 69 | team_tbl=teams_tbl, team_stats_tbl=team_stats_tbl) 70 | schedule_tbl_name = "schedule_{}".format(year) 71 | if not db.table_exists(schedule_tbl_name): 72 | schedule.create_table(db, schedule_data, schedule_tbl_name, teams_tbl, team_stats_tbl) 73 | schedule_tbl = db.table_mappings[schedule_tbl_name] 74 | session.add_all([schedule_tbl(**row) for row in schedule_data.rows]) 75 | session.commit() 76 | else: 77 | schedule_tbl = db.table_mappings[schedule_tbl_name] 78 | update_rows = schedule.update_table(session, schedule_data, schedule_tbl, team_stats_tbl) 79 | session.add_all(update_rows) 80 | session.commit() 81 | 82 | # ~~~~~~~~~~~~~ 83 | # Odds 84 | # ~~~~~~~~~~~~~ 85 | odds_dict = line_scraper.scrape() 86 | odds_data = None 87 | if odds_dict: 88 | odds_dict = odds.format_data(session, odds_dict, teams_tbl, schedule_tbl) 89 | odds_data = DataOperator(odds_dict) 90 | # Evaluate if you have the correct columns in odds_data (i.e. home\away team id's) 91 | odds_tbl_name = "odds_{}".format(year) 92 | if not db.table_exists(odds_tbl_name) and odds_data: 93 | odds.create_table(db, odds_tbl_name, odds_data, schedule_tbl) 94 | odds_tbl = db.table_mappings[odds_tbl_name] 95 | session.add_all(odds_tbl(**row) for row in odds_data.rows) 96 | session.commit() 97 | elif odds_data: 98 | odds_tbl = db.table_mappings[odds_tbl_name] 99 | session.add_all(odds_tbl(**row) for row in odds_data.rows) 100 | session.commit() 101 | odds.update_table(session, odds_tbl, odds_data) 102 | session.commit() 103 | odds.delete(session, odds_tbl) 104 | 105 | session.close() 106 | 107 | if __name__ == "__main__": 108 | db = Database("test", Config.get_property("outputs")) 109 | main(db) 110 | -------------------------------------------------------------------------------- /NBApredict/management/conversion.py: -------------------------------------------------------------------------------- 1 | """Conversion contains functions to grease interoperability between tables. At the moment, this consists of the 2 | values_to_foreign_key function.""" 3 | 4 | from nbapredict.helpers.classes import NestedDict 5 | import pandas as pd 6 | import sqlalchemy 7 | 8 | 9 | def values_to_foreign_key(session, foreign_tbl, foreign_key, foreign_value, child_data): 10 | """Return values from child data that exist in the foreign_tbl transformed into foreign key values 11 | 12 | Args: 13 | session: A sqlalchemy session 14 | foreign_tbl: The foreign table mapping child data references 15 | foreign_key: The name of the column containing foreign key values 16 | foreign_value: The name of the column containing values to match with child data 17 | child_data: A list of data with values contained in foreign value 18 | 19 | Returns: 20 | A list of values from the foreign key column that correspond to child data's relationship to the foreign values 21 | """ 22 | # past 999 the SQLite backend raises a "too many variables warning". Here, we presume we don't have >999 unique 23 | # values in child_data. Rather, presume we have < 999 unique values and take a set of the data. 24 | set_data = set() 25 | if len(child_data) > 999: 26 | set_data = set(child_data) 27 | if type(foreign_tbl) == sqlalchemy.sql.selectable.Alias: 28 | conversion_dict = _values_to_foreign_key(session, foreign_tbl, foreign_key, foreign_value, 29 | set_data or child_data) 30 | return [conversion_dict[i] for i in child_data] 31 | else: 32 | key_column = [getattr(foreign_tbl, foreign_key)] 33 | if isinstance(child_data, dict): 34 | composite_fd = True # Composite functional dependency, two+ columns required to identify unique key 35 | value_columns = [getattr(foreign_tbl, val) for val in child_data.keys()] 36 | keys = list(child_data.keys()) 37 | filters = [value_columns[i].in_(child_data[keys[i]]) for i in range(len(keys))] 38 | else: 39 | composite_fd = False 40 | value_columns = [getattr(foreign_tbl, foreign_value)] 41 | filters = [value_columns[0].in_(set_data or child_data)] 42 | 43 | rows = session.query(*key_column, *value_columns).distinct().filter(*filters).all() 44 | 45 | if composite_fd: 46 | nested_conversion_dict = NestedDict() 47 | for r in rows: 48 | # multi-valued key with the foreign key as the value 49 | nested_conversion_dict[[col for col in r[1:]]] = r[0] 50 | 51 | # Generate a list of lists with the values in each row of child data 52 | # These values form keys for the foreign keys stored in the nested_conversion_dict which is returned 53 | conversion_keys = [] 54 | length = len(child_data[list(child_data.keys())[0]]) 55 | for i in range(length): 56 | conversion_keys.append([child_data[k][i] for k in child_data.keys()]) 57 | return [nested_conversion_dict[k] for k in conversion_keys] 58 | else: 59 | conversion_dict = {getattr(row, foreign_value): getattr(row, foreign_key) for row in rows} 60 | return [conversion_dict[i] for i in child_data] 61 | 62 | 63 | def _values_to_foreign_key(session, foreign_subquery, foreign_key, foreign_value, child_data): 64 | """Return values from child data that exist in the foreign_subquery transformed into foreign key values 65 | 66 | This function performs the same query as values_to_foreign_key() except it can take a subquery, which has 67 | different syntax, as input rather than a table. The function presumes child_data has already been modified if 68 | necessary. NOTE: this does not support multi-column conversions of child_data to foreign key. 69 | 70 | Args: 71 | foreign_subquery: A subquery which is an Alias class in sqlalchemy. These classes are created when subquery() 72 | is appended to a sqlalchemy query statement 73 | foreign_key: The name of the column containing foreign key values 74 | foreign_value: The name of the column containing values to match with child data 75 | child_data: A list of data with values contained in foreign value 76 | 77 | Returns: 78 | A conversion dict that maps child_data to foreign keys 79 | """ 80 | rows = session.query(getattr(foreign_subquery.c, foreign_key), getattr(foreign_subquery.c, foreign_value)). \ 81 | filter(getattr(foreign_subquery.c, foreign_value).in_(child_data)).all() 82 | conversion_dict = {getattr(row, foreign_value): getattr(row, foreign_key) for row in rows} 83 | return conversion_dict 84 | 85 | 86 | def convert_sql_statement_to_table(session, sql_statement, qualifiers=False): 87 | """Convert the specified table into a pandas dataframe, modify it according to qualifiers, and return the result 88 | 89 | Args: 90 | session: SQLalchemy session object 91 | sql_statement: A sql_statement. Typically, this is the statement property of an object returned by a query such 92 | as session.query(tbl).statement 93 | qualifiers: A list of columns or a function to filter rows by 94 | """ 95 | if qualifiers: 96 | return pd.read_sql(sql_statement, session.bind)[qualifiers] 97 | else: 98 | return pd.read_sql(sql_statement, session.bind) 99 | -------------------------------------------------------------------------------- /NBApredict/models/graphing.py: -------------------------------------------------------------------------------- 1 | """ 2 | graphing contains functions for creating evaluative graphs for regressions 3 | """ 4 | 5 | import math 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | import scipy.stats as sci_stats 9 | from sklearn.linear_model import LinearRegression 10 | import statistics as stats 11 | import statsmodels.api as sm 12 | from statsmodels.compat import lzip 13 | from yellowbrick.regressor import ResidualsPlot 14 | 15 | 16 | def pred_vs_actual(predictions, target, r_squared, out_path=None): 17 | """Create and returnsa scatter plot of a model's predictions versus target variables 18 | 19 | Args: 20 | predictions: The predictions from a regression 21 | target: The target variable of a regression 22 | r_squared: The r_squared of a regression 23 | out_path: An optional path to save the graph to 24 | Returns: 25 | The predicted vs. actual graph 26 | """ 27 | 28 | # Generate coordinates for a 1:1 line 29 | minimum = int(predictions.min()) - 1 30 | maximum = int(predictions.max()) + 1 31 | diag_line_x = [i for i in range(minimum, maximum)] 32 | diag_line_y = [i for i in diag_line_x] 33 | 34 | # Build Scatterplot 35 | fig, ax = plt.subplots() 36 | ax.scatter(predictions, target) 37 | ax.set_title("Predicted vs. Actual") 38 | ax.set_xlabel("Predicted") 39 | ax.set_ylabel("Actual") 40 | ax.axhline(0, c="k", linewidth=0.25) 41 | ax.plot(diag_line_x, diag_line_y, c="r") 42 | ax.text(0.1, 0.9, "R^2 = {}".format(r_squared), transform=ax.transAxes, bbox=dict(fill=False)) 43 | 44 | if out_path: 45 | fig.savefig(fname=out_path) 46 | return fig 47 | 48 | 49 | def residuals_vs_fitted(predictions, residuals, out_path=None): 50 | """Create and return a scatter plot of a model's fitted values (predictions) versus the residuals 51 | 52 | Args: 53 | predictions: The predictions from a regression 54 | residuals: The residuals from a regression 55 | out_path: An optional path to save the graph to 56 | 57 | Returns: 58 | The residuals vs. fitted graph 59 | """ 60 | # Get Jarque-bera test of normality 61 | name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis'] 62 | test = sm.stats.jarque_bera(residuals) 63 | jarque_bera = lzip(name, test) 64 | p_value = jarque_bera[1][1] 65 | 66 | mu = 0 67 | variance = stats.variance(residuals) 68 | sigma = math.sqrt(variance) 69 | x = np.linspace(mu-4*sigma, mu+4*sigma, 100) 70 | 71 | # Build Scatterplot 72 | fig, ax = plt.subplots(nrows=1, ncols=2, gridspec_kw={'width_ratios': [3, 1]}) 73 | ax[0].scatter(predictions, residuals) 74 | ax[0].set_title("Residuals vs. Fitted Values") 75 | ax[0].set_xlabel("Fitted Values") 76 | ax[0].set_ylabel("Residuals") 77 | ax[0].axhline(0, c="k", linewidth=0.5) 78 | ax[1].hist(residuals, bins=30, orientation="horizontal") 79 | # ax[1].set_xticks(np.linspace(0, round(ax[1].get_xbound()[1]), 3)) 80 | ax2 = ax[1].twiny() 81 | # ax2.set_xticks(np.linspace(0, round(ax2.get_xbound()[1], 2), 3)) 82 | ax2.plot(sci_stats.norm.pdf(x, mu, sigma), x, color="red") 83 | ax[1].set_xlabel("Frequency") 84 | ax[1].set_title("Residual Distribution") 85 | fig.tight_layout() 86 | align_xaxis(ax[1], 0, ax2, 0) 87 | if out_path: 88 | fig.savefig(out_path) 89 | return fig 90 | 91 | 92 | def cooks_distance(cooks_d, out_path=None): 93 | """Create and return a cook's distance graph 94 | 95 | Args: 96 | cooks_d: Cook's distance from a regression 97 | out_path: optional path to save the figure to 98 | Returns: 99 | The cook's distance graph 100 | """ 101 | fig, ax = plt.subplots() 102 | ax.stem(np.arange(len(cooks_d)), cooks_d) 103 | ax.set_title("Cook's Distance") 104 | ax.set_xlabel("Residuals") 105 | ax.set_ylabel("Cook's Distance") 106 | if out_path: 107 | fig.savefig(out_path) 108 | return fig 109 | 110 | 111 | def residual_independence(residuals): 112 | """Create a residual time series plot to check for independence. 113 | 114 | Row number on X-axis, Residual on Y-axis 115 | 116 | Args: 117 | residuals: Pandas series holding residuals 118 | """ 119 | indices = [x for x in range(len(residuals))] 120 | fig, ax = plt.subplots() 121 | ax.stem(indices, residuals) 122 | ax.set_title("Residual Independence") 123 | ax.set_xlabel("Row Number") 124 | ax.set_ylabel("Residual") 125 | return fig 126 | 127 | 128 | def align_xaxis(ax1, v1, ax2, v2): 129 | """adjust ax2 ylimit so that v2 in ax2 is aligned to v1 in ax1""" 130 | _, x1 = ax1.transData.transform((0, v1)) 131 | _, x2 = ax2.transData.transform((0, v2)) 132 | inv = ax2.transData.inverted() 133 | _, dx = inv.transform((0, 0)) - inv.transform((0, x1-x2)) 134 | minx, maxx = ax2.get_xlim() 135 | ax2.set_xlim(minx+dx, maxx+dx) 136 | 137 | 138 | def residuals_yellowbrick(predictors, target): 139 | """Returns a residuals vs. fitted graph with a histogram. Not currently functional. 140 | 141 | For future development. uses yellowbrick, which makes good graphs, but experiencing an unexplained missing 142 | argument TypeError 143 | """ 144 | lm = LinearRegression 145 | visualizer = ResidualsPlot(lm) 146 | visualizer.fit(predictors, target) 147 | return visualizer -------------------------------------------------------------------------------- /NBApredict/scrapers/season_scraper.py: -------------------------------------------------------------------------------- 1 | """ 2 | season_scraper scrapes data from a specified season and writes it to the specified database. 3 | 4 | The basketball_reference_web_scraper package is used to scrape the data. The data is then formatted and written to the 5 | database. The table is automatically named 'sched' for schedule with the year appended as in 'sched_2019'. 6 | """ 7 | 8 | from datetime import datetime 9 | from datatotable.data import DataOperator 10 | import pandas 11 | from sqlalchemy import UniqueConstraint, func 12 | 13 | # Local Imports 14 | from nbapredict.br_web_scraper import client 15 | from nbapredict.configuration import Config 16 | 17 | 18 | def br_enum_to_string(season): 19 | """Substitute the value of each enum for an enum in season and return a modified season 20 | 21 | Args: 22 | season: A season as defined by basketball_reference_web_scraper 23 | 24 | Returns: 25 | A season modified so that any enums in the season are replaced by their values 26 | """ 27 | new_season = [] 28 | for game in season: 29 | game_dict = dict() 30 | keys = game.keys() 31 | for key in keys: 32 | if type(game[key]) not in [str, int, float, datetime]: 33 | game_dict[key] = game[key].value # Extract value from enum here 34 | else: 35 | game_dict[key] = game[key] 36 | new_season.append(game_dict) 37 | return new_season 38 | 39 | 40 | def create_season_table(database, data, tbl_name): 41 | """Creates the season table in the specified database, inserts the data, and clears mappers 42 | 43 | Use only if the table does not already exist 44 | 45 | Args: 46 | database: An instantiated DBInterface object from database.database for database interactions. 47 | data: A DataOperator object from database.manipulator that holds the data to add. 48 | tbl_name: The name of the table to create. 49 | """ 50 | sql_types = data.get_sql_type() 51 | constraint = {UniqueConstraint: ["start_time", "home_team", "away_team"]} 52 | database.map_table(tbl_name, sql_types, constraint) 53 | database.create_tables() 54 | database.insert_rows(tbl_name, data.data) 55 | database.clear_mappers() # if mappers aren't cleared, others scripts won't be able to use DBInterface.Template 56 | 57 | 58 | def update_season_table(session, sched_tbl, season_df): 59 | """Updates the schedule table in the database with new data stored in the season_df 60 | 61 | Changes are added to the session and need to be committed later. 62 | During the playoffs, some games are removed from the sched_df as described in line. 63 | 64 | Args: 65 | session: A SQLalchemy session object 66 | sched_tbl: A mapped table that holds the schedule 67 | season_df: A pandas Dataframe version of the season as returned from br_web_scraper 68 | """ 69 | date = datetime.date(datetime.now()) 70 | update_query = session.query(sched_tbl).filter(sched_tbl.start_time < date, 71 | sched_tbl.home_team_score == 0).order_by(sched_tbl.start_time) 72 | if update_query.count() == 0: 73 | # print("Season is up to date; Returning without performing an update.") Test/logging statement 74 | return 75 | 76 | all_update_rows = update_query.all() 77 | first_game_time = all_update_rows[0].start_time 78 | last_game_time = all_update_rows[len(all_update_rows) - 1].start_time 79 | 80 | # Reduce season to games between first and last game time 81 | season_df["start_time"] = season_df["start_time"].dt.tz_localize(None) 82 | update_df = season_df.loc[(season_df.start_time >= first_game_time) & (season_df.start_time <= last_game_time)] 83 | 84 | for row in all_update_rows: 85 | game = update_df.loc[(update_df.home_team == row.home_team) & (update_df.away_team == row.away_team) & 86 | (update_df.start_time.dt.date == datetime.date(row.start_time))] 87 | if len(game) == 0: 88 | # This catches playoff games which do not end up happening (i.e. a game 7 in a series a team sweeps), and 89 | # removes it from the database 90 | session.delete(row) 91 | else: 92 | row.home_team_score = int(game.home_team_score) 93 | row.away_team_score = int(game.away_team_score) 94 | row.start_time = game.start_time.dt.to_pydatetime()[0] # Convert Pandas TimeStamp to datetime 95 | session.add(row) 96 | 97 | 98 | def add_rows(session, schedule, rows): 99 | """Add rows into the schedule if they contain games past the most recent game in schedule. 100 | 101 | Args: 102 | session: An instantiated sqlalchemy session 103 | schedule: A mapped schedule table 104 | rows: rows compatible with schedule 105 | """ 106 | most_recent_game = session.query(func.max(schedule.start_time)).one()[0] # The most recent game in the database 107 | most_recent_game = most_recent_game.replace(tzinfo=rows[0]["start_time"].tzinfo) # Unify timezones 108 | new_rows = [row for row in rows if row["start_time"] > most_recent_game] 109 | new_row_objects = [] 110 | for row in new_rows: 111 | new_row_objects.append(schedule(**row)) 112 | session.add_all(new_row_objects) 113 | 114 | 115 | def scrape(): 116 | """Scrape basketball reference for games in a season, parse the output, and write the output to a database. 117 | 118 | If the specified year has been completed, it will return every game in the season. If the season is ongoing, it will 119 | return every game up to the day before the module is run. This ensures only completed games are returned. 120 | 121 | Args: 122 | database: An instantiated DBInterface object from database.database for database interactions 123 | session: A SQLalchemy session object 124 | """ 125 | league_year = Config.get_property("league_year") 126 | 127 | # Create table 128 | season_data = client.season_schedule(league_year) 129 | season_data = br_enum_to_string(season_data) 130 | return season_data 131 | 132 | 133 | if __name__ == '__main__': 134 | scrape() 135 | -------------------------------------------------------------------------------- /NBApredict/database/manipulator.py: -------------------------------------------------------------------------------- 1 | """ 2 | manipulator holds the DataOperator class which coerces raw_data into SQLalchemy compatible formats. 3 | ToDo: Remove 4 | """ 5 | from datetime import datetime 6 | from nbapredict.helpers import type 7 | from sqlalchemy import Integer, Float, String, DateTime, Boolean 8 | 9 | 10 | class DataOperator: 11 | """DataOperator takes scraped data in init, and uses its member functions to return manipulations of that data""" 12 | 13 | def __init__(self, data): 14 | """Stores the data dictionary passed to it 15 | 16 | Args: 17 | data: A dictionary of data which will, usually, reflect data scraped from a website. Two dictionary 18 | formats are accepted. First, data may hold column names with data values formatted as: 19 | data[col1] = [val1, val2, ...] 20 | data[col2] = [val1, val2, ...] 21 | Second, data may be a list of rows formatted as: 22 | data[0] = {col1: val0, col2: val0, colx: val0} 23 | data[x] = {col1: valx, col2: valx, colx: valx} 24 | """ 25 | self.data = data 26 | self.rows = None 27 | 28 | def get_sql_type(self): 29 | """Take the object's data and return a dictionary formatted as {key: SQLtype}. 30 | 31 | Returns: 32 | A dictionary with the same keys as tbl_dict. The dictionary's values are the sql_types of each key:value 33 | pair in tbl_dict. The sql_types are defined to function with SQLalchemy as column definitions. 34 | """ 35 | py_types = self._get_py_type() # py_types is a dict 36 | sql_types = self._py_type_to_sql_type(py_types) 37 | return sql_types 38 | 39 | def _get_py_type(self): 40 | """Take the classes data values and return a dictionary that holds the python type for the values. 41 | 42 | Returns: 43 | A dictionary formatted as key:py_type where the type can be integer, float, string, datetime, or none 44 | """ 45 | py_types_dict = {} 46 | if isinstance(self.data, dict): 47 | tbl_keys = list(self.data.keys()) 48 | py_types = [type.get_type(self.data[key]) for key in tbl_keys] 49 | py_types_dict = dict(zip(tbl_keys, py_types)) 50 | elif isinstance(self.data, list): 51 | if isinstance(self.data[0], dict): 52 | data = self.data[0] 53 | tbl_keys = list(data.keys()) 54 | py_types = [type.get_type(data[key]) for key in tbl_keys] 55 | py_types_dict = dict(zip(tbl_keys, py_types)) 56 | else: 57 | raise Exception("The data structure ({}) is not handled by _get_py_type".format(type(self.data))) 58 | return py_types_dict 59 | 60 | @staticmethod 61 | def _py_type_to_sql_type(py_types): 62 | """Convert and return a dictionary of python types to a dictionary of sql types. 63 | 64 | Raises: 65 | An exception if a py_type is not an integer, float, string, datetime, bool, or none 66 | 67 | To-do: 68 | * Change the logic into a switch statement 69 | """ 70 | 71 | sql_types = dict() 72 | for key in py_types: 73 | py_type = py_types[key] 74 | if py_type == "integer" or py_type is int: 75 | sql_types[key] = Integer 76 | elif py_type == "float" or py_type is float: 77 | sql_types[key] = Float 78 | elif py_type == "string" or py_type is str: 79 | sql_types[key] = String 80 | elif py_type == "datetime" or py_type is datetime: 81 | sql_types[key] = DateTime 82 | elif py_type == "bool" or py_type is bool: 83 | sql_types[key] = Boolean 84 | elif py_type is None: 85 | continue # We continue here so as to not create a column for null values 86 | else: 87 | raise Exception("Error: py_type {} is not an integer, float, datetime," 88 | " none, or string".format(py_types[key])) 89 | return sql_types 90 | 91 | # Table modification functions 92 | def dict_to_rows(self): 93 | """Convert and return class data into rows compatible with sqlalchemy's insert function 94 | 95 | Currently presumes each dictionary object is a list of equivalent length. Calls _dict_to_rows() to do primary 96 | processing. Does not yet function with lists. 97 | 98 | Returns: 99 | a list of rows compatible with SQLalchemy's 100 | 101 | Raise: 102 | Exception: If the input is neither a list nor dictionary, an exception is raised 103 | """ 104 | if isinstance(self.data, dict): 105 | self.rows = self._dict_to_rows() 106 | return self.rows 107 | elif isinstance(self.data, list): 108 | self.rows = self._list_to_rows() 109 | return self.rows 110 | else: 111 | raise Exception("tbl is neither a list or dictionary, and cannot be handled") 112 | 113 | def _dict_to_rows(self): 114 | """Convert and return an input dictionary into rows compatible with SQLalchemy""" 115 | 116 | rows = [] 117 | keys = list(self.data.keys()) 118 | # The length of the data should be checked outside the function to ensure each value is an equal length object 119 | length = len(self.data[keys[0]]) 120 | for i in range(length): 121 | row_dict = dict() 122 | for key in keys: 123 | row_dict[key] = self.data[key][i] 124 | rows.append(row_dict) 125 | return rows 126 | 127 | def _list_to_rows(self): 128 | """Not yet functional 129 | 130 | To-do: 131 | Implement functionality for transforming lists into database rows""" 132 | 133 | raise Exception("tbl is a list. Function to convert lists into database rows is not implemented") 134 | 135 | def validate_data_length(self): 136 | """Given a dictionary where keys references lists, check that all lists are the same length, and return T or F 137 | 138 | Returns: 139 | True: if all the lists in the dictionary have the same length 140 | False: if the dictionary's lists are of different lengths 141 | """ 142 | keys = self.data.keys() 143 | lengths = [] 144 | for key in keys: 145 | lengths.append(len(self.data[key])) 146 | length_set = set(lengths) 147 | if len(length_set) == 1: 148 | return True 149 | else: 150 | return False 151 | -------------------------------------------------------------------------------- /NBApredict/management/tables/odds.py: -------------------------------------------------------------------------------- 1 | """odds.py contains function to create the odds table in the database""" 2 | 3 | import nbapredict.management.conversion as convert 4 | from sqlalchemy import ForeignKey, or_, func 5 | from sqlalchemy.orm import aliased 6 | from datetime import timedelta 7 | import math 8 | 9 | 10 | def format_data(session, odds_dict, team_tbl, schedule_tbl): 11 | """From the odds_dict, strip extraneous dictionary keys, add a 'game_id' FK, and return the odds_dict 12 | 13 | Args: 14 | session: A SQLalchemy session bound to the db 15 | odds_dict: A dictionary of data returned by line_scraper 16 | team_tbl: A mapped team table 17 | schedule_tbl: A mapped schedule table 18 | 19 | Returns: 20 | odds_dict formatted with foreign keys (mainly a FK for games in the schedule tbl) 21 | """ 22 | odds_dict['home_team_id'] = convert.values_to_foreign_key(session, team_tbl, "id", 'team_name', 23 | odds_dict.pop('home_team')) 24 | odds_dict = check_gametimes(session, schedule_tbl, odds_dict) 25 | 26 | # the columns that uniquely identify a game in the schedule table 27 | val_cols = ['home_team_id', 'start_time'] 28 | uID = {k: odds_dict[k] for k in val_cols} # Home team + start_time form a unique identifier for a game in schedule 29 | odds_dict['game_id'] = convert.values_to_foreign_key(session, schedule_tbl, "id", val_cols, uID) 30 | 31 | # Each of these columns is held in the schedule table 32 | del odds_dict['start_time'] 33 | del odds_dict['away_team'] 34 | del odds_dict['home_team_id'] 35 | 36 | return odds_dict 37 | 38 | 39 | def check_gametimes(session, schedule_tbl, odds_dict): 40 | """Check and, if necessary, change game times in the odds_dict 41 | 42 | Some games in Bovada do not have the same time as those in the official schedule. For example a Bovada game may 43 | start at 9:05 whereas the official game time is 9:00. """ 44 | first_gametime = min(odds_dict['start_time']) - timedelta(hours=12) 45 | last_gametime = max(odds_dict['start_time']) + timedelta(days=1) 46 | sched_times = session.query(schedule_tbl.start_time).filter( 47 | schedule_tbl.home_team_id.in_(odds_dict['home_team_id']), 48 | schedule_tbl.start_time >= first_gametime, 49 | schedule_tbl.start_time <= last_gametime).all() 50 | sched_times = [t.start_time for t in sched_times] 51 | 52 | s_times = odds_dict['start_time'] 53 | # List of tuples where the first element is the index to replace in odds_dict and the second element is the 54 | # unmatched time 55 | unmatched_times = [(t, s_times[t]) for t in range(len(s_times)) if s_times[t] not in sched_times] 56 | offsets = [timedelta(minutes=5)] # Append more offsets here if they arise in the future 57 | 58 | # Check if the unmatched times +/- an offset exists in the schedule times 59 | for i in unmatched_times: 60 | for j in offsets: 61 | if (i[1] + j) in sched_times: 62 | odds_dict['start_time'][i[0]] = i[1] + j 63 | break 64 | elif (i[1] - j) in sched_times: 65 | odds_dict['start_time'][i[0]] = i[1] - j 66 | break 67 | 68 | return odds_dict 69 | 70 | 71 | def create_table(db, tbl_name, odds_data, schedule_tbl): 72 | """Create a table of odds in the database""" 73 | columns = odds_data.columns 74 | schedule_tbl_name = schedule_tbl.__table__.fullname 75 | columns['game_id'].append(ForeignKey("{}.id".format(schedule_tbl_name))) 76 | db.map_table(tbl_name=tbl_name, columns=columns) 77 | db.create_tables() 78 | db.clear_mappers() 79 | 80 | 81 | def update_table(session, odds_tbl, odds_data): 82 | """Return a list of rows to update in the odds table. 83 | 84 | This function wraps updated rows from any number of functions that perform updates on different criteria.""" 85 | line_updates = update_lines(session, odds_tbl, odds_data) 86 | return line_updates 87 | 88 | 89 | def update_lines(session, odds_tbl, odds_data): 90 | """Update odds_tbl rows that are missing betting data present in the odds_data""" 91 | 92 | game_ids = odds_data.data['game_id'] 93 | rows = session.query(odds_tbl).filter(or_(odds_tbl.home_spread_price == None, odds_tbl.away_spread_price == None, 94 | odds_tbl.home_moneyline == None, odds_tbl.away_moneyline == None) & 95 | odds_tbl.game_id.in_(game_ids)) 96 | if rows.count() > 0: 97 | rows = rows.all() 98 | data_df = odds_data.dataframe 99 | update_rows = [] 100 | bet_cols = ['home_spread_price', 'away_spread_price', 'home_moneyline', 'away_moneyline', 'spread'] 101 | for r in rows: 102 | data_row = data_df[data_df['game_id'] == r.game_id] 103 | updated = False 104 | for c in bet_cols: 105 | data_val = data_row[c].to_numpy()[0] 106 | if math.isnan(data_val): 107 | data_val = None 108 | if data_val != getattr(r, c): 109 | setattr(r, c, data_val) 110 | updated = True 111 | if updated: 112 | update_rows.append(r) 113 | else: 114 | update_rows = [] 115 | return update_rows 116 | 117 | 118 | def delete(session, odds_tbl): 119 | """Wraps odds functions that delete rows""" 120 | delete_duplicates(session, odds_tbl) 121 | 122 | 123 | def delete_duplicates(session, odds_tbl): 124 | """Delete odds rows where multiple copies exist for a game but the betting information does not change""" 125 | l = aliased(odds_tbl) # left odds 126 | r = aliased(odds_tbl) # right odds 127 | # Rows that hold the same information 128 | join = session.query(l).join(r, l.game_id == r.game_id, isouter=True). \ 129 | filter(l.id != r.id, l.spread == r.spread, l.home_spread_price == r.home_spread_price, 130 | l.home_moneyline == r.home_moneyline, l.away_moneyline == r.away_moneyline).distinct().subquery() 131 | 132 | # join = session.query(l). \ 133 | # filter(l.id != r.id, l.spread == r.spread, l.home_spread_price == r.home_spread_price, 134 | # l.home_moneyline == r.home_moneyline, l.away_moneyline == r.away_moneyline).distinct().subquery() 135 | 136 | min_ids = session.query(func.min(join.c['id']).label('id')).group_by(join.c['game_id']).order_by(join.c['id']) \ 137 | .subquery() 138 | 139 | delete = session.query(join).filter(join.c['id'].notin_(min_ids)).subquery() 140 | delete_alias = aliased(odds_tbl, delete) 141 | delete_rows = session.query(delete_alias).all() 142 | if len(delete_rows) > 0: 143 | for i in delete_rows: 144 | session.delete(i) 145 | -------------------------------------------------------------------------------- /NBApredict/helpers/br_references.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains constants from basketballreference.com or for interaction with basketballreference based tables. 3 | 4 | For example, it contains all team names and abbreviations as well as headers for different tables that are queried. 5 | All of the classes come from the br_web_scraper repo available on github. I am not clear on why the classes use enums 6 | which I've largely worked around in the package. Other lists and dictionaries were generated specifically for this 7 | project. 8 | """ 9 | 10 | from enum import Enum 11 | import os 12 | 13 | 14 | class Location(Enum): 15 | HOME = "HOME" 16 | AWAY = "AWAY" 17 | 18 | 19 | class Outcome(Enum): 20 | WIN = "WIN" 21 | LOSS = "LOSS" 22 | 23 | 24 | class Team(Enum): 25 | ATLANTA_HAWKS = "ATLANTA HAWKS" 26 | BOSTON_CELTICS = "BOSTON CELTICS" 27 | BROOKLYN_NETS = "BROOKLYN NETS" 28 | CHARLOTTE_HORNETS = "CHARLOTTE HORNETS" 29 | CHICAGO_BULLS = "CHICAGO BULLS" 30 | CLEVELAND_CAVALIERS = "CLEVELAND CAVALIERS" 31 | DALLAS_MAVERICKS = "DALLAS MAVERICKS" 32 | DENVER_NUGGETS = "DENVER NUGGETS" 33 | DETROIT_PISTONS = "DETROIT PISTONS" 34 | GOLDEN_STATE_WARRIORS = "GOLDEN STATE WARRIORS" 35 | HOUSTON_ROCKETS = "HOUSTON ROCKETS" 36 | INDIANA_PACERS = "INDIANA PACERS" 37 | LOS_ANGELES_CLIPPERS = "LOS ANGELES CLIPPERS" 38 | LOS_ANGELES_LAKERS = "LOS ANGELES LAKERS" 39 | MEMPHIS_GRIZZLIES = "MEMPHIS GRIZZLIES" 40 | MIAMI_HEAT = "MIAMI HEAT" 41 | MILWAUKEE_BUCKS = "MILWAUKEE BUCKS" 42 | MINNESOTA_TIMBERWOLVES = "MINNESOTA TIMBERWOLVES" 43 | NEW_ORLEANS_PELICANS = "NEW ORLEANS PELICANS" 44 | NEW_YORK_KNICKS = "NEW YORK KNICKS" 45 | OKLAHOMA_CITY_THUNDER = "OKLAHOMA CITY THUNDER" 46 | ORLANDO_MAGIC = "ORLANDO MAGIC" 47 | PHILADELPHIA_76ERS = "PHILADELPHIA 76ERS" 48 | PHOENIX_SUNS = "PHOENIX SUNS" 49 | PORTLAND_TRAIL_BLAZERS = "PORTLAND TRAIL BLAZERS" 50 | SACRAMENTO_KINGS = "SACRAMENTO KINGS" 51 | SAN_ANTONIO_SPURS = "SAN ANTONIO SPURS" 52 | TORONTO_RAPTORS = "TORONTO RAPTORS" 53 | UTAH_JAZZ = "UTAH JAZZ" 54 | WASHINGTON_WIZARDS = "WASHINGTON WIZARDS" 55 | 56 | # DEPRECATED TEAMS 57 | CHARLOTTE_BOBCATS = "CHARLOTTE BOBCATS" 58 | NEW_JERSEY_NETS = "NEW JERSEY NETS" 59 | NEW_ORLEANS_HORNETS = "NEW ORLEANS HORNETS" 60 | NEW_ORLEANS_OKLAHOMA_CITY_HORNETS = "NEW ORLEANS/OKLAHOMA CITY HORNETS" 61 | SEATTLE_SUPERSONICS = "SEATTLE SUPERSONICS" 62 | VANCOUVER_GRIZZLIES = "VANCOUVER GRIZZLIES" 63 | 64 | 65 | class OutputType(Enum): 66 | JSON = "JSON" 67 | CSV = "CSV" 68 | 69 | 70 | class OutputWriteOption(Enum): 71 | WRITE = "w" 72 | CREATE_AND_WRITE = "w+" 73 | APPEND = "a" 74 | APPEND_AND_WRITE = "a+" 75 | 76 | 77 | class Position(Enum): 78 | POINT_GUARD = "POINT GUARD" 79 | SHOOTING_GUARD = "SHOOTING GUARD" 80 | SMALL_FORWARD = "SMALL FORWARD" 81 | POWER_FORWARD = "POWER FORWARD" 82 | CENTER = "CENTER" 83 | 84 | 85 | class Tables(Enum): 86 | misc_stats = "misc_stats" 87 | 88 | 89 | TEAM_ABBREVIATIONS_TO_TEAM = { 90 | 'ATL': Team.ATLANTA_HAWKS, 91 | 'BOS': Team.BOSTON_CELTICS, 92 | 'BRK': Team.BROOKLYN_NETS, 93 | 'CHI': Team.CHICAGO_BULLS, 94 | 'CHO': Team.CHARLOTTE_HORNETS, 95 | 'CLE': Team.CLEVELAND_CAVALIERS, 96 | 'DAL': Team.DALLAS_MAVERICKS, 97 | 'DEN': Team.DENVER_NUGGETS, 98 | 'DET': Team.DETROIT_PISTONS, 99 | 'GSW': Team.GOLDEN_STATE_WARRIORS, 100 | 'HOU': Team.HOUSTON_ROCKETS, 101 | 'IND': Team.INDIANA_PACERS, 102 | 'LAC': Team.LOS_ANGELES_CLIPPERS, 103 | 'LAL': Team.LOS_ANGELES_LAKERS, 104 | 'MEM': Team.MEMPHIS_GRIZZLIES, 105 | 'MIA': Team.MIAMI_HEAT, 106 | 'MIL': Team.MILWAUKEE_BUCKS, 107 | 'MIN': Team.MINNESOTA_TIMBERWOLVES, 108 | 'NOP': Team.NEW_ORLEANS_PELICANS, 109 | 'NYK': Team.NEW_YORK_KNICKS, 110 | 'OKC': Team.OKLAHOMA_CITY_THUNDER, 111 | 'ORL': Team.ORLANDO_MAGIC, 112 | 'PHI': Team.PHILADELPHIA_76ERS, 113 | 'PHO': Team.PHOENIX_SUNS, 114 | 'POR': Team.PORTLAND_TRAIL_BLAZERS, 115 | 'SAC': Team.SACRAMENTO_KINGS, 116 | 'SAS': Team.SAN_ANTONIO_SPURS, 117 | 'TOR': Team.TORONTO_RAPTORS, 118 | 'UTA': Team.UTAH_JAZZ, 119 | 'WAS': Team.WASHINGTON_WIZARDS, 120 | 121 | # DEPRECATED TEAMS 122 | 'NJN': Team.NEW_JERSEY_NETS, 123 | 'NOH': Team.NEW_ORLEANS_HORNETS, 124 | 'NOK': Team.NEW_ORLEANS_OKLAHOMA_CITY_HORNETS, 125 | 'CHA': Team.CHARLOTTE_BOBCATS, 126 | 'CHH': Team.CHARLOTTE_HORNETS, 127 | 'SEA': Team.SEATTLE_SUPERSONICS, 128 | 'VAN': Team.VANCOUVER_GRIZZLIES, 129 | } 130 | 131 | team_to_team_abbreviation = { 132 | "ATLANTA HAWKS": "ATL", 133 | "BOSTON CELTICS": "BOS", 134 | "BROOKLYN NETS": "BRK", 135 | "CHARLOTTE HORNETS": "CHO", 136 | "CHICAGO BULLS": "CHI", 137 | "CLEVELAND CAVALIERS": "CLE", 138 | "DALLAS MAVERICKS": "DAL", 139 | "DENVER NUGGETS": "DEN", 140 | "DETROIT PISTONS": "DET", 141 | "GOLDEN STATE WARRIORS": "GSW", 142 | "HOUSTON ROCKETS": "HOU", 143 | "INDIANA PACERS": "IND", 144 | "LOS ANGELES CLIPPERS": "LAC", 145 | "LOS ANGELES LAKERS": "LAL", 146 | "MEMPHIS GRIZZLIES": "MEM", 147 | "MIAMI HEAT": "MIA", 148 | "MILWAUKEE BUCKS": "MIL", 149 | "MINNESOTA TIMBERWOLVES": "MIN", 150 | "NEW ORLEANS PELICANS": "NOP", 151 | "NEW YORK KNICKS": "NYK", 152 | "OKLAHOMA CITY THUNDER": "OKC", 153 | "ORLANDO MAGIC": "ORL", 154 | "PHILADELPHIA 76ERS": "PHI", 155 | "PHOENIX SUNS": "PHO", 156 | "PORTLAND TRAIL BLAZERS": "POR", 157 | "SACRAMENTO KINGS": "SAC", 158 | "SAN ANTONIO SPURS": "SAS", 159 | "TORONTO RAPTORS": "TOR", 160 | "UTAH JAZZ": "UTA", 161 | "WASHINGTON WIZARDS": "WAS", 162 | 163 | # DEPRECATED TEAMS 164 | "CHARLOTTE BOBCATS": "CHA", 165 | "NEW JERSEY NETS": "NJN", 166 | "NEW ORLEANS HORNETS": "NOH", 167 | "NEW ORLEANS/OKLAHOMA CITY HORNETS": "NOK", 168 | "SEATTLE SUPERSONICS": "SEA", 169 | "VANCOUVER GRIZZLIES": "VAN" 170 | } 171 | 172 | POSITION_ABBREVIATIONS_TO_POSITION = { 173 | "PG": Position.POINT_GUARD, 174 | "SG": Position.SHOOTING_GUARD, 175 | "SF": Position.SMALL_FORWARD, 176 | "PF": Position.POWER_FORWARD, 177 | "C": Position.CENTER, 178 | } 179 | 180 | bball_ref_tbl_names = { 181 | "misc_stats": Tables.misc_stats 182 | } 183 | 184 | data_stat_headers = [ # Column headers for the misc_stats table 185 | "team_name", "age", "wins", 186 | "losses", "wins_pyth", "losses_pyth", 187 | "mov", "sos", "srs", "off_rtg", 188 | "def_rtg", "pace", "fta_per_fga_pct", 189 | "fg3a_per_fga_pct", "ts_pct", 190 | "efg_pct", "tov_pct", "orb_pct", 191 | "ft_rate", "opp_efg_pct", "opp_tov_pct", 192 | "drb_pct", "opp_ft_rate", "arena_name", 193 | "attendance", "attendance_per_g" 194 | ] 195 | 196 | four_factors = [ # The offensive and defensive four factors 197 | "efg_pct", "tov_pct", "orb_pct", 198 | "ft_rate", "opp_efg_pct", "opp_tov_pct", 199 | "drb_pct", "opp_ft_rate" 200 | ] 201 | 202 | BASE_URL = 'https://www.basketball-reference.com' 203 | # JSON_REFERENCE_PATH = os.path.join(test.project_directory(), "data", "references.json") 204 | -------------------------------------------------------------------------------- /NBApredict/predict/get.py: -------------------------------------------------------------------------------- 1 | """Funcs contains functions for generating predictions and their helper functions.""" 2 | import pandas as pd 3 | 4 | import nbapredict.models.four_factor_regression as lm 5 | import nbapredict.helpers.br_references as br_references 6 | from nbapredict.database.manipulator import DataOperator 7 | import nbapredict.database.getters as getters 8 | 9 | 10 | def sample_prediction(database, session, ref_tbl, model): 11 | """Generate and return a one row sample prediction created from the first row of the reference table. 12 | 13 | ToDo: Change function to take any model 14 | ToDo: Change docstring to reference new classes, perhaps drop DB arg 15 | 16 | Args: 17 | database: An initialized DBInterface class from database.dbinterface.py 18 | session: A SQLalchemy session object 19 | ref_tbl: A mapped odds table 20 | model: A regression object from four_factor_regression.py 21 | 22 | Returns: 23 | A DataOperator object initialized with a prediction from regression 24 | """ 25 | first_game_odds = session.query(ref_tbl).order_by(ref_tbl.start_time).first() 26 | 27 | home_tm = first_game_odds.home_team 28 | away_tm = first_game_odds.away_team 29 | start_time = first_game_odds.start_time 30 | 31 | sample_prediction = game_prediction(database, session, model, home_tm, away_tm, start_time) 32 | data = DataOperator(sample_prediction) 33 | return data 34 | 35 | 36 | def game_prediction(database, session, regression, home_tm, away_tm, start_time, year=2019, console_out=False): 37 | """Predict a game versus the line, and return the information in a dictionary. 38 | 39 | Use console out for human readable output if desired. Cdf is a cumulative density function. SF is a survival 40 | function. CDF is calculated when the betting line's prediction is below the model's prediction. SF is calculated 41 | when the betting line's prediction is above the model's prediction. 42 | 43 | ToDO: Modify to use new database 44 | 45 | Args: 46 | database: an instantiated DBInterface class from database.dbinterface.py 47 | session: A SQLalchemy session object 48 | regression: A regression object 49 | start_time: Date.datetime with the date and start time of the game 50 | home_tm: The home team 51 | away_tm: The away team 52 | line: The betting line 53 | year: The year to use stats from in predicting the game 54 | console_out: If true, print the prediction results. Ignore otherwise 55 | """ 56 | home_tm = team_name(home_tm) 57 | away_tm = team_name(away_tm) 58 | 59 | # Get Misc stats for year 60 | ff_list = lm.four_factors_list() 61 | ff_df = getters.get_pandas_df_from_table(database, session, "misc_stats_{}".format(year), ff_list) 62 | 63 | pred_df = prediction_df(home_tm, away_tm, ff_df) 64 | pred = prediction(regression, pred_df) 65 | # probability, function = line_probability(prediction, line, np.std(regression.residuals)) 66 | 67 | #if console_out: 68 | # prediction_result_console_output(home_tm, away_tm, line, prediction, probability) 69 | 70 | return {"start_time": start_time, "home_team": home_tm, "away_team": away_tm, "prediction": pred} 71 | 72 | 73 | def prediction(reg, pred_df): 74 | """Generate and return a prediction for the observations in the pred_df. 75 | 76 | Args: 77 | reg: LinearRegression class from four_factors_regression.py 78 | pred_df: A dataframe of observations, with home and away statistics, from which to generate a prediction 79 | 80 | Returns: 81 | The predicted value generated from the regression object and the predictors""" 82 | return reg.results.predict(pred_df).values[0] 83 | 84 | 85 | def console_output(home_tm, away_tm, line, prediction, probability): 86 | """Generate human readable printout comparing the model's predictions, the line, and the p_value of the line. 87 | 88 | Args: 89 | home_tm: The home team 90 | away_tm: The away team 91 | line: The betting line 92 | prediction: A prediction of the home team's margin of victory 93 | probability: The probability of the betting line as determined by a CDF or SF 94 | """ 95 | if prediction > 0: 96 | print("The {} are projected to beat the {} by {} points".format(home_tm, away_tm, prediction)) 97 | if (-1 * line) < prediction: 98 | print("If the model were true, the betting line's ({}) CDF, in relation to the prediction, would " 99 | "be realized {}% of the time".format(line, probability)) 100 | else: 101 | print("If the model were true, the betting line's ({}) SF, in relation to the prediction, would " 102 | "be realized {}% of the time".format(line, probability)) 103 | if prediction < 0: 104 | print("The {} are projected to lose to the {} by {} points".format(home_tm, away_tm, prediction)) 105 | if (-1 * line) < prediction: 106 | print("If the model were true, the betting line's ({}) CDF, in relation to the prediction, would " 107 | "be realized {}% of the time".format(line, probability)) 108 | else: 109 | print("If the model were true, the betting line's ({}) SF, in relation to the prediction, would " 110 | "be realized {}% of the time".format(line, probability)) 111 | 112 | 113 | def prediction_df(home_tm, away_tm, ff_df): 114 | """Create and return a dataframe that merges the four factors for the home and away team. 115 | 116 | Args: 117 | home_tm: The home team 118 | away_tm: The away team 119 | ff_df: Dataframe of the four factors for all teams 120 | 121 | Returns: 122 | A single row four factors data frame of the home and away team's four factors 123 | """ 124 | home_ff = team_ff(home_tm, ff_df, home=True) 125 | away_ff = team_ff(away_tm, ff_df, home=False) 126 | home_ff["key"] = 1 127 | home_ff["const"] = 1.0 # sm.add_const does not add a constant for whatever reason 128 | away_ff["key"] = 1 129 | merged = pd.merge(home_ff, away_ff, on="key", sort=True) 130 | merged = merged.drop(["key"], axis=1) 131 | merged = merged.sort_index(axis=1) 132 | return merged 133 | 134 | 135 | def team_ff(team, ff_df, home): 136 | """Create and return a data frame of the four factors for the specified team. 137 | 138 | Args: 139 | team: The team to extract the four factors for 140 | ff_df: A dataframe of the four factors 141 | home: Boolean which dictates if an '_h or '_a' should be appended to the team's stats 142 | 143 | Returns: 144 | The four factors, with a home or away suffix, for a team are returned as a data frame 145 | """ 146 | ff_list = br_references.four_factors 147 | team_ff = ff_df[ff_df.team_name.str.lower() == team.lower()][ff_list] 148 | if home: 149 | team_ff = team_ff.rename(lm.append_h, axis='columns') 150 | else: 151 | team_ff = team_ff.rename(lm.append_a, axis='columns') 152 | return team_ff 153 | 154 | 155 | def team_name(team): 156 | """Match team to a standard team name (not cap-sensitive) and return the br_references standard team name.""" 157 | for team_name in br_references.Team: 158 | if team.lower() == team_name.value.lower(): 159 | return team_name.value 160 | -------------------------------------------------------------------------------- /NBApredict/database/dbinterface.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains a DBInterface class which dictates table creation, deletion, and access. 3 | ToDo: Remove 4 | """ 5 | 6 | import os 7 | from sqlalchemy import Column, Integer, Table 8 | from sqlalchemy import create_engine, MetaData, event 9 | from sqlalchemy.engine import Engine 10 | from sqlalchemy.ext.declarative import declarative_base 11 | from sqlalchemy.orm import mapper, clear_mappers 12 | from sqlalchemy.ext.automap import automap_base 13 | 14 | # Local Imports 15 | from nbapredict.configuration import Config 16 | 17 | 18 | class DBInterface: 19 | """DBInterface contains high level information about the desired database and creation, deletion, and access functions 20 | 21 | Attributes: 22 | path: The path to the database 23 | engine: SQLalchemy engine for accessing the database 24 | metadata: Metadata for the engine, used mostly for table access / reflection 25 | Base: SQLalchemy declarative_base() used for table creation 26 | """ 27 | 28 | class Template(object): 29 | """Blank template to map tables to with the sqlalchemy mapper function 30 | 31 | Note: 32 | Template can only be mapped to one table at a time. Use clear_mappers to free the template for new tables 33 | """ 34 | pass 35 | 36 | def __init__(self, url=None): 37 | """Initialize macro-level SQLalchemy objects as class attributes (engine, metadata, base). 38 | 39 | A session will allow interaction with the DB.""" 40 | if not url: 41 | file_path = os.getcwd() 42 | self.path = Config.get_property("database") 43 | else: 44 | self.path = url 45 | self.engine = create_engine(self.path, pool_pre_ping=True) 46 | self.metadata = MetaData(self.engine) 47 | self.Base = declarative_base() 48 | 49 | def get_tables(self, table_names=False): 50 | """Find and return the specified tables or return all tables. 51 | 52 | Primary use is to check if table exists in database. Use get_table_mappings() for ORM style table interactions 53 | """ 54 | meta = MetaData(bind=self.engine) 55 | meta.reflect(bind=self.engine) 56 | if table_names: 57 | return meta.tables[table_names] 58 | else: 59 | return meta.tables 60 | 61 | def get_table_mappings(self, table_names): 62 | """Find and return the specified table mappings or return all table mappings 63 | 64 | Args: 65 | table_names: The table names for which mappings are desired. Either a string or list 66 | """ 67 | if isinstance(table_names, str): # Allows a string, rather than list, to be passed to function 68 | holder = table_names 69 | table_names = [holder] 70 | 71 | self.metadata.reflect(self.engine, only=table_names) 72 | Base = automap_base(metadata=self.metadata) 73 | Base.prepare() 74 | 75 | mapped_tables = [Base.classes[name] for name in table_names] 76 | if len(mapped_tables) == 1: 77 | return mapped_tables[0] 78 | else: 79 | return mapped_tables 80 | 81 | def table_exists(self, tbl_name): 82 | """Check if a table exists in the database; Return True if it exists and False otherwise.""" 83 | self.metadata.reflect(bind=self.engine) 84 | if tbl_name in self.metadata.tables: 85 | return True 86 | else: 87 | return False 88 | 89 | def create_tables(self): 90 | """Creates all tables which have been made or modified with the Base class of the DBInterface 91 | 92 | Note that existing tables which have been modified, such as by adding a relationship, will be updated when 93 | create_tables() is called. """ 94 | self.metadata.create_all(self.engine) 95 | 96 | def map_table(self, tbl_name, column_types, constraints=None): 97 | """Map a table named tbl_name and with column_types to Template, add constraints if specified. 98 | 99 | Note: Foreign key constraints should likely be added to the mapped table explicitly rather than in this function 100 | 101 | Args: 102 | tbl_name: The name of the table to be mapped 103 | column_types: A dictionary with column names as keys and sql types as values 104 | constraints: A dictionary of desired constraints where the constraints (Such as UniqueConstraint) are keys 105 | and the columns to be constrained is a list of string column names 106 | """ 107 | columns = self._generate_columns(column_types) 108 | if constraints: 109 | t = Table(tbl_name, self.metadata, Column('id', Integer, primary_key=True), 110 | *columns, 111 | *(constraint(*columns) for constraint, columns in constraints.items()), 112 | ) 113 | else: 114 | t = Table(tbl_name, self.metadata, Column('id', Integer, primary_key=True), 115 | *columns 116 | ) 117 | 118 | mapper(self.Template, t) 119 | 120 | @staticmethod 121 | def _generate_columns(columns): 122 | """Take columns where key is the column name and value is the column type into SQLlachemy columns. 123 | 124 | To use additional arguments, such as constraints, specify column values as a list where the constraints are 125 | elements of the list""" 126 | column_list = [] 127 | for key, value in columns.items(): 128 | try: 129 | column_list.append(Column(key, *value)) # Unpacks additional column arguments 130 | except TypeError: # if no additional arguments, just make a standard name and type column 131 | column_list.append(Column(key, value)) 132 | return column_list 133 | 134 | @staticmethod 135 | def clear_mappers(): 136 | clear_mappers() 137 | 138 | def insert_row(self, table, row): 139 | """Insert a single row into the specified table in the engine""" 140 | conn = self.engine.connect() 141 | table = self.get_tables(table) 142 | conn.execute(table.insert(), row) 143 | conn.close() 144 | # Rows formatted as 145 | # [{'l_name': 'Jones', 'f_name': 'bob'}, 146 | # {'l_name': 'Welker', 'f_name': 'alice'}]) 147 | 148 | def insert_rows(self, table, rows): 149 | """Insert rows into the specified table. 150 | 151 | Uses sqlalchemy's "Classic" method. ORM database interactions are mediated by sessions. 152 | """ 153 | table = self.get_tables(table) 154 | conn = self.engine.connect() 155 | for row in rows: 156 | conn.execute(table.insert(), row) 157 | conn.close() 158 | 159 | def drop_table(self, drop_tbl): 160 | """Drops the specified table from the database""" 161 | self.metadata.reflect(bind=self.engine) 162 | drop_tbls = self.metadata.tables[drop_tbl] 163 | drop_tbls.drop() 164 | self.metadata = MetaData(bind=self.engine) # Updates the metadata to reflect changes 165 | 166 | 167 | @event.listens_for(Engine, "connect") 168 | def set_sqlite_pragma(dbapi_connection, connection_record): 169 | """SQLalchemy listener function to allow foreign keys in SQLite""" 170 | cursor = dbapi_connection.cursor() 171 | cursor.execute("PRAGMA foreign_keys=ON") 172 | cursor.close() 173 | -------------------------------------------------------------------------------- /NBApredict/configuration.py: -------------------------------------------------------------------------------- 1 | """ 2 | Path contains function which return file and folder paths for the project 3 | ToDo: Research os.environ for setting variables. These may still be stored in Config, but they may offer a default 4 | ToDo: or a different way to set variables, particularly for file pathes. 5 | """ 6 | import os 7 | import yaml 8 | from nbapredict.helpers.classes import NestedDict 9 | 10 | 11 | def project_directory(): 12 | """Returns the project directory so long as configuration.py is in the top-level of the project""" 13 | return os.path.abspath(os.path.dirname(__file__)) 14 | 15 | 16 | def settings_file(): 17 | """Returns the file path of settings.yaml""" 18 | return os.path.join(project_directory(), "settings.yaml") 19 | 20 | 21 | def output_directory(): 22 | """Returns the path to the output folder which holds the database and graphs""" 23 | return os.path.join(project_directory(), "outputs") 24 | 25 | 26 | def rreplace(string, old, new, count): 27 | """Replace old with new in a string in reverse order. 28 | Args: 29 | string: String to modify 30 | old: Sub-string to replace 31 | new: Sub-string to replace old 32 | count: The number old sub-strings to be replaced""" 33 | li = string.rsplit(old, count) 34 | return new.join(li) 35 | 36 | 37 | def database_file(calling_file_path): 38 | """Return the database file path with the path modified in relation to the path the function is called from. 39 | 40 | The base path is r"sqlite:///outputs//nba_db.db". This function modifies that path in relation to the calling file 41 | path by inserting ..// to the front of the base path. So a file nested one level below the root directory becomes 42 | r"sqlite:///..//outputs//nba_db.db" 43 | """ 44 | head_path = project_directory() 45 | head_folder = os.path.split(head_path)[1] 46 | 47 | if os.path.realpath(calling_file_path) in head_path: 48 | # If NBApredict is imported from outside the project, replace calling_file_path with head_path 49 | calling_file_path = head_path 50 | 51 | calling_file_path = calling_file_path.replace("\\", "/") 52 | #print("Calling_file_path:", calling_file_path) 53 | sub_dirs = [] 54 | split_path = os.path.split(calling_file_path) 55 | path = split_path[0] 56 | folder = split_path[1] 57 | while folder != head_folder: 58 | sub_dirs.append(folder) 59 | split_path = os.path.split(path) 60 | path = split_path[0] 61 | folder = split_path[1] 62 | 63 | if len(sub_dirs) > 0: 64 | modified_path = calling_file_path 65 | for folder in sub_dirs: 66 | modified_path = rreplace(modified_path, folder, "..", 1) 67 | 68 | path_addin = modified_path.split(head_folder)[1] 69 | path_addin = path_addin.replace("/", "//") 70 | while path_addin[0] == "/": 71 | path_addin = path_addin[1:] 72 | db_path = r"sqlite:///{}//outputs//nba_db.db".format(path_addin) 73 | return db_path 74 | else: 75 | return r"sqlite:///outputs//nba_db.db" 76 | 77 | 78 | def graphs_directory(): 79 | """Return the folder which holds graphs for the project.""" 80 | return os.path.join(output_directory(), "graphs") 81 | 82 | 83 | class Configuration: 84 | """Read and write configuration settings from settings.yaml 85 | 86 | Warning: 87 | Configuration cannot handle duplicate keys even if keys are of a different depth 88 | 89 | Attributes: 90 | _file: the source file of the Configuration instance 91 | _config: a dictionary of settings 92 | _key_order: each key in _config with values listing keys above the specified key 93 | """ 94 | 95 | def __init__(self, file, settings): 96 | """sets _config to the settings dictionary and stores the _key_order for accessing each element in _config""" 97 | self._file = file 98 | self._key_order = self._generate_config_keys(settings) 99 | self._config = NestedDict(settings) 100 | 101 | def _generate_config_keys(self, config_dict, path=None, result=None, depth=0, ): 102 | """Return a dictionary with each key, of any depth, in self._config. 103 | 104 | Each key's value is an ordered list of the nodes above the key and the key itself in self._config. A key in the 105 | fourth level of config will be: {key: [node1, node2, node3, key]}. 106 | 107 | Args: 108 | config_dict: A dictionary of configuration options 109 | path: A list of keys above the current key in the dictionary 110 | result: A dictionary which stores results 111 | depth: The current depth of the recursion 112 | """ 113 | # Initialize path and result. We avoid defaults so path and result are reset on each call 114 | if path is None: 115 | path = [] 116 | if result is None: 117 | result = {} 118 | for key, value in config_dict.items(): 119 | if depth == 0: # Reset path each time the function reaches a top-level key in the dictionary 120 | path = [key] 121 | if type(value) is dict: 122 | if key not in path: 123 | path.append(key) 124 | if key not in result.keys(): 125 | result.update({key: path[:]}) # Create a new list to store path's current state 126 | result = self._generate_config_keys(value, path, result, depth=depth + 1) 127 | else: 128 | result.update({key: path[:]}) 129 | result[key].append(key) 130 | 131 | return result 132 | 133 | def get_property(self, property_key): 134 | """Return the property associated with the property key from _config. 135 | 136 | Args: 137 | property_key: The key, of any depth, of the desired property 138 | """ 139 | if property_key not in self._key_order.keys(): 140 | return None 141 | elif property_key in self._config.dict.keys(): # Checks if named property is in the top level of _config 142 | return self._config[property_key] 143 | else: 144 | return self._config[self._key_order[property_key]] 145 | 146 | def _set_property(self, property_key, value): 147 | """Private function for modifying key:value pairs in self._config. 148 | 149 | Additionally, rewrites self._key_order in order to store changes.""" 150 | if property_key not in self._key_order.keys(): 151 | raise KeyError("'{}' not in Config. Manually modify the settings.yaml file if you wish to add new" 152 | " settings.".format(property_key)) 153 | keys = [i for i in self._key_order[property_key]] 154 | self._config[keys] = value 155 | self._key_order = self._generate_config_keys(self._config.dict) 156 | 157 | def _write(self): 158 | """Private function for over-writing self._config to the settings file""" 159 | 160 | 161 | def create_configuration(file, config_settings): 162 | """Return an instantiated Configuration class.""" 163 | return Configuration(file, config_settings) 164 | 165 | 166 | def check_paths(config, comp_dict): 167 | no_match = {} 168 | for k, v in comp_dict.items(): 169 | if config.get_property(k) != v: 170 | no_match[k] = v 171 | return no_match 172 | 173 | 174 | def set_paths(config, change_dict): 175 | for k, v in change_dict.items(): 176 | config._set_property(k, v) 177 | return config 178 | 179 | 180 | with open(settings_file(), "r") as file: 181 | config_settings = yaml.safe_load(file) 182 | 183 | Config = create_configuration(settings_file(), config_settings) 184 | paths = {"directory": project_directory(), "database": database_file(os.getcwd()), "graph_dir": graphs_directory(), 185 | "settings": settings_file()} 186 | paths = {"directory": project_directory()} 187 | paths.update({"database": database_file(os.getcwd())}) 188 | change_paths = check_paths(Config, paths) 189 | set_paths(Config, change_paths) 190 | 191 | # noinspection PyProtectedMember 192 | # Config._set_property("four_factor_regression", "something_else") 193 | 194 | 195 | t = 2 196 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NBA_predict 2 | 3 | NBApredict is a package for predicting NBA games against betting lines. It has two main behaviors: 4 | 1. Scrape and store team statistics, game results, and betting lines in a SQLite database. 5 | 2. Generate predictions for each NBA game, compare the prediction to the betting line, and store the results. 6 | 7 | 8 | ## Status 9 | I have effectively archived this project. Given that, I thought it would be relevant to update the code and README to reflect the state of the project where it left off. Work on the project was previously sponsored by a benefactor, but monetary support dried up with the pause of the NBA season due to COVID-19 in March 2020. That pause occured in the middle of a significant reorganization which was perhaps 90% finished. Those changes are now stored in "main" as people continued to clone the master branch, and the code in that branch was trash. Hence why I started the reorganization. The old master branch is stored as "archive". (Note: "main" is now the equivalent of "master" if that's not clear above.) 10 | 11 | This project will not run if you just clone it. However, there's hundreds of hours of work here! So, if you want to get this project to run for you, get in contact, and I'd be happy to iron out the few remaining kinks. Otherwise, I feel no real incentive to work on this for myself at the moment. 12 | 13 | In the rest of this README, I'll try to describe the state of the project so that you may have some idea of what utility you may derive from the work I've done. Hopefully this will be of some help in your own pursuits. 14 | 15 | ## Project Overview 16 | ### Directories 17 | This section overviews the main components of the project. Details for other sections of the project are available in the documentation. 18 | 19 | * br_web_scraper - This is just a clone of the same package listed in the [the credits section](#credits) with some changes to fit NBApredict. 20 | * database - Can be ignored. This folder contains various modules intended to automate database operations. However, I found these modules useful for other projects, so I created the [DatatoTable](https://github.com/Spencer-Weston/DatatoTable) repo. DatatoTable is available as a package on PyPi, and the rest of NBApredict *should* use that package. 21 | * helpers - Miscellaneous. Most of this would get removed were the reorganization finished. 22 | * management - Data management package 23 | * tables - Each module in this directory has a corresponding table in the database. Each module will tend to have of format_data(), create_table(), and update_table() functions, among others, which perform the necessary work for that table. 24 | * conversion - This module has been replaced by [DatatoTable.convert](https://github.com/Spencer-Weston/DatatoTable) functions. However, the functionality, either from this module or [DatatoTable](https://github.com/Spencer-Weston/DatatoTable), is essential for configuring foreign keys. An example is in tables.odds.format_data(). 25 | * etl - Extract, Transform, Load. This module runs all processes which involve external data. 26 | * models - This directory is to hold whatever models get incorporated into the project. At the moment, it just holds the four_factor_regression module (explained below) and a graphing script which generates graphs for regression evaluation. 27 | * outputs - This directory is generated by the project. It holds the SQLite database generated by the project. It also holds a graphs directory which stores any saved graphs. 28 | * predict - As it's named, this package is used to generate predictions. This is where work on the reorganization stopped, so these scripts are the least polished. The ToDo at the top of predict.bets describes the vision for this package. 29 | * run - The run directory holds two scripts, daily.py and all.py. The daily script will set the project to run daily while the all script runs the project when called. Neither will work unless work on upstream components is finished. 30 | * scrapers - The scrapers folder holds modules for scaping data. scraper.py's scrape_all() function will scrape all season, team, and betting line data. To just scrape one type of data, call the desired data's scrape function. For example, line_scraper.scrape() will scrape betting lines. 31 | 32 | ## The Model 33 | As of now, the model uses a linear regression based on the [Four Factors of Basketball Success](https://www.basketball-reference.com/about/factors.html) which encapsulates shooting, turnovers, rebounding, and free throws. Further, we include the opposing four factors, which are how a team's opponents perform on the four factors in aggregate. Thus, each team has eight variables, and the model uses sixteen variables (eight for each team) for each prediction. The target, Y, or dependent variable is home Margin of Victory (MOV). Away MOV is simply the inverse of home MOV. 34 | 35 | ### What are betting lines? 36 | MOV is targeted because it provides an easy comparison with two types of betting lines, the spread and moneyline. Here's what the spread and moneyline might look like for a matchup between the Milwaukee Bucks and Atlanta Hawks: 37 | 38 | Milwaukee Bucks (Home): 39 | 1. Spread: -8 40 | 2. Moneyline: -350 41 | 42 | Atlanta Hawks (Away): 43 | 1. Spread: 8 44 | 2. Moneyline: 270 45 | 46 | First, the spread attempts to guess the MOV between two teams. The Milwaukee Bucks spread of -8 indicates the betting line expects the Bucks to beat the Hawks by eight points. Or, the Bucks are "given" eight points. If one thinks the Bucks will beat the Hawks by more than eight points, they bet the Bucks. If one believes the Bucks will either win by less than eight points or lose, they bet the Hawks. Typically, spreads have symetric, or near-symetric, returns where picking the Bucks or the Hawks provides an equal return on a correct bet. 47 | 48 | In comparison, the moneyline states the likelihood of a team winning or losing in terms of a monetary return. A negative moneyline, such as the Buck's -350, means one must put up $350 in order to win $100. A positive moneyline, such as the Hawk's 270, means a bet of $100 will return $270 if it is correct. 49 | 50 | ### Generating Predictions 51 | 52 | Before comparing predictions to betting lines, we need to ensure the model meets the assumptions of regression. For now, assume assumptions are met, and refer to [Additional Reading](#additional-reading) for further model discussion. To compare the model's predictions to betting lines, we look at the prediction's distance from the betting line. In the model, the prediction is the expected value, or the mean, of the matchup. All possible outcomes of the game are normally distributed around this mean with a standard deviation, which as of March 2020, is approximately thirteen. 53 | 54 | Continuing the Bucks-Hawks example, lets say the model predicts the Bucks to win by 6 in comparison to the betting line of 8. To compare the betting line to the prediction, we want to evaluate the likelihood of a Bucks win by 8 or more given a normal distribution with a mean of 6 and standard deviation of 13. Thus, we calculate the survival function* of 8 based on the distribution. The result is approximately 0.44 which means we'd expect the home MOV to be greater than or equal to 8 44% of the time. Inversely, we expect the home MOV to be less than 8 approximately 56% of the time. 55 | 56 | To compare moneylines instead of spreads, simply set the spread to 0, and the output will be the likelihood of a win or loss. 57 | 58 | 59 | *The model uses a cumulative density function when the predicted MOV is greater than the betting line 60 | 61 | ## Usage 62 | (Outdated: This usage hasn't been recreated in the reorganization yet.) 63 | 64 | Clone this repo to your local machine: https://github.com/Spencer-Weston/NBA_bet.git 65 | 66 | To set the project to run daily: 67 | ```~\NBApredict>python -m run.daily``` 68 | 69 | run.daily sets the project to run 1 hour before the first game of each day. This time is chosen because betting lines are not always available until later in the day. 70 | 71 | Or to run the project once: 72 | ```~\NBApredict>python -m run.all``` 73 | 74 | 75 | ## Version: V0.2 - Reorganization 76 | 77 | This version isn't finished as described in the [Status](#status) section. Still, here is a rough approximation of V0.2: 78 | 79 | ### Why the Reorganization? 80 | In short, the project sucked before this point (check the archive branch). The project strucure was not pythonic, so namespaces were messy. Modules were more agglomerations of random behaviors than coherent units of related functionality. The project structure now follows standard python package design principles. The initial database design did not incorporate normalized databases. Given that many tables stored the same data, such as game times, keeping tables in sync required adding unique update functions for every table. The new tables are normalized with cascades to avoid this. Various other quality of life improvements have been implemented or will be if I ever pick this project back up. 81 | 82 | ### Finished 83 | * Project organized into a pythonic package structure 84 | * All tables are normalized 85 | * Database operations exported to [DatatoTable](https://github.com/Spencer-Weston/DatatoTable) with much improved usability 86 | * All scrapers (schedule, betting lines, team stats, and teams) and their associated table modules use the normalized format 87 | 88 | ### Unfinished 89 | * Predictions and the associated table (The models still work; they'r just not threaded into the full workflow) 90 | * Predictions and data interfaces. When completed, these functions would allow some degree of analysis for individual games or stats from the command line 91 | * "Run All" functionality. Once the above is finished, the project will be left to run daily to keep up to date data and predictions. 92 | 93 | ## Author 94 | Spencer Weston 95 | 96 | personal website: [Crockpot Thoughts](https://crockpotthoughts.wordpress.com/) 97 | 98 | ## Additional Reading 99 | * [How and Why](https://crockpotthoughts.wordpress.com/2019/07/23/an-nba-prediction-model-part-2-the-how-and-why/) 100 | * [Model Evaluation and Explanation](https://crockpotthoughts.wordpress.com/2019/08/05/predicting-nba-games-part-3-the-model/) 101 | 102 | ## Credits 103 | Jae Bradley: https://github.com/jaebradley/basketball_reference_web_scraper 104 | - Used to scrape games and game results 105 | 106 | ## License 107 | MIT 108 | -------------------------------------------------------------------------------- /NBApredict/scrapers/line_scraper.py: -------------------------------------------------------------------------------- 1 | """ 2 | line_scraper scrapes NBA betting odds from Bovada and stores them in the database. 3 | """ 4 | 5 | from datetime import datetime, timedelta 6 | import re 7 | import requests 8 | from sqlalchemy import UniqueConstraint, ForeignKey, Integer 9 | from sqlalchemy.exc import IntegrityError 10 | from sqlalchemy.orm import Session, relationship 11 | 12 | # Local Imports 13 | from nbapredict.configuration import Config 14 | from nbapredict.database.manipulator import DataOperator 15 | from nbapredict.database import getters 16 | from nbapredict.database.reconcile import reconcile 17 | 18 | 19 | def bovada_json_request(url): 20 | response = requests.get(url, allow_redirects=False).json() 21 | if not len(response): 22 | return None 23 | return response 24 | 25 | 26 | def odds_for_today(): 27 | """Match betting odds from Bovada to the games_query and return the odds 28 | 29 | Args: 30 | date to reflect the current games on Bovada. 31 | 32 | Returns: 33 | A dictionary where the column keys lists of values 34 | """ 35 | scrape_time = datetime.now() 36 | 37 | # Check for response from Bovada 38 | url = Config.get_property("regularURL") 39 | response = bovada_json_request(url) 40 | if not response: 41 | url = Config.get_property("playoffURL") 42 | response = bovada_json_request(url) 43 | if not response: 44 | return None 45 | 46 | # Move down tree towards games 47 | events = response[0]["events"] 48 | 49 | # Strip games from the 'event's object (which holds a bunch of random information) 50 | bovada_games = [e for e in events if e['description'].count('@') > 0 and e['type'] == 'GAMEEVENT'] 51 | if not bovada_games: 52 | return None 53 | 54 | # Set-up the line dictionary which stores data in the correct table format 55 | lines = {"home_team": [], "away_team": [], 'start_time': [], "spread": [], "home_spread_price": [], 56 | "away_spread_price": [], "home_moneyline": [], "away_moneyline": [], "scrape_time": []} 57 | 58 | # Iterate through each game returned by bovada and store its information 59 | for game in bovada_games: 60 | link = game['link'].split('-') 61 | link = link[len(link)-1] 62 | str_time = re.findall('[0-9]', link) 63 | start_time = ''.join(str_time) 64 | start_time = datetime.strptime(start_time, "%Y%m%d%H%M") 65 | if datetime.now() > start_time: 66 | # An ongoing game will not have the correct betting data. We don't want to store this information 67 | print("This game ({}) is either ongoing or completed. Not scraping".format(game['description'])) 68 | continue 69 | 70 | home_team, away_team = parse_teams(game["competitors"]) 71 | 72 | # Get only the full match betting information from the game object 73 | betting_info = game["displayGroups"][0]["markets"] 74 | full_match_bets = [bet for bet in betting_info if bet["period"]["description"] == "Match"] 75 | 76 | # Extract the betting data associated with the game 77 | money_lines = False 78 | for bet in full_match_bets: 79 | if bet["description"] == "Moneyline": 80 | home_moneyline, away_moneyline = parse_moneyline(bet) 81 | if home_moneyline == "": 82 | home_moneyline = None 83 | if away_moneyline == "": 84 | away_moneyline = None 85 | money_lines = True 86 | elif bet["description"] == "Point Spread": 87 | spread, home_spread_price, away_spread_price = parse_spread(bet) 88 | if spread == "": 89 | spread = None 90 | if home_spread_price == "": 91 | home_spread_price = None 92 | if away_spread_price == "": 93 | away_spread_price = None 94 | if not money_lines: 95 | home_moneyline = None 96 | away_moneyline = None 97 | 98 | game_lines = [home_team, away_team, start_time, spread, home_spread_price, away_spread_price, home_moneyline, 99 | away_moneyline, scrape_time] 100 | 101 | # This section depends on python 3.7+ to preserve the order of dict keys in lines 102 | i = 0 103 | for key in lines: 104 | lines[key].append(game_lines[i]) 105 | i += 1 106 | return lines 107 | 108 | 109 | def parse_teams(competitors): 110 | """Parse a competitors object from Bovada and return the home and away teams, respectively""" 111 | if len(competitors) > 2: 112 | raise Exception("Unexpected objects in competitors") 113 | home_team = "" 114 | away_team = "" 115 | for team in competitors: 116 | if team["home"]: 117 | home_team = team["name"] 118 | else: 119 | away_team = team["name"] 120 | if not home_team == "" or away_team == "": 121 | return home_team.upper(), away_team.upper() 122 | else: 123 | raise Exception("Competitors was not properly parsed. Missing data.") 124 | 125 | 126 | def parse_moneyline(moneyline_bet): 127 | """Parse a moneyline bet object from Bovada and return, in order, the home and away moneyline""" 128 | outcomes = moneyline_bet["outcomes"] 129 | home_moneyline = "" 130 | away_moneyline = "" 131 | if len(outcomes) > 2: 132 | raise Exception("Unexpected objects in moneyline bet") 133 | for o in outcomes: 134 | price = o["price"]["american"] 135 | if price == "EVEN": 136 | price = 100 137 | else: 138 | price = int(price) 139 | if o["type"] == "H": 140 | home_moneyline = price 141 | elif o["type"] == "A": 142 | away_moneyline = price 143 | if not home_moneyline == "" or away_moneyline == "": 144 | return home_moneyline, away_moneyline 145 | else: 146 | raise Exception("Moneyline was not properly parsed. Missing data.") 147 | 148 | 149 | def parse_spread(spread_bet): 150 | """Parse a spread bet object from Bovada and return, in order, the spread and the home and away spread prices""" 151 | outcomes = spread_bet["outcomes"] 152 | spread = "" 153 | home_spread_price = "" 154 | away_spread_price = "" 155 | if len(outcomes) > 2: 156 | raise Exception("Unexpected objects in spread bet") 157 | for o in outcomes: 158 | if o["type"] == "H": 159 | spread = float(o["price"]["handicap"]) 160 | home_spread_price = int(o["price"]["american"]) 161 | elif o["type"] == "A": 162 | away_spread_price = int(o["price"]["american"]) 163 | if not spread == "" or home_spread_price == "" or away_spread_price == "": 164 | return spread, home_spread_price, away_spread_price 165 | else: 166 | raise Exception("Spread was not properly parsed. Missing data.") 167 | 168 | 169 | def create_odds_table(database, data, tbl_name, sched_tbl): 170 | """Creates an odds_table in the database based on the data with foreign key based on the schedule 171 | 172 | Args: 173 | database: An instance of the DBInterface class from database/DBInterface.py 174 | data: A DataOperator object from database/manipulator which holds the data and 175 | tbl_name: 176 | sched_tbl: The schedule table which will contain the game_id for the odds_table and which will be given a 177 | relationship to the odds table 178 | """ 179 | # Set columns and constraints 180 | sql_types = data.get_sql_type() 181 | sched_tbl_name = sched_tbl.classes.items()[0][0] 182 | sql_types.update({'game_id': [Integer, ForeignKey(sched_tbl_name + ".id")]}) 183 | constraint = {UniqueConstraint: ["home_team", "away_team", "start_time"]} 184 | 185 | database.map_table(tbl_name, sql_types, constraint) # Maps the odds table 186 | 187 | # Establish relationship if it does not exist 188 | if "odds" not in sched_tbl.__mapper__.relationships.keys(): 189 | sched_tbl.odds = relationship(database.Template) 190 | 191 | database.create_tables() 192 | database.clear_mappers() 193 | 194 | 195 | def update_odds_table(odds_table, sched_tbl, rows, session): 196 | """Update the odds_table with the information in rows 197 | 198 | Args: 199 | odds_table: A mapped odds table object from the database 200 | sched_tbl: A mapped schedule table object from the database 201 | rows: A dictionary of rows with column names as keys with lists of values 202 | session: A SQLalchemy session object 203 | """ 204 | row_objects = [] 205 | if len(rows) == 0: # Avoid messing with things if no rows exist 206 | print("No new odds available. Returning without updating odds table") 207 | return 208 | for row in rows: 209 | # Delete the row in the table if it exists to allow overwrite 210 | existing_rows = session.query(odds_table).filter(odds_table.home_team == row["home_team"], 211 | odds_table.away_team == row["away_team"], 212 | odds_table.start_time == row["start_time"]) 213 | if len(existing_rows.all()) > 0: 214 | for exist_row in existing_rows.all(): 215 | session.delete(exist_row) 216 | 217 | # Adds all of the normal betting data 218 | row_object = odds_table(**row) 219 | 220 | # Finds and adds the foreign key from the schedule 221 | game = session.query(sched_tbl).filter(sched_tbl.home_team == row_object.home_team, 222 | sched_tbl.away_team == row_object.away_team, 223 | sched_tbl.start_time == row_object.start_time).all() 224 | if len(game) > 1: 225 | raise Exception("More than one game matches the row") 226 | game = game[0] 227 | row_object.game_id = game.id 228 | 229 | row_objects.append(row_object) 230 | try: 231 | session.add_all(row_objects) 232 | except IntegrityError: # If all objects cannot be added, try to add each one individually 233 | for row in row_objects: 234 | try: 235 | session.add(row) 236 | except IntegrityError: 237 | continue 238 | 239 | 240 | def scrape(): 241 | """Scrapes betting line information from bovada and adds it to the session""" 242 | league_year = Config.get_property("league_year") 243 | lines = odds_for_today() 244 | if not lines: 245 | return False 246 | return lines 247 | 248 | line_data = DataOperator(lines) 249 | 250 | tbl_name = "odds_{}".format(league_year) 251 | tbl_exists = database.table_exists(tbl_name) 252 | if not tbl_exists: 253 | create_odds_table(database, line_data, tbl_name, schedule) 254 | tbl_exists = database.table_exists(tbl_name) 255 | 256 | if line_data.validate_data_length() and tbl_exists: 257 | # All values in line_data are expected to be be unique from values in the database. A possible place for errors 258 | # to occur 259 | odds_table = database.get_table_mappings([tbl_name]) 260 | 261 | # Reconcile ensures the odds_table has appropriate start_times; Add logic so its not called every run 262 | reconcile(schedule, odds_table, "start_time", "id", "game_id", session) 263 | 264 | update_odds_table(odds_table, schedule, line_data.dict_to_rows(), session) 265 | else: 266 | raise Exception("Something is wrong here (Not descriptive, but this point shouldn't be hit.)") 267 | 268 | return True 269 | 270 | 271 | if __name__ == "__main__": 272 | from datatotable.database import Database 273 | db = Database("test", Config.get_property("outputs")) 274 | year = 2019 275 | session = Session(bind=db.engine) 276 | scrape(db, session) 277 | -------------------------------------------------------------------------------- /NBApredict/management/tables/schedule.py: -------------------------------------------------------------------------------- 1 | """schedule.py contains function to create the schedule table in the database""" 2 | 3 | from datetime import datetime, timedelta 4 | import math 5 | import nbapredict.management.conversion as convert 6 | from sqlalchemy import ForeignKey, func, tuple_ 7 | from sqlalchemy.orm import aliased 8 | import pandas as pd 9 | 10 | 11 | def format_data(session, schedule_data, team_tbl, team_stats_tbl): 12 | """Format and return schedule data to match the database schema. 13 | 14 | Adds a Margin of Victory column and adds/modifies foreign key columns 15 | 16 | Args: 17 | schedule_data: A DataOperator object with schedule data 18 | team_tbl: A mapped instance of the team_tbl 19 | team_stats_tbl: A mapped instance of the team_stats_tbl 20 | """ 21 | h_score = schedule_data.data['home_team_score'] 22 | a_score = schedule_data.data['away_team_score'] 23 | schedule_data.data['MOV'] = [h_score[i] - a_score[i] for i in range(schedule_data.num_rows())] 24 | schedule_data.data['playoffs'] = [''] 25 | schedule_data.data['game_date'] = [datetime.date(t) for t in schedule_data.data['start_time']] 26 | schedule_data.fill('playoffs', None) 27 | schedule_data.data["home_team_id"] = convert.values_to_foreign_key(session, foreign_tbl=team_tbl, foreign_key="id", 28 | foreign_value="team_name", 29 | child_data=schedule_data.data.pop("home_team")) 30 | schedule_data.data["away_team_id"] = convert.values_to_foreign_key(session, foreign_tbl=team_tbl, foreign_key="id", 31 | foreign_value="team_name", 32 | child_data=schedule_data.data.pop("away_team")) 33 | 34 | today = datetime.date(datetime.now()) 35 | tomorrow = today + timedelta(days=1) 36 | tmrw_idx = 0 37 | for idx in range(len(schedule_data.data['start_time'])): 38 | if schedule_data.data['start_time'][idx].date() >= tomorrow: 39 | tmrw_idx = idx 40 | break 41 | if not tmrw_idx: 42 | raise ValueError("tmrw_idx was not found") 43 | subquery = session.query(team_stats_tbl.id, team_stats_tbl.team_id, func.max(team_stats_tbl.scrape_time)). \ 44 | filter(team_stats_tbl.scrape_date <= today).group_by(team_stats_tbl.team_id).subquery() 45 | schedule_data.data['home_stats_id'] = convert.values_to_foreign_key(session, subquery, 'id', 'team_id', 46 | schedule_data.data['home_team_id'][:tmrw_idx]) 47 | schedule_data.data['away_stats_id'] = convert.values_to_foreign_key(session, subquery, 'id', 'team_id', 48 | schedule_data.data['away_team_id'][:tmrw_idx]) 49 | schedule_data.fill('home_stats_id', None) 50 | schedule_data.fill('away_stats_id', None) 51 | 52 | return schedule_data 53 | 54 | 55 | def create_table(db, schedule_data, tbl_name, team_tbl, team_stats_tbl): 56 | """Create a table of the NBA schedule in the database. 57 | Args: 58 | db: a datotable.database.Database object connected to a database 59 | schedule_data: A datatotable.data.DataOperator object with schedule data 60 | tbl_name: The desired name of the table 61 | team_tbl: A mapped team table to set foreign keys on 62 | team_stats_tbl: A mapped team stats table to set foreign keys on 63 | """ 64 | columns = schedule_data.columns 65 | team_tbl_name = team_tbl.__table__.fullname 66 | team_stats_tbl_name = team_stats_tbl.__table__.fullname 67 | columns['home_team_id'].append(ForeignKey("{}.id".format(team_tbl_name))) 68 | columns['away_team_id'].append(ForeignKey("{}.id".format(team_tbl_name))) 69 | columns['home_stats_id'].append(ForeignKey("{}.id".format(team_stats_tbl_name))) 70 | columns['away_stats_id'].append(ForeignKey("{}.id".format(team_stats_tbl_name))) 71 | db.map_table(tbl_name=tbl_name, columns=columns) 72 | db.create_tables() 73 | db.clear_mappers() 74 | 75 | 76 | def update_table(session, schedule_data, schedule_tbl, team_stats_tbl): 77 | """Wrap and run update functions for the schedule_tbl.""" 78 | 79 | update_games(session, schedule_tbl, schedule_data) 80 | score_updates = update_scores(session, schedule_tbl, schedule_data) 81 | stats_updates = update_stats(session, schedule_tbl, team_stats_tbl) 82 | time_updates = update_start_time(session, schedule_tbl, schedule_data) 83 | 84 | # Some rows may be updated in different functions. Use a set to remove duplicates 85 | return set(score_updates + stats_updates + time_updates) 86 | 87 | 88 | def update_scores(session, schedule_tbl, schedule_data) -> list: 89 | date = datetime.date(datetime.now()) 90 | update_query = session.query(schedule_tbl).filter(schedule_tbl.start_time < date, 91 | schedule_tbl.home_team_score == 0). \ 92 | order_by(schedule_tbl.start_time) 93 | # if update_query.count() == 0: 94 | # return 95 | rows = update_query.all() 96 | if len(rows) == 0: 97 | return [] 98 | first_game_time = rows[0].start_time 99 | last_game_time = rows[len(rows) - 1].start_time 100 | 101 | sched_df = schedule_data.dataframe 102 | sched_df["start_time"] = sched_df["start_time"].dt.tz_localize(None) 103 | update_df = sched_df.loc[(sched_df.start_time >= first_game_time) & (sched_df.start_time <= last_game_time)] 104 | 105 | update_rows = [] 106 | for row in rows: 107 | game = update_df.loc[(update_df.home_team_id == row.home_team_id) & (update_df.away_team_id == row.away_team_id) 108 | & (update_df.start_time.dt.date == datetime.date(row.start_time))] 109 | row.home_team_score = int(game.home_team_score) 110 | row.away_team_score = int(game.away_team_score) 111 | row.MOV = row.home_team_score - row.away_team_score 112 | row.start_time = game.start_time.dt.to_pydatetime()[0] # Convert Pandas TimeStamp to datetime 113 | update_rows.append(row) 114 | return update_rows 115 | 116 | 117 | def update_stats(session, schedule_tbl, team_stats_tbl) -> list: 118 | tomorrow = datetime.date(datetime.now()) + timedelta(days=1) 119 | 120 | d_time = session.query(func.min(schedule_tbl.start_time)).filter(schedule_tbl.home_stats_id == None).all()[0][0] 121 | date = datetime.date(d_time) 122 | date_ranges = [] 123 | while date < tomorrow: 124 | next_day = date + timedelta(days=1) 125 | date_ranges.append((date, next_day)) 126 | date = next_day 127 | 128 | update_rows = [] 129 | for d in date_ranges: 130 | # Get the team stats with the greatest scrape_time before the end date of the range (31 obs, all teams + L. AVG) 131 | stats_q = session.query(team_stats_tbl.id, team_stats_tbl.team_id, 132 | func.max(team_stats_tbl.scrape_time).label('s_time')). \ 133 | filter(team_stats_tbl.scrape_time < d[1]).group_by(team_stats_tbl.team_id).subquery() 134 | home_stats = aliased(stats_q, 'home_stats') 135 | away_stats = aliased(stats_q, 'away_stats') 136 | 137 | sched_rows = session.query(schedule_tbl, home_stats.c.id.label('h_s_id'), away_stats.c.id.label('a_s_id')). \ 138 | filter(schedule_tbl.home_stats_id == None, schedule_tbl.start_time > d[0], schedule_tbl.start_time < d[1]).\ 139 | join(home_stats, schedule_tbl.home_team_id == home_stats.c.team_id). \ 140 | join(away_stats, schedule_tbl.away_team_id == away_stats.c.team_id).all() 141 | 142 | # ToDo: remove explicit 2020 references 143 | for row in sched_rows: 144 | row.schedule_2020.home_stats_id = row.a_s_id 145 | row.schedule_2020.away_stats_id = row.h_s_id 146 | update_rows.append(row.schedule_2020) 147 | return update_rows 148 | 149 | 150 | def update_start_time(session, schedule_tbl, schedule_data) -> list: 151 | """Return updated rows for any games where the start_time has changed. 152 | 153 | Note this will not check if the date of a game has changed.""" 154 | today = datetime.date(datetime.now()) 155 | end_week = datetime.date(datetime.now()) + timedelta(days=7) 156 | 157 | games = session.query(schedule_tbl).filter(schedule_tbl.game_date >= today, 158 | schedule_tbl.game_date <= end_week).all() 159 | 160 | df = schedule_data.dataframe[['start_time', 'game_date', 'home_team_id', 'away_team_id']] 161 | df.start_time = df.start_time.dt.tz_localize(None) 162 | df = df[(df.start_time >= pd.Timestamp(today)) & (df.game_date <= end_week)] 163 | 164 | update_rows = [] 165 | for game in games: 166 | if df[(df.start_time == game.start_time) & (df.home_team_id == game.home_team_id)].empty: 167 | date = game.game_date 168 | changed_game = df[(df.home_team_id == game.home_team_id) & (df.away_team_id == game.away_team_id) & 169 | (df.game_date == game.game_date)] 170 | if changed_game.empty: 171 | raise ValueError('Game time for {} @ {} has changed,' 172 | ' but cannot find the new game time'.format(game.home_team_id, 173 | game.away_team_id)) 174 | elif len(changed_game) == 1: 175 | new_time = changed_game.start_time 176 | new_time_timestamp = pd.to_datetime(new_time.values[0]) 177 | game.start_time = new_time_timestamp 178 | update_rows.append(game) 179 | else: 180 | raise ValueError('Game time for {} @ {} has changed,' 181 | 'but there are multiple replacement values available'.format(game.home_team_id, 182 | game.away_team_id)) 183 | return update_rows 184 | 185 | 186 | def update_games(session, schedule_tbl, schedule_data): 187 | """Check if any games have been removed or added from the schedule and add that change to the database. 188 | 189 | ToDo: Add check for new games (i.e. when Clippers-Lakers gets rescheduled) 190 | ToDo: This should work for playoff games too, right? 191 | ToDo: Iterating through indices potentially slow, though great alternatives don't seem to exist 192 | """ 193 | 194 | data_len = len(schedule_data.data['start_time']) 195 | tbl_len = session.query(schedule_tbl).count() 196 | if data_len < tbl_len: 197 | data_df = pd.DataFrame({'home_team_id': schedule_data.data['home_team_id'], 198 | 'game_date': schedule_data.data['game_date']}) 199 | 200 | tbl_id_dates = session.query(schedule_tbl.home_team_id, schedule_tbl.game_date).all() 201 | id_dates_dict = {'home_team_id': [r.home_team_id for r in tbl_id_dates], 202 | 'game_date': [r.game_date for r in tbl_id_dates]} 203 | tbl_df = pd.DataFrame(id_dates_dict) 204 | 205 | # Outer join for all rows, indicator for diff column 206 | comp = data_df.merge(tbl_df, how='outer', indicator=True) 207 | tbl_only = comp[comp['_merge'] == 'right_only'] 208 | ids = tbl_only['home_team_id'].values.tolist() 209 | dates = tbl_only['game_date'].values.tolist() 210 | cancelled_games = [(ids[i], dates[i]) for i in range(len(ids))] 211 | 212 | delete_rows = session.query(schedule_tbl).filter(tuple_(schedule_tbl.home_team_id, schedule_tbl.game_date). 213 | in_(cancelled_games)) 214 | if delete_rows.count() > 0: 215 | for row in delete_rows: 216 | session.delete(row) 217 | 218 | 219 | -------------------------------------------------------------------------------- /NBApredict/models/four_factor_regression.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: Spencer Weston 3 | 4 | Purpose: Four Factor Regression performs a regression on the Margin of Victory (mov) between NBA teams with each teams 5 | four factors(offensive and defensive) as predictors. The regression object is returned from the module. 6 | 7 | Args (default): 8 | year (2019): The year of the season desired 9 | db_url ('sqlite:///database//nba_db.db'): Path to the database where data should be written 10 | 11 | Returns: 12 | Returns a LinearRegression class 13 | """ 14 | from datetime import datetime 15 | import matplotlib.pyplot as plt 16 | import numpy as np 17 | import pandas as pd 18 | import os 19 | import scipy.stats as stats 20 | from sqlalchemy.orm import Session 21 | from sqlalchemy import func, alias 22 | 23 | import statsmodels.api as sm 24 | from statsmodels.stats.outliers_influence import variance_inflation_factor as vif 25 | 26 | # Local Packages 27 | from datatotable.database import Database 28 | from nbapredict.database import getters 29 | from nbapredict.helpers import br_references as br 30 | from nbapredict.management import conversion 31 | from nbapredict.models import graphing 32 | from nbapredict.configuration import Config 33 | 34 | 35 | class LinearRegression: 36 | """A class that creates and holds linear regression information and functions for regression evaluation. 37 | 38 | LinearRegression is initialized with a target variable and the desired predictors. Then, a regression is run and 39 | necessary regression stats are stored as class parameters. Member functions generate evaluative graphs and/or 40 | stats for the regression. 41 | 42 | Attributes: 43 | target: The target variable 44 | predictors: The predictive variables 45 | results: statsmodels results wrapper 46 | predictions: predicted results from the regression 47 | r_squared: r_squared of the regression 48 | adj_r_squared: adj_r_squared of the regression 49 | r_squared_rnd: r_squared rounded to three decimal places 50 | residuals: residuals of the gression 51 | p_values: p_values of the coefficients 52 | coefs: values of the coefficients 53 | output: data frame of coefficients with their values and p_values""" 54 | 55 | def __init__(self, target, predictors): 56 | """Performs a linear regression and stores pertinent regression outputs as class variables 57 | 58 | Args: 59 | target: The target variable 60 | predictors: The prediction variables""" 61 | self.target = target 62 | self.predictors = sm.add_constant(predictors) 63 | self.results = sm.OLS(target, self.predictors).fit() 64 | self.predictions = self.results.predict(self.predictors) 65 | self.r_squared = self.results.rsquared 66 | self.adj_r_squared = self.results.rsquared_adj 67 | self.r_squared_rnd = np.around(self.r_squared, 3) 68 | self.residuals = self.results.resid 69 | self.p_values = self.results.pvalues 70 | self.coefs = self.results.params 71 | self.output = pd.concat([self.coefs, self.p_values], axis=1) 72 | self.output.columns = ["coefficient", "p_value"] 73 | 74 | def predicted_vs_actual(self, out_path=None): 75 | """Generate a predicted vs. actual graph, save to out_path if it exists, and return the graph.""" 76 | graph = graphing.pred_vs_actual(self.predictions, self.target, self.r_squared_rnd, out_path=out_path) 77 | return graph 78 | 79 | def residuals_vs_fitted(self, out_path=None): 80 | """Generate a residuals vs. fitted graph, save to out_path if it exists, and return the graph.""" 81 | graph = graphing.residuals_vs_fitted(self.predictions, self.residuals, out_path) 82 | return graph 83 | 84 | def qqplot(self, out_path=None): 85 | """Generate a qq plot, save to out_path if it exists, and return the graph.""" 86 | fig = sm.qqplot(self.residuals, dist=stats.t, fit=True, line="45") 87 | if out_path: 88 | fig.savefig(out_path) 89 | return fig 90 | 91 | def influence_plot(self, out_path=None): 92 | """Generate an influence plot, save to out_path if it exists, and return the graph.""" 93 | fig, ax = plt.subplots(figsize=(12, 8)) 94 | fig = sm.graphics.influence_plot(self.results, alpha=0, ax=ax, criterion="cooks") 95 | if out_path: 96 | fig.savefig(out_path) 97 | return fig 98 | 99 | def cooks_distance(self, out_path=None): 100 | """Generate a cook's distance graph, save to out_path if it exists, and return the graph.""" 101 | influence = self.results.get_influence() 102 | # c is the distance and p is p-value 103 | (c, p) = influence.cooks_distance 104 | graph = graphing.cooks_distance(c, out_path) 105 | return graph 106 | 107 | def residual_independence(self, out_path=None): 108 | """Generate a residual independence plot, save to out_path if it exists, and return the graph.""" 109 | residuals = self.residuals 110 | plot = graphing.residual_independence(residuals) 111 | if out_path: 112 | plot.savefig(out_path) 113 | return plot 114 | 115 | def vif(self): 116 | """Determine the Variance Inflation Factor (vif) of the coefficients and return a dataframe of the vif's.""" 117 | vif_out = pd.DataFrame() 118 | predictors = np.array(self.predictors) 119 | vif_out["VIF Factor"] = [vif(predictors, i) for i in range(predictors.shape[1])] 120 | vif_out["features"] = self.predictors.columns 121 | return vif_out 122 | 123 | def residual_distribution(self): 124 | """Calculate the normal curve of the residuals and return the distribution""" 125 | norm = stats.norm 126 | mu, std = norm.fit(self.residuals) 127 | # mu = 0 # By definition, mu of resids = 0, but the fit provides approximately 0. It's perhaps best to just 128 | # set mu=0? 129 | return norm(loc=mu, scale=std) 130 | 131 | 132 | def create_ff_regression_df(session, team_stats_tbl, sched_tbl, ff_list): 133 | """Create and return a regression data frame of the four factors (ff) for each team in a matchup. 134 | 135 | Args: 136 | session: Sqlalchemy session object 137 | team_stats_tbl: mapped team stats table object 138 | sched_tbl: mapped schedule table object 139 | ff_list: List of the four factors variable 140 | 141 | Returns: 142 | A data frame with home('_h') and away('_a') stats and the margin of victory (mov). The mov is the target 143 | for a regression. The '_h' and '_a" stats are the home and away four factors in a specific matchup. 144 | """ 145 | home_stats = alias(team_stats_tbl, name='home') 146 | away_stats = alias(team_stats_tbl, name='away') 147 | sched = alias(sched_tbl, name='sched') 148 | home_stat_ff = [getattr(home_stats.c, col) for col in ff_list if col in home_stats.c.keys()] 149 | away_stat_ff = [getattr(away_stats.c, col) for col in ff_list if col in away_stats.c.keys()] 150 | sched_stats_query = session.query(sched, *home_stat_ff, *away_stat_ff).filter(sched.c['home_team_score'] > 0).\ 151 | join(home_stats, home_stats.c['id'] == sched.c['home_stats_id']).\ 152 | join(away_stats, away_stats.c['id'] == sched.c['away_stats_id']).subquery(with_labels=True) 153 | sched_stats = session.query(sched_stats_query) 154 | 155 | df = conversion.convert_sql_statement_to_table(session, sched_stats.statement) 156 | return df 157 | 158 | 159 | def alt_regression_df(session, team_stats_tbl, sched_tbl, ff_list, qualifiers=None): 160 | """Alternate regression df where the latest team_stats are applied to all games in schedule. 161 | 162 | Args: 163 | session: A sqlalchemy session object 164 | team_stats_tbl: A mapped team stats table object 165 | sched_tbl: a mapped schedule table object' 166 | qualifiers: Optional qualifiers to apply to the returned regression dataframe. Can be columns to subset from the 167 | regression dataframe or a function 168 | 169 | Returns: 170 | A regression dataframe, modified by qualifiers if specified, with the four factors 171 | """ 172 | team_stats = session.query(team_stats_tbl).group_by(team_stats_tbl.team_id).having(func.max(team_stats_tbl.id)).\ 173 | subquery() 174 | home_stats = alias(team_stats, name='home') 175 | away_stats = alias(team_stats, name='away') 176 | sched = alias(sched_tbl, name='sched') 177 | home_stat_ff = [getattr(home_stats.c, col) for col in ff_list if col in home_stats.c.keys()] 178 | away_stat_ff = [getattr(away_stats.c, col) for col in ff_list if col in away_stats.c.keys()] 179 | 180 | sched_stats_query = session.query(sched, *home_stat_ff, *away_stat_ff).filter(sched.c['home_team_score'] > 0).\ 181 | join(home_stats, home_stats.c['team_id'] == sched.c['home_team_id']).\ 182 | join(away_stats, away_stats.c['team_id'] == sched.c['away_team_id']).subquery(with_labels=True) 183 | 184 | sched_stats = session.query(sched_stats_query) 185 | 186 | if qualifiers: 187 | df = conversion.convert_sql_statement_to_table(session, sched_stats.statement, qualifiers) 188 | else: 189 | df = conversion.convert_sql_statement_to_table(session, sched_stats.statement) 190 | return(df) 191 | 192 | 193 | def get_team_ff(ff_df, team, ff_list, home): 194 | """Extract the four factors for a specific team from the ff_df and return the result. 195 | 196 | Further, if home is True, a "_h" is appended to each four factor for the team. And if False, "_a" is appended. 197 | This is to specify if the team is home or away. 198 | 199 | Args: 200 | ff_df: four factors Pandas data frame (read from SQL table) 201 | team: A team name 202 | ff_list: List of the four factors variable 203 | home: Boolean. True if the team is home; False if the team is away 204 | """ 205 | team_ff = ff_df[ff_df.team_name.str.lower() == team.lower()][ff_list] 206 | if home: 207 | team_ff = team_ff.rename(append_h, axis='columns') 208 | else: 209 | team_ff = team_ff.rename(append_a, axis='columns') 210 | return team_ff 211 | 212 | 213 | def append_h(string): 214 | """Append "_h" to string and return the modified string""" 215 | string = '{}{}'.format(string, '_h') 216 | return string 217 | 218 | 219 | def append_a(string): 220 | """Append "_a" to string and return the modified string""" 221 | string = '{}{}'.format(string, '_a') 222 | return string 223 | 224 | 225 | def ensure_unique_index(index, indices, i=1): # Indexed to 1 so +1 == 2nd, 3rd, 4th, etc. game 226 | """Check if index is in indices, modify index until it's unique, and return the unique index 227 | 228 | If the index is unique, it's returned as is. Otherwise, the function calls itself and increments i. The recursion 229 | stops when the index and numerical suffix (i) are not in indices. Used to create unique identifiers for multiple 230 | matchups between the same teams. 231 | 232 | Args: 233 | index: A string index to check for in indices 234 | indices: A list of indices to check the index against 235 | i: A numerical suffix used to modify index until it does not exist in indices 236 | Returns: 237 | index, or a modified form of index, that does not exist in indices 238 | """ 239 | if index in indices: 240 | i = i+1 241 | test_index = "{}{}".format(index, i) 242 | if test_index in indices: 243 | return ensure_unique_index(index, indices, i) 244 | else: 245 | return test_index 246 | else: 247 | return index 248 | 249 | 250 | def four_factors_list(): 251 | """Create a four factor(ff) list and identifying information and return it.""" 252 | # Import and specify a list of factors to extract from database 253 | ff_list = br.four_factors.copy() 254 | return ff_list 255 | 256 | 257 | def main(session, team_stats_tbl, sched_tbl, graph=False): 258 | """Create a regression data frame, run a regression through the LinearRegression class, and return the class 259 | 260 | Args: 261 | session: An instantiated Session object from sqlalchemy 262 | team_stats_tbl: A mapped team stats table class 263 | sched_tbl: A mapped schedule table class 264 | graph: A boolean that creates graphs if true 265 | 266 | Returns: 267 | A LinearRegression class 268 | """ 269 | league_year = Config.get_property("league_year") 270 | graph_dir = Config.get_property("graph_dir") 271 | if not os.path.exists(graph_dir) and graph: 272 | os.mkdir(graph_dir) 273 | 274 | # Import and specify a list of factors to extract from database 275 | ff_list = four_factors_list() 276 | 277 | # regression_df = create_ff_regression_df(session, team_stats_tbl, sched_tbl, ff_list) 278 | regression_df = alt_regression_df(session, team_stats_tbl, sched_tbl, ff_list) 279 | print('using alternative/old regression_df') 280 | 281 | # Separate DF's into them into X (predictors) and y (target) 282 | predictors = regression_df[regression_df.columns.drop(list(regression_df.filter(regex='sched')))] 283 | target = regression_df["sched_MOV"] 284 | 285 | ff_reg = LinearRegression(target, predictors) 286 | 287 | # Note: On Windows, graphs will not appear to update 288 | # To change that, go to folder properties -> customize -> optimize for: Documents 289 | if graph: 290 | ff_reg.predicted_vs_actual(out_path=os.path.join(graph_dir, "pred_vs_actual_{}.png".format(league_year))) 291 | ff_reg.residuals_vs_fitted(out_path=os.path.join(graph_dir, "residuals_vs_fitted_{}.png".format(league_year))) 292 | ff_reg.qqplot(out_path=os.path.join(graph_dir, "qqplot_{}.png".format(league_year))) 293 | ff_reg.influence_plot(out_path=os.path.join(graph_dir, "influence_{}.png".format(league_year))) 294 | ff_reg.cooks_distance(out_path=os.path.join(graph_dir, "cooks_distance_{}.png".format(league_year))) 295 | ff_reg.residual_independence(out_path=os.path.join(graph_dir, "resid_independence_{}.png".format(league_year))) 296 | 297 | # Multicollinearity 298 | # vif_df = ff_reg.vif() 299 | # ff_reg.residual_distribution() 300 | 301 | return ff_reg 302 | 303 | 304 | if __name__ == "__main__": 305 | db = Database('test', "../management") 306 | session = Session(db.engine) 307 | year = Config.get_property('league_year') 308 | sched_tbl = db.table_mappings['schedule_{}'.format(year)] 309 | team_stats_tbl = db.table_mappings['team_stats_{}'.format(year)] 310 | test = main(session, graph=True) 311 | t=2 312 | -------------------------------------------------------------------------------- /project_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "Author: Spencer Weston\n", 10 | "\n", 11 | "Website: [Crockpot Thoughts](https://crockpotthoughts.wordpress.com/)\n", 12 | "\n", 13 | "Last Update: 08/10/2019" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Guide to NBA Bet\n", 21 | "In this notebook, we'll overview the usage of the NBA bet project. First, we'll run the project from the top level. And afterwards, we'll dive into some of the details of the projects implementation. You can work through this notebook in two ways. If you view it on Github, you can download a clone of the project and follow along. Or, if you have jupyter installed, you can run the notebook yourself. However, since it is the NBA offseason, the project will have limited functionality." 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## Running the Project\n", 29 | "The run directory holds the scripts to run the entire projects. There are two modules: all and daily. Daily calls run.all.run_all() one hour before the next game in the schedule. All contains run_all() which runs the entire project. " 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "Daily is called from the command line and runs on main. To call it at the command line, enter: ~\\NBA_bet>python -m run.daily" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "To run the project on an ad hoc basis, we use the run_all() function:" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 1, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "ename": "ModuleNotFoundError", 53 | "evalue": "No module named 'database'", 54 | "traceback": [ 55 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 56 | "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", 57 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mNBApredict\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mall\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mrun_all\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 58 | "\u001b[1;32m~\\Documents\\Projects\\NBApredict\\NBApredict\\NBApredict\\run\\all.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 10\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[1;31m# Local Imports\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 12\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mdatabase\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdatabase\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mDatabase\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 13\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mpredict\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mpredict\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 14\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mscrapers\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mscraper\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 59 | "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'database'" 60 | ], 61 | "output_type": "error" 62 | } 63 | ], 64 | "source": [ 65 | "from NBApredict.run.all import run_all" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 2, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stderr", 75 | "output_type": "stream", 76 | "text": [ 77 | "c:\\users\\spencer\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\numpy\\core\\fromnumeric.py:2389: FutureWarning: Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.\n", 78 | " return ptp(axis=axis, out=out, **kwargs)\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "run_all()" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "Console output will raise a deprecation warning, an issue I've scheduled to fix. \n", 91 | "\n", 92 | "It's really that simple to run the project. If you've got this far, you've run the whole project. Now let's look at what's going on under the hood." 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "## Scraping Data\n", 100 | "We scrape data from two sources: Basketballreference.com for basketball statistics and Bovada.com for betting lines. We're concerned with three types of data:\n", 101 | "1. Team Stats - These statistics are our explanatory variables. They describe Dean Oliver's four factors as described in the [third post](https://crockpotthoughts.wordpress.com/2019/08/05/predicting-nba-games-part-3-the-model/) in my series overviewing the project. \n", 102 | "2. The Schedule - We need the schedule to know when games are played.\n", 103 | "3. Betting Lines - Lines to compare our predictions to.\n", 104 | "\n", 105 | "All the data scrapers are held in the \"scrapers\" folder. To scrape all data, scrapers.scraper.scrape_all() will scrape and store all data in a local SQLite database. You can find the database in the \"outputs\" folder. When the project is run, it will create this folder for you. The scrape_all function takes three arguments: a database, session, and the league year. Let's set these arguments up:" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 3, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "from scrapers import scraper\n", 115 | "from database.database import Database\n", 116 | "from sqlalchemy.orm import Session" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 4, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "db = Database()\n", 126 | "session = Session(bind=db.engine)\n", 127 | "league_year = 2019" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "On initialization, the Database class will intialize within the working directory. Alternatively, init accepts a file path to the desired database location. We then initialize the Session with the engine, part of the sqlalchemy package, from db. Now, we can pass these arguments to the scrape_all function." 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 5, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "scraper.scrape_all(database=db, session=session, league_year=league_year)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "Now, check the outputs folder. You'll find nba_db.db within. It can be viewed with your database software of choice, or you can interact with it through the instantiated Database class. We can see our tables through the db.get_tables() function. Note: If you didn't call run_all() earlier in the notebook, you will not have the 'predictions_2019' table. " 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 6, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/plain": [ 161 | "dict_keys(['misc_stats_2019', 'odds_2019', 'sched_2019', 'predictions_2019'])" 162 | ] 163 | }, 164 | "execution_count": 6, 165 | "metadata": {}, 166 | "output_type": "execute_result" 167 | } 168 | ], 169 | "source": [ 170 | "db.get_tables().keys()" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "If you'd like, you can only scrape one type of data at a time. Just call the individual scraper's scrape() function." 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 7, 183 | "metadata": { 184 | "scrolled": true 185 | }, 186 | "outputs": [ 187 | { 188 | "data": { 189 | "text/plain": [ 190 | "True" 191 | ] 192 | }, 193 | "execution_count": 7, 194 | "metadata": {}, 195 | "output_type": "execute_result" 196 | } 197 | ], 198 | "source": [ 199 | "from scrapers import line_scraper, team_scraper, season_scraper\n", 200 | "\n", 201 | "team_scraper.scrape(db, league_year)\n", 202 | "line_scraper.scrape(db, session, league_year)\n", 203 | "season_scraper.scrape(db, session, league_year)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "## Generating Predictions\n", 211 | "Now, let's look at predicting games. We can predict all games we have betting lines for, all games on a specific day, or individual games. Predicting all games will store all results in the database while games on a specific day or individual games will return a dictionary with results. This portion of the project suffers from entangled function because too many specifications and data operations are managed within functions. However, predict_all() can be run with a simple call:" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 8, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "from predict import predict\n", 221 | "\n", 222 | "predict.predict_all(db, session, league_year)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "Uhh IDK placeholder" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 10, 235 | "metadata": {}, 236 | "outputs": [ 237 | { 238 | "ename": "NameError", 239 | "evalue": "name 'year' is not defined", 240 | "traceback": [ 241 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 242 | "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", 243 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0mdate\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m2019\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m3\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m26\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0mpredict\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpredict_games_on_date\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdb\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msession\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mleague_year\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdate\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconsole_out\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 244 | "\u001b[1;32m~\\Documents\\Projects\\test\\NBA_bet\\predict\\predict.py\u001b[0m in \u001b[0;36mpredict_games_on_date\u001b[1;34m(database, session, league_year, date, console_out)\u001b[0m\n\u001b[0;32m 479\u001b[0m \u001b[0mgame_spreads\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mgame\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mgame\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mgames_query\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 480\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 481\u001b[1;33m \u001b[0mresults\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpredict_games_on_day\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdatabase\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msession\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgame_spreads\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconsole_out\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mconsole_out\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 482\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 483\u001b[0m \u001b[0mprediction_tbl\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"predictions_{}\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mleague_year\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 245 | "\u001b[1;32m~\\Documents\\Projects\\test\\NBA_bet\\predict\\predict.py\u001b[0m in \u001b[0;36mpredict_games_on_day\u001b[1;34m(database, session, games, console_out)\u001b[0m\n\u001b[0;32m 189\u001b[0m \"\"\"\n\u001b[0;32m 190\u001b[0m \u001b[0mresults\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 191\u001b[1;33m \u001b[0mregression\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlm\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdatabase\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdatabase\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msession\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msession\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0myear\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0myear\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 192\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 193\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mgame\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mgames\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 246 | "\u001b[1;31mNameError\u001b[0m: name 'year' is not defined" 247 | ], 248 | "output_type": "error" 249 | } 250 | ], 251 | "source": [ 252 | "from datetime import datetime\n", 253 | "\n", 254 | "date = datetime(2019, 3, 26)\n", 255 | "predict.predict_games_on_date(db, session, league_year, date, console_out=True)" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [] 264 | } 265 | ], 266 | "metadata": { 267 | "kernelspec": { 268 | "display_name": "Python 3", 269 | "language": "python", 270 | "name": "python3" 271 | }, 272 | "language_info": { 273 | "codemirror_mode": { 274 | "name": "ipython", 275 | "version": 3 276 | }, 277 | "file_extension": ".py", 278 | "mimetype": "text/x-python", 279 | "name": "python", 280 | "nbconvert_exporter": "python", 281 | "pygments_lexer": "ipython3", 282 | "version": "3.7.1" 283 | } 284 | }, 285 | "nbformat": 4, 286 | "nbformat_minor": 2 287 | } 288 | -------------------------------------------------------------------------------- /NBApredict/predict/bets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Predict.odds contains functions organized around comparing predictions to odds 3 | 4 | ToDo: 5 | In theory, the module will allow multiple model inputs. Thus, we can pass it a linear, bayesian, ML, etc. model, 6 | generate results, and store them. That functionality does not exist. This should also have a class of some sort to 7 | manage predictions. It will add specificity and remove call complexity and name overlaps (i.e. 8 | predict_games_on_day() vs. predict_games_on_date()) 9 | """ 10 | 11 | from datetime import datetime 12 | import numpy as np 13 | import pandas as pd 14 | import scipy.stats as stats 15 | from sqlalchemy.orm import Session 16 | from sqlalchemy import or_ 17 | from sqlalchemy.exc import IntegrityError 18 | 19 | # Local imports 20 | from nbapredict.configuration import Config 21 | from nbapredict.helpers import br_references 22 | from datatotable.database import Database 23 | from datatotable.data import DataOperator 24 | from nbapredict.database import getters 25 | from nbapredict.management import conversion 26 | from nbapredict.management.tables import predictions 27 | from nbapredict.models import four_factor_regression as ff_reg 28 | 29 | 30 | def get_prediction(reg, pred_df): 31 | """Generate and return a prediction for the observations in the pred_df. 32 | 33 | Args: 34 | reg: LinearRegression class from four_factors_regression.py 35 | pred_df: A dataframe of observations, with home and away statistics, from which to generate a prediction 36 | 37 | Returns: 38 | The predicted value generated from the regression object and the predictors""" 39 | return reg.results.predict(pred_df).values[0] 40 | 41 | 42 | def get_team_name(team): 43 | """Match team to a standard team name and return the br_references standard team name.""" 44 | for team_name in br_references.Team: 45 | if team.lower() == team_name.value.lower(): 46 | return team_name.value 47 | 48 | 49 | # def create_prediction_df(home_tm, away_tm, ff_df): 50 | # """Create and return a dataframe that merges the four factors for the home and away team. 51 | # TODO: Replace with ff_reg.alt_regression_df/getregression_df 52 | # 53 | # Args: 54 | # home_tm: The home team 55 | # away_tm: The away team 56 | # ff_df: Dataframe of the four factors for all teams 57 | # 58 | # Returns: 59 | # A single row four factors data frame of the home and away team's four factors 60 | # """ 61 | # home_ff = get_team_ff(home_tm, ff_df, home=True) 62 | # away_ff = get_team_ff(away_tm, ff_df, home=False) 63 | # home_ff["key"] = 1 64 | # home_ff["const"] = 1.0 # sm.add_const does not add a constant for whatever reason 65 | # away_ff["key"] = 1 66 | # merged = pd.merge(home_ff, away_ff, on="key", sort=True) 67 | # merged = merged.drop(["key"], axis=1) 68 | # merged = merged.sort_index(axis=1) 69 | # return merged 70 | 71 | 72 | def get_team_ff(team, ff_df, home): 73 | """Create and return a data frame of the four factors for the specified team. 74 | 75 | Args: 76 | team: The team to extract the four factors for 77 | ff_df: A dataframe of the four factors 78 | home: Boolean which dictates if an '_h or '_a' should be appended to the team's stats 79 | 80 | Returns: 81 | The four factors, with a home or away suffix, for a team are returned as a data frame 82 | """ 83 | ff_list = br_references.four_factors 84 | team_ff = ff_df[ff_df.team_name.str.lower() == team.lower()][ff_list] 85 | if home: 86 | team_ff = team_ff.rename(ff_reg.append_h, axis='columns') 87 | else: 88 | team_ff = team_ff.rename(ff_reg.append_a, axis='columns') 89 | return team_ff 90 | 91 | 92 | def line_probability(prediction, line, std): 93 | """Calculate and return the CDF or SF, as appropriate, of the line if the model were true. 94 | 95 | "if the model were true" means that if the assumption holds that the residuals are homoscedastic and follow a 96 | normal distribution 97 | 98 | Args: 99 | prediction: The prediction for a game 100 | line: The line associated with the same game as the prediction 101 | std: The standard deviation of the residuals for the model used to make the prediction 102 | 103 | Returns: 104 | The survival function or cumulative density function for the line in relation to the prediction 105 | """ 106 | # ToDo: T-Distribution? 107 | dist = stats.norm(loc=prediction, scale=std) 108 | line_prediction = -1 * line 109 | 110 | if prediction > line_prediction: 111 | return dist.cdf(line_prediction), "cdf" 112 | elif prediction < line_prediction: 113 | return dist.sf(line_prediction), "sf" 114 | elif prediction == line_prediction: 115 | return 0.5 # If the predictions are equal, the cdf automatically equals 0.5 116 | 117 | 118 | def prediction_result_console_output(home_tm, away_tm, line, prediction, probability): 119 | """Generate human readable printout comparing the model's predictions, the line, and the p_value of the line. 120 | 121 | Args: 122 | home_tm: The home team 123 | away_tm: The away team 124 | line: The betting line 125 | prediction: A prediction of the home team's margin of victory 126 | probability: The probability of the betting line as determined by a CDF or SF 127 | """ 128 | if prediction > 0: 129 | print("The {} are projected to beat the {} by {} points".format(home_tm, away_tm, prediction)) 130 | if (-1 * line) < prediction: 131 | print("If the model were true, the betting line's ({}) CDF, in relation to the prediction, would " 132 | "be realized {}% of the time".format(line, probability)) 133 | else: 134 | print("If the model were true, the betting line's ({}) SF, in relation to the prediction, would " 135 | "be realized {}% of the time".format(line, probability)) 136 | if prediction < 0: 137 | print("The {} are projected to lose to the {} by {} points".format(home_tm, away_tm, prediction)) 138 | if (-1 * line) < prediction: 139 | print("If the model were true, the betting line's ({}) CDF, in relation to the prediction, would " 140 | "be realized {}% of the time".format(line, probability)) 141 | else: 142 | print("If the model were true, the betting line's ({}) SF, in relation to the prediction, would " 143 | "be realized {}% of the time".format(line, probability)) 144 | 145 | 146 | def insert_predictions(rows, session, pred_tbl, sched_tbl): 147 | """Add rows into the prediction table in session with additional information from sched_tbl and odds_tbl. 148 | 149 | # ToDo: Will need equivalent function, but it won't look like this 150 | Args: 151 | rows: SQLalchemy compatible rows 152 | session: A SQLalchemy session object 153 | pred_tbl: A mapped prediction table object 154 | sched_tbl: A mapped scheduled table object 155 | """ 156 | row_objects = [] 157 | for row in rows: 158 | row_obj = pred_tbl(**row) 159 | row_objects.append(row_obj) 160 | row_objects = update_schedule_attributes(row_objects, session, sched_tbl) 161 | 162 | session.add_all(row_objects) 163 | 164 | 165 | def insert_new_predictions(rows, session, pred_tbl, sched_tbl, odds_tbl): 166 | """Insert unique predictions in rows which do not already exist in the prediction table. 167 | 168 | Additional information from sched_tbl and odds_tbl is added to the rows as well. 169 | 170 | # ToDo: Will need significant rewrite (Also note similarities between this function and the one above) 171 | Args: 172 | rows: SQLalchemy compatible rows 173 | session: a SQLalchemy session object 174 | pred_tbl: A mapped prediction table object 175 | sched_tbl: A mapped scheduled table object 176 | odds_tbl: A mapped odds_tbl object 177 | """ 178 | row_objects = [] 179 | existing_predictions = session.query(pred_tbl.home_team, pred_tbl.away_team, pred_tbl.start_time).all() 180 | existing_predictions = [(game.home_team, game.away_team, game.start_time) for game in existing_predictions] 181 | for row in rows: 182 | game_identifier = (row["home_team"], row["away_team"], row["start_time"]) 183 | if game_identifier in existing_predictions: 184 | continue 185 | else: 186 | row_obj = pred_tbl(**row) 187 | row_objects.append(row_obj) 188 | if len(row_objects) > 0: 189 | row_objects = update_odds_id(row_objects, session, odds_tbl) 190 | row_objects = update_schedule_attributes(row_objects, session, sched_tbl) 191 | session.add_all(row_objects) 192 | 193 | 194 | def update_prediction_table(session, pred_tbl, sched_tbl, odds_tbl): 195 | """Find and update null or 0 values in the score, odds_id, or bet_result columns of the prediction table. 196 | 197 | Args: 198 | session: A SQLalchemy session object 199 | pred_tbl: A mapped prediction table object 200 | sched_tbl: A mapped scheduled table object 201 | odds_tbl: A mapped odds_tbl object 202 | """ 203 | score_update_objs = session.query(pred_tbl).filter(or_(pred_tbl.home_team_score == 0, 204 | pred_tbl.away_team_score == 0)).all() 205 | session.add_all(score_update_objs) 206 | 207 | bet_update_objs = session.query(pred_tbl).filter(pred_tbl.bet_result.is_(None), pred_tbl.home_team_score > 0).all() 208 | bet_update_objs = update_bet_results(bet_update_objs) 209 | session.add_all(bet_update_objs) 210 | 211 | 212 | def update_bet_results(bet_update_objects): 213 | """Take bet_update_objects, determine the prediction result, and add the result to each row in bet_update_objects. 214 | 215 | # ToDo: Will need this function, but will require a lot of modification 216 | Args: 217 | bet_update_objects: Objects from a query.all() from the prediction table. Objects should have a home and 218 | away team score. 219 | 220 | Returns: 221 | bet_update_objects updated with the bet results (WIN, LOSS, or PUSH). 222 | """ 223 | for row in bet_update_objects: 224 | score_margin = row.home_team_score - row.away_team_score 225 | line_inverse = row.line * -1 226 | prediction = row.prediction 227 | if score_margin == line_inverse: 228 | row.bet_result = "PUSH" 229 | elif (score_margin < line_inverse) and (prediction < line_inverse): 230 | row.bet_result = "WIN" 231 | elif (score_margin > line_inverse) and (prediction > line_inverse): 232 | row.bet_result = "WIN" 233 | else: 234 | row.bet_result = "LOSS" 235 | return bet_update_objects 236 | 237 | 238 | def get_sample_prediction(session, regression): 239 | """Generate and return a sample prediction formatted specifically for table creation. 240 | 241 | Args: 242 | session: A SQLalchemy session object 243 | regression: A regression object from four_factor_regression.py 244 | 245 | Returns: 246 | A DataOperator object initialized with a prediction from regression 247 | """ 248 | one_row_dataframe = regression.predictors.loc[[0]] 249 | 250 | sample_prediction = predict_game(session, regression, one_row_dataframe) 251 | data = DataOperator(sample_prediction) 252 | return data 253 | 254 | 255 | def predict_game(session, regression, x_df, console_out=False): 256 | """Predict a game and return the information in a dictionary. 257 | 258 | Use console out for human readable output if desired.Cdf is a cumulative density function. SF is a survival 259 | function. CDF is calculated when the betting line's prediction is below the model's prediction. SF is calculated 260 | when the betting line's prediction is above the model's prediction. 261 | 262 | Args: 263 | session: A SQLalchemy session object 264 | regression: A regression object 265 | 266 | console_out: If true, print the prediction results. Ignore otherwise 267 | """ 268 | 269 | prediction = get_prediction(regression, x_df) 270 | # probability, function = line_probability(prediction, line, np.std(regression.residuals)) 271 | 272 | # if console_out: 273 | # prediction_result_console_output(home_tm, away_tm, prediction, probability) 274 | 275 | return {"prediction": prediction} 276 | 277 | 278 | def predict_games_in_odds(session, regression, odds_tbl): 279 | """Generate and return predictions for all games with odds in the odds_tbl 280 | 281 | ToDo: Take tables as inputs vs. DB 282 | Args: 283 | session: A SQLalchemy session object 284 | regression: A linear regression object generated from four_factor_regression 285 | odds_tbl: Mapped sqlalchemy odds table 286 | 287 | """ 288 | all_odds = session.query(odds_tbl).all() 289 | predictions = [] 290 | for odds in all_odds: 291 | home_team = odds.home_team 292 | away_team = odds.away_team 293 | start_time = odds.start_time 294 | line = odds.spread 295 | predictions.append(predict_game(session, regression, home_team, away_team, start_time, line)) 296 | return predictions 297 | 298 | 299 | def predict_games_on_day(database, session, games, console_out=False): 300 | """Take a SQLalchemy query object of games, and return a prediction for each game. 301 | 302 | ToDO: On day versus on date? 303 | Args: 304 | database: an instantiated DBInterface class from database.dbinterface.py 305 | session: A SQLalchemy session object 306 | games: a SQLalchemy query object of games containing start_time, home_tm, away_tm, and the spread 307 | console_out: A bool. True to print prediction outputs 308 | """ 309 | results = [] 310 | regression = ff_reg.main(database=database, session=session, year=year) 311 | try: 312 | for game in games: 313 | prediction = predict_game(database=database, session=session, regression=regression, home_tm=game.home_team, 314 | away_tm=game.away_team, start_time=game.start_time, line=game.spread, 315 | console_out=console_out) 316 | results.append(prediction) 317 | except AttributeError: 318 | # If games doesn't contain spreads, catch the attribute error and pass a 0 line. 319 | # If games is missing other data, function will break. 320 | for game in games: 321 | prediction = predict_game(database=database, session=session, regression=regression, home_tm=game.home_team, 322 | away_tm=game.away_team, start_time=game.start_time, line=0, 323 | console_out=console_out) 324 | results.append(prediction) 325 | return results 326 | 327 | 328 | def predict_games_on_date(database, session, league_year, date, console_out): 329 | """Predict games on the specified date and write the results to the database 330 | 331 | ToDO: On day versus on date? 332 | Args: 333 | database: An instantiated DBInterface class from dbinterface.py 334 | session: A sqlalchemy session object for queries and writes 335 | league_year: The league year to work with. For example, the league year of the 2018-19 season is 2019 336 | date: Either a datetime.date or a dictionary keyed formatted as {"day": day, "month": month, "year": year"} 337 | console_out: If true, prints prediction results to the console 338 | """ 339 | # Get lines for the games 340 | if not isinstance(date, datetime): 341 | date = datetime(date["year"], date["month"], date["day"]) 342 | odds_tbl = database.get_table_mappings(["odds_{}".format(league_year)]) 343 | games_query = getters.get_spreads_for_date(odds_tbl, session, date) 344 | game_spreads = [game for game in games_query] 345 | 346 | results = predict_games_on_day(database, session, game_spreads, console_out=console_out) 347 | 348 | prediction_tbl = "predictions_{}".format(league_year) 349 | data = DataOperator(results) 350 | 351 | sched_tbl = database.get_table_mappings("sched_{}".format(league_year)) 352 | pred_tbl = database.get_table_mappings("predictions_{}".format(league_year)) 353 | 354 | # Results are sent to DataOperator in row format, so just pass data.data instead of data.dict_to_rows() 355 | try: 356 | insert_predictions(data.data, session, pred_tbl, sched_tbl, odds_tbl) 357 | session.commit() 358 | except IntegrityError: 359 | session.rollback() 360 | update_prediction_table(session, pred_tbl, sched_tbl, odds_tbl) 361 | session.commit() 362 | finally: 363 | session.close() 364 | 365 | 366 | def predict_all(db): 367 | """Generate and store predictions for all games available in the odds table. 368 | 369 | Checks if the table exists. If it doesn't, generate a table in the database. 370 | """ 371 | session = Session(bind=db.engine) 372 | league_year = Config.get_property("league_year") 373 | sched_tbl = db.table_mappings["schedule_{}".format(league_year)] 374 | team_stats_tbl = db.table_mappings['team_stats_{}'.format(league_year)] 375 | odds_tbl = db.table_mappings['odds_{}'.format(league_year)] 376 | 377 | regression = ff_reg.main(session, team_stats_tbl, sched_tbl) 378 | 379 | pred_tbl_name = "predictions_{}".format(league_year) 380 | 381 | if not db.table_exists(pred_tbl_name): 382 | sample = get_sample_prediction(session, regression, sched_tbl) 383 | pred_data = predictions.format_data() 384 | predictions.create_table() 385 | pred_tbl = db.table_mappings[pred_tbl_name] 386 | session.add_all([pred_tbl(**row) for row in pred_data.rows]) 387 | session.commit() 388 | else: 389 | # Data operator 390 | pred_tbl = db.table_mappings[pred_tbl_name] 391 | schedule_tbl = db.table_mappings[pred_tbl_name] 392 | update_rows = predictions.insert(session, ) 393 | results = predict_games_in_odds(session, regression, odds_tbl) 394 | session.add_all(update_rows) 395 | session.commit() 396 | 397 | insert_new_predictions(results, session, pred_tbl, sched_tbl, odds_tbl) 398 | 399 | session.commit() # Commit here b/c update_prediction_tbl() needs the inserted values 400 | 401 | update_prediction_table(session, pred_tbl, sched_tbl, odds_tbl) 402 | 403 | 404 | if __name__ == "__main__": 405 | db = Database('test', "../management") 406 | predict_all(db) 407 | predict_game("Sacramento Kings", "Orlando Magic", line=-5.5, year=2019, console_out=True) 408 | date = datetime(2019, 3, 26) 409 | predict_games_on_date(db, session, league_year=2019, date=date, console_out=True) 410 | --------------------------------------------------------------------------------