├── short_long_mlb_names.csv ├── README.md ├── MLB_Runline_Training.ipynb ├── MLB_Runline_Production.ipynb ├── mlb-runline-daily.py ├── mlb-runline-dataset-builder.py └── mlb-runline-dataset-production.py /short_long_mlb_names.csv: -------------------------------------------------------------------------------- 1 | short_name,long_name 2 | ARI Diamondbacks - ARI Diamondbacks,Arizona Diamondbacks 3 | CHI Cubs - CHI Cubs,Chicago Cubs 4 | MIA Marlins - MIA Marlins,Miami Marlins 5 | TB Rays - TB Rays,Tampa Bay Rays 6 | NY Yankees - NY Yankees,New York Yankees 7 | BOS Red Sox - BOS Red Sox,Boston Red Sox 8 | KC Royals - KC Royals,Kansas City Royals 9 | MIL Brewers - MIL Brewers,Milwaukee Brewers 10 | ATL Braves - ATL Braves,Atlanta Braves 11 | BAL Orioles - BAL Orioles,Baltimore Orioles 12 | DET Tigers - DET Tigers,Detroit Tigers 13 | CLE Guardians - CLE Guardians,Cleveland Guardians 14 | LA Angels - LA Angels,Los Angeles Angels 15 | COL Rockies - COL Rockies,Colorado Rockies 16 | CHI White Sox - CHI White Sox,Chicago White Sox 17 | CIN Reds - CIN Reds,Cincinnati Reds 18 | HOU Astros - HOU Astros,Houston Astros 19 | OAK Athletics - OAK Athletics,Oakland Athletics 20 | NY Mets - NY Mets,New York Mets 21 | LA Dodgers - LA Dodgers,Los Angeles Dodgers 22 | MIN Twins - MIN Twins,Minnesota Twins 23 | PIT Pirates - PIT Pirates,Pittsburgh Pirates 24 | SEA Mariners - SEA Mariners,Seattle Mariners 25 | SF Giants - SF Giants,San Francisco Giants 26 | PHI Phillies - PHI Phillies,Philadelphia Phillies 27 | SD Padres - SD Padres,San Diego Padres 28 | STL Cardinals - STL Cardinals,St. Louis Cardinals 29 | TEX Rangers - TEX Rangers,Texas Rangers 30 | WAS Nationals - WAS Nationals,Washington Nationals 31 | TOR Blue Jays - TOR Blue Jays,Toronto Blue Jays -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Original Source: [The Quant's Playbook](https://quantgalore.substack.com/) 2 | 3 | This system stores the data in MySQL databases and the models in Colab notebooks. If you don’t have experience in setting these up, it is highly recommend visiting: 4 | [Machine Learning for Sports Betting: MLB Edition](https://www.quantgalore.com/courses/ml-sports-betting-mlb "Machine Learning for Sports Betting: MLB Edition"), 5 | where we walk through the entire process with a similar workflow, going from data all the way to production. 6 | 7 | 8 | The workflow for this algorithm is as follows: 9 | 10 | 1. Register for a [prop-odds](https://www.prop-odds.com/) API key 11 | 12 | 2. Run the “mlb-runline-dataset-builder.py” file 13 | 14 | - This builds the original dataset and takes about 15-30 minutes 15 | 16 | 3. Run the “mlb-runline-daily.py” file 17 | 18 | - This is the dataset that will be used to get the predictions for the games of that day. 19 | 20 | 4. In Google Colab, run the “MLB_Runline_Training.ipynb” file 21 | 22 | - This file is responsible for comparing and training the dozens of available models. It isn't necessary to make any changes to the model, but you have the freedom to experiment. 23 | 24 | - Running the file will create a .pkl file containing the model of your choice, be sure to upload this to your drive. 25 | 26 | 5. In Google Colab, run the “MLB_Runline_Production.ipynb" file 27 | 28 | - This file will deploy the model you saved and generate predictions and theoretical odds. 29 | 30 | 6. In order to update new, future data points without having to re-build the entire dataset, run the “mlb-runline-dataset-production.py” in lieu of the “mlb-runline-dataset-builder.py”. 31 | 32 | 7. To start tracking predictions before going live, visit [the action network](https://www.actionnetwork.com/). 33 | 34 | 8. Finally, you're all set! 😄 35 | -------------------------------------------------------------------------------- /MLB_Runline_Training.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "id": "gTKL-2tRlNIT" 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "!pip install pycaret\n", 26 | "!pip install mysql-connector-python\n", 27 | "!pip install sqlalchemy==1.4.32" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": { 34 | "id": "W1oH3E8Nln9t" 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "import pycaret\n", 39 | "import pandas as pd\n", 40 | "import numpy as np\n", 41 | "import sqlalchemy\n", 42 | "import mysql.connector\n", 43 | "\n", 44 | "from pycaret import classification\n", 45 | "from google.colab import drive\n", 46 | "from datetime import datetime\n", 47 | "from google.colab.data_table import DataTable\n", 48 | "from google.colab import files" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": { 55 | "id": "dNNe7YwMlpgP" 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "engine = sqlalchemy.create_engine('mysql+mysqlconnector://username:password@database-host-name:3306/database-name')\n", 60 | "\n", 61 | "Raw_Dataset = pd.read_sql(sql = \"SELECT * FROM baseball_spread\", con = engine).set_index(\"game_datetime\").reset_index()[[\"game_datetime\", \"team_1\",\t\"team_1_spread_odds\",\t\"team_2\",\t\"team_2_spread_odds\",\t\"venue_name\",\t\"spread\"]]\n", 62 | "print(len(Raw_Dataset))" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": { 69 | "id": "K4RnhrRmKlwR" 70 | }, 71 | "outputs": [], 72 | "source": [ 73 | "Raw_Dataset" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "source": [ 79 | "Classification_Setup = pycaret.classification.setup(data = Raw_Dataset, target = \"spread\", train_size = .70, use_gpu = False, data_split_stratify = False, data_split_shuffle = False, date_features = [\"game_datetime\"], categorical_features = [\"team_1\",\"team_2\",\"venue_name\"])" 80 | ], 81 | "metadata": { 82 | "id": "Wb0v2bKsmJOc" 83 | }, 84 | "execution_count": null, 85 | "outputs": [] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": { 90 | "id": "jnTnG8aSK74e" 91 | }, 92 | "source": [ 93 | "***Compare/Create Model***" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "source": [ 99 | "Classification_Models = Classification_Setup.compare_models(turbo = False)" 100 | ], 101 | "metadata": { 102 | "id": "YTAPYfpanoXE" 103 | }, 104 | "execution_count": null, 105 | "outputs": [] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "source": [ 110 | "You can use the multi-layer perceptron (\"mlp\") as outlined in the post, but a random forest (\"rf\") classification is more effective. A neural-network is a ***bit*** of an overkill." 111 | ], 112 | "metadata": { 113 | "id": "KK66V4NXFy0n" 114 | } 115 | }, 116 | { 117 | "cell_type": "code", 118 | "source": [ 119 | "Classification_Model = Classification_Setup.create_model(\"rf\")" 120 | ], 121 | "metadata": { 122 | "id": "tY82ShLirM3B" 123 | }, 124 | "execution_count": null, 125 | "outputs": [] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "source": [ 130 | "Prediction = Classification_Setup.predict_model(Classification_Model)\n", 131 | "Prediction.head(20)" 132 | ], 133 | "metadata": { 134 | "id": "ZKYRxxPLXsE3" 135 | }, 136 | "execution_count": null, 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "source": [ 142 | "Classification_Setup.evaluate_model(Classification_Model)" 143 | ], 144 | "metadata": { 145 | "id": "Cjj3McLgxFfT" 146 | }, 147 | "execution_count": null, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": { 153 | "id": "odO3tScVvMTy" 154 | }, 155 | "source": [ 156 | "***Save Model***" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": { 163 | "id": "d16cozQ4vLxv" 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "Finalized_Model = pycaret.classification.finalize_model(Classification_Model)\n", 168 | "Finalized_Model_save_to_file_string = f\"{datetime.today().strftime('%Y-%m-%d')} Baseball Spread\"\n", 169 | "pycaret.classification.save_model(Finalized_Model, Finalized_Model_save_to_file_string)\n", 170 | "files.download(f\"{Finalized_Model_save_to_file_string}.pkl\")" 171 | ] 172 | } 173 | ] 174 | } -------------------------------------------------------------------------------- /MLB_Runline_Production.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "source": [ 20 | "**Install Packages**" 21 | ], 22 | "metadata": { 23 | "id": "Si6oYuigEsWJ" 24 | } 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "id": "-AV9WzDzEcTt" 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "!pip install pycaret\n", 35 | "!pip install mysql-connector-python\n", 36 | "!pip install sqlalchemy==1.4.32\n", 37 | "!pip install MLB-StatsAPI" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "source": [ 43 | "**Import Packages**" 44 | ], 45 | "metadata": { 46 | "id": "AL_lGnkTE7af" 47 | } 48 | }, 49 | { 50 | "cell_type": "code", 51 | "source": [ 52 | "import pycaret\n", 53 | "import pandas as pd\n", 54 | "import numpy as np\n", 55 | "import sqlalchemy\n", 56 | "import mysql.connector\n", 57 | "import sys\n", 58 | "import requests\n", 59 | "import statsapi\n", 60 | "\n", 61 | "from pycaret import classification\n", 62 | "from datetime import datetime\n", 63 | "from google.colab.data_table import DataTable\n", 64 | "from google.colab import files\n", 65 | "from google.colab import drive\n", 66 | "\n", 67 | "sys.path.append(\"/content/drive/MyDrive\")\n", 68 | "\n", 69 | "drive.mount('/content/drive')\n", 70 | "\n", 71 | "def odds_calculator(probability):\n", 72 | "\n", 73 | " return round(-100 / ((1/probability)-1))" 74 | ], 75 | "metadata": { 76 | "id": "N3dnZwMJFEe5" 77 | }, 78 | "execution_count": null, 79 | "outputs": [] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "source": [ 84 | "**Import Data**" 85 | ], 86 | "metadata": { 87 | "id": "Bx0rUFJ1FraV" 88 | } 89 | }, 90 | { 91 | "cell_type": "code", 92 | "source": [ 93 | "engine = sqlalchemy.create_engine('mysql+mysqlconnector://username:password@database-host-name:3306/database-name')\n", 94 | "\n", 95 | "Raw_Dataset = pd.read_sql(sql = \"SELECT * FROM baseball_spread_production\", con = engine).set_index(\"game_datetime\").reset_index()[[\"game_datetime\", \"team_1\",\t\"team_1_spread_odds\",\t\"team_2\",\t\"team_2_spread_odds\",\t\"venue_name\"]]\n", 96 | "\n", 97 | "print(len(Raw_Dataset))" 98 | ], 99 | "metadata": { 100 | "id": "4nMUvN_HFvoi" 101 | }, 102 | "execution_count": null, 103 | "outputs": [] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "source": [ 108 | "Raw_Dataset" 109 | ], 110 | "metadata": { 111 | "id": "8-tmvY2cagZd" 112 | }, 113 | "execution_count": null, 114 | "outputs": [] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "source": [ 119 | "### **Load Models**" 120 | ], 121 | "metadata": { 122 | "id": "9P_9ThrHGyPd" 123 | } 124 | }, 125 | { 126 | "cell_type": "code", 127 | "source": [ 128 | "#Finalized_Model_save_to_file_string = f\"2023-07-22 Baseball Spread\"\n", 129 | "Finalized_Model_save_to_file_string = f\"{datetime.today().strftime('%Y-%m-%d')} Baseball Spread\"\n", 130 | "\n", 131 | "Classification_Model = pycaret.classification.load_model(f\"/content/drive/MyDrive/{Finalized_Model_save_to_file_string}\")\n", 132 | "\n", 133 | "print(Finalized_Model_save_to_file_string)" 134 | ], 135 | "metadata": { 136 | "id": "-j2zDCd7Y_8z" 137 | }, 138 | "execution_count": null, 139 | "outputs": [] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "source": [ 144 | "### **Get Predictions of Day**" 145 | ], 146 | "metadata": { 147 | "id": "L6GgjUXpTQCv" 148 | } 149 | }, 150 | { 151 | "cell_type": "code", 152 | "source": [ 153 | "Prediction = pycaret.classification.predict_model(estimator = Classification_Model, data = Raw_Dataset)" 154 | ], 155 | "metadata": { 156 | "id": "e-Djt_9BTVrd" 157 | }, 158 | "execution_count": 10, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "source": [ 164 | "Prediction[\"theo_odds\"] = Prediction[\"prediction_score\"].apply(odds_calculator)" 165 | ], 166 | "metadata": { 167 | "id": "KyvX6ywuOlDp" 168 | }, 169 | "execution_count": 11, 170 | "outputs": [] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "source": [ 175 | "Prediction[[\"team_1\", \"team_2\",\"prediction_label\",\"prediction_score\", \"theo_odds\"]].sort_values(by = \"prediction_score\", ascending = False)" 176 | ], 177 | "metadata": { 178 | "id": "km4EQOOOZ7tI" 179 | }, 180 | "execution_count": null, 181 | "outputs": [] 182 | } 183 | ] 184 | } -------------------------------------------------------------------------------- /mlb-runline-daily.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created 2023 4 | 5 | @author: Quant Galore 6 | """ 7 | 8 | from datetime import datetime, timedelta 9 | 10 | import pandas as pd 11 | import statsapi 12 | import requests 13 | import numpy as np 14 | import sqlalchemy 15 | import mysql.connector 16 | 17 | Short_Long_Names = pd.read_csv("short_long_mlb_names.csv") 18 | 19 | def name_converter(short_name): 20 | 21 | long_name = Short_Long_Names[Short_Long_Names["short_name"] == short_name]["long_name"] 22 | 23 | if len(long_name) < 1 : 24 | 25 | return np.nan 26 | else: 27 | 28 | return long_name.iloc[0] 29 | 30 | API_KEY = "your prop-odds.com api key" 31 | 32 | # ============================================================================= 33 | # Start 34 | # ============================================================================= 35 | 36 | # This is the production dataset, designed to only append new values, as oppposed to having to constantly re-build the dataset 37 | # So, we set the start date to 7 days prior. This way, we check all of the games in that period to include any days that we missed 38 | 39 | begin_date = (datetime.today()).strftime("%Y-%m-%d") 40 | ending_date = (datetime.today()).strftime("%Y-%m-%d") 41 | 42 | Schedule = statsapi.schedule(start_date = begin_date, end_date = ending_date) 43 | Schedule_DataFrame = pd.json_normalize(Schedule) 44 | 45 | date_range = pd.date_range(start = begin_date, end = ending_date) 46 | 47 | odds_list = [] 48 | 49 | # The spread market represents the "runline" bet 50 | 51 | market = "spread" 52 | 53 | for date in date_range: 54 | 55 | date = date.strftime("%Y-%m-%d") 56 | url = f"https://api.prop-odds.com/beta/games/mlb?date={date}&tz=America/Chicago&api_key={API_KEY}" 57 | games_url = f"https://api.prop-odds.com/beta/games/mlb?date={date}&api_key={API_KEY}" 58 | 59 | 60 | games = pd.json_normalize(requests.get(games_url).json()["games"]) 61 | 62 | if len(games) < 1: 63 | 64 | continue 65 | 66 | for game_id in games["game_id"]: 67 | 68 | Game = games[games["game_id"] == game_id] 69 | 70 | sportsbook = [] 71 | 72 | odds_url = f"https://api.prop-odds.com/beta/odds/{game_id}/{market}?api_key={API_KEY}" 73 | odds = requests.get(odds_url).json() 74 | 75 | if len(odds) < 2: 76 | continue 77 | 78 | else: 79 | 80 | # DraftKings generally offers the best odds, so for uniformity, we only include odds sourced from DraftKings 81 | 82 | for book in odds["sportsbooks"]: 83 | 84 | if book["bookie_key"] == "draftkings": 85 | sportsbook = book 86 | else: 87 | continue 88 | 89 | if len(sportsbook) < 1: 90 | 91 | continue 92 | 93 | odds_data = pd.json_normalize(sportsbook["market"]["outcomes"]) 94 | 95 | # The runline (-1.5) refers to the favorite winning by 2 or more points, so we have to first pull who the favorite is 96 | 97 | moneyline_url = f"https://api.prop-odds.com/beta/odds/{game_id}/moneyline?api_key={API_KEY}" 98 | moneyline_odds = requests.get(moneyline_url).json() 99 | 100 | if len(moneyline_odds) < 2: 101 | continue 102 | 103 | else: 104 | 105 | for moneyline_book in moneyline_odds["sportsbooks"]: 106 | 107 | if moneyline_book["bookie_key"] == "draftkings": 108 | moneyline_sportsbook = moneyline_book 109 | else: 110 | continue 111 | 112 | if len(moneyline_sportsbook) < 1: 113 | 114 | continue 115 | 116 | moneyline_odds_data = pd.json_normalize(moneyline_sportsbook["market"]["outcomes"]) 117 | 118 | if moneyline_odds_data["odds"].max() < 0: 119 | continue 120 | 121 | moneyline_favorite = moneyline_odds_data[moneyline_odds_data["odds"] < 0].sort_values(by = "timestamp", ascending = True).head(1)["name"].iloc[0] 122 | moneyline_underdog = moneyline_odds_data[moneyline_odds_data["odds"] > 0].sort_values(by = "timestamp", ascending = True).head(1)["name"].iloc[0] 123 | 124 | # We sort by earliest available pre-game odds first, since the API may occasionally include odds that were set mid-game. 125 | 126 | favorite = odds_data[(odds_data["handicap"] == -1.5) & (odds_data["name"] == moneyline_favorite)].sort_values(by = "timestamp", ascending = True).head(1) 127 | underdog = odds_data[(odds_data["handicap"] == 1.5) & (odds_data["name"] == moneyline_underdog)].sort_values(by = "timestamp", ascending = True).head(1) 128 | 129 | if len(favorite) < 1: 130 | continue 131 | elif len(underdog) < 1: 132 | continue 133 | 134 | team_1_favorite = favorite["name"].drop_duplicates().iloc[0] 135 | team_2_underdog = underdog["name"].drop_duplicates().iloc[0] 136 | 137 | team_1_favorite_odds = favorite["odds"].iloc[0] 138 | team_2_underdog_odds = underdog["odds"].iloc[0] 139 | 140 | odds_dataframe = pd.DataFrame([[team_1_favorite, team_1_favorite_odds, team_2_underdog, team_2_underdog_odds]], 141 | columns = ["team_1", "team_1_spread_odds", "team_2", "team_2_spread_odds"]) 142 | 143 | full_odds_dataframe = pd.concat([Game.reset_index(drop = True), odds_dataframe], axis = 1) 144 | 145 | if len(full_odds_dataframe) > 1: 146 | continue 147 | 148 | odds_list.append(full_odds_dataframe) 149 | 150 | full_odds = pd.concat(odds_list).reset_index(drop = True).rename(columns = {"away_team":"away_name", 151 | "home_team":"home_name", 152 | "start_timestamp":"game_datetime"}) 153 | 154 | Merged_DataFrame = pd.merge(Schedule_DataFrame, full_odds, on = ["game_datetime", "away_name", "home_name"]) 155 | 156 | Merged_DataFrame["team_1"] = Merged_DataFrame["team_1"].apply(name_converter) 157 | Merged_DataFrame["team_2"] = Merged_DataFrame["team_2"].apply(name_converter) 158 | 159 | Featured_Merged_DataFrame = Merged_DataFrame[["game_datetime","away_name","home_name", "team_1", "team_1_spread_odds", "team_2", "team_2_spread_odds", "venue_name"]].copy().set_index("game_datetime") 160 | 161 | # "team_1" always represents the favorite 162 | 163 | Featured_Spread_DataFrame = Featured_Merged_DataFrame[["team_1", "team_1_spread_odds", "team_2", "team_2_spread_odds", "venue_name"]].copy().reset_index().set_index("game_datetime") 164 | 165 | # To weed out any errors in the data set, we ensure to only include data where the odds are between -200 to +200 166 | # The odds for these bets are almost never set outside of that range, so we exclude them. 167 | 168 | Featured_Spread_DataFrame = Featured_Spread_DataFrame[(abs(Featured_Spread_DataFrame["team_1_spread_odds"]) < 200) & (abs(Featured_Spread_DataFrame["team_2_spread_odds"]) < 200)] 169 | Featured_Spread_DataFrame = Featured_Spread_DataFrame[Featured_Spread_DataFrame["team_1"] != Featured_Spread_DataFrame["team_2"]] 170 | Featured_Spread_DataFrame.index = pd.to_datetime(Featured_Spread_DataFrame.index).tz_convert("America/Chicago") 171 | 172 | # ============================================================================= 173 | # End 174 | # ============================================================================= 175 | 176 | # We initialize our sqlalchemy engine, then submit the data to the database 177 | 178 | engine = sqlalchemy.create_engine('mysql+mysqlconnector://username:password@database-host-name:3306/database-name') 179 | 180 | # The daily production dataset should be dropped each day, sa 181 | 182 | with engine.connect() as conn: 183 | result = conn.execute(sqlalchemy.text('DROP TABLE baseball_spread_production')) 184 | 185 | Featured_Spread_DataFrame.to_sql("baseball_spread_production", con = engine, if_exists = "append") 186 | -------------------------------------------------------------------------------- /mlb-runline-dataset-builder.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created 2023 4 | 5 | @author: Quant Galore 6 | """ 7 | 8 | from datetime import datetime, timedelta 9 | 10 | import pandas as pd 11 | import statsapi 12 | import requests 13 | import numpy as np 14 | import sqlalchemy 15 | import mysql.connector 16 | 17 | Short_Long_Names = pd.read_csv("short_long_mlb_names.csv") 18 | 19 | def name_converter(short_name): 20 | 21 | long_name = Short_Long_Names[Short_Long_Names["short_name"] == short_name]["long_name"] 22 | 23 | if len(long_name) < 1 : 24 | 25 | return np.nan 26 | else: 27 | 28 | return long_name.iloc[0] 29 | 30 | API_KEY = "your prop-odds.com api key" 31 | 32 | # ============================================================================= 33 | # Start 34 | # ============================================================================= 35 | 36 | # First, we call the schedule API to get a list of all games played from the start of the season, up to yesterday. 37 | 38 | begin_date = "2023-03-30" 39 | ending_date = (datetime.today() - timedelta(days = 1)).strftime("%Y-%m-%d") 40 | 41 | Schedule = statsapi.schedule(start_date = begin_date, end_date = ending_date) 42 | Schedule_DataFrame = pd.json_normalize(Schedule) 43 | 44 | date_range = pd.date_range(start = begin_date, end = ending_date) 45 | 46 | odds_list = [] 47 | 48 | # The spread market represents the "runline" bet 49 | 50 | market = "spread" 51 | 52 | for date in date_range: 53 | 54 | date = date.strftime("%Y-%m-%d") 55 | url = f"https://api.prop-odds.com/beta/games/mlb?date={date}&tz=America/Chicago&api_key={API_KEY}" 56 | games_url = f"https://api.prop-odds.com/beta/games/mlb?date={date}&api_key={API_KEY}" 57 | 58 | 59 | games = pd.json_normalize(requests.get(games_url).json()["games"]) 60 | 61 | if len(games) < 1: 62 | 63 | continue 64 | 65 | for game_id in games["game_id"]: 66 | 67 | Game = games[games["game_id"] == game_id] 68 | 69 | sportsbook = [] 70 | 71 | odds_url = f"https://api.prop-odds.com/beta/odds/{game_id}/{market}?api_key={API_KEY}" 72 | odds = requests.get(odds_url).json() 73 | 74 | if len(odds) < 2: 75 | continue 76 | 77 | else: 78 | 79 | # DraftKings generally offers the best odds, so for uniformity, we only include odds sourced from DraftKings 80 | 81 | for book in odds["sportsbooks"]: 82 | 83 | if book["bookie_key"] == "draftkings": 84 | sportsbook = book 85 | else: 86 | continue 87 | 88 | if len(sportsbook) < 1: 89 | 90 | continue 91 | 92 | odds_data = pd.json_normalize(sportsbook["market"]["outcomes"]) 93 | 94 | # The runline (-1.5) refers to the favorite winning by 2 or more points, so we have to first pull who the favorite is 95 | 96 | moneyline_url = f"https://api.prop-odds.com/beta/odds/{game_id}/moneyline?api_key={API_KEY}" 97 | moneyline_odds = requests.get(moneyline_url).json() 98 | 99 | if len(moneyline_odds) < 2: 100 | continue 101 | 102 | else: 103 | 104 | for moneyline_book in moneyline_odds["sportsbooks"]: 105 | 106 | if moneyline_book["bookie_key"] == "draftkings": 107 | moneyline_sportsbook = moneyline_book 108 | else: 109 | continue 110 | 111 | if len(moneyline_sportsbook) < 1: 112 | 113 | continue 114 | 115 | moneyline_odds_data = pd.json_normalize(moneyline_sportsbook["market"]["outcomes"]) 116 | 117 | if moneyline_odds_data["odds"].max() < 0: 118 | continue 119 | 120 | moneyline_favorite = moneyline_odds_data[moneyline_odds_data["odds"] < 0].sort_values(by = "timestamp", ascending = True).head(1)["name"].iloc[0] 121 | moneyline_underdog = moneyline_odds_data[moneyline_odds_data["odds"] > 0].sort_values(by = "timestamp", ascending = True).head(1)["name"].iloc[0] 122 | 123 | # We sort by earliest available pre-game odds first, since the API may occasionally include odds that were set mid-game. 124 | 125 | favorite = odds_data[(odds_data["handicap"] == -1.5) & (odds_data["name"] == moneyline_favorite)].sort_values(by = "timestamp", ascending = True).head(1) 126 | underdog = odds_data[(odds_data["handicap"] == 1.5) & (odds_data["name"] == moneyline_underdog)].sort_values(by = "timestamp", ascending = True).head(1) 127 | 128 | if len(favorite) < 1: 129 | continue 130 | elif len(underdog) < 1: 131 | continue 132 | 133 | team_1_favorite = favorite["name"].drop_duplicates().iloc[0] 134 | team_2_underdog = underdog["name"].drop_duplicates().iloc[0] 135 | 136 | team_1_favorite_odds = favorite["odds"].iloc[0] 137 | team_2_underdog_odds = underdog["odds"].iloc[0] 138 | 139 | odds_dataframe = pd.DataFrame([[team_1_favorite, team_1_favorite_odds, team_2_underdog, team_2_underdog_odds]], 140 | columns = ["team_1", "team_1_spread_odds", "team_2", "team_2_spread_odds"]) 141 | 142 | full_odds_dataframe = pd.concat([Game.reset_index(drop = True), odds_dataframe], axis = 1) 143 | 144 | if len(full_odds_dataframe) > 1: 145 | continue 146 | 147 | odds_list.append(full_odds_dataframe) 148 | 149 | full_odds = pd.concat(odds_list).reset_index(drop = True).rename(columns = {"away_team":"away_name", 150 | "home_team":"home_name", 151 | "start_timestamp":"game_datetime"}) 152 | 153 | Merged_DataFrame = pd.merge(Schedule_DataFrame, full_odds, on = ["game_datetime", "away_name", "home_name"]) 154 | 155 | Merged_DataFrame["team_1"] = Merged_DataFrame["team_1"].apply(name_converter) 156 | Merged_DataFrame["team_2"] = Merged_DataFrame["team_2"].apply(name_converter) 157 | 158 | Featured_Merged_DataFrame = Merged_DataFrame[["game_datetime","away_name","home_name","away_score","home_score", "team_1", "team_1_spread_odds", "team_2", "team_2_spread_odds", "venue_name", "winning_team"]].copy().set_index("game_datetime") 159 | 160 | # "team_1" always represents the favorite 161 | 162 | # If the favorite was the home team, then home score - away score gives us the spread -- vice versa if the favorite is the away team 163 | 164 | team_1_away_wins = Featured_Merged_DataFrame[Featured_Merged_DataFrame["away_name"] == Featured_Merged_DataFrame["team_1"]].copy() 165 | team_1_away_wins["spread"] = team_1_away_wins["away_score"].astype(int) - team_1_away_wins["home_score"].astype(int) 166 | 167 | team_1_home_wins = Featured_Merged_DataFrame[Featured_Merged_DataFrame["home_name"] == Featured_Merged_DataFrame["team_1"]].copy() 168 | team_1_home_wins["spread"] = team_1_home_wins["home_score"].astype(int) - team_1_home_wins["away_score"].astype(int) 169 | 170 | spread_dataframe = pd.concat([team_1_away_wins, team_1_home_wins], axis = 0) 171 | 172 | def spread_converter(spread): 173 | 174 | if spread >= 2: 175 | 176 | return 1 177 | else: 178 | return 0 179 | 180 | 181 | # If the favorite won the game by 2 or more points, we assign a 1 182 | # If the favorite wins by less than 2 points, or if the underdog wins, we assign a 0 183 | 184 | spread_dataframe["spread"] = spread_dataframe["spread"].apply(spread_converter) 185 | 186 | Featured_Spread_DataFrame = spread_dataframe[["team_1", "team_1_spread_odds", "team_2", "team_2_spread_odds", "venue_name", "spread"]].copy().reset_index().set_index("game_datetime") 187 | 188 | # To weed out any errors in the data set, we ensure to only include data where the odds are between -200 to +200 189 | # The odds for these bets are almost never set outside of that range, so we exclude them. 190 | 191 | Featured_Spread_DataFrame = Featured_Spread_DataFrame[(abs(Featured_Spread_DataFrame["team_1_spread_odds"]) < 200) & (abs(Featured_Spread_DataFrame["team_2_spread_odds"]) < 200)] 192 | Featured_Spread_DataFrame = Featured_Spread_DataFrame[Featured_Spread_DataFrame["team_1"] != Featured_Spread_DataFrame["team_2"]] 193 | Featured_Spread_DataFrame.index = pd.to_datetime(Featured_Spread_DataFrame.index).tz_convert("America/Chicago") 194 | 195 | # We initialize our sqlalchemy engine, then submit the data to the database 196 | 197 | initial_engine = sqlalchemy.create_engine('mysql+mysqlconnector://username:password@database-host-name:3306') 198 | 199 | with initial_engine.connect() as conn: 200 | result = conn.execute(sqlalchemy.text('CREATE DATABASE desired_database_name')) 201 | 202 | engine = sqlalchemy.create_engine('mysql+mysqlconnector://username:password@database-host-name:3306/database-name') 203 | 204 | Featured_Spread_DataFrame.to_sql("baseball_spread", con = engine, if_exists = "append") 205 | 206 | # If you make a mistake, or wish to re-build te dataset, you can drop the table and start over 207 | 208 | with engine.connect() as conn: 209 | result = conn.execute(sqlalchemy.text('DROP TABLE baseball_spread')) 210 | -------------------------------------------------------------------------------- /mlb-runline-dataset-production.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created 2023 4 | 5 | @author: Quant Galore 6 | """ 7 | 8 | from datetime import datetime, timedelta 9 | 10 | import pandas as pd 11 | import statsapi 12 | import requests 13 | import numpy as np 14 | import sqlalchemy 15 | import mysql.connector 16 | 17 | Short_Long_Names = pd.read_csv("short_long_mlb_names.csv") 18 | 19 | def name_converter(short_name): 20 | 21 | long_name = Short_Long_Names[Short_Long_Names["short_name"] == short_name]["long_name"] 22 | 23 | if len(long_name) < 1 : 24 | 25 | return np.nan 26 | else: 27 | 28 | return long_name.iloc[0] 29 | 30 | API_KEY = "your prop-odds.com api key" 31 | 32 | # ============================================================================= 33 | # Start 34 | # ============================================================================= 35 | 36 | # This is the production dataset, designed to only append new values, as oppposed to having to constantly re-build the dataset 37 | # So, we set the start date to 7 days prior. This way, we check all of the games in that period to include any days that we missed 38 | 39 | begin_date = (datetime.today() - timedelta(days = 7)).strftime("%Y-%m-%d") 40 | ending_date = (datetime.today() - timedelta(days = 1)).strftime("%Y-%m-%d") 41 | 42 | Schedule = statsapi.schedule(start_date = begin_date, end_date = ending_date) 43 | Schedule_DataFrame = pd.json_normalize(Schedule) 44 | 45 | date_range = pd.date_range(start = begin_date, end = ending_date) 46 | 47 | odds_list = [] 48 | 49 | # The spread market represents the "runline" bet 50 | 51 | market = "spread" 52 | 53 | for date in date_range: 54 | 55 | date = date.strftime("%Y-%m-%d") 56 | url = f"https://api.prop-odds.com/beta/games/mlb?date={date}&tz=America/Chicago&api_key={API_KEY}" 57 | games_url = f"https://api.prop-odds.com/beta/games/mlb?date={date}&api_key={API_KEY}" 58 | 59 | 60 | games = pd.json_normalize(requests.get(games_url).json()["games"]) 61 | 62 | if len(games) < 1: 63 | 64 | continue 65 | 66 | for game_id in games["game_id"]: 67 | 68 | Game = games[games["game_id"] == game_id] 69 | 70 | sportsbook = [] 71 | 72 | odds_url = f"https://api.prop-odds.com/beta/odds/{game_id}/{market}?api_key={API_KEY}" 73 | odds = requests.get(odds_url).json() 74 | 75 | if len(odds) < 2: 76 | continue 77 | 78 | else: 79 | 80 | # DraftKings generally offers the best odds, so for uniformity, we only include odds sourced from DraftKings 81 | 82 | for book in odds["sportsbooks"]: 83 | 84 | if book["bookie_key"] == "draftkings": 85 | sportsbook = book 86 | else: 87 | continue 88 | 89 | if len(sportsbook) < 1: 90 | 91 | continue 92 | 93 | odds_data = pd.json_normalize(sportsbook["market"]["outcomes"]) 94 | 95 | # The runline (-1.5) refers to the favorite winning by 2 or more points, so we have to first pull who the favorite is 96 | 97 | moneyline_url = f"https://api.prop-odds.com/beta/odds/{game_id}/moneyline?api_key={API_KEY}" 98 | moneyline_odds = requests.get(moneyline_url).json() 99 | 100 | if len(moneyline_odds) < 2: 101 | continue 102 | 103 | else: 104 | 105 | for moneyline_book in moneyline_odds["sportsbooks"]: 106 | 107 | if moneyline_book["bookie_key"] == "draftkings": 108 | moneyline_sportsbook = moneyline_book 109 | else: 110 | continue 111 | 112 | if len(moneyline_sportsbook) < 1: 113 | 114 | continue 115 | 116 | moneyline_odds_data = pd.json_normalize(moneyline_sportsbook["market"]["outcomes"]) 117 | 118 | if moneyline_odds_data["odds"].max() < 0: 119 | continue 120 | 121 | moneyline_favorite = moneyline_odds_data[moneyline_odds_data["odds"] < 0].sort_values(by = "timestamp", ascending = True).head(1)["name"].iloc[0] 122 | moneyline_underdog = moneyline_odds_data[moneyline_odds_data["odds"] > 0].sort_values(by = "timestamp", ascending = True).head(1)["name"].iloc[0] 123 | 124 | # We sort by earliest available pre-game odds first, since the API may occasionally include odds that were set mid-game. 125 | 126 | favorite = odds_data[(odds_data["handicap"] == -1.5) & (odds_data["name"] == moneyline_favorite)].sort_values(by = "timestamp", ascending = True).head(1) 127 | underdog = odds_data[(odds_data["handicap"] == 1.5) & (odds_data["name"] == moneyline_underdog)].sort_values(by = "timestamp", ascending = True).head(1) 128 | 129 | if len(favorite) < 1: 130 | continue 131 | elif len(underdog) < 1: 132 | continue 133 | 134 | team_1_favorite = favorite["name"].drop_duplicates().iloc[0] 135 | team_2_underdog = underdog["name"].drop_duplicates().iloc[0] 136 | 137 | team_1_favorite_odds = favorite["odds"].iloc[0] 138 | team_2_underdog_odds = underdog["odds"].iloc[0] 139 | 140 | odds_dataframe = pd.DataFrame([[team_1_favorite, team_1_favorite_odds, team_2_underdog, team_2_underdog_odds]], 141 | columns = ["team_1", "team_1_spread_odds", "team_2", "team_2_spread_odds"]) 142 | 143 | full_odds_dataframe = pd.concat([Game.reset_index(drop = True), odds_dataframe], axis = 1) 144 | 145 | if len(full_odds_dataframe) > 1: 146 | continue 147 | 148 | odds_list.append(full_odds_dataframe) 149 | 150 | full_odds = pd.concat(odds_list).reset_index(drop = True).rename(columns = {"away_team":"away_name", 151 | "home_team":"home_name", 152 | "start_timestamp":"game_datetime"}) 153 | 154 | Merged_DataFrame = pd.merge(Schedule_DataFrame, full_odds, on = ["game_datetime", "away_name", "home_name"]) 155 | 156 | Merged_DataFrame["team_1"] = Merged_DataFrame["team_1"].apply(name_converter) 157 | Merged_DataFrame["team_2"] = Merged_DataFrame["team_2"].apply(name_converter) 158 | 159 | Featured_Merged_DataFrame = Merged_DataFrame[["game_datetime","away_name","home_name","away_score","home_score", "team_1", "team_1_spread_odds", "team_2", "team_2_spread_odds", "venue_name", "winning_team"]].copy().set_index("game_datetime") 160 | 161 | # "team_1" always represents the favorite 162 | 163 | # If the favorite was the home team, then home score - away score gives us the spread -- vice versa if the favorite is the away team 164 | 165 | team_1_away_wins = Featured_Merged_DataFrame[Featured_Merged_DataFrame["away_name"] == Featured_Merged_DataFrame["team_1"]].copy() 166 | team_1_away_wins["spread"] = team_1_away_wins["away_score"] - team_1_away_wins["home_score"] 167 | 168 | team_1_home_wins = Featured_Merged_DataFrame[Featured_Merged_DataFrame["home_name"] == Featured_Merged_DataFrame["team_1"]].copy() 169 | team_1_home_wins["spread"] = team_1_home_wins["home_score"] - team_1_home_wins["away_score"] 170 | 171 | spread_dataframe = pd.concat([team_1_away_wins, team_1_home_wins], axis = 0) 172 | 173 | def spread_converter(spread): 174 | 175 | if spread >= 2: 176 | 177 | return 1 178 | else: 179 | return 0 180 | 181 | 182 | # If the favorite won the game by 2 or more points, we assign a 1 183 | # If the favorite wins by less than 2 points, or if the underdog wins, we assign a 0 184 | 185 | spread_dataframe["spread"] = spread_dataframe["spread"].apply(spread_converter) 186 | 187 | Featured_Spread_DataFrame = spread_dataframe[["team_1", "team_1_spread_odds", "team_2", "team_2_spread_odds", "venue_name", "spread"]].copy().reset_index().set_index("game_datetime") 188 | 189 | # To weed out any errors in the data set, we ensure to only include data where the odds are between -200 to +200 190 | # The odds for these bets are almost never set outside of that range, so we exclude them. 191 | 192 | Featured_Spread_DataFrame = Featured_Spread_DataFrame[(abs(Featured_Spread_DataFrame["team_1_spread_odds"]) < 200) & (abs(Featured_Spread_DataFrame["team_2_spread_odds"]) < 200)] 193 | Featured_Spread_DataFrame = Featured_Spread_DataFrame[Featured_Spread_DataFrame["team_1"] != Featured_Spread_DataFrame["team_2"]] 194 | Featured_Spread_DataFrame.index = pd.to_datetime(Featured_Spread_DataFrame.index).tz_convert("America/Chicago") 195 | 196 | # We initialize our sqlalchemy engine, then submit the data to the database 197 | 198 | engine = sqlalchemy.create_engine('mysql+mysqlconnector://username:password@database-host-name:3306/database-name') 199 | 200 | existing_data = pd.read_sql("SELECT * FROM baseball_spread", con = engine) 201 | existing_data["unique_indentifier"] = existing_data["game_datetime"].astype(str) + existing_data["team_1"].astype(str) + existing_data["team_2"].astype(str) + existing_data["venue_name"].astype(str) 202 | 203 | Featured_Spread_DataFrame["unique_indentifier"] = Featured_Spread_DataFrame.index.strftime("%Y-%m-%d %H:%M:%S") + Featured_Spread_DataFrame["team_1"].astype(str) + Featured_Spread_DataFrame["team_2"].astype(str) + Featured_Spread_DataFrame["venue_name"].astype(str) 204 | 205 | new_data = Featured_Spread_DataFrame[~Featured_Spread_DataFrame["unique_indentifier"].isin(existing_data["unique_indentifier"])] 206 | new_data = new_data.drop("unique_indentifier", axis = 1) 207 | 208 | new_data.to_sql("baseball_spread", con = engine, if_exists = "append") --------------------------------------------------------------------------------