├── short_long_mlb_names.csv
├── README.md
├── MLB_Runline_Training.ipynb
├── MLB_Runline_Production.ipynb
├── mlb-runline-daily.py
├── mlb-runline-dataset-builder.py
└── mlb-runline-dataset-production.py


/short_long_mlb_names.csv:
--------------------------------------------------------------------------------
 1 | short_name,long_name
 2 | ARI Diamondbacks - ARI Diamondbacks,Arizona Diamondbacks
 3 | CHI Cubs - CHI Cubs,Chicago Cubs
 4 | MIA Marlins - MIA Marlins,Miami Marlins
 5 | TB Rays - TB Rays,Tampa Bay Rays
 6 | NY Yankees - NY Yankees,New York Yankees
 7 | BOS Red Sox - BOS Red Sox,Boston Red Sox
 8 | KC Royals - KC Royals,Kansas City Royals
 9 | MIL Brewers - MIL Brewers,Milwaukee Brewers
10 | ATL Braves - ATL Braves,Atlanta Braves
11 | BAL Orioles - BAL Orioles,Baltimore Orioles
12 | DET Tigers - DET Tigers,Detroit Tigers
13 | CLE Guardians - CLE Guardians,Cleveland Guardians
14 | LA Angels - LA Angels,Los Angeles Angels
15 | COL Rockies - COL Rockies,Colorado Rockies
16 | CHI White Sox - CHI White Sox,Chicago White Sox
17 | CIN Reds - CIN Reds,Cincinnati Reds
18 | HOU Astros - HOU Astros,Houston Astros
19 | OAK Athletics - OAK Athletics,Oakland Athletics
20 | NY Mets - NY Mets,New York Mets
21 | LA Dodgers - LA Dodgers,Los Angeles Dodgers
22 | MIN Twins - MIN Twins,Minnesota Twins
23 | PIT Pirates - PIT Pirates,Pittsburgh Pirates
24 | SEA Mariners - SEA Mariners,Seattle Mariners
25 | SF Giants - SF Giants,San Francisco Giants
26 | PHI Phillies - PHI Phillies,Philadelphia Phillies
27 | SD Padres - SD Padres,San Diego Padres
28 | STL Cardinals - STL Cardinals,St. Louis Cardinals
29 | TEX Rangers - TEX Rangers,Texas Rangers
30 | WAS Nationals - WAS Nationals,Washington Nationals
31 | TOR Blue Jays - TOR Blue Jays,Toronto Blue Jays


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Original Source: [The Quant's Playbook](https://quantgalore.substack.com/)
 2 | 
 3 | This system stores the data in MySQL databases and the models in Colab notebooks. If you don’t have experience in setting these up, it is highly recommend visiting: 
 4 | [Machine Learning for Sports Betting: MLB Edition](https://www.quantgalore.com/courses/ml-sports-betting-mlb "Machine Learning for Sports Betting: MLB Edition"),
 5 | where we walk through the entire process with a similar workflow, going from data all the way to production.
 6 | 
 7 | 
 8 | The workflow for this algorithm is as follows:
 9 | 
10 | 1. Register for a [prop-odds](https://www.prop-odds.com/)  API key
11 | 
12 | 2. Run the “mlb-runline-dataset-builder.py” file
13 | 
14 | 	- This builds the original dataset and takes about 15-30 minutes
15 | 
16 | 3. Run the “mlb-runline-daily.py” file
17 | 
18 | 	- This is the dataset that will be used to get the predictions for the games of that day.
19 | 
20 | 4. In Google Colab, run the “MLB_Runline_Training.ipynb” file
21 | 
22 | 	- This file is responsible for comparing and training the dozens of available models. It isn't necessary to make any changes to the model, but you have the freedom to experiment.
23 | 
24 | 	- Running the file will create a .pkl file containing the model of your choice, be sure to upload this to your drive.
25 | 
26 | 5. In Google Colab, run the “MLB_Runline_Production.ipynb" file
27 | 
28 | 	- This file will deploy the model you saved and generate predictions and theoretical odds.
29 | 
30 | 6. In order to update new, future data points without having to re-build the entire dataset, run the “mlb-runline-dataset-production.py” in lieu of the “mlb-runline-dataset-builder.py”.
31 | 
32 | 7. To start tracking predictions before going live, visit [the action network](https://www.actionnetwork.com/).
33 | 
34 | 8. Finally, you're all set! 😄
35 | 


--------------------------------------------------------------------------------
/MLB_Runline_Training.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": []
  7 |     },
  8 |     "kernelspec": {
  9 |       "name": "python3",
 10 |       "display_name": "Python 3"
 11 |     },
 12 |     "language_info": {
 13 |       "name": "python"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "code",
 19 |       "execution_count": null,
 20 |       "metadata": {
 21 |         "id": "gTKL-2tRlNIT"
 22 |       },
 23 |       "outputs": [],
 24 |       "source": [
 25 |         "!pip install pycaret\n",
 26 |         "!pip install mysql-connector-python\n",
 27 |         "!pip install sqlalchemy==1.4.32"
 28 |       ]
 29 |     },
 30 |     {
 31 |       "cell_type": "code",
 32 |       "execution_count": 2,
 33 |       "metadata": {
 34 |         "id": "W1oH3E8Nln9t"
 35 |       },
 36 |       "outputs": [],
 37 |       "source": [
 38 |         "import pycaret\n",
 39 |         "import pandas as pd\n",
 40 |         "import numpy as np\n",
 41 |         "import sqlalchemy\n",
 42 |         "import mysql.connector\n",
 43 |         "\n",
 44 |         "from pycaret import classification\n",
 45 |         "from google.colab import drive\n",
 46 |         "from datetime import datetime\n",
 47 |         "from google.colab.data_table import DataTable\n",
 48 |         "from google.colab import files"
 49 |       ]
 50 |     },
 51 |     {
 52 |       "cell_type": "code",
 53 |       "execution_count": null,
 54 |       "metadata": {
 55 |         "id": "dNNe7YwMlpgP"
 56 |       },
 57 |       "outputs": [],
 58 |       "source": [
 59 |         "engine = sqlalchemy.create_engine('mysql+mysqlconnector://username:password@database-host-name:3306/database-name')\n",
 60 |         "\n",
 61 |         "Raw_Dataset = pd.read_sql(sql = \"SELECT * FROM baseball_spread\", con = engine).set_index(\"game_datetime\").reset_index()[[\"game_datetime\", \"team_1\",\t\"team_1_spread_odds\",\t\"team_2\",\t\"team_2_spread_odds\",\t\"venue_name\",\t\"spread\"]]\n",
 62 |         "print(len(Raw_Dataset))"
 63 |       ]
 64 |     },
 65 |     {
 66 |       "cell_type": "code",
 67 |       "execution_count": null,
 68 |       "metadata": {
 69 |         "id": "K4RnhrRmKlwR"
 70 |       },
 71 |       "outputs": [],
 72 |       "source": [
 73 |         "Raw_Dataset"
 74 |       ]
 75 |     },
 76 |     {
 77 |       "cell_type": "code",
 78 |       "source": [
 79 |         "Classification_Setup = pycaret.classification.setup(data = Raw_Dataset, target = \"spread\", train_size = .70, use_gpu = False, data_split_stratify = False, data_split_shuffle = False, date_features = [\"game_datetime\"], categorical_features = [\"team_1\",\"team_2\",\"venue_name\"])"
 80 |       ],
 81 |       "metadata": {
 82 |         "id": "Wb0v2bKsmJOc"
 83 |       },
 84 |       "execution_count": null,
 85 |       "outputs": []
 86 |     },
 87 |     {
 88 |       "cell_type": "markdown",
 89 |       "metadata": {
 90 |         "id": "jnTnG8aSK74e"
 91 |       },
 92 |       "source": [
 93 |         "***Compare/Create Model***"
 94 |       ]
 95 |     },
 96 |     {
 97 |       "cell_type": "code",
 98 |       "source": [
 99 |         "Classification_Models = Classification_Setup.compare_models(turbo = False)"
100 |       ],
101 |       "metadata": {
102 |         "id": "YTAPYfpanoXE"
103 |       },
104 |       "execution_count": null,
105 |       "outputs": []
106 |     },
107 |     {
108 |       "cell_type": "markdown",
109 |       "source": [
110 |         "You can use the multi-layer perceptron (\"mlp\") as outlined in the post, but a random forest (\"rf\") classification is more effective. A neural-network is a ***bit*** of an overkill."
111 |       ],
112 |       "metadata": {
113 |         "id": "KK66V4NXFy0n"
114 |       }
115 |     },
116 |     {
117 |       "cell_type": "code",
118 |       "source": [
119 |         "Classification_Model = Classification_Setup.create_model(\"rf\")"
120 |       ],
121 |       "metadata": {
122 |         "id": "tY82ShLirM3B"
123 |       },
124 |       "execution_count": null,
125 |       "outputs": []
126 |     },
127 |     {
128 |       "cell_type": "code",
129 |       "source": [
130 |         "Prediction = Classification_Setup.predict_model(Classification_Model)\n",
131 |         "Prediction.head(20)"
132 |       ],
133 |       "metadata": {
134 |         "id": "ZKYRxxPLXsE3"
135 |       },
136 |       "execution_count": null,
137 |       "outputs": []
138 |     },
139 |     {
140 |       "cell_type": "code",
141 |       "source": [
142 |         "Classification_Setup.evaluate_model(Classification_Model)"
143 |       ],
144 |       "metadata": {
145 |         "id": "Cjj3McLgxFfT"
146 |       },
147 |       "execution_count": null,
148 |       "outputs": []
149 |     },
150 |     {
151 |       "cell_type": "markdown",
152 |       "metadata": {
153 |         "id": "odO3tScVvMTy"
154 |       },
155 |       "source": [
156 |         "***Save Model***"
157 |       ]
158 |     },
159 |     {
160 |       "cell_type": "code",
161 |       "execution_count": null,
162 |       "metadata": {
163 |         "id": "d16cozQ4vLxv"
164 |       },
165 |       "outputs": [],
166 |       "source": [
167 |         "Finalized_Model = pycaret.classification.finalize_model(Classification_Model)\n",
168 |         "Finalized_Model_save_to_file_string = f\"{datetime.today().strftime('%Y-%m-%d')} Baseball Spread\"\n",
169 |         "pycaret.classification.save_model(Finalized_Model, Finalized_Model_save_to_file_string)\n",
170 |         "files.download(f\"{Finalized_Model_save_to_file_string}.pkl\")"
171 |       ]
172 |     }
173 |   ]
174 | }


--------------------------------------------------------------------------------
/MLB_Runline_Production.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": []
  7 |     },
  8 |     "kernelspec": {
  9 |       "name": "python3",
 10 |       "display_name": "Python 3"
 11 |     },
 12 |     "language_info": {
 13 |       "name": "python"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "markdown",
 19 |       "source": [
 20 |         "**Install Packages**"
 21 |       ],
 22 |       "metadata": {
 23 |         "id": "Si6oYuigEsWJ"
 24 |       }
 25 |     },
 26 |     {
 27 |       "cell_type": "code",
 28 |       "execution_count": null,
 29 |       "metadata": {
 30 |         "id": "-AV9WzDzEcTt"
 31 |       },
 32 |       "outputs": [],
 33 |       "source": [
 34 |         "!pip install pycaret\n",
 35 |         "!pip install mysql-connector-python\n",
 36 |         "!pip install sqlalchemy==1.4.32\n",
 37 |         "!pip install MLB-StatsAPI"
 38 |       ]
 39 |     },
 40 |     {
 41 |       "cell_type": "markdown",
 42 |       "source": [
 43 |         "**Import Packages**"
 44 |       ],
 45 |       "metadata": {
 46 |         "id": "AL_lGnkTE7af"
 47 |       }
 48 |     },
 49 |     {
 50 |       "cell_type": "code",
 51 |       "source": [
 52 |         "import pycaret\n",
 53 |         "import pandas as pd\n",
 54 |         "import numpy as np\n",
 55 |         "import sqlalchemy\n",
 56 |         "import mysql.connector\n",
 57 |         "import sys\n",
 58 |         "import requests\n",
 59 |         "import statsapi\n",
 60 |         "\n",
 61 |         "from pycaret import classification\n",
 62 |         "from datetime import datetime\n",
 63 |         "from google.colab.data_table import DataTable\n",
 64 |         "from google.colab import files\n",
 65 |         "from google.colab import drive\n",
 66 |         "\n",
 67 |         "sys.path.append(\"/content/drive/MyDrive\")\n",
 68 |         "\n",
 69 |         "drive.mount('/content/drive')\n",
 70 |         "\n",
 71 |         "def odds_calculator(probability):\n",
 72 |         "\n",
 73 |         "    return round(-100 / ((1/probability)-1))"
 74 |       ],
 75 |       "metadata": {
 76 |         "id": "N3dnZwMJFEe5"
 77 |       },
 78 |       "execution_count": null,
 79 |       "outputs": []
 80 |     },
 81 |     {
 82 |       "cell_type": "markdown",
 83 |       "source": [
 84 |         "**Import Data**"
 85 |       ],
 86 |       "metadata": {
 87 |         "id": "Bx0rUFJ1FraV"
 88 |       }
 89 |     },
 90 |     {
 91 |       "cell_type": "code",
 92 |       "source": [
 93 |         "engine = sqlalchemy.create_engine('mysql+mysqlconnector://username:password@database-host-name:3306/database-name')\n",
 94 |         "\n",
 95 |         "Raw_Dataset = pd.read_sql(sql = \"SELECT * FROM baseball_spread_production\", con = engine).set_index(\"game_datetime\").reset_index()[[\"game_datetime\", \"team_1\",\t\"team_1_spread_odds\",\t\"team_2\",\t\"team_2_spread_odds\",\t\"venue_name\"]]\n",
 96 |         "\n",
 97 |         "print(len(Raw_Dataset))"
 98 |       ],
 99 |       "metadata": {
100 |         "id": "4nMUvN_HFvoi"
101 |       },
102 |       "execution_count": null,
103 |       "outputs": []
104 |     },
105 |     {
106 |       "cell_type": "code",
107 |       "source": [
108 |         "Raw_Dataset"
109 |       ],
110 |       "metadata": {
111 |         "id": "8-tmvY2cagZd"
112 |       },
113 |       "execution_count": null,
114 |       "outputs": []
115 |     },
116 |     {
117 |       "cell_type": "markdown",
118 |       "source": [
119 |         "### **Load Models**"
120 |       ],
121 |       "metadata": {
122 |         "id": "9P_9ThrHGyPd"
123 |       }
124 |     },
125 |     {
126 |       "cell_type": "code",
127 |       "source": [
128 |         "#Finalized_Model_save_to_file_string = f\"2023-07-22 Baseball Spread\"\n",
129 |         "Finalized_Model_save_to_file_string = f\"{datetime.today().strftime('%Y-%m-%d')} Baseball Spread\"\n",
130 |         "\n",
131 |         "Classification_Model = pycaret.classification.load_model(f\"/content/drive/MyDrive/{Finalized_Model_save_to_file_string}\")\n",
132 |         "\n",
133 |         "print(Finalized_Model_save_to_file_string)"
134 |       ],
135 |       "metadata": {
136 |         "id": "-j2zDCd7Y_8z"
137 |       },
138 |       "execution_count": null,
139 |       "outputs": []
140 |     },
141 |     {
142 |       "cell_type": "markdown",
143 |       "source": [
144 |         "### **Get Predictions of Day**"
145 |       ],
146 |       "metadata": {
147 |         "id": "L6GgjUXpTQCv"
148 |       }
149 |     },
150 |     {
151 |       "cell_type": "code",
152 |       "source": [
153 |         "Prediction = pycaret.classification.predict_model(estimator = Classification_Model, data = Raw_Dataset)"
154 |       ],
155 |       "metadata": {
156 |         "id": "e-Djt_9BTVrd"
157 |       },
158 |       "execution_count": 10,
159 |       "outputs": []
160 |     },
161 |     {
162 |       "cell_type": "code",
163 |       "source": [
164 |         "Prediction[\"theo_odds\"] = Prediction[\"prediction_score\"].apply(odds_calculator)"
165 |       ],
166 |       "metadata": {
167 |         "id": "KyvX6ywuOlDp"
168 |       },
169 |       "execution_count": 11,
170 |       "outputs": []
171 |     },
172 |     {
173 |       "cell_type": "code",
174 |       "source": [
175 |         "Prediction[[\"team_1\", \"team_2\",\"prediction_label\",\"prediction_score\", \"theo_odds\"]].sort_values(by = \"prediction_score\", ascending = False)"
176 |       ],
177 |       "metadata": {
178 |         "id": "km4EQOOOZ7tI"
179 |       },
180 |       "execution_count": null,
181 |       "outputs": []
182 |     }
183 |   ]
184 | }


--------------------------------------------------------------------------------
/mlb-runline-daily.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created 2023
  4 | 
  5 | @author: Quant Galore
  6 | """
  7 | 
  8 | from datetime import datetime, timedelta
  9 | 
 10 | import pandas as pd
 11 | import statsapi
 12 | import requests
 13 | import numpy as np
 14 | import sqlalchemy
 15 | import mysql.connector
 16 | 
 17 | Short_Long_Names = pd.read_csv("short_long_mlb_names.csv")
 18 | 
 19 | def name_converter(short_name):
 20 |     
 21 |     long_name = Short_Long_Names[Short_Long_Names["short_name"] == short_name]["long_name"]
 22 |     
 23 |     if len(long_name) < 1 :
 24 |         
 25 |         return np.nan
 26 |     else:
 27 |     
 28 |         return long_name.iloc[0]
 29 | 
 30 | API_KEY = "your prop-odds.com api key"
 31 | 
 32 | # =============================================================================
 33 | # Start 
 34 | # =============================================================================
 35 | 
 36 | # This is the production dataset, designed to only append new values, as oppposed to having to constantly re-build the dataset
 37 | # So, we set the start date to 7 days prior. This way, we check all of the games in that period to include any days that we missed
 38 | 
 39 | begin_date = (datetime.today()).strftime("%Y-%m-%d")
 40 | ending_date = (datetime.today()).strftime("%Y-%m-%d")
 41 | 
 42 | Schedule = statsapi.schedule(start_date = begin_date, end_date = ending_date)
 43 | Schedule_DataFrame = pd.json_normalize(Schedule)
 44 | 
 45 | date_range = pd.date_range(start = begin_date, end = ending_date)
 46 | 
 47 | odds_list = []
 48 | 
 49 | # The spread market represents the "runline" bet
 50 | 
 51 | market = "spread"
 52 | 
 53 | for date in date_range:
 54 | 
 55 |     date = date.strftime("%Y-%m-%d")
 56 |     url = f"https://api.prop-odds.com/beta/games/mlb?date={date}&tz=America/Chicago&api_key={API_KEY}"
 57 |     games_url = f"https://api.prop-odds.com/beta/games/mlb?date={date}&api_key={API_KEY}"
 58 |     
 59 |     
 60 |     games = pd.json_normalize(requests.get(games_url).json()["games"])
 61 |     
 62 |     if len(games) < 1:
 63 |         
 64 |         continue
 65 |     
 66 |     for game_id in games["game_id"]:
 67 |         
 68 |         Game = games[games["game_id"] == game_id]
 69 |         
 70 |         sportsbook = []
 71 |     
 72 |         odds_url = f"https://api.prop-odds.com/beta/odds/{game_id}/{market}?api_key={API_KEY}"
 73 |         odds = requests.get(odds_url).json()
 74 |         
 75 |         if len(odds) < 2:
 76 |             continue
 77 |         
 78 |         else:
 79 |             
 80 |             # DraftKings generally offers the best odds, so for uniformity, we only include odds sourced from DraftKings
 81 |             
 82 |             for book in odds["sportsbooks"]:
 83 |                 
 84 |                 if book["bookie_key"] == "draftkings":
 85 |                     sportsbook = book
 86 |                 else:
 87 |                     continue
 88 |                 
 89 |             if len(sportsbook) < 1:
 90 |                 
 91 |                 continue
 92 |                 
 93 |             odds_data = pd.json_normalize(sportsbook["market"]["outcomes"])
 94 |             
 95 |             # The runline (-1.5) refers to the favorite winning by 2 or more points, so we have to first pull who the favorite is
 96 |             
 97 |             moneyline_url = f"https://api.prop-odds.com/beta/odds/{game_id}/moneyline?api_key={API_KEY}"
 98 |             moneyline_odds = requests.get(moneyline_url).json()
 99 |             
100 |             if len(moneyline_odds) < 2:
101 |                 continue
102 |             
103 |             else:
104 |                 
105 |                 for moneyline_book in moneyline_odds["sportsbooks"]:
106 |                     
107 |                     if moneyline_book["bookie_key"] == "draftkings":
108 |                         moneyline_sportsbook = moneyline_book
109 |                     else:
110 |                         continue
111 |                     
112 |                 if len(moneyline_sportsbook) < 1:
113 |                     
114 |                     continue
115 |             
116 |             moneyline_odds_data = pd.json_normalize(moneyline_sportsbook["market"]["outcomes"])
117 |             
118 |             if moneyline_odds_data["odds"].max() < 0:
119 |                 continue
120 |             
121 |             moneyline_favorite = moneyline_odds_data[moneyline_odds_data["odds"] < 0].sort_values(by = "timestamp", ascending = True).head(1)["name"].iloc[0]
122 |             moneyline_underdog = moneyline_odds_data[moneyline_odds_data["odds"] > 0].sort_values(by = "timestamp", ascending = True).head(1)["name"].iloc[0]
123 |             
124 |             # We sort by earliest available pre-game odds first, since the API may occasionally include odds that were set mid-game.
125 |             
126 |             favorite = odds_data[(odds_data["handicap"] == -1.5) & (odds_data["name"] == moneyline_favorite)].sort_values(by = "timestamp", ascending = True).head(1)
127 |             underdog = odds_data[(odds_data["handicap"] == 1.5) & (odds_data["name"] == moneyline_underdog)].sort_values(by = "timestamp", ascending = True).head(1)
128 |             
129 |             if len(favorite) < 1:
130 |                 continue
131 |             elif len(underdog) < 1:
132 |                 continue
133 |             
134 |             team_1_favorite = favorite["name"].drop_duplicates().iloc[0]
135 |             team_2_underdog = underdog["name"].drop_duplicates().iloc[0]
136 |             
137 |             team_1_favorite_odds = favorite["odds"].iloc[0]
138 |             team_2_underdog_odds = underdog["odds"].iloc[0]
139 |             
140 |             odds_dataframe = pd.DataFrame([[team_1_favorite, team_1_favorite_odds, team_2_underdog, team_2_underdog_odds]],
141 |                                           columns = ["team_1", "team_1_spread_odds", "team_2", "team_2_spread_odds"])
142 |                 
143 |             full_odds_dataframe = pd.concat([Game.reset_index(drop = True), odds_dataframe], axis = 1)
144 |             
145 |             if len(full_odds_dataframe) > 1:
146 |                 continue
147 |             
148 |             odds_list.append(full_odds_dataframe)
149 |             
150 | full_odds = pd.concat(odds_list).reset_index(drop = True).rename(columns = {"away_team":"away_name",
151 |                                                                             "home_team":"home_name",
152 |                                                                             "start_timestamp":"game_datetime"})
153 | 
154 | Merged_DataFrame = pd.merge(Schedule_DataFrame, full_odds, on = ["game_datetime", "away_name", "home_name"])
155 | 
156 | Merged_DataFrame["team_1"] = Merged_DataFrame["team_1"].apply(name_converter)
157 | Merged_DataFrame["team_2"] = Merged_DataFrame["team_2"].apply(name_converter)
158 | 
159 | Featured_Merged_DataFrame = Merged_DataFrame[["game_datetime","away_name","home_name", "team_1", "team_1_spread_odds", "team_2", "team_2_spread_odds", "venue_name"]].copy().set_index("game_datetime")
160 | 
161 | # "team_1" always represents the favorite
162 | 
163 | Featured_Spread_DataFrame = Featured_Merged_DataFrame[["team_1", "team_1_spread_odds", "team_2", "team_2_spread_odds", "venue_name"]].copy().reset_index().set_index("game_datetime")
164 | 
165 | # To weed out any errors in the data set, we ensure to only include data where the odds are between -200 to +200
166 | # The odds for these bets are almost never set outside of that range, so we exclude them.
167 | 
168 | Featured_Spread_DataFrame = Featured_Spread_DataFrame[(abs(Featured_Spread_DataFrame["team_1_spread_odds"]) < 200) & (abs(Featured_Spread_DataFrame["team_2_spread_odds"]) < 200)]
169 | Featured_Spread_DataFrame = Featured_Spread_DataFrame[Featured_Spread_DataFrame["team_1"] != Featured_Spread_DataFrame["team_2"]]
170 | Featured_Spread_DataFrame.index = pd.to_datetime(Featured_Spread_DataFrame.index).tz_convert("America/Chicago")
171 | 
172 | # =============================================================================
173 | # End
174 | # =============================================================================
175 | 
176 | # We initialize our sqlalchemy engine, then submit the data to the database
177 | 
178 | engine = sqlalchemy.create_engine('mysql+mysqlconnector://username:password@database-host-name:3306/database-name')
179 | 
180 | # The daily production dataset should be dropped each day, sa
181 | 
182 | with engine.connect() as conn:
183 |     result = conn.execute(sqlalchemy.text('DROP TABLE baseball_spread_production'))
184 | 
185 | Featured_Spread_DataFrame.to_sql("baseball_spread_production", con = engine, if_exists = "append")
186 | 


--------------------------------------------------------------------------------
/mlb-runline-dataset-builder.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created 2023
  4 | 
  5 | @author: Quant Galore
  6 | """
  7 | 
  8 | from datetime import datetime, timedelta
  9 | 
 10 | import pandas as pd
 11 | import statsapi
 12 | import requests
 13 | import numpy as np
 14 | import sqlalchemy
 15 | import mysql.connector
 16 | 
 17 | Short_Long_Names = pd.read_csv("short_long_mlb_names.csv")
 18 | 
 19 | def name_converter(short_name):
 20 |     
 21 |     long_name = Short_Long_Names[Short_Long_Names["short_name"] == short_name]["long_name"]
 22 |     
 23 |     if len(long_name) < 1 :
 24 |         
 25 |         return np.nan
 26 |     else:
 27 |     
 28 |         return long_name.iloc[0]
 29 | 
 30 | API_KEY = "your prop-odds.com api key"
 31 | 
 32 | # =============================================================================
 33 | # Start 
 34 | # =============================================================================
 35 | 
 36 | # First, we call the schedule API to get a list of all games played from the start of the season, up to yesterday.
 37 | 
 38 | begin_date = "2023-03-30"
 39 | ending_date = (datetime.today() - timedelta(days = 1)).strftime("%Y-%m-%d")
 40 | 
 41 | Schedule = statsapi.schedule(start_date = begin_date, end_date = ending_date)
 42 | Schedule_DataFrame = pd.json_normalize(Schedule)
 43 | 
 44 | date_range = pd.date_range(start = begin_date, end = ending_date)
 45 | 
 46 | odds_list = []
 47 | 
 48 | # The spread market represents the "runline" bet
 49 | 
 50 | market = "spread"
 51 | 
 52 | for date in date_range:
 53 | 
 54 |     date = date.strftime("%Y-%m-%d")
 55 |     url = f"https://api.prop-odds.com/beta/games/mlb?date={date}&tz=America/Chicago&api_key={API_KEY}"
 56 |     games_url = f"https://api.prop-odds.com/beta/games/mlb?date={date}&api_key={API_KEY}"
 57 |     
 58 |     
 59 |     games = pd.json_normalize(requests.get(games_url).json()["games"])
 60 |     
 61 |     if len(games) < 1:
 62 |         
 63 |         continue
 64 |     
 65 |     for game_id in games["game_id"]:
 66 |         
 67 |         Game = games[games["game_id"] == game_id]
 68 |         
 69 |         sportsbook = []
 70 |     
 71 |         odds_url = f"https://api.prop-odds.com/beta/odds/{game_id}/{market}?api_key={API_KEY}"
 72 |         odds = requests.get(odds_url).json()
 73 |         
 74 |         if len(odds) < 2:
 75 |             continue
 76 |         
 77 |         else:
 78 |             
 79 |             # DraftKings generally offers the best odds, so for uniformity, we only include odds sourced from DraftKings
 80 |             
 81 |             for book in odds["sportsbooks"]:
 82 |                 
 83 |                 if book["bookie_key"] == "draftkings":
 84 |                     sportsbook = book
 85 |                 else:
 86 |                     continue
 87 |                 
 88 |             if len(sportsbook) < 1:
 89 |                 
 90 |                 continue
 91 |                 
 92 |             odds_data = pd.json_normalize(sportsbook["market"]["outcomes"])
 93 |             
 94 |             # The runline (-1.5) refers to the favorite winning by 2 or more points, so we have to first pull who the favorite is
 95 |             
 96 |             moneyline_url = f"https://api.prop-odds.com/beta/odds/{game_id}/moneyline?api_key={API_KEY}"
 97 |             moneyline_odds = requests.get(moneyline_url).json()
 98 |             
 99 |             if len(moneyline_odds) < 2:
100 |                 continue
101 |             
102 |             else:
103 |                 
104 |                 for moneyline_book in moneyline_odds["sportsbooks"]:
105 |                     
106 |                     if moneyline_book["bookie_key"] == "draftkings":
107 |                         moneyline_sportsbook = moneyline_book
108 |                     else:
109 |                         continue
110 |                     
111 |                 if len(moneyline_sportsbook) < 1:
112 |                     
113 |                     continue
114 |             
115 |             moneyline_odds_data = pd.json_normalize(moneyline_sportsbook["market"]["outcomes"])
116 |             
117 |             if moneyline_odds_data["odds"].max() < 0:
118 |                 continue
119 |             
120 |             moneyline_favorite = moneyline_odds_data[moneyline_odds_data["odds"] < 0].sort_values(by = "timestamp", ascending = True).head(1)["name"].iloc[0]
121 |             moneyline_underdog = moneyline_odds_data[moneyline_odds_data["odds"] > 0].sort_values(by = "timestamp", ascending = True).head(1)["name"].iloc[0]
122 |             
123 |             # We sort by earliest available pre-game odds first, since the API may occasionally include odds that were set mid-game.
124 |             
125 |             favorite = odds_data[(odds_data["handicap"] == -1.5) & (odds_data["name"] == moneyline_favorite)].sort_values(by = "timestamp", ascending = True).head(1)
126 |             underdog = odds_data[(odds_data["handicap"] == 1.5) & (odds_data["name"] == moneyline_underdog)].sort_values(by = "timestamp", ascending = True).head(1)
127 |             
128 |             if len(favorite) < 1:
129 |                 continue
130 |             elif len(underdog) < 1:
131 |                 continue
132 |             
133 |             team_1_favorite = favorite["name"].drop_duplicates().iloc[0]
134 |             team_2_underdog = underdog["name"].drop_duplicates().iloc[0]
135 |             
136 |             team_1_favorite_odds = favorite["odds"].iloc[0]
137 |             team_2_underdog_odds = underdog["odds"].iloc[0]
138 |             
139 |             odds_dataframe = pd.DataFrame([[team_1_favorite, team_1_favorite_odds, team_2_underdog, team_2_underdog_odds]],
140 |                                           columns = ["team_1", "team_1_spread_odds", "team_2", "team_2_spread_odds"])
141 |                 
142 |             full_odds_dataframe = pd.concat([Game.reset_index(drop = True), odds_dataframe], axis = 1)
143 |             
144 |             if len(full_odds_dataframe) > 1:
145 |                 continue
146 |             
147 |             odds_list.append(full_odds_dataframe)
148 |             
149 | full_odds = pd.concat(odds_list).reset_index(drop = True).rename(columns = {"away_team":"away_name",
150 |                                                                             "home_team":"home_name",
151 |                                                                             "start_timestamp":"game_datetime"})
152 | 
153 | Merged_DataFrame = pd.merge(Schedule_DataFrame, full_odds, on = ["game_datetime", "away_name", "home_name"])
154 | 
155 | Merged_DataFrame["team_1"] = Merged_DataFrame["team_1"].apply(name_converter)
156 | Merged_DataFrame["team_2"] = Merged_DataFrame["team_2"].apply(name_converter)
157 | 
158 | Featured_Merged_DataFrame = Merged_DataFrame[["game_datetime","away_name","home_name","away_score","home_score", "team_1", "team_1_spread_odds", "team_2", "team_2_spread_odds", "venue_name", "winning_team"]].copy().set_index("game_datetime")
159 | 
160 | # "team_1" always represents the favorite
161 | 
162 | # If the favorite was the home team, then home score - away score gives us the spread -- vice versa if the favorite is the away team
163 | 
164 | team_1_away_wins = Featured_Merged_DataFrame[Featured_Merged_DataFrame["away_name"] == Featured_Merged_DataFrame["team_1"]].copy()
165 | team_1_away_wins["spread"] = team_1_away_wins["away_score"].astype(int) - team_1_away_wins["home_score"].astype(int)
166 | 
167 | team_1_home_wins = Featured_Merged_DataFrame[Featured_Merged_DataFrame["home_name"] == Featured_Merged_DataFrame["team_1"]].copy()
168 | team_1_home_wins["spread"] = team_1_home_wins["home_score"].astype(int) - team_1_home_wins["away_score"].astype(int)
169 | 
170 | spread_dataframe = pd.concat([team_1_away_wins, team_1_home_wins], axis = 0)
171 | 
172 | def spread_converter(spread):
173 |     
174 |     if spread >= 2:
175 |         
176 |         return 1
177 |     else:
178 |         return 0
179 | 
180 | 
181 | # If the favorite won the game by 2 or more points, we assign a 1
182 | # If the favorite wins by less than 2 points, or if the underdog wins, we assign a 0
183 | 
184 | spread_dataframe["spread"] = spread_dataframe["spread"].apply(spread_converter)
185 | 
186 | Featured_Spread_DataFrame = spread_dataframe[["team_1", "team_1_spread_odds", "team_2", "team_2_spread_odds", "venue_name", "spread"]].copy().reset_index().set_index("game_datetime")
187 | 
188 | # To weed out any errors in the data set, we ensure to only include data where the odds are between -200 to +200
189 | # The odds for these bets are almost never set outside of that range, so we exclude them.
190 | 
191 | Featured_Spread_DataFrame = Featured_Spread_DataFrame[(abs(Featured_Spread_DataFrame["team_1_spread_odds"]) < 200) & (abs(Featured_Spread_DataFrame["team_2_spread_odds"]) < 200)]
192 | Featured_Spread_DataFrame = Featured_Spread_DataFrame[Featured_Spread_DataFrame["team_1"] != Featured_Spread_DataFrame["team_2"]]
193 | Featured_Spread_DataFrame.index = pd.to_datetime(Featured_Spread_DataFrame.index).tz_convert("America/Chicago")
194 | 
195 | # We initialize our sqlalchemy engine, then submit the data to the database
196 | 
197 | initial_engine = sqlalchemy.create_engine('mysql+mysqlconnector://username:password@database-host-name:3306')
198 | 
199 | with initial_engine.connect() as conn:
200 |     result = conn.execute(sqlalchemy.text('CREATE DATABASE desired_database_name'))
201 | 
202 | engine = sqlalchemy.create_engine('mysql+mysqlconnector://username:password@database-host-name:3306/database-name')
203 | 
204 | Featured_Spread_DataFrame.to_sql("baseball_spread", con = engine, if_exists = "append")
205 | 
206 | # If you make a mistake, or wish to re-build te dataset, you can drop the table and start over
207 | 
208 | with engine.connect() as conn:
209 |     result = conn.execute(sqlalchemy.text('DROP TABLE baseball_spread'))
210 | 


--------------------------------------------------------------------------------
/mlb-runline-dataset-production.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created 2023
  4 | 
  5 | @author: Quant Galore
  6 | """
  7 | 
  8 | from datetime import datetime, timedelta
  9 | 
 10 | import pandas as pd
 11 | import statsapi
 12 | import requests
 13 | import numpy as np
 14 | import sqlalchemy
 15 | import mysql.connector
 16 | 
 17 | Short_Long_Names = pd.read_csv("short_long_mlb_names.csv")
 18 | 
 19 | def name_converter(short_name):
 20 |     
 21 |     long_name = Short_Long_Names[Short_Long_Names["short_name"] == short_name]["long_name"]
 22 |     
 23 |     if len(long_name) < 1 :
 24 |         
 25 |         return np.nan
 26 |     else:
 27 |     
 28 |         return long_name.iloc[0]
 29 | 
 30 | API_KEY = "your prop-odds.com api key"
 31 | 
 32 | # =============================================================================
 33 | # Start 
 34 | # =============================================================================
 35 | 
 36 | # This is the production dataset, designed to only append new values, as oppposed to having to constantly re-build the dataset
 37 | # So, we set the start date to 7 days prior. This way, we check all of the games in that period to include any days that we missed
 38 | 
 39 | begin_date = (datetime.today() - timedelta(days = 7)).strftime("%Y-%m-%d")
 40 | ending_date = (datetime.today() - timedelta(days = 1)).strftime("%Y-%m-%d")
 41 | 
 42 | Schedule = statsapi.schedule(start_date = begin_date, end_date = ending_date)
 43 | Schedule_DataFrame = pd.json_normalize(Schedule)
 44 | 
 45 | date_range = pd.date_range(start = begin_date, end = ending_date)
 46 | 
 47 | odds_list = []
 48 | 
 49 | # The spread market represents the "runline" bet
 50 | 
 51 | market = "spread"
 52 | 
 53 | for date in date_range:
 54 | 
 55 |     date = date.strftime("%Y-%m-%d")
 56 |     url = f"https://api.prop-odds.com/beta/games/mlb?date={date}&tz=America/Chicago&api_key={API_KEY}"
 57 |     games_url = f"https://api.prop-odds.com/beta/games/mlb?date={date}&api_key={API_KEY}"
 58 |     
 59 |     
 60 |     games = pd.json_normalize(requests.get(games_url).json()["games"])
 61 |     
 62 |     if len(games) < 1:
 63 |         
 64 |         continue
 65 |     
 66 |     for game_id in games["game_id"]:
 67 |         
 68 |         Game = games[games["game_id"] == game_id]
 69 |         
 70 |         sportsbook = []
 71 |     
 72 |         odds_url = f"https://api.prop-odds.com/beta/odds/{game_id}/{market}?api_key={API_KEY}"
 73 |         odds = requests.get(odds_url).json()
 74 |         
 75 |         if len(odds) < 2:
 76 |             continue
 77 |         
 78 |         else:
 79 |             
 80 |             # DraftKings generally offers the best odds, so for uniformity, we only include odds sourced from DraftKings
 81 |             
 82 |             for book in odds["sportsbooks"]:
 83 |                 
 84 |                 if book["bookie_key"] == "draftkings":
 85 |                     sportsbook = book
 86 |                 else:
 87 |                     continue
 88 |                 
 89 |             if len(sportsbook) < 1:
 90 |                 
 91 |                 continue
 92 |                 
 93 |             odds_data = pd.json_normalize(sportsbook["market"]["outcomes"])
 94 |             
 95 |             # The runline (-1.5) refers to the favorite winning by 2 or more points, so we have to first pull who the favorite is
 96 |             
 97 |             moneyline_url = f"https://api.prop-odds.com/beta/odds/{game_id}/moneyline?api_key={API_KEY}"
 98 |             moneyline_odds = requests.get(moneyline_url).json()
 99 |             
100 |             if len(moneyline_odds) < 2:
101 |                 continue
102 |             
103 |             else:
104 |                 
105 |                 for moneyline_book in moneyline_odds["sportsbooks"]:
106 |                     
107 |                     if moneyline_book["bookie_key"] == "draftkings":
108 |                         moneyline_sportsbook = moneyline_book
109 |                     else:
110 |                         continue
111 |                     
112 |                 if len(moneyline_sportsbook) < 1:
113 |                     
114 |                     continue
115 |             
116 |             moneyline_odds_data = pd.json_normalize(moneyline_sportsbook["market"]["outcomes"])
117 |             
118 |             if moneyline_odds_data["odds"].max() < 0:
119 |                 continue
120 |             
121 |             moneyline_favorite = moneyline_odds_data[moneyline_odds_data["odds"] < 0].sort_values(by = "timestamp", ascending = True).head(1)["name"].iloc[0]
122 |             moneyline_underdog = moneyline_odds_data[moneyline_odds_data["odds"] > 0].sort_values(by = "timestamp", ascending = True).head(1)["name"].iloc[0]
123 |             
124 |             # We sort by earliest available pre-game odds first, since the API may occasionally include odds that were set mid-game.
125 |             
126 |             favorite = odds_data[(odds_data["handicap"] == -1.5) & (odds_data["name"] == moneyline_favorite)].sort_values(by = "timestamp", ascending = True).head(1)
127 |             underdog = odds_data[(odds_data["handicap"] == 1.5) & (odds_data["name"] == moneyline_underdog)].sort_values(by = "timestamp", ascending = True).head(1)
128 |             
129 |             if len(favorite) < 1:
130 |                 continue
131 |             elif len(underdog) < 1:
132 |                 continue
133 |             
134 |             team_1_favorite = favorite["name"].drop_duplicates().iloc[0]
135 |             team_2_underdog = underdog["name"].drop_duplicates().iloc[0]
136 |             
137 |             team_1_favorite_odds = favorite["odds"].iloc[0]
138 |             team_2_underdog_odds = underdog["odds"].iloc[0]
139 |             
140 |             odds_dataframe = pd.DataFrame([[team_1_favorite, team_1_favorite_odds, team_2_underdog, team_2_underdog_odds]],
141 |                                           columns = ["team_1", "team_1_spread_odds", "team_2", "team_2_spread_odds"])
142 |                 
143 |             full_odds_dataframe = pd.concat([Game.reset_index(drop = True), odds_dataframe], axis = 1)
144 |             
145 |             if len(full_odds_dataframe) > 1:
146 |                 continue
147 |             
148 |             odds_list.append(full_odds_dataframe)
149 |             
150 | full_odds = pd.concat(odds_list).reset_index(drop = True).rename(columns = {"away_team":"away_name",
151 |                                                                             "home_team":"home_name",
152 |                                                                             "start_timestamp":"game_datetime"})
153 | 
154 | Merged_DataFrame = pd.merge(Schedule_DataFrame, full_odds, on = ["game_datetime", "away_name", "home_name"])
155 | 
156 | Merged_DataFrame["team_1"] = Merged_DataFrame["team_1"].apply(name_converter)
157 | Merged_DataFrame["team_2"] = Merged_DataFrame["team_2"].apply(name_converter)
158 | 
159 | Featured_Merged_DataFrame = Merged_DataFrame[["game_datetime","away_name","home_name","away_score","home_score", "team_1", "team_1_spread_odds", "team_2", "team_2_spread_odds", "venue_name", "winning_team"]].copy().set_index("game_datetime")
160 | 
161 | # "team_1" always represents the favorite
162 | 
163 | # If the favorite was the home team, then home score - away score gives us the spread -- vice versa if the favorite is the away team
164 | 
165 | team_1_away_wins = Featured_Merged_DataFrame[Featured_Merged_DataFrame["away_name"] == Featured_Merged_DataFrame["team_1"]].copy()
166 | team_1_away_wins["spread"] = team_1_away_wins["away_score"] - team_1_away_wins["home_score"]
167 | 
168 | team_1_home_wins = Featured_Merged_DataFrame[Featured_Merged_DataFrame["home_name"] == Featured_Merged_DataFrame["team_1"]].copy()
169 | team_1_home_wins["spread"] = team_1_home_wins["home_score"] - team_1_home_wins["away_score"]
170 | 
171 | spread_dataframe = pd.concat([team_1_away_wins, team_1_home_wins], axis = 0)
172 | 
173 | def spread_converter(spread):
174 |     
175 |     if spread >= 2:
176 |         
177 |         return 1
178 |     else:
179 |         return 0
180 | 
181 | 
182 | # If the favorite won the game by 2 or more points, we assign a 1
183 | # If the favorite wins by less than 2 points, or if the underdog wins, we assign a 0
184 | 
185 | spread_dataframe["spread"] = spread_dataframe["spread"].apply(spread_converter)
186 | 
187 | Featured_Spread_DataFrame = spread_dataframe[["team_1", "team_1_spread_odds", "team_2", "team_2_spread_odds", "venue_name", "spread"]].copy().reset_index().set_index("game_datetime")
188 | 
189 | # To weed out any errors in the data set, we ensure to only include data where the odds are between -200 to +200
190 | # The odds for these bets are almost never set outside of that range, so we exclude them.
191 | 
192 | Featured_Spread_DataFrame = Featured_Spread_DataFrame[(abs(Featured_Spread_DataFrame["team_1_spread_odds"]) < 200) & (abs(Featured_Spread_DataFrame["team_2_spread_odds"]) < 200)]
193 | Featured_Spread_DataFrame = Featured_Spread_DataFrame[Featured_Spread_DataFrame["team_1"] != Featured_Spread_DataFrame["team_2"]]
194 | Featured_Spread_DataFrame.index = pd.to_datetime(Featured_Spread_DataFrame.index).tz_convert("America/Chicago")
195 | 
196 | # We initialize our sqlalchemy engine, then submit the data to the database
197 | 
198 | engine = sqlalchemy.create_engine('mysql+mysqlconnector://username:password@database-host-name:3306/database-name')
199 | 
200 | existing_data = pd.read_sql("SELECT * FROM baseball_spread", con = engine)
201 | existing_data["unique_indentifier"] = existing_data["game_datetime"].astype(str) + existing_data["team_1"].astype(str) + existing_data["team_2"].astype(str) + existing_data["venue_name"].astype(str)
202 | 
203 | Featured_Spread_DataFrame["unique_indentifier"] = Featured_Spread_DataFrame.index.strftime("%Y-%m-%d %H:%M:%S") +  Featured_Spread_DataFrame["team_1"].astype(str) + Featured_Spread_DataFrame["team_2"].astype(str) + Featured_Spread_DataFrame["venue_name"].astype(str) 
204 | 
205 | new_data = Featured_Spread_DataFrame[~Featured_Spread_DataFrame["unique_indentifier"].isin(existing_data["unique_indentifier"])]
206 | new_data = new_data.drop("unique_indentifier", axis = 1)
207 | 
208 | new_data.to_sql("baseball_spread", con = engine, if_exists = "append")


--------------------------------------------------------------------------------