├── year_count.yaml
├── ROC_curve_class.png
├── pca_components.png
├── binary_keras_deep.h5
├── class_label_count.png
├── correlations_class.png
├── SHAP_feature_importances.png
├── feature_importance_xgb_classifier.png
├── extra
├── exact_match.py
└── analyze_output.py
├── cbb.yaml
├── LICENSE
├── README.md
├── .gitignore
├── teams_sports_ref_format.csv
├── all_teams_cbb.csv
├── cbb_web_scraper.py
├── deep_learn.py
├── deep_learn_regressor.py
├── deep_learn_MA.py
├── cbb_regression.py
└── cbb_classification.py
/year_count.yaml:
--------------------------------------------------------------------------------
1 | year:
2 | - 2024
3 | - 2023
4 |
--------------------------------------------------------------------------------
/ROC_curve_class.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bszek213/cbb_machine_learning/HEAD/ROC_curve_class.png
--------------------------------------------------------------------------------
/pca_components.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bszek213/cbb_machine_learning/HEAD/pca_components.png
--------------------------------------------------------------------------------
/binary_keras_deep.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bszek213/cbb_machine_learning/HEAD/binary_keras_deep.h5
--------------------------------------------------------------------------------
/class_label_count.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bszek213/cbb_machine_learning/HEAD/class_label_count.png
--------------------------------------------------------------------------------
/correlations_class.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bszek213/cbb_machine_learning/HEAD/correlations_class.png
--------------------------------------------------------------------------------
/SHAP_feature_importances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bszek213/cbb_machine_learning/HEAD/SHAP_feature_importances.png
--------------------------------------------------------------------------------
/feature_importance_xgb_classifier.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bszek213/cbb_machine_learning/HEAD/feature_importance_xgb_classifier.png
--------------------------------------------------------------------------------
/extra/exact_match.py:
--------------------------------------------------------------------------------
1 | from fuzzywuzzy import process
2 | import pandas as pd
3 |
4 | df = pd.read_csv('all_teams_cbb.csv')
5 | teams = pd.read_csv('teams_sports_ref_format.csv')
6 | def find_closest_match(school_name):
7 | closest_match = process.extractOne(school_name.lower(), teams['teams'])
8 | return closest_match[0]
9 |
10 | df['School'] = df['School'].apply(find_closest_match)
11 |
12 | for val in df['School']:
13 | print(val)
--------------------------------------------------------------------------------
/cbb.yaml:
--------------------------------------------------------------------------------
1 | name: cbb
2 | channels:
3 | - conda-forge
4 | - robostack
5 | - anaconda
6 | - intel
7 | - rapidsai
8 | dependencies:
9 | - hvplot
10 | - numpy
11 | - pandas
12 | - holoviews
13 | - scikit-learn
14 | - keras
15 | - cudatoolkit=11.2
16 | - cudnn=8.1.0
17 | - scipy
18 | - ipython
19 | - plotly
20 | - seaborn
21 | - ipywidgets
22 | - ipykernel
23 | - matplotlib
24 | - spyder
25 | - notebook
26 | - keyboard
27 | - eli5
28 | - pip
29 | - pip:
30 | - sportsipy
31 | - tensorflow
32 | - beautifulsoup4
33 | - eli5
34 | - cfbd
35 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Brian Szekely
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # College Basketball Game Predictions
2 |
3 | Machine learning that predicts the outcome of any Division I college basketball game. Data are from 2010 - 2024 seasons.
4 |
5 | Data are from SportsReference.com
6 |
7 | ## Usage
8 |
9 | ```python
10 | python cbb_classification.py tune or python cbb_classification.py notune
11 | ```
12 |
13 | ```bash
14 | Removed features (>=0.9 correlation): ['fta', 'fta_per_fga_pct', 'fg3a_per_fga_pct', 'ts_pct', 'stl_pct', 'blk_pct', 'efg_pct', 'tov_pct', 'orb_pct', 'ft_rate']
15 | dataset shape: (27973 samples, 55 features)
16 |
17 | ### Current prediction accuracies - XGBoost
18 | # After 5 fold cross validation and pre-processing
19 | Current XGBoost Classifier - best params: {'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 1, 'n_estimators': 200, 'reg_alpha': 0.01, 'reg_lambda': 0.01, 'scale_pos_weight': 1, 'subsample': 1.0}
20 |
21 |
22 | #Classification - XGBoost
23 | Confusion Matrix:[[1316 46]
24 | [ 31 1404]]
25 | Model accuracy on test data: 0.9688952449052556
26 |
27 | #Classificatino - DNN Keras
28 | Final model test loss 0.07359004765748978 and accuracy 0.9760457873344421
29 | ```
30 | ### Correlation Matrix
31 | 
32 |
33 |
35 | ### Feature Importances Classification
36 | XGBoost
37 | 
38 | Deep Neural Network
39 | 
40 |
41 | ## Contributing
42 | Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change.
43 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 | # key file
6 | key.txt
7 | errors.log
8 | #Compression
9 | randomForestModelTuned.pkl
10 | randomForestModelTuned.joblib
11 | classifierModelTuned.joblib
12 | classifierModelTuned_xgb.joblib
13 | #training directory
14 | cbb_sequential_hp/
15 | # C extensions
16 | *.so
17 |
18 | # Distribution / packaging
19 | .Python
20 | build/
21 | develop-eggs/
22 | dist/
23 | downloads/
24 | eggs/
25 | .eggs/
26 | lib/
27 | lib64/
28 | parts/
29 | sdist/
30 | var/
31 | wheels/
32 | pip-wheel-metadata/
33 | share/python-wheels/
34 | *.egg-info/
35 | .installed.cfg
36 | *.egg
37 | MANIFEST
38 |
39 | # PyInstaller
40 | # Usually these files are written by a python script from a template
41 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
42 | *.manifest
43 | *.spec
44 |
45 | #Python Files
46 | github.py
47 |
48 | # Installer logs
49 | pip-log.txt
50 | pip-delete-this-directory.txt
51 |
52 | # Unit test / coverage reports
53 | htmlcov/
54 | .tox/
55 | .nox/
56 | .coverage
57 | .coverage.*
58 | .cache
59 | nosetests.xml
60 | coverage.xml
61 | *.cover
62 | *.py,cover
63 | .hypothesis/
64 | .pytest_cache/
65 |
66 | # Translations
67 | *.mo
68 | *.pot
69 |
70 | # Django stuff:
71 | *.log
72 | local_settings.py
73 | db.sqlite3
74 | db.sqlite3-journal
75 |
76 | # Flask stuff:
77 | instance/
78 | .webassets-cache
79 |
80 | # Scrapy stuff:
81 | .scrapy
82 |
83 | # Sphinx documentation
84 | docs/_build/
85 |
86 | # PyBuilder
87 | target/
88 |
89 | # Jupyter Notebook
90 | .ipynb_checkpoints
91 |
92 | # IPython
93 | profile_default/
94 | ipython_config.py
95 |
96 | # pyenv
97 | .python-version
98 |
99 | # pipenv
100 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
101 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
102 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
103 | # install all needed dependencies.
104 | #Pipfile.lock
105 |
106 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
107 | __pypackages__/
108 |
109 | # Celery stuff
110 | celerybeat-schedule
111 | celerybeat.pid
112 |
113 | # SageMath parsed files
114 | *.sage.py
115 |
116 | # Environments
117 | .env
118 | .venv
119 | env/
120 | venv/
121 | ENV/
122 | env.bak/
123 | venv.bak/
124 |
125 | # Spyder project settings
126 | .spyderproject
127 | .spyproject
128 |
129 | # Rope project settings
130 | .ropeproject
131 |
132 | # mkdocs documentation
133 | /site
134 |
135 | # mypy
136 | .mypy_cache/
137 | .dmypy.json
138 | dmypy.json
139 |
140 | # Pyre type checker
141 | .pyre/
142 |
--------------------------------------------------------------------------------
/extra/analyze_output.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | analyze output from machine learning model to determine why the model gets some games wrong
5 | @author: brianszekely
6 | """
7 | from pandas import read_csv, DataFrame
8 | import matplotlib.pyplot as plt
9 | from numpy import where, mean
10 | from scipy.stats import ttest_ind, pearsonr
11 | from seaborn import regplot
12 | def get_data(path):
13 | return read_csv(path)
14 | def basic_stats(df):
15 | df.dropna(inplace=True)
16 | #get difference in outcomes
17 | df['Team_1_pt_diff'] = abs(df['Team 1 Score'] - df['Team 1 Score Pred'])
18 | df['Team_2_pt_diff'] = abs(df['Team 2 Score'] - df['Team 2 Score Pred'])
19 | corr_median = where(df['Correct Median'] == 1)[0]
20 | incorr_median = where(df['Correct Median'] == 0)[0]
21 | #NO DIFFERENCE BETWEEN TEAM 1 VAR AND TEAM 2 VAR IN THE INCORRECT OUTCOMES
22 | # plt.bar('team_1_var',df['Team 1 Var'].iloc[incorr])
23 | # plt.bar('team_2_var',df['Team 2 Var'].iloc[incorr])
24 | # print(ttest_ind(df['Team 1 Var'].iloc[incorr],df['Team 2 Var'].iloc[incorr]))
25 | #NO DIFFERENCE BETWEEN TEAM 1 VAR AND TEAM 2 VAR IN THE CORRECT OUTCOMES
26 | # plt.bar('team_1_var',df['Team 1 Var'].iloc[corr])
27 | # plt.bar('team_2_var',df['Team 2 Var'].iloc[corr])
28 | # print(ttest_ind(df['Team 1 Var'].iloc[corr],df['Team 2 Var'].iloc[corr]))
29 | #LOW CORRELATIONS BETWEEN VARIABILITY AND ESTIMATED : ACTUAL POINT OUTCOMES
30 | # regplot(data=df,x='Team 1 Var',y='Team_1_pt_diff',scatter=True,fit_reg=True,label='team1')
31 | # regplot(data=df,x='Team 2 Var',y='Team_2_pt_diff',scatter=True,fit_reg=True,label='team2')
32 | # print(pearsonr(df['Team_1_pt_diff'],df['Team 1 Var']))
33 | # print(pearsonr(df['Team_2_pt_diff'],df['Team 2 Var']))
34 | # plt.legend()
35 | #NO DIFFERENCE IN VARIABILITY IN GAMES THAT ARE INCORRECTLY PREDICTED AND HAVE A LARGE PT DIFFERENTIAL COMPARED TO THE
36 | #TEAM THAT WAS CLOSER TO THE PREDICTED OUTCOME
37 | #THERE IS A SIGNIFICANT DIFFERENCE BETWEEN THE TEAM THAT IS BIGGER IN DIFFERENCE THAN THE TEAM HAS A SMALLER DIFFERENCE
38 | #MAY MEAN THAT ONE TEAM IS BEING INCORRECTLY PREDICTED, WHILE THE OTHER TEAM IS ALMOST SPOT ON
39 | # greater_diff = []
40 | # lesser_diff = []
41 | # for i in range(len(incorr_median)):
42 | # if df['Team_1_pt_diff'].iloc[i] > df['Team_2_pt_diff'].iloc[i]:
43 | # greater_diff.append(df['Team_1_pt_diff'].iloc[i])
44 | # lesser_diff.append(df['Team_2_pt_diff'].iloc[i])
45 | # else:
46 | # greater_diff.append(df['Team_2_pt_diff'].iloc[i])
47 | # lesser_diff.append(df['Team_1_pt_diff'].iloc[i])
48 | # plt.bar('greater_diff',mean(greater_diff))
49 | # plt.bar('lesser_diff',mean(lesser_diff))
50 | # print(ttest_ind(greater_diff,lesser_diff))
51 | #NO CORRELATION BETWEEN BEST TWO FEATURES STD AND PTS DIFF BETWEEN TEAMS WITH HIGH DIFF AND LOW DIFF
52 | greater_diff = []
53 | greater_var = []
54 | lesser_diff = []
55 | lesser_var = []
56 | for i in range(len(df)):
57 | if df['Team_1_pt_diff'].iloc[i] > df['Team_2_pt_diff'].iloc[i]:
58 | # greater_var.append(df['Team 1 Var'].iloc[i])
59 | # greater_diff.append(df['Team_1_pt_diff'].iloc[i])
60 | # lesser_diff.append(df['Team_2_pt_diff'].iloc[i])
61 | # lesser_var.append(df['Team 2 Var'].iloc[i])
62 | greater_diff.append(df['Team 1 Var'].iloc[i] / df['Team_1_pt_var'].iloc[i])
63 | lesser_diff.append(df['Team 2 Var'].iloc[i] / df['Team_2_pt_var'].iloc[i])
64 | else:
65 | greater_diff.append(df['Team 2 Var'].iloc[i] / df['Team_2_pt_var'].iloc[i])
66 | lesser_diff.append(df['Team 1 Var'].iloc[i] / df['Team_1_pt_var'].iloc[i])
67 | # greater_var.append(df['Team 2 Var'].iloc[i])
68 | # greater_diff.append(df['Team_2_pt_diff'].iloc[i])
69 | # lesser_diff.append(df['Team_1_pt_diff'].iloc[i])
70 | # lesser_var.append(df['Team 1 Var'].iloc[i])
71 | plt.bar('greater_diff',mean(greater_diff))
72 | plt.bar('lesser_diff',mean(lesser_diff))
73 | print(ttest_ind(greater_diff,lesser_diff))
74 | # greater_diff_df = DataFrame({'Team 1 Var': greater_var,'Team_1_pt_diff': greater_diff})
75 | # lesser_diff_df = DataFrame({'Team 2 Var': lesser_var,'Team_2_pt_diff': lesser_diff})
76 | # regplot(data=greater_diff_df,x='Team 1 Var',y='Team_1_pt_diff',scatter=True,fit_reg=True,label='greater')
77 | # regplot(data=lesser_diff_df,x='Team 2 Var',y='Team_2_pt_diff',scatter=True,fit_reg=True,label='lesser')
78 | # plt.legend()
79 | # print(pearsonr(greater_diff_df['Team_1_pt_diff'],greater_diff_df['Team 1 Var']))
80 | # print(pearsonr(lesser_diff_df['Team_2_pt_diff'],lesser_diff_df['Team 2 Var']))
81 | plt.show()
82 | def main():
83 | df = get_data('test_acc_regression.csv')
84 | basic_stats(df)
85 | if __name__ == "__main__":
86 | main()
--------------------------------------------------------------------------------
/teams_sports_ref_format.csv:
--------------------------------------------------------------------------------
1 | teams
2 | abilene-christian
3 | air-force
4 | akron
5 | alabama
6 | alabama-am
7 | alabama-state
8 | albany-ny
9 | alcorn-state
10 | american
11 | appalachian-state
12 | arizona
13 | arizona-state
14 | arkansas
15 | arkansas-state
16 | arkansas-pine-bluff
17 | army
18 | auburn
19 | austin-peay
20 | ball-state
21 | baylor
22 | belmont
23 | bethune-cookman
24 | binghamton
25 | boise-state
26 | boston-college
27 | boston-university
28 | bowling-green-state
29 | bradley
30 | brigham-young
31 | brown
32 | bryant
33 | bucknell
34 | buffalo
35 | butler
36 | cal-poly
37 | cal-state-bakersfield
38 | cal-state-fullerton
39 | cal-state-northridge
40 | california
41 | campbell
42 | canisius
43 | central-arkansas
44 | central-connecticut-state
45 | central-florida
46 | central-michigan
47 | charleston-southern
48 | charlotte
49 | chattanooga
50 | chicago-state
51 | cincinnati
52 | clemson
53 | cleveland-state
54 | coastal-carolina
55 | colgate
56 | college-of-charleston
57 | colorado
58 | colorado-state
59 | columbia
60 | connecticut
61 | coppin-state
62 | cornell
63 | creighton
64 | dartmouth
65 | davidson
66 | dayton
67 | delaware
68 | delaware-state
69 | denver
70 | depaul
71 | detroit-mercy
72 | drake
73 | drexel
74 | duke
75 | duquesne
76 | east-carolina
77 | east-tennessee-state
78 | eastern-illinois
79 | eastern-kentucky
80 | eastern-michigan
81 | eastern-washington
82 | elon
83 | evansville
84 | fairfield
85 | fairleigh-dickinson
86 | florida
87 | florida-am
88 | florida-atlantic
89 | florida-gulf-coast
90 | florida-international
91 | florida-state
92 | fordham
93 | fresno-state
94 | furman
95 | gardner-webb
96 | george-mason
97 | george-washington
98 | georgetown
99 | georgia
100 | georgia-southern
101 | georgia-state
102 | georgia-tech
103 | gonzaga
104 | grambling
105 | grand-canyon
106 | green-bay
107 | hampton
108 | hartford
109 | harvard
110 | hawaii
111 | high-point
112 | hofstra
113 | holy-cross
114 | houston
115 | houston-baptist
116 | howard
117 | idaho
118 | idaho-state
119 | illinois
120 | illinois-state
121 | illinois-chicago
122 | indiana
123 | indiana-state
124 | iona
125 | iowa
126 | iowa-state
127 | iupui
128 | jackson-state
129 | jacksonville
130 | jacksonville-state
131 | james-madison
132 | kansas
133 | missouri-kansas-city
134 | kansas-state
135 | kennesaw-state
136 | kent-state
137 | kentucky
138 | la-salle
139 | lafayette
140 | lamar
141 | lehigh
142 | liberty
143 | lipscomb
144 | arkansas-little-rock
145 | long-beach-state
146 | long-island-university
147 | longwood
148 | louisiana-lafayette
149 | louisiana-state
150 | louisiana-tech
151 | louisiana-monroe
152 | louisville
153 | loyola-il
154 | loyola-md
155 | loyola-marymount
156 | maine
157 | manhattan
158 | marist
159 | marquette
160 | marshall
161 | maryland
162 | maryland-baltimore-county
163 | maryland-eastern-shore
164 | massachusetts
165 | massachusetts-lowell
166 | mcneese-state
167 | memphis
168 | mercer
169 | miami-fl
170 | miami-oh
171 | michigan
172 | michigan-state
173 | middle-tennessee
174 | milwaukee
175 | minnesota
176 | mississippi
177 | mississippi-state
178 | mississippi-valley-state
179 | missouri
180 | missouri-state
181 | monmouth
182 | montana
183 | montana-state
184 | morehead-state
185 | morgan-state
186 | mount-st-marys
187 | murray-state
188 | navy
189 | north-carolina-state
190 | nebraska
191 | nevada
192 | nevada-las-vegas
193 | new-hampshire
194 | new-mexico
195 | new-mexico-state
196 | new-orleans
197 | niagara
198 | nicholls-state
199 | njit
200 | norfolk-state
201 | north-carolina
202 | north-carolina-at
203 | north-carolina-central
204 | north-dakota
205 | north-dakota-state
206 | north-florida
207 | north-texas
208 | northeastern
209 | northern-arizona
210 | northern-colorado
211 | northern-illinois
212 | northern-iowa
213 | northwestern
214 | northwestern-state
215 | notre-dame
216 | oakland
217 | ohio
218 | ohio-state
219 | oklahoma
220 | oklahoma-state
221 | old-dominion
222 | nebraska-omaha
223 | oral-roberts
224 | oregon
225 | oregon-state
226 | pacific
227 | penn-state
228 | pennsylvania
229 | pepperdine
230 | pittsburgh
231 | portland
232 | portland-state
233 | prairie-view
234 | presbyterian
235 | princeton
236 | providence
237 | purdue
238 | ipfw
239 | quinnipiac
240 | radford
241 | rhode-island
242 | rice
243 | richmond
244 | rider
245 | robert-morris
246 | rutgers
247 | sacramento-state
248 | sacred-heart
249 | saint-francis-pa
250 | saint-josephs
251 | saint-louis
252 | saint-marys-ca
253 | saint-peters
254 | sam-houston-state
255 | samford
256 | san-diego
257 | san-diego-state
258 | san-francisco
259 | san-jose-state
260 | santa-clara
261 | seattle
262 | seton-hall
263 | siena
264 | south-alabama
265 | south-carolina
266 | south-carolina-state
267 | south-carolina-upstate
268 | south-dakota
269 | south-dakota-state
270 | south-florida
271 | southeast-missouri-state
272 | southeastern-louisiana
273 | southern
274 | southern-california
275 | southern-illinois
276 | southern-illinois-edwardsville
277 | southern-methodist
278 | southern-mississippi
279 | southern-utah
280 | st-bonaventure
281 | st-francis-ny
282 | st-johns-ny
283 | stanford
284 | stephen-f-austin
285 | stetson
286 | stony-brook
287 | syracuse
288 | texas-christian
289 | temple
290 | tennessee
291 | tennessee-state
292 | tennessee-tech
293 | tennessee-martin
294 | texas
295 | texas-am
296 | texas-am-corpus-christi
297 | texas-southern
298 | texas-state
299 | texas-tech
300 | texas-pan-american
301 | citadel
302 | toledo
303 | towson
304 | troy
305 | tulane
306 | tulsa
307 | alabama-birmingham
308 | california-davis
309 | california-irvine
310 | california-riverside
311 | california-santa-barbara
312 | ucla
313 | north-carolina-asheville
314 | north-carolina-greensboro
315 | north-carolina-wilmington
316 | texas-arlington
317 | utah
318 | utah-state
319 | utah-valley
320 | texas-el-paso
321 | texas-san-antonio
322 | valparaiso
323 | vanderbilt
324 | vermont
325 | villanova
326 | virginia
327 | virginia-commonwealth
328 | virginia-military-institute
329 | virginia-tech
330 | wagner
331 | wake-forest
332 | washington
333 | washington-state
334 | weber-state
335 | west-virginia
336 | western-carolina
337 | western-illinois
338 | western-kentucky
339 | western-michigan
340 | wichita-state
341 | william-mary
342 | winthrop
343 | wisconsin
344 | wofford
345 | wright-state
346 | wyoming
347 | xavier
348 | yale
349 | youngstown-state
350 |
--------------------------------------------------------------------------------
/all_teams_cbb.csv:
--------------------------------------------------------------------------------
1 | School,From,To
2 | Abilene Christian,1971,2024
3 | Air Force,1958,2024
4 | Akron,1902,2024
5 | Alabama,1913,2024
6 | Alabama A&M,2000,2024
7 | Alabama State,1983,2024
8 | Albany (NY),2000,2024
9 | Alcorn State,1978,2024
10 | Allegheny Gators,1896,1916
11 | American,1967,2024
12 | Amherst Lord Jeffs,1901,1902
13 | Appalachian State,1974,2024
14 | Arizona,1905,2024
15 | Arizona State,1912,2024
16 | Arkansas,1924,2024
17 | Arkansas State,1971,2024
18 | Arkansas-Pine Bluff,1999,2024
19 | Armstrong Pirates,1987,1987
20 | Army,1903,2024
21 | Auburn,1906,2024
22 | Augusta Jaguars,1985,1991
23 | Augustana (IL) Vikings,1902,1917
24 | Austin Peay,1964,2024
25 | Baker University Wildcats,1903,1908
26 | Baldwin-Wallace Yellow Jackets,1948,1953
27 | Ball State,1972,2024
28 | Baltimore Super Bees,1979,1983
29 | Baylor,1907,2024
30 | Bellarmine,2021,2024
31 | Belmont,2000,2024
32 | Beloit Buccaneers,1911,1924
33 | Bethune-Cookman,1981,2024
34 | Binghamton,2002,2024
35 | Birmingham-Southern Panthers,1920,2006
36 | Bloomsburg Huskies,1896,1911
37 | Boise State,1972,2024
38 | Boston College,1946,2024
39 | Boston University,1916,2024
40 | Bowling Green State,1916,2024
41 | Bradley,1903,2024
42 | Brigham Young,1903,2024
43 | Brigham Young College,1908,1908
44 | Brooklyn Bulldogs,1934,1992
45 | Brown,1901,2024
46 | Bryant,2011,2024
47 | Bucknell,1896,2024
48 | Buffalo,1907,2024
49 | Butler,1897,2024
50 | Cal Poly,1995,2024
51 | Cal State Bakersfield,2011,2024
52 | Cal State Fullerton,1975,2024
53 | Cal State Los Angeles Golden Eagles,1971,1975
54 | Cal State Northridge,1991,2024
55 | California,1908,2024
56 | California Baptist,2019,2024
57 | Campbell,1978,2024
58 | Canisius,1904,2024
59 | Canterbury College,1931,1931
60 | Carleton College Knights,1910,1934
61 | Carnegie Mellon Tartans,1933,1939
62 | Case Western Reserve Spartans,1898,1955
63 | Catholic Cardinals,1913,1981
64 | Centenary (LA) Gents,1960,2011
65 | Central Arkansas,2011,2024
66 | Central Connecticut State,1987,2024
67 | Central Michigan,1974,2024
68 | Central Missouri Mules,1913,1937
69 | Central Pennsylvania College Knights,1896,1900
70 | Centre (KY) Colonels,1910,1919
71 | Charleston Southern,1975,2024
72 | Charlotte,1973,2024
73 | Chattanooga,1978,2024
74 | Cheyenne Business College,1903,1903
75 | Chicago Maroons,1896,1946
76 | Chicago State,1985,2024
77 | Cincinnati,1902,2024
78 | City College of New York Beavers,1906,1953
79 | Clemson,1912,2024
80 | Cleveland State,1973,2024
81 | Coastal Carolina,1987,2024
82 | Colgate,1901,2024
83 | College of Charleston,1992,2024
84 | College of New Jersey Lions,1900,1900
85 | Colorado,1902,2024
86 | Colorado College Tigers,1915,1937
87 | Colorado School of Mines Orediggers,1908,1937
88 | Colorado State,1902,2024
89 | Columbia,1901,2024
90 | Concordia Seminary Preachers,1907,1923
91 | Connecticut,1901,2024
92 | Coppin State,1986,2024
93 | Cornell,1899,2024
94 | Cotner College,1910,1911
95 | Creighton,1912,2024
96 | Cumberland,1904,1904
97 | Dakota Wesleyan Tigers,1932,1932
98 | Dartmouth,1900,2024
99 | Davidson,1909,2024
100 | Dayton,1904,2024
101 | Delaware,1906,2024
102 | Delaware State,1974,2024
103 | Denison Big Red,1905,1944
104 | Denver,1904,2024
105 | DePaul,1924,2024
106 | DePauw Tigers,1916,1932
107 | Detroit Mercy,1910,2024
108 | Dickinson College Red Devils,1926,1947
109 | Drake,1907,2024
110 | Drexel,1895,2024
111 | Duke,1906,2024
112 | Duquesne,1914,2024
113 | East Carolina,1967,2024
114 | East Central Tigers,1929,1931
115 | East Tennessee State,1959,2024
116 | Eastern Illinois,1982,2024
117 | Eastern Kentucky,1948,2024
118 | Eastern Michigan,1933,2024
119 | Eastern Washington,1984,2024
120 | Elon,2000,2024
121 | Emporia State Hornets,1934,1934
122 | Ensign College,1903,1903
123 | Evansville,1925,2024
124 | Fairfield,1965,2024
125 | FDU,1968,2024
126 | Florida,1921,2024
127 | Florida A&M,1980,2024
128 | Florida Atlantic,1994,2024
129 | Florida Gulf Coast,2011,2024
130 | Florida International,1988,2024
131 | Florida State,1957,2024
132 | Fordham,1903,2024
133 | Franklin Grizzlies,1907,1925
134 | Fresno State,1956,2024
135 | Furman,1920,2024
136 | Gardner-Webb,2003,2024
137 | Geneva Golden Tornadoes,1893,1943
138 | George Mason,1979,2024
139 | George Washington,1913,2024
140 | Georgetown,1907,2024
141 | Georgia,1906,2024
142 | Georgia Southern,1972,2024
143 | Georgia State,1974,2024
144 | Georgia Tech,1920,2024
145 | Gettysburg Bullets,1901,1973
146 | Gonzaga,1944,2024
147 | Grambling,1978,2024
148 | Grand Canyon,2014,2024
149 | Green Bay,1982,2024
150 | Grinnell Pioneers,1901,1939
151 | Grove City Wolverines,1899,1925
152 | Hamline Pipers,1945,1948
153 | Hampton,1996,2024
154 | Hardin-Simmons Cowboys,1923,1990
155 | Hartford Hawks,1985,2023
156 | Harvard,1901,2024
157 | Haskell (KS) Fighting Indians,1903,1908
158 | Hawaii,1971,2024
159 | High Point,2000,2024
160 | Hiram Terriers,1894,1904
161 | Hofstra,1943,2024
162 | Holy Cross,1901,2024
163 | Hope Flying Dutchmen,1908,1913
164 | Houston,1951,2024
165 | Houston Christian,1974,2024
166 | Howard,1974,2024
167 | Idaho,1906,2024
168 | Idaho State,1959,2024
169 | Illinois,1906,2024
170 | Illinois State,1899,2024
171 | Illinois Wesleyan Titans,1928,1928
172 | Illinois-Chicago,1982,2024
173 | Incarnate Word,2014,2024
174 | Indiana,1901,2024
175 | Indiana State,1900,2024
176 | Iona,1954,2024
177 | Iowa,1893,2024
178 | Iowa State,1908,2024
179 | IUPUI,1999,2024
180 | Jackson State,1978,2024
181 | Jacksonville,1967,2024
182 | Jacksonville State,1996,2024
183 | James Madison,1977,2024
184 | John Carroll Blue Streaks,1948,1955
185 | Kalamazoo Hornets,1908,1923
186 | Kansas,1899,2024
187 | Missouri Kansas City,1990,2024
188 | Kansas State,1906,2024
189 | Kennesaw State,2010,2024
190 | Kent State,1914,2024
191 | Kentucky,1903,2024
192 | Kentucky Wesleyan Panthers,1957,1958
193 | La Salle,1932,2024
194 | Lafayette,1901,2024
195 | Lake Forest Foresters,1905,1916
196 | Lamar,1970,2024
197 | Lawrence Tech,1948,1948
198 | Le Moyne,2024,2024
199 | Lehigh,1902,2024
200 | Lewis Flyers,1905,1905
201 | Liberty,1989,2024
202 | Lindenwood,2023,2024
203 | Lipscomb,2004,2024
204 | Arkansas Little Rock,1979,2024
205 | Long Beach State,1970,2024
206 | Long Island University,1929,2024
207 | Longwood,2008,2024
208 | Louisiana Lafayette,1972,2024
209 | Louisiana State,1909,2024
210 | Louisiana Tech,1974,2024
211 | Louisiana-Monroe,1974,2024
212 | Louisville,1912,2024
213 | Loyola (IL),1921,2024
214 | Loyola (LA) Wolfpack,1952,1972
215 | Loyola (MD),1908,2024
216 | Loyola Marymount,1943,2024
217 | Macalester Scots,1896,1899
218 | Maine,1904,2024
219 | Manchester Spartans,1926,1926
220 | Manhattan,1905,2024
221 | Marietta Pioneers,1908,1920
222 | Marist,1982,2024
223 | Marquette,1917,2024
224 | Marshall,1919,2024
225 | Maryland,1924,2024
226 | Maryland-Baltimore County,1987,2024
227 | Maryland-Eastern Shore,1974,2024
228 | Massachusetts,1926,2024
229 | Massachusetts Institute of Technology Engineers,1909,1909
230 | Massachusetts-Lowell,1906,2024
231 | McNeese State,1974,2024
232 | Memphis,1956,2024
233 | Mercer,1974,2024
234 | Merchant Marine Mariners,1946,1947
235 | Merrimack,2020,2024
236 | Miami (FL),1949,2024
237 | Miami (OH),1906,2024
238 | Michigan,1918,2024
239 | Michigan State,1899,2024
240 | Middle Tennessee,1959,2024
241 | Millikin Big Blue,1910,1921
242 | Millsaps Majors,1911,1921
243 | Milwaukee,1974,2024
244 | Minnesota,1896,2024
245 | Minnesota A&M Aggies,1896,1903
246 | Mississippi,1909,2024
247 | Mississippi State,1909,2024
248 | Mississippi Valley State,1980,2024
249 | Missouri,1907,2024
250 | Missouri State,1983,2024
251 | Monmouth,1984,2024
252 | Montana,1912,2024
253 | Montana State,1902,2024
254 | Morehead State,1956,2024
255 | Morgan State,1985,2024
256 | Morris Brown Wolverines,2002,2003
257 | Mount St. Mary's,1989,2024
258 | Mount Union Purple Raiders,1896,1932
259 | Muhlenberg Mules,1901,1963
260 | Murray State,1954,2024
261 | Muskingum Fighting Muskies,1905,1927
262 | Navy,1908,2024
263 | North Carolina State,1913,2024
264 | Nebraska,1897,2024
265 | Nebraska Wesleyan Prairie Wolves,1906,1917
266 | Nevada,1913,2024
267 | Nevada-Las Vegas,1970,2024
268 | New Hampshire,1927,2024
269 | New Mexico,1900,2024
270 | New Mexico State,1905,2024
271 | New Orleans,1976,2024
272 | New York University Violets,1907,1971
273 | Newberry Wolves,1921,1921
274 | Niagara,1906,2024
275 | Nicholls State,1981,2024
276 | NJIT,2010,2024
277 | Norfolk State,1998,2024
278 | North Alabama,2019,2024
279 | North Carolina,1911,2024
280 | North Carolina A&T,1974,2024
281 | North Carolina Central,2011,2024
282 | North Central Cardinals,1911,1922
283 | North Dakota,1905,2024
284 | North Dakota State,1898,2024
285 | North Florida,2010,2024
286 | North Texas,1922,2024
287 | Northeastern,1938,2024
288 | Northeastern Illinois Golden Eagles,1991,1998
289 | Northern Arizona,1919,2024
290 | Northern Colorado,1911,2024
291 | Northern Illinois,1927,2024
292 | Northern Iowa,1981,2024
293 | Northern Kentucky,2013,2024
294 | Northwest Missouri State Bearcats,1930,1932
295 | Northwestern,1905,2024
296 | Northwestern State,1977,2024
297 | Notre Dame,1897,2024
298 | Oakland,2000,2024
299 | Oberlin Yeomen,1905,1921
300 | Ohio,1908,2024
301 | Ohio State,1899,2024
302 | Ohio Wesleyan Battling Bishops,1929,1935
303 | Oklahoma,1908,2024
304 | Oklahoma City Chiefs,1951,1985
305 | Oklahoma State,1908,2024
306 | Old Dominion,1977,2024
307 | Nebraska Omaha,2013,2024
308 | Oral Roberts,1972,2024
309 | Oregon,1903,2024
310 | Oregon State,1902,2024
311 | Pacific,1938,2024
312 | Penn State,1897,2024
313 | Pennsylvania,1897,2024
314 | Pepperdine,1944,2024
315 | Phillips Haymakers,1920,1920
316 | Pittsburg State Gorillas,1927,1931
317 | Pittsburgh,1906,2024
318 | Portland,1954,2024
319 | Portland State,1973,2024
320 | Prairie View,1981,2024
321 | Pratt Institute Cannoneers,1934,1934
322 | Presbyterian,2011,2024
323 | Princeton,1901,2024
324 | Providence,1929,2024
325 | Purdue,1897,2024
326 | Purdue Fort Wayne,2003,2024
327 | Queens (NC),2023,2024
328 | Quinnipiac,1999,2024
329 | Radford,1985,2024
330 | Regis (CO) Rangers,1962,1964
331 | Rensselaer Engineers,1901,1924
332 | Rhode Island,1904,2024
333 | Rice,1915,2024
334 | Richmond,1913,2024
335 | Rider,1929,2024
336 | Ripon Red Hawks,1902,1922
337 | Roanoke Maroons,1912,1919
338 | Robert Morris,1977,2024
339 | Rochester (NY) Yellowjackets,1910,1944
340 | Rose-Hulman Fightin' Engineers,1898,1898
341 | Rutgers,1914,2024
342 | Sacramento State,1992,2024
343 | Sacred Heart,2000,2024
344 | Saint Francis (PA),1956,2024
345 | Saint Joseph's,1910,2024
346 | Saint Louis,1916,2024
347 | Saint Mary's (CA),1910,2024
348 | Saint Peter's,1966,2024
349 | Sam Houston,1987,2024
350 | Samford,1973,2024
351 | San Diego,1980,2024
352 | San Diego State,1971,2024
353 | San Francisco,1924,2024
354 | San Jose State,1938,2024
355 | Santa Clara,1909,2024
356 | Savage School of Physical Education,1896,1898
357 | Savannah State Tigers,2003,2019
358 | Scranton Royals,1948,1948
359 | Seattle,1953,2024
360 | Seton Hall,1909,2024
361 | Sewanee Tigers,1923,1941
362 | Siena,1939,2024
363 | South Alabama,1972,2024
364 | South Carolina,1909,2024
365 | South Carolina State,1972,2024
366 | South Carolina Upstate,2011,2024
367 | South Dakota,2011,2024
368 | South Dakota State,2009,2024
369 | South Florida,1974,2024
370 | Southeast Missouri State,1992,2024
371 | Southeastern Louisiana,1981,2024
372 | Southern,1978,2024
373 | Southern California,1907,2024
374 | Southern Illinois,1968,2024
375 | Southern Illinois-Edwardsville,2011,2024
376 | Southern Indiana,2023,2024
377 | Southern Methodist,1917,2024
378 | Southern Mississippi,1973,2024
379 | Southern Utah,1989,2024
380 | Southwestern (KS) Moundbuilders,1905,1923
381 | Southwestern (TX) Pirates,1915,1916
382 | Springfield Pride,1897,1935
383 | St. Bonaventure,1920,2024
384 | St. Francis (NY) Terriers,1902,2023
385 | St. John's (NY),1908,2024
386 | St. John's College (OH),1921,1921
387 | St. Lawrence Saints,1902,1914
388 | St. Thomas,2022,2024
389 | Stanford,1914,2024
390 | Stephen F. Austin,1987,2024
391 | Stetson,1972,2024
392 | Stevens Institute Ducks,1917,1920
393 | Stonehill,2023,2024
394 | Stony Brook,2000,2024
395 | SUNY-Potsdam Bears,1910,1913
396 | Swarthmore Garnet,1906,1919
397 | Syracuse,1901,2024
398 | Tarleton State,2021,2024
399 | TCU,1914,2024
400 | Temple,1895,2024
401 | Tennessee,1909,2024
402 | Tennessee State,1978,2024
403 | Tennessee Tech,1944,2024
404 | Tennessee-Martin,1993,2024
405 | Texas,1906,2024
406 | Texas A&M,1913,2024
407 | Texas A&M-Commerce,2023,2024
408 | Texas A&M-Corpus Christi,2003,2024
409 | Texas Southern,1978,2024
410 | Texas State,1985,2024
411 | Texas Tech,1926,2024
412 | Texas Wesleyan Rams,1948,1948
413 | Texas-Rio Grande Valley,1969,2024
414 | The Citadel,1913,2024
415 | Toledo,1916,2024
416 | Towson,1980,2024
417 | Trinity (CT) Bantams,1897,1911
418 | Trinity (TX) Tigers,1971,1973
419 | Troy,1994,2024
420 | Tulane,1906,2024
421 | Tulsa,1914,2024
422 | U.S. International Gulls,1982,1991
423 | Alabama Birmingham,1980,2024
424 | California Davis,2008,2024
425 | California irvine,1978,2024
426 | California Riverside,2002,2024
427 | California San Diego,2021,2024
428 | California Santa Barbara,1964,2024
429 | Central Florida,1985,2024
430 | UCLA,1920,2024
431 | North Carolina Asheville,1987,2024
432 | North Carolina Greensboro,1992,2024
433 | North Carolina Wilmington,1977,2024
434 | Union (NY) Dutchmen,1907,1925
435 | Texas Arlington,1969,2024
436 | Utah,1909,2024
437 | Utah State,1904,2024
438 | Utah Tech,2021,2024
439 | Utah Valley,2010,2024
440 | University Texas El Paso,1923,2024
441 | Utica Pioneers,1982,1987
442 | University Texas San Antonio,1982,2024
443 | Valparaiso,1918,2024
444 | Vanderbilt,1901,2024
445 | Vermont,1921,2024
446 | Villanova,1921,2024
447 | Virginia,1906,2024
448 | Virginia Commonwealth,1974,2024
449 | Virginia Military Institute,1909,2024
450 | Virginia Tech,1909,2024
451 | Wabash Little Giants,1897,1925
452 | Wagner,1966,2024
453 | Wake Forest,1906,2024
454 | Washburn Ichabods,1906,1941
455 | Washington,1896,2024
456 | Washington & Jefferson Presidents,1913,1944
457 | Washington & Lee Generals,1907,1959
458 | Washington (MO) Bears,1905,1960
459 | Washington College Shoremen,1913,1925
460 | Washington State,1902,2024
461 | Wayne State (MI) Warriors,1928,1950
462 | Weber State,1964,2024
463 | Wesleyan (CT) Cardinals,1896,1913
464 | West Chester Golden Rams,1899,1982
465 | West Texas A&M Buffaloes,1921,1986
466 | West Virginia,1904,2024
467 | Western Carolina,1977,2024
468 | Western Colorado Mountaineers,1924,1937
469 | Western Illinois,1982,2024
470 | Western Kentucky,1922,2024
471 | Western Michigan,1914,2024
472 | Westminster (MO) Blue Jays,1920,1920
473 | Westminster (PA) Titans,1898,1935
474 | Wheaton (IL) Thunder,1902,1905
475 | Whittier Poets,1909,1915
476 | Wichita State,1906,2024
477 | Widener Pride,1899,1909
478 | William Mary,1906,2024
479 | Williams Ephs,1901,1911
480 | Winthrop,1987,2024
481 | Wisconsin,1899,2024
482 | Wisconsin-Stevens Point Pointers,1898,1918
483 | Wisconsin-Superior Yellowjackets,1900,1901
484 | Wittenberg Tigers,1931,1931
485 | Wofford,1996,2024
486 | Wooster Fighting Scots,1901,1931
487 | WPI Engineers,1920,1920
488 | Wright State,1988,2024
489 | Wyoming,1905,2024
490 | Xavier,1920,2024
491 | Yale,1896,2024
492 | Youngstown State,1948,2024
493 |
--------------------------------------------------------------------------------
/cbb_web_scraper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | html parse code - college basketball
5 | @author: brianszekely
6 | """
7 | import requests
8 | from bs4 import BeautifulSoup
9 | from pandas import DataFrame
10 | from numpy import nan
11 | from time import sleep
12 | from os.path import join, exists
13 | from os import getcwd
14 | from urllib import request
15 | from urllib.request import Request, urlopen
16 | from pandas import read_csv
17 | from numpy import where
18 | from re import search
19 | from difflib import get_close_matches
20 | from datetime import datetime
21 | from numpy import nan
22 | #TODO: CREATE A FEATURE OF opp_simple_rating_system
23 |
24 | def get_teams_year(year_min,year_max):
25 | #Try to redo this when 429 is not an issue
26 | # URL = 'https://www.sports-reference.com/cbb/schools/'
27 | # hdr = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
28 | # req = Request(URL,headers=hdr)
29 | # html = request.urlopen(req)
30 | # soup = BeautifulSoup(html, "html.parser")
31 | # table = soup.find(class_="table_container is_setup")
32 | # print(soup)
33 | # input()
34 | #Read in from csv
35 | teams_save = []
36 | teams = read_csv('all_teams_cbb.csv')
37 | teams_with_year = where((teams['From'] <= year_min) & (teams['To'] == year_max))[0]
38 | for team in teams['School'].iloc[teams_with_year]:
39 | team = team.replace(' ', '-').lower()
40 | if '.' in team:
41 | team = team.replace(".", "")
42 | if 'the' in team:
43 | team = team.replace("the-", "")
44 | if '&' in team:
45 | team = team.replace("&", "")
46 | if '(' in team and ')' in team:
47 | team = team.replace("(", "")
48 | team = team.replace(")", "")
49 | if "'" in team:
50 | team = team.replace("'", "")
51 | teams_save.append(team)
52 | return teams_save
53 |
54 | def alter_string(team):
55 | team = team.replace(' ', '-').lower()
56 | if '.' in team:
57 | team = team.replace(".", "")
58 | if 'the' in team:
59 | team = team.replace("the-", "")
60 | if '&' in team:
61 | team = team.replace("&", "")
62 | if '(' in team and ')' in team:
63 | team = team.replace("(", "")
64 | team = team.replace(")", "")
65 | if "'" in team:
66 | team = team.replace("'", "")
67 | return team
68 | def get_latest_srs(team):
69 | sleep(4)
70 | url_srs = f'https://www.sports-reference.com/cbb/schools/{team}/men/2024-schedule.html'
71 | hdr = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
72 | req_1 = Request(url_srs,headers=hdr)
73 | html_1 = request.urlopen(req_1)
74 | soup_3 = BeautifulSoup(html_1, "html.parser")
75 | table3 = soup_3.find(id='div_schedule')
76 | tbody2 = table3.find('tbody')
77 | tr_body2 = tbody2.find_all('tr')
78 | srs = []
79 | for trb in tr_body2:
80 | for td in trb.find_all('td'):
81 | if td.get('data-stat') == "srs":
82 | srs.append(td.get_text())
83 | return float(srs[-1])
84 |
85 | def get_adv_opp_variables(team,parsed_date):
86 | date_without_time = parsed_date.strftime('%Y-%m-%d')
87 | sleep(3)
88 | url ='https://www.sports-reference.com/cbb/schools/' + team + '/' + str(2024) + '-gamelogs-advanced.html'
89 | hdr = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
90 | req_1 = Request(url,headers=hdr)
91 | html_1 = request.urlopen(req_1)
92 | soup_2 = BeautifulSoup(html_1, "html.parser")
93 | table2 = soup_2.find(id="all_sgl-advanced")
94 | tbody2 = table2.find('tbody')
95 | tr_body2 = tbody2.find_all('tr')
96 | # off_rtg, def_rtg = [], []
97 | efg_pct = None
98 | print(f'team they played: {team}')
99 | for trb in tr_body2:
100 | for td in trb.find_all('td'):
101 | if td.get('data-stat') == 'date':
102 | if td.get_text() == date_without_time:
103 | continue
104 | else:
105 | break
106 | if td.get('data-stat') == "efg_pct":
107 | efg_pct = td.get_text()
108 | return efg_pct
109 |
110 | def html_to_df_web_scrape_cbb(URL,URL1,team,year):
111 | #URL = Basic data ; URL1 = Advanced stats
112 | url_srs = f'https://www.sports-reference.com/cbb/schools/{team}/men/{year}-schedule.html'
113 | hdr = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
114 | req_1 = Request(URL,headers=hdr)
115 | html_1 = request.urlopen(req_1)
116 | sleep(4)
117 | req_2 = Request(URL1,headers=hdr)
118 | html_2 = request.urlopen(req_2)
119 | sleep(4)
120 | req_3 = Request(url_srs,headers=hdr)
121 | html_3 = request.urlopen(req_3)
122 | # while True:4700++6
123 | # try:
124 | soup_1 = BeautifulSoup(html_1, "html.parser")
125 | soup_2 = BeautifulSoup(html_2, "html.parser")
126 | soup_3 = BeautifulSoup(html_3, "html.parser")
127 | # page = requests.get(URL)
128 | # soup = BeautifulSoup(page.content, "html.parser")
129 | # page1 = requests.get(URL1)
130 | # soup1 = BeautifulSoup(page1.content, "html.parser")
131 | # break
132 | # except:
133 | # print('HTTPSConnectionPool(host="www.sports-reference.com", port=443): Max retries exceeded. Retry in 10 seconds')
134 | # sleep(10)
135 | # table = soup_1.find(id="all_sgl-basic")
136 | table = soup_1.select_one('table[id^="sgl-basic"]')
137 | table1 = soup_2.find(id="all_sgl-advanced")
138 | table3 = soup_3.find(id='div_schedule')
139 | tbody = table.find('tbody')
140 | tbody1 = table1.find('tbody')
141 | tbody2 = table3.find('tbody')
142 | tr_body = tbody.find_all('tr')
143 | tr_body1 = tbody1.find_all('tr')
144 | tr_body2 = tbody2.find_all('tr')
145 | # game_season = []
146 | # date_game = []
147 | # game_location = []
148 | # opp_id= []
149 | # BASIC STATS
150 | game_result= []
151 | pts= []
152 | opp_pts= []
153 | fg= []
154 | fga= []
155 | fg_pct= []
156 | fg3= []
157 | fg3a= []
158 | fg3_pct= []
159 | ft= []
160 | fta= []
161 | ft_pct= []
162 | orb= []
163 | total_board= []
164 | ast= []
165 | stl= []
166 | blk= []
167 | tov= []
168 | pf= []
169 | opp_fg = []
170 | opp_fga= []
171 | opp_fg_pct= []
172 | opp_fg3= []
173 | opp_fg3a= []
174 | opp_fg3_pct= []
175 | opp_ft= []
176 | opp_fta= []
177 | opp_ft_pct= []
178 | opp_orb= []
179 | opp_trb= []
180 | opp_ast= []
181 | opp_stl= []
182 | opp_blk= []
183 | opp_tov= []
184 | opp_pf= []
185 | game_loc = []
186 | srs = []
187 | date_save = []
188 | efg_percent_opp = []
189 | # opp_srs = []
190 | #SIMPLE RATING SYSTEM
191 | # teams_sports_ref = read_csv('teams_sports_ref_format.csv')
192 | for trb in tr_body2:
193 | for td in trb.find_all('td'):
194 | # if td.get('data-stat') == 'opp_name':
195 | # get_close_matches(td.get_text(),teams_sports_ref['teams'].tolist(),n=1)[0]
196 | # print(td.get_text())
197 | if td.get('data-stat') == "srs":
198 | if td.get_text() == '':
199 | srs.append(nan)
200 | else:
201 | srs.append(td.get_text())
202 | #SIMPLE RATING SYSTEM - OPPONENT ?
203 | #BASIC STATS - change td.get_text() to float(td.get_text()) ?
204 | for trb in tr_body:
205 | for td in trb.find_all('td'):
206 | if td.get('data-stat') == "game_location":
207 | #home = 0, away = 1, N = 2
208 | if td.get_text() == 'N':
209 | game_loc.append(2)
210 | elif td.get_text() == '@':
211 | game_loc.append(1)
212 | elif td.get_text() == '':
213 | game_loc.append(0)
214 | if td.get('data-stat') == "game_result":
215 | if 'W' in td.get_text():
216 | game_result.append(1)
217 | else:
218 | game_result.append(0)
219 | if td.get('data-stat') == "date":
220 | parsed_date = datetime.strptime(td.get_text(), '%Y-%m-%d')
221 | month = parsed_date.month
222 | day = parsed_date.day
223 | date_save.append(float(f'{month}.{day}'))
224 | #TODO: FIX THIS IN THE FUTURE TO ADD OPPONENT VARIABLES
225 | # if td.get('data-stat') == "opp_team_id":
226 | # opp_name = alter_string(td.get_text())
227 | # try:
228 | # efg_percent_opp.append(get_adv_opp_variables(opp_name,parsed_date))
229 | # except:
230 | # print(f'no advanced data for {opp_name}, advanced opponent variables are None')
231 | # efg_percent_opp.append(nan)
232 | if td.get('data-stat') == "pts":
233 | pts.append(td.get_text())
234 | if td.get('data-stat') == "opp_pts":
235 | opp_pts.append(td.get_text())
236 | if td.get('data-stat') == "fg":
237 | fg.append(td.get_text())
238 | if td.get('data-stat') == "fga":
239 | fga.append(td.get_text())
240 | if td.get('data-stat') == "fg_pct":
241 | fg_pct.append(td.get_text())
242 | if td.get('data-stat') == "fg3":
243 | fg3.append(td.get_text())
244 | if td.get('data-stat') == "fg3a":
245 | fg3a.append(td.get_text())
246 | if td.get('data-stat') == "fg3_pct":
247 | fg3_pct.append(td.get_text())
248 | if td.get('data-stat') == "ft":
249 | ft.append(td.get_text())
250 | if td.get('data-stat') == "fta":
251 | fta.append(td.get_text())
252 | if td.get('data-stat') == "ft_pct":
253 | ft_pct.append(td.get_text())
254 | if td.get('data-stat') == "orb":
255 | orb.append(td.get_text())
256 | if td.get('data-stat') == "trb":
257 | total_board.append(td.get_text())
258 | if td.get('data-stat') == "ast":
259 | ast.append(td.get_text())
260 | if td.get('data-stat') == "stl":
261 | stl.append(td.get_text())
262 | if td.get('data-stat') == "blk":
263 | blk.append(td.get_text())
264 | if td.get('data-stat') == "tov":
265 | tov.append(td.get_text())
266 | if td.get('data-stat') == "pf":
267 | pf.append(td.get_text())
268 | if td.get('data-stat') == "opp_fg":
269 | opp_fg.append(td.get_text())
270 | if td.get('data-stat') == "opp_fga":
271 | opp_fga.append(td.get_text())
272 | if td.get('data-stat') == "opp_fg_pct":
273 | opp_fg_pct.append(td.get_text())
274 | if td.get('data-stat') == "opp_fg3":
275 | opp_fg3.append(td.get_text())
276 | if td.get('data-stat') == "opp_fg3a":
277 | opp_fg3a.append(td.get_text())
278 | if td.get('data-stat') == "opp_fg3_pct":
279 | opp_fg3_pct.append(td.get_text())
280 | if td.get('data-stat') == "opp_ft":
281 | opp_ft.append(td.get_text())
282 | if td.get('data-stat') == "opp_fta":
283 | opp_fta.append(td.get_text())
284 | if td.get('data-stat') == "opp_ft_pct":
285 | opp_ft_pct.append(td.get_text())
286 | if td.get('data-stat') == "opp_orb":
287 | opp_orb.append(td.get_text())
288 | if td.get('data-stat') == "opp_trb":
289 | opp_trb.append(td.get_text())
290 | if td.get('data-stat') == "opp_ast":
291 | opp_ast.append(td.get_text())
292 | if td.get('data-stat') == "opp_stl":
293 | opp_stl.append(td.get_text())
294 | if td.get('data-stat') == "opp_blk":
295 | opp_blk.append(td.get_text())
296 | if td.get('data-stat') == "opp_tov":
297 | opp_tov.append(td.get_text())
298 | if td.get('data-stat') == "opp_pf":
299 | opp_pf.append(td.get_text())
300 | #ADVANCED STATS
301 | off_rtg = []
302 | def_rtg = []
303 | off_rtg_opp = []
304 | def_rtg_opp = []
305 | pace = []
306 | fta_per_fga_pct = []
307 | fg3a_per_fga_pct = []
308 | ts_pct = []
309 | trb_pct = []
310 | ast_pct = []
311 | stl_pct = []
312 | blk_pct = []
313 | efg_pct = []
314 | tov_pct = []
315 | orb_pct = []
316 | ft_rate = []
317 | opp_efg_pct= []
318 | opp_tov_pct = []
319 | drb_pct = []
320 | opp_ft_rate = []
321 | for trb in tr_body1:
322 | for td in trb.find_all('td'):
323 | if td.get('data-stat') == "off_rtg":
324 | off_rtg.append(td.get_text())
325 | def_rtg_opp.append(td.get_text())
326 | if td.get('data-stat') == "def_rtg":
327 | off_rtg_opp.append(td.get_text())
328 | def_rtg.append(td.get_text())
329 | if td.get('data-stat') == "pace":
330 | pace.append(td.get_text())
331 | if td.get('data-stat') == "fta_per_fga_pct":
332 | fta_per_fga_pct.append(td.get_text())
333 | if td.get('data-stat') == "fg3a_per_fga_pct":
334 | fg3a_per_fga_pct.append(td.get_text())
335 | if td.get('data-stat') == "ts_pct":
336 | ts_pct.append(td.get_text())
337 | if td.get('data-stat') == "trb_pct":
338 | trb_pct.append(td.get_text())
339 | if td.get('data-stat') == "ast_pct":
340 | ast_pct.append(td.get_text())
341 | if td.get('data-stat') == "stl_pct":
342 | stl_pct.append(td.get_text())
343 | if td.get('data-stat') == "blk_pct":
344 | blk_pct.append(td.get_text())
345 | if td.get('data-stat') == "efg_pct":
346 | efg_pct.append(td.get_text())
347 | if td.get('data-stat') == "tov_pct":
348 | tov_pct.append(td.get_text())
349 | if td.get('data-stat') == "orb_pct":
350 | orb_pct.append(td.get_text())
351 | if td.get('data-stat') == "ft_rate":
352 | ft_rate.append(td.get_text())
353 | if td.get('data-stat') == "opp_efg_pct":
354 | opp_efg_pct.append(td.get_text())
355 | if td.get('data-stat') == "opp_tov_pct":
356 | opp_tov_pct.append(td.get_text())
357 | if td.get('data-stat') == "drb_pct":
358 | drb_pct.append(td.get_text())
359 | if td.get('data-stat') == "opp_ft_rate":
360 | opp_ft_rate.append(td.get_text())
361 | return DataFrame(list(zip(game_result,pts,opp_pts,fg,fga,
362 | fg_pct,fg3,fg3a,fg3_pct,ft,fta,ft_pct,orb,total_board,ast,
363 | stl,blk,tov,pf,opp_fg,opp_fga,opp_fg_pct,opp_fg3,opp_fg3a,opp_fg3_pct,
364 | opp_ft,opp_fta,opp_ft_pct,opp_orb,opp_trb,opp_ast,opp_stl,opp_blk,opp_tov,
365 | opp_pf, off_rtg,def_rtg,pace,fta_per_fga_pct,fg3a_per_fga_pct,ts_pct,
366 | trb_pct,ast_pct,stl_pct,blk_pct,efg_pct,tov_pct,orb_pct,ft_rate,opp_efg_pct,
367 | opp_tov_pct,drb_pct,opp_ft_rate,game_loc,srs,date_save,
368 | off_rtg_opp,def_rtg_opp)),
369 | columns =['game_result','pts','opp_pts','fg','fga',
370 | 'fg_pct','fg3','fg3a','fg3_pct','ft','fta','ft_pct','orb','total_board','ast',
371 | 'stl','blk','tov','pf','opp_fg','opp_fga','opp_fg_pct','opp_fg3','opp_fg3a','opp_fg3_pct',
372 | 'opp_ft','opp_fta','opp_ft_pct','opp_orb','opp_trb','opp_ast','opp_stl','opp_blk','opp_tov',
373 | 'opp_pf','off_rtg','def_rtg','pace','fta_per_fga_pct','fg3a_per_fga_pct','ts_pct',
374 | 'trb_pct','ast_pct','stl_pct','blk_pct','efg_pct','tov_pct','orb_pct','ft_rate','opp_efg_pct',
375 | 'opp_tov_pct','drb_pct','opp_ft_rate','game_loc','simple_rating_system','date_played',
376 | 'opp_off_rtg','opp_def_rtg'])
377 |
378 | def get_espn(URL,team_1,team_2):
379 | team_1 = create_acr(team_1)
380 | team_2 = create_acr(team_2)
381 | # URL = "https://www.espn.com/mens-college-basketball/schedule/_/date/20230131"
382 | hdr = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
383 | req_1 = Request(URL,headers=hdr)
384 | html_1 = request.urlopen(req_1)
385 | soup_1 = BeautifulSoup(html_1, "html.parser")
386 | table = soup_1.find(class_="ResponsiveTable")
387 | table1 = table.find(class_="Table__Scroller")
388 | table2 = table.find(class_="Table")
389 | table3 = table.find(class_="Table__TBODY")
390 | for td in table3.find_all(class_="Table__TR Table__TR--sm Table__even"):
391 | try:
392 | #Get team names
393 | inst = td.find(class_="events__col Table__TD")
394 | href_team = inst.find(class_="AnchorLink").get("href")
395 | if team_1 in href_team:
396 | #Get game link
397 | inst = td.find(class_="date__col Table__TD")
398 | href_val = inst.find(class_="AnchorLink").get("href")
399 | game = "https://www.espn.com" + href_val
400 | req_second = Request(game,headers=hdr)
401 | html_second = request.urlopen(req_second)
402 | soup_second = BeautifulSoup(html_second, "html.parser")
403 | #Team 1 - left-0 top-0 = Away
404 | team_1_predict = soup_second.find(class_="matchupPredictor__teamValue matchupPredictor__teamValue--b left-0 top-0 flex items-baseline absolute copy")
405 | start = '>'
406 | end = "
= 0.90)]
115 | self.drop_cols = to_drop
116 | self.x_no_corr = self.x.drop(columns=to_drop)
117 | cols = self.x_no_corr.columns
118 | print(f'Columns dropped >= 0.90: {to_drop}')
119 | #Drop samples that are outliers
120 | print(f'old feature dataframe shape before outlier removal: {self.x_no_corr.shape}')
121 | for col_name in cols:
122 | Q1 = np.percentile(self.x_no_corr[col_name], 25)
123 | Q3 = np.percentile(self.x_no_corr[col_name], 75)
124 | IQR = Q3 - Q1
125 | upper = np.where(self.x_no_corr[col_name] >= (Q3+2.5*IQR)) #1.5 is the standard, use two to see if more data helps improve model performance
126 | lower = np.where(self.x_no_corr[col_name] <= (Q1-2.5*IQR))
127 | self.x_no_corr.drop(upper[0], inplace = True)
128 | self.x_no_corr.drop(lower[0], inplace = True)
129 | self.y.drop(upper[0], inplace = True)
130 | self.y.drop(lower[0], inplace = True)
131 | if 'level_0' in self.x_no_corr.columns:
132 | self.x_no_corr.drop(columns=['level_0'],inplace = True)
133 | self.x_no_corr.reset_index(inplace = True)
134 | self.y.reset_index(inplace = True, drop=True)
135 | self.x_no_corr.drop(columns=['level_0','index'],inplace = True)
136 | print(f'new feature dataframe shape after outlier removal: {self.x_no_corr.shape}')
137 | top_corr_features = corr_matrix.index
138 | def create_model(self, neurons=32, learning_rate=0.001, dropout_rate=0.2, alpha=0.1):
139 | model = keras.Sequential([
140 | layers.Dense(neurons, input_shape=(self.x_no_corr.shape[1],)),
141 | layers.LeakyReLU(alpha=alpha),
142 | layers.Dropout(dropout_rate),
143 | layers.Dense(neurons),
144 | layers.LeakyReLU(alpha=alpha),
145 | layers.Dropout(dropout_rate),
146 | layers.Dense(neurons),
147 | layers.LeakyReLU(alpha=alpha),
148 | layers.Dropout(dropout_rate),
149 | layers.Dense(1, activation='sigmoid')
150 | ])
151 | optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
152 | model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
153 | return model
154 | def deep_learn(self):
155 | if exists('deep_learning.h5'):
156 | self.model = keras.models.load_model('deep_learning.h5')
157 | else:
158 | #best params
159 | # Best: 0.999925 using {'alpha': 0.1, 'batch_size': 32, 'dropout_rate': 0.2,
160 | # 'learning_rate': 0.001, 'neurons': 16}
161 | optimizer = keras.optimizers.Adam(learning_rate=0.001)
162 | self.model = keras.Sequential([
163 | layers.Dense(16, input_shape=(self.x_no_corr.shape[1],)),
164 | layers.LeakyReLU(alpha=0.1),
165 | layers.BatchNormalization(),
166 | layers.Dropout(0.2),
167 | layers.Dense(16),
168 | layers.LeakyReLU(alpha=0.1),
169 | layers.BatchNormalization(),
170 | layers.Dropout(0.2),
171 | layers.Dense(16),
172 | layers.LeakyReLU(alpha=0.1),
173 | layers.BatchNormalization(),
174 | layers.Dropout(0.2),
175 | layers.Dense(16),
176 | layers.LeakyReLU(alpha=0.1),
177 | layers.BatchNormalization(),
178 | layers.Dropout(0.2),
179 | layers.Dense(16),
180 | layers.LeakyReLU(alpha=0.1),
181 | layers.BatchNormalization(),
182 | layers.Dropout(0.2),
183 | layers.Dense(16),
184 | layers.LeakyReLU(alpha=0.1),
185 | layers.BatchNormalization(),
186 | layers.Dropout(0.2),
187 | layers.Dense(1, activation='sigmoid')
188 | ])
189 | self.model.compile(optimizer=optimizer,
190 | loss='binary_crossentropy',
191 | metrics=['accuracy'])
192 | history = self.model.fit(self.x_train, self.y_train,
193 | epochs=50, batch_size=32,
194 | validation_data=(self.x_test,self.y_test))
195 | #validation_split=0.2)
196 | # param_grid = {
197 | # 'neurons': [16, 32, 64],
198 | # 'learning_rate': [0.01, 0.001, 0.0001],
199 | # 'dropout_rate': [0.1, 0.2, 0.3],
200 | # 'alpha': [0.01, 0.1, 0.2],
201 | # 'batch_size': [16, 32, 64]
202 | # }
203 | # param_grid = {
204 | # 'neurons': [16, 32],
205 | # 'learning_rate': [0.01, 0.001],
206 | # 'dropout_rate': [0.2],
207 | # 'alpha': [0.1],
208 | # 'batch_size': [32, 64]
209 | # }
210 | # model = KerasClassifier(build_fn=self.create_model,
211 | # epochs=50, batch_size=32, verbose=4)
212 | # grid = GridSearchCV(estimator=model,
213 | # param_grid=param_grid,
214 | # cv=3,
215 | # verbose=3)
216 | # self.grid_result = grid.fit(self.x_train, self.y_train)
217 | # print("Best: %f using %s" % (self.grid_result.best_score_, self.grid_result.best_params_))
218 | # self.model = self.grid_result
219 | # input()
220 | self.model.save('deep_learning.h5')
221 | plt.figure()
222 | plt.plot(history.history['accuracy'], label='training accuracy')
223 | plt.plot(history.history['val_accuracy'], label='validation accuracy')
224 | plt.title('Accuracy History')
225 | plt.xlabel('Epoch')
226 | plt.ylabel('Accuracy')
227 | plt.legend()
228 | plt.savefig('Accuracy.png',dpi=300)
229 | plt.close()
230 |
231 | # plot loss history
232 | plt.figure()
233 | plt.plot(history.history['loss'], label='training loss')
234 | plt.plot(history.history['val_loss'], label='validation loss')
235 | plt.title('Loss History')
236 | plt.xlabel('Epoch')
237 | plt.ylabel('Loss')
238 | plt.legend()
239 | plt.savefig('Loss.png',dpi=300)
240 | plt.close()
241 | def predict_two_teams(self):
242 | teams_sports_ref = read_csv('teams_sports_ref_format.csv')
243 | while True:
244 | try:
245 | team_1 = input('team_1: ')
246 | if team_1 == 'exit':
247 | break
248 | team_2 = input('team_2: ')
249 | #Game location
250 | game_loc_team1 = int(input(f'{team_1} : #home = 0, away = 1, N = 2: '))
251 | if game_loc_team1 == 0:
252 | game_loc_team2 = 1
253 | elif game_loc_team1 == 1:
254 | game_loc_team2 = 0
255 | elif game_loc_team1 == 2:
256 | game_loc_team2 = 2
257 | #Check to see if the team was spelled right
258 | team_1 = get_close_matches(team_1,teams_sports_ref['teams'].tolist(),n=1)[0]
259 | team_2 = get_close_matches(team_2,teams_sports_ref['teams'].tolist(),n=1)[0]
260 | #2023 data
261 | year = 2023
262 | sleep(4)
263 | basic = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs.html'
264 | adv = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs-advanced.html'
265 | team_1_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_1.lower(),year)
266 | sleep(4) #I get get banned for a small period of time if I do not do this
267 | basic = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs.html'
268 | adv = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs-advanced.html'
269 | team_2_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_2.lower(),year)
270 | #Remove empty cells
271 | team_1_df2023['pts'].replace('', np.nan, inplace=True)
272 | team_1_df2023.replace('', np.nan, inplace=True)
273 | team_1_df2023.dropna(inplace=True)
274 | team_2_df2023['pts'].replace('', np.nan, inplace=True)
275 | team_2_df2023.replace('', np.nan, inplace=True)
276 | team_2_df2023.dropna(inplace=True)
277 | #Remove pts and game result
278 | # for col in team_1_df2023.columns:
279 | # if 'opp' in col:
280 | # team_1_df2023.drop(columns=col,inplace=True)
281 | # for col in team_2_df2023.columns:
282 | # if 'opp' in col:
283 | # team_2_df2023.drop(columns=col,inplace=True)
284 | team_1_df2023.drop(columns=['game_result'],inplace=True)
285 | team_2_df2023.drop(columns=['game_result'],inplace=True)
286 | #Drop the correlated features
287 | team_1_df2023.drop(columns=self.drop_cols, inplace=True)
288 | team_2_df2023.drop(columns=self.drop_cols, inplace=True)
289 | ma_range = np.arange(2,5,1) #2 was the most correct value for mean and 8 was the best for the median; chose 9 for tiebreaking
290 | team_1_count = 0
291 | team_2_count = 0
292 | team_1_count_mean = 0
293 | team_2_count_mean = 0
294 | team_1_ma_win = []
295 | team_1_ma_loss = []
296 | team_2_ma = []
297 | #get SRS
298 | team_srs = cbb_web_scraper.get_latest_srs(team_1)
299 | for ma in tqdm(ma_range):
300 | # data1_median = team_1_df2023.rolling(ma).median()
301 | # data1_median['game_loc'] = game_loc_team1
302 | # data2_median = team_2_df2023.rolling(ma).median()
303 | # data2_median['game_loc'] = game_loc_team2
304 | # data1_mean_old = team_1_df2023.rolling(ma).mean()
305 | # data2_mean_old = team_2_df2023.rolling(ma).mean()
306 | # TEAM 1
307 | data1_mean = team_1_df2023.ewm(span=ma,min_periods=ma-1).mean()
308 | data1_mean['game_loc'] = game_loc_team1
309 | data2_mean = team_2_df2023.ewm(span=ma,min_periods=ma-1).mean()
310 | data2_mean['game_loc'] = game_loc_team2
311 | # team_1_predict_median = self.RandForclass.predict(data1_median.iloc[-1:])
312 | # team_2_predict_median = self.RandForclass.predict(data2_median.iloc[-1:])
313 | #Here replace opponent metrics with the features of the second team
314 | for col in team_1_df2023.columns:
315 | if "opp" in col:
316 | if col == 'opp_trb':
317 | # new_col = col.replace("opp_", "")
318 | data1_mean.loc[data1_mean.index[-1], 'opp_trb'] = data2_mean.loc[data2_mean.index[-1], 'total_board']
319 | else:
320 | new_col = col.replace("opp_", "")
321 | data1_mean.loc[data1_mean.index[-1], col] = data2_mean.loc[data2_mean.index[-1], new_col]
322 | #get latest SRS value
323 | data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_srs
324 | # data1_mean['simple_rating_system'].iloc[-1] = cbb_web_scraper.get_latest_srs(team_1)# float(input(f'input {team_1} current simple rating system value: '))
325 | #TEAM 1 Prediction
326 | x_new = self.scaler.transform(data1_mean.iloc[-1:])
327 | prediction = self.model.predict(x_new)
328 | print(f'prediction: {prediction[0]*100}%')
329 | probability = prediction[0]
330 | if probability > 0.5:
331 | team_1_count += 1
332 | elif probability < 0.5:
333 | team_2_count += 1
334 | # team_1_predict_mean = self.RandForclass.predict_proba(data1_mean.iloc[-1:])
335 | #TEAM
336 | # data1_mean_change = team_1_df2023.ewm(span=ma,min_periods=ma-1).mean()
337 | # data1_mean_change['game_loc'] = game_loc_team1
338 | # data2_mean_change = team_2_df2023.ewm(span=ma,min_periods=ma-1).mean()
339 | # data2_mean_change['game_loc'] = game_loc_team2
340 | # x_new = self.scaler.transform(data2_mean_change.iloc[-1:])
341 | # prediction = self.model.predict(x_new)
342 | # probability = prediction[0]
343 | # team_1_predict_median = self.RandForclass.predict(data1_median.iloc[-1:])
344 | # team_2_predict_median = self.RandForclass.predict(data2_median.iloc[-1:])
345 | #Here replace opponent metrics with the features of the second team
346 | # for col in team_2_df2023.columns:
347 | # if "opp" in col:
348 | # if col == 'opp_trb':
349 | # # new_col = col.replace("opp_", "")
350 | # data2_mean_change.loc[data2_mean_change.index[-1], 'opp_trb'] = data1_mean_change.loc[data1_mean_change.index[-1], 'total_board']
351 | # else:
352 | # new_col = col.replace("opp_", "")
353 | # data2_mean_change.loc[data2_mean_change.index[-1], col] = data1_mean_change.loc[data1_mean_change.index[-1], new_col]
354 | # team_2_predict_mean = self.RandForclass.predict_proba(data2_mean_change.iloc[-1:])
355 | # team_2_ma.append(team_2_predict_mean[0][1])
356 | # print('===============================================================')
357 | # print(f'{team_1} win probability {round(np.mean(team_1_ma_win),4)*100}%')
358 | # print(f'{team_2} win probability {round(np.median(team_2_predict_mean),4)*100}%')
359 | # print(f'{team_2} winning: {np.mean(team_2_ma)}%')
360 | print('===============================================================')
361 | # if np.mean(team_1_ma_win) > np.mean(team_1_ma_loss):
362 | # print(f'{team_1} wins over {team_2}')
363 | # else:
364 | # print(f'{team_2} wins over {team_1}')
365 | if team_1_count > team_2_count:
366 | print(f'{team_1} wins over {team_2}')
367 | elif team_1_count < team_2_count:
368 | print(f'{team_2} wins over {team_1}')
369 | print('===============================================================')
370 | except Exception as e:
371 | print(f'The error: {e}')
372 | def run_analysis(self):
373 | self.get_teams()
374 | self.split()
375 | self.deep_learn()
376 | self.predict_two_teams()
377 | def main():
378 | cbbDeep().run_analysis()
379 | if __name__ == '__main__':
380 | main()
--------------------------------------------------------------------------------
/deep_learn_regressor.py:
--------------------------------------------------------------------------------
1 | #deep learning implementation
2 | import tensorflow as tf
3 | from tensorflow import keras
4 | from tensorflow.keras import layers
5 | from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
6 | from sklearn.preprocessing import StandardScaler
7 | import cbb_web_scraper
8 | from os import getcwd
9 | from os.path import join, exists
10 | import yaml
11 | from tqdm import tqdm
12 | from time import sleep
13 | from pandas import DataFrame, concat, read_csv, isnull
14 | from sklearn.model_selection import train_test_split
15 | from sklearn.model_selection import GridSearchCV
16 | # from sklearn.ensemble import RandomForestClassifier
17 | import numpy as np
18 | import matplotlib.pyplot as plt
19 | import seaborn as sns
20 | # from sys import argv
21 | import joblib
22 | from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
23 | from difflib import get_close_matches
24 | # from datetime import datetime, timedelta
25 | # from sklearn.metrics import roc_curve
26 | import seaborn as sns
27 |
28 | #TODO: CREATE A FEATURE OF opp_simple_rating_system
29 |
30 | class cbbDeep():
31 | def __init__(self):
32 | print('instantiate class cbbClass')
33 | self.all_data = DataFrame()
34 | # if exists(join(getcwd(),'randomForestModelTuned.joblib')):
35 | # self.RandForRegressor=joblib.load("./randomForestModelTuned.joblib")
36 | def get_teams(self):
37 | year_list_find = []
38 | year_list = [2023,2022,2021,2019,2018,2017,2016,2015,2014,2013,2012] #,2014,2013,2012,2011,2010
39 | if exists(join(getcwd(),'year_count.yaml')):
40 | with open(join(getcwd(),'year_count.yaml')) as file:
41 | year_counts = yaml.load(file, Loader=yaml.FullLoader)
42 | else:
43 | year_counts = {'year':year_list_find}
44 | #Remove any years that have already been collected
45 | if year_counts['year']:
46 | year_list_check = year_counts['year']
47 | year_list_find = year_counts['year']
48 | year_list = [i for i in year_list if i not in year_list_check]
49 | print(f'Need data for year: {year_list}')
50 | #Collect data per year
51 | if year_list:
52 | for year in tqdm(year_list):
53 | all_teams = cbb_web_scraper.get_teams_year(year_list[-1],year_list[0])
54 | team_names = sorted(all_teams)
55 | final_list = []
56 | self.year_store = year
57 | for abv in tqdm(team_names):
58 | try:
59 | print() #tqdm things
60 | print(f'current team: {abv}, year: {year}')
61 | basic = 'https://www.sports-reference.com/cbb/schools/' + abv + '/' + str(self.year_store) + '-gamelogs.html'
62 | adv = 'https://www.sports-reference.com/cbb/schools/' + abv + '/' + str(self.year_store) + '-gamelogs-advanced.html'
63 | df_inst = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,abv,self.year_store)
64 | df_inst['pts'].replace('', np.nan, inplace=True)
65 | df_inst.dropna(inplace=True)
66 | final_list.append(df_inst)
67 | except Exception as e:
68 | print(e)
69 | print(f'{abv} data are not available')
70 | sleep(4) #I get get banned for a small period of time if I do not do this
71 | final_data = concat(final_list)
72 | if exists(join(getcwd(),'all_data_regressor.csv')):
73 | self.all_data = read_csv(join(getcwd(),'all_data_regressor.csv'))
74 | self.all_data = concat([self.all_data, final_data.dropna()])
75 | if not exists(join(getcwd(),'all_data_regressor.csv')):
76 | self.all_data.to_csv(join(getcwd(),'all_data_regressor.csv'),index=False)
77 | self.all_data.to_csv(join(getcwd(),'all_data_regressor.csv'),index=False)
78 | year_list_find.append(year)
79 | print(f'year list after loop: {year_list_find}')
80 | with open(join(getcwd(),'year_count.yaml'), 'w') as write_file:
81 | yaml.dump(year_counts, write_file)
82 | print(f'writing {year} to yaml file')
83 | else:
84 | self.all_data = read_csv(join(getcwd(),'all_data_regressor.csv'))
85 | print('len data: ', len(self.all_data))
86 | self.all_data = self.all_data.drop_duplicates(keep='last')
87 | print(f'length of data after duplicates are dropped: {len(self.all_data)}')
88 | def convert_to_float(self):
89 | for col in self.all_data.columns:
90 | self.all_data[col].replace('', np.nan, inplace=True)
91 | self.all_data[col] = self.all_data[col].astype(float)
92 | def split(self):
93 | # self.delete_opp()
94 | for col in self.all_data.columns:
95 | if 'Unnamed' in col:
96 | self.all_data.drop(columns=col,inplace=True)
97 | self.convert_to_float()
98 | self.y = self.all_data['pts'].astype(float)
99 | self.x = self.all_data.drop(columns=['game_result','pts'])
100 | self.pre_process()
101 | #Dropna and remove all data from subsequent y data
102 | real_values = ~self.x_no_corr.isna().any(axis=1)
103 | self.x_no_corr.dropna(inplace=True)
104 | self.y = self.y.loc[real_values]
105 | self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x_no_corr, self.y, train_size=0.8)
106 | # normalize data
107 | self.scaler = StandardScaler()
108 | self.x_train = self.scaler.fit_transform(self.x_train)
109 | self.x_test = self.scaler.transform(self.x_test)
110 | def pre_process(self):
111 | # Remove features with a correlation coef greater than 0.85
112 | corr_matrix = np.abs(self.x.astype(float).corr())
113 | upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
114 | to_drop = [column for column in upper.columns if any(upper[column] >= 0.90)]
115 | self.drop_cols = to_drop
116 | self.x_no_corr = self.x.drop(columns=to_drop)
117 | cols = self.x_no_corr.columns
118 | self.drop_cols.append('game_result')
119 | print(f'Columns dropped >= 0.90: {to_drop}')
120 | #Drop samples that are outliers
121 | print(f'old feature dataframe shape before outlier removal: {self.x_no_corr.shape}')
122 | for col_name in cols:
123 | Q1 = np.percentile(self.x_no_corr[col_name], 25)
124 | Q3 = np.percentile(self.x_no_corr[col_name], 75)
125 | IQR = Q3 - Q1
126 | upper = np.where(self.x_no_corr[col_name] >= (Q3+2.5*IQR)) #1.5 is the standard, use two to see if more data helps improve model performance
127 | lower = np.where(self.x_no_corr[col_name] <= (Q1-2.5*IQR))
128 | self.x_no_corr.drop(upper[0], inplace = True)
129 | self.x_no_corr.drop(lower[0], inplace = True)
130 | self.y.drop(upper[0], inplace = True)
131 | self.y.drop(lower[0], inplace = True)
132 | if 'level_0' in self.x_no_corr.columns:
133 | self.x_no_corr.drop(columns=['level_0'],inplace = True)
134 | self.x_no_corr.reset_index(inplace = True)
135 | self.y.reset_index(inplace = True, drop=True)
136 | self.x_no_corr.drop(columns=['level_0','index'],inplace = True)
137 | print(f'new feature dataframe shape after outlier removal: {self.x_no_corr.shape}')
138 | top_corr_features = corr_matrix.index
139 | def create_model(self, neurons=32, learning_rate=0.001, dropout_rate=0.2, alpha=0.1):
140 | model = keras.Sequential([
141 | layers.Dense(neurons, input_shape=(self.x_no_corr.shape[1],)),
142 | layers.LeakyReLU(alpha=alpha),
143 | layers.Dropout(dropout_rate),
144 | layers.Dense(neurons),
145 | layers.LeakyReLU(alpha=alpha),
146 | layers.Dropout(dropout_rate),
147 | layers.Dense(neurons),
148 | layers.LeakyReLU(alpha=alpha),
149 | layers.Dropout(dropout_rate),
150 | layers.Dense(1, activation='sigmoid')
151 | ])
152 | optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
153 | model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
154 | return model
155 | def deep_learn(self):
156 | if exists('deep_learning_regressor.h5'):
157 | self.model = keras.models.load_model('deep_learning_regressor.h5')
158 | else:
159 | optimizer = keras.optimizers.Adam(learning_rate=0.001)
160 | self.model = keras.Sequential([
161 | layers.Dense(16, input_shape=(self.x_no_corr.shape[1],)),
162 | layers.LeakyReLU(alpha=0.1),
163 | layers.BatchNormalization(),
164 | layers.Dropout(0.2),
165 | layers.Dense(16),
166 | layers.LeakyReLU(alpha=0.1),
167 | layers.BatchNormalization(),
168 | layers.Dropout(0.2),
169 | layers.Dense(16),
170 | layers.LeakyReLU(alpha=0.1),
171 | layers.BatchNormalization(),
172 | layers.Dropout(0.2),
173 | layers.Dense(16),
174 | layers.LeakyReLU(alpha=0.1),
175 | layers.BatchNormalization(),
176 | layers.Dropout(0.2),
177 | layers.Dense(16),
178 | layers.LeakyReLU(alpha=0.1),
179 | layers.BatchNormalization(),
180 | layers.Dropout(0.2),
181 | layers.Dense(16),
182 | layers.LeakyReLU(alpha=0.1),
183 | layers.BatchNormalization(),
184 | layers.Dropout(0.2),
185 | layers.Dense(1)
186 | ])
187 | self.model.compile(optimizer=optimizer,
188 | loss='mean_squared_error',
189 | metrics=['mean_absolute_error'])
190 | history = self.model.fit(self.x_train, self.y_train,
191 | epochs=50, batch_size=32,
192 | validation_data=(self.x_test,self.y_test))
193 |
194 | self.model.save('deep_learning_regressor.h5')
195 | # plt.figure()
196 | # plt.plot(history.history['accuracy'], label='training accuracy')
197 | # plt.plot(history.history['val_accuracy'], label='validation accuracy')
198 | # plt.title('Accuracy History')
199 | # plt.xlabel('Epoch')
200 | # plt.ylabel('Accuracy')
201 | # plt.legend()
202 | # plt.savefig('Accuracy.png',dpi=300)
203 | # plt.close()
204 |
205 | # plot loss history
206 | plt.figure()
207 | plt.plot(history.history['loss'], label='training loss')
208 | plt.plot(history.history['val_loss'], label='validation loss')
209 | plt.title('Loss History')
210 | plt.xlabel('Epoch')
211 | plt.ylabel('Loss')
212 | plt.legend()
213 | plt.savefig('Loss.png',dpi=300)
214 | plt.close()
215 | def predict_two_teams(self):
216 | teams_sports_ref = read_csv('teams_sports_ref_format.csv')
217 | while True:
218 | # try:
219 | team_1 = input('team_1: ')
220 | if team_1 == 'exit':
221 | break
222 | team_2 = input('team_2: ')
223 | #Game location
224 | game_loc_team1 = int(input(f'{team_1} : #home = 0, away = 1, N = 2: '))
225 | if game_loc_team1 == 0:
226 | game_loc_team2 = 1
227 | elif game_loc_team1 == 1:
228 | game_loc_team2 = 0
229 | elif game_loc_team1 == 2:
230 | game_loc_team2 = 2
231 | #Check to see if the team was spelled right
232 | team_1 = get_close_matches(team_1,teams_sports_ref['teams'].tolist(),n=1)[0]
233 | team_2 = get_close_matches(team_2,teams_sports_ref['teams'].tolist(),n=1)[0]
234 | #2023 data
235 | year = 2023
236 | sleep(4)
237 | basic = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs.html'
238 | adv = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs-advanced.html'
239 | team_1_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_1.lower(),year)
240 | sleep(4) #I get get banned for a small period of time if I do not do this
241 | basic = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs.html'
242 | adv = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs-advanced.html'
243 | team_2_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_2.lower(),year)
244 | #Remove empty cells
245 | team_1_df2023['pts'].replace('', np.nan, inplace=True)
246 | team_1_df2023.replace('', np.nan, inplace=True)
247 | team_1_df2023.dropna(inplace=True)
248 | team_2_df2023['pts'].replace('', np.nan, inplace=True)
249 | team_2_df2023.replace('', np.nan, inplace=True)
250 | team_2_df2023.dropna(inplace=True)
251 | #Remove pts and game result
252 | # for col in team_1_df2023.columns:
253 | # if 'opp' in col:
254 | # team_1_df2023.drop(columns=col,inplace=True)
255 | # for col in team_2_df2023.columns:
256 | # if 'opp' in col:
257 | # team_2_df2023.drop(columns=col,inplace=True)
258 | #Drop the correlated features
259 | # self.drop_cols.remove('game_result')
260 | team_1_df2023.drop(columns=self.drop_cols, inplace=True)
261 | team_2_df2023.drop(columns=self.drop_cols, inplace=True)
262 | ma_range = np.arange(2,5,1) #2 was the most correct value for mean and 8 was the best for the median; chose 9 for tiebreaking
263 | team_1_count = 0
264 | team_2_count = 0
265 | team_1_count_mean = 0
266 | team_2_count_mean = 0
267 | team_1_ma_win = []
268 | team_1_ma_loss = []
269 | team_2_ma = []
270 | #get SRS
271 | team_srs = cbb_web_scraper.get_latest_srs(team_1)
272 | team_srs_2 = cbb_web_scraper.get_latest_srs(team_2)
273 | values_team1 = []
274 | values_team2 = []
275 | for ma in tqdm(ma_range):
276 | # data1_median = team_1_df2023.rolling(ma).median()
277 | # data1_median['game_loc'] = game_loc_team1
278 | # data2_median = team_2_df2023.rolling(ma).median()
279 | # data2_median['game_loc'] = game_loc_team2
280 | # data1_mean_old = team_1_df2023.rolling(ma).mean()
281 | # data2_mean_old = team_2_df2023.rolling(ma).mean()
282 | # TEAM 1
283 | data1_mean = team_1_df2023.ewm(span=ma,min_periods=ma-1).mean()
284 | data1_mean['game_loc'] = game_loc_team1
285 | data2_mean = team_2_df2023.ewm(span=ma,min_periods=ma-1).mean()
286 | data2_mean['game_loc'] = game_loc_team2
287 | # team_1_predict_median = self.RandForclass.predict(data1_median.iloc[-1:])
288 | # team_2_predict_median = self.RandForclass.predict(data2_median.iloc[-1:])
289 | #Here replace opponent metrics with the features of the second team
290 | for col in team_1_df2023.columns:
291 | if "opp" in col:
292 | if col == 'opp_trb':
293 | # new_col = col.replace("opp_", "")
294 | data1_mean.loc[data1_mean.index[-1], 'opp_trb'] = data2_mean.loc[data2_mean.index[-1], 'total_board']
295 | else:
296 | new_col = col.replace("opp_", "")
297 | data1_mean.loc[data1_mean.index[-1], col] = data2_mean.loc[data2_mean.index[-1], new_col]
298 | #get latest SRS value
299 | data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_srs
300 | # data1_mean['simple_rating_system'].iloc[-1] = cbb_web_scraper.get_latest_srs(team_1)# float(input(f'input {team_1} current simple rating system value: '))
301 | #TEAM 1 Prediction
302 | #Drop y-value
303 | data1_mean.drop(columns=['pts'],inplace=True)
304 | x_new = self.scaler.transform(data1_mean.iloc[-1:])
305 | prediction = self.model.predict(x_new)
306 | print(f'prediction: {prediction[0][0]}')
307 | values_team1.append(prediction)
308 | for ma in tqdm(ma_range):
309 | # data1_median = team_1_df2023.rolling(ma).median()
310 | # data1_median['game_loc'] = game_loc_team1
311 | # data2_median = team_2_df2023.rolling(ma).median()
312 | # data2_median['game_loc'] = game_loc_team2
313 | # data1_mean_old = team_1_df2023.rolling(ma).mean()
314 | # data2_mean_old = team_2_df2023.rolling(ma).mean()
315 | # TEAM 1
316 | data1_mean = team_1_df2023.ewm(span=ma,min_periods=ma-1).mean()
317 | data1_mean['game_loc'] = game_loc_team1
318 | data2_mean = team_2_df2023.ewm(span=ma,min_periods=ma-1).mean()
319 | data2_mean['game_loc'] = game_loc_team2
320 | # team_1_predict_median = self.RandForclass.predict(data1_median.iloc[-1:])
321 | # team_2_predict_median = self.RandForclass.predict(data2_median.iloc[-1:])
322 | #Here replace opponent metrics with the features of the second team
323 | for col in team_1_df2023.columns:
324 | if "opp" in col:
325 | if col == 'opp_trb':
326 | # new_col = col.replace("opp_", "")
327 | data2_mean.loc[data2_mean.index[-1], 'opp_trb'] = data1_mean.loc[data1_mean.index[-1], 'total_board']
328 | else:
329 | new_col = col.replace("opp_", "")
330 | data2_mean.loc[data2_mean.index[-1], col] = data1_mean.loc[data1_mean.index[-1], new_col]
331 | #get latest SRS value
332 | data2_mean.loc[data2_mean.index[-1], 'simple_rating_system'] = team_srs_2
333 | # data1_mean['simple_rating_system'].iloc[-1] = cbb_web_scraper.get_latest_srs(team_1)# float(input(f'input {team_1} current simple rating system value: '))
334 | #TEAM 1 Prediction
335 | #Drop y-value
336 | data2_mean.drop(columns=['pts'],inplace=True)
337 | x_new = self.scaler.transform(data2_mean.iloc[-1:])
338 | prediction = self.model.predict(x_new)
339 | print(f'prediction: {prediction[0][0]}')
340 | values_team2.append(prediction)
341 | # if probability > 0.5:
342 | # team_1_count += 1
343 | # elif probability < 0.5:
344 | # team_2_count += 1
345 | # team_1_predict_mean = self.RandForclass.predict_proba(data1_mean.iloc[-1:])
346 | #TEAM
347 | # data1_mean_change = team_1_df2023.ewm(span=ma,min_periods=ma-1).mean()
348 | # data1_mean_change['game_loc'] = game_loc_team1
349 | # data2_mean_change = team_2_df2023.ewm(span=ma,min_periods=ma-1).mean()
350 | # data2_mean_change['game_loc'] = game_loc_team2
351 | # x_new = self.scaler.transform(data2_mean_change.iloc[-1:])
352 | # prediction = self.model.predict(x_new)
353 | # probability = prediction[0]
354 | # team_1_predict_median = self.RandForclass.predict(data1_median.iloc[-1:])
355 | # team_2_predict_median = self.RandForclass.predict(data2_median.iloc[-1:])
356 | #Here replace opponent metrics with the features of the second team
357 | # for col in team_2_df2023.columns:
358 | # if "opp" in col:
359 | # if col == 'opp_trb':
360 | # # new_col = col.replace("opp_", "")
361 | # data2_mean_change.loc[data2_mean_change.index[-1], 'opp_trb'] = data1_mean_change.loc[data1_mean_change.index[-1], 'total_board']
362 | # else:
363 | # new_col = col.replace("opp_", "")
364 | # data2_mean_change.loc[data2_mean_change.index[-1], col] = data1_mean_change.loc[data1_mean_change.index[-1], new_col]
365 | # team_2_predict_mean = self.RandForclass.predict_proba(data2_mean_change.iloc[-1:])
366 | # team_2_ma.append(team_2_predict_mean[0][1])
367 | # print('===============================================================')
368 | # print(f'{team_1} win probability {round(np.mean(team_1_ma_win),4)*100}%')
369 | # print(f'{team_2} win probability {round(np.median(team_2_predict_mean),4)*100}%')
370 | # print(f'{team_2} winning: {np.mean(team_2_ma)}%')
371 | print('===============================================================')
372 | # if np.mean(team_1_ma_win) > np.mean(team_1_ma_loss):
373 | # print(f'{team_1} wins over {team_2}')
374 | # else:
375 | # print(f'{team_2} wins over {team_1}')
376 | print(f'{team_1} score {np.median(values_team1)} : {team_2} score {np.median(values_team2)}')
377 | # if team_1_count > team_2_count:
378 | # print(f'{team_1} wins over {team_2}')
379 | # elif team_1_count < team_2_count:
380 | # print(f'{team_2} wins over {team_1}')
381 | print('===============================================================')
382 | # except Exception as e:
383 | # print(f'The error: {e}')
384 | def run_analysis(self):
385 | self.get_teams()
386 | self.split()
387 | self.deep_learn()
388 | self.predict_two_teams()
389 | def main():
390 | cbbDeep().run_analysis()
391 | if __name__ == '__main__':
392 | main()
--------------------------------------------------------------------------------
/deep_learn_MA.py:
--------------------------------------------------------------------------------
1 | #deep learning implementation
2 | import tensorflow as tf
3 | from tensorflow import keras
4 | from tensorflow.keras import layers
5 | from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
6 | from sklearn.preprocessing import StandardScaler
7 | import cbb_web_scraper
8 | from os import getcwd
9 | from os.path import join, exists
10 | import yaml
11 | from tqdm import tqdm
12 | from time import sleep
13 | from pandas import DataFrame, concat, read_csv, isnull
14 | from sklearn.model_selection import train_test_split
15 | from sklearn.model_selection import GridSearchCV
16 | # from sklearn.ensemble import RandomForestClassifier
17 | import numpy as np
18 | import matplotlib.pyplot as plt
19 | import seaborn as sns
20 | # from sys import argv
21 | import joblib
22 | from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
23 | from difflib import get_close_matches
24 | # from datetime import datetime, timedelta
25 | # from sklearn.metrics import roc_curve
26 | import seaborn as sns
27 |
28 | #TODO: Create a PCA method that remove correlated features and reduces the dimension so that the resulting dims = 95% of the total variance
29 |
30 | class cbbDeep():
31 | def __init__(self):
32 | print('instantiate class cbbClass')
33 | self.all_data = DataFrame()
34 | # if exists(join(getcwd(),'randomForestModelTuned.joblib')):
35 | # self.RandForRegressor=joblib.load("./randomForestModelTuned.joblib")
36 | def get_teams(self):
37 | year_list_find = []
38 | year_list = [2023,2022,2021,2019,2018,2017,2016,2015,2014,2013,2012] #,2014,2013,2012,2011,2010
39 | if exists(join(getcwd(),'year_count.yaml')):
40 | with open(join(getcwd(),'year_count.yaml')) as file:
41 | year_counts = yaml.load(file, Loader=yaml.FullLoader)
42 | else:
43 | year_counts = {'year':year_list_find}
44 | #Remove any years that have already been collected
45 | if year_counts['year']:
46 | year_list_check = year_counts['year']
47 | year_list_find = year_counts['year']
48 | year_list = [i for i in year_list if i not in year_list_check]
49 | print(f'Need data for year: {year_list}')
50 | #Collect data per year
51 | if year_list:
52 | for year in tqdm(year_list):
53 | all_teams = cbb_web_scraper.get_teams_year(year_list[-1],year_list[0])
54 | team_names = sorted(all_teams)
55 | final_list = []
56 | self.year_store = year
57 | for abv in tqdm(team_names):
58 | try:
59 | print() #tqdm things
60 | print(f'current team: {abv}, year: {year}')
61 | basic = 'https://www.sports-reference.com/cbb/schools/' + abv + '/' + str(self.year_store) + '-gamelogs.html'
62 | adv = 'https://www.sports-reference.com/cbb/schools/' + abv + '/' + str(self.year_store) + '-gamelogs-advanced.html'
63 | df_inst = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,abv,self.year_store)
64 | df_inst['pts'].replace('', np.nan, inplace=True)
65 | df_inst.dropna(inplace=True)
66 | final_list.append(df_inst)
67 | except Exception as e:
68 | print(e)
69 | print(f'{abv} data are not available')
70 | sleep(4) #I get get banned for a small period of time if I do not do this
71 | final_data = concat(final_list)
72 | if exists(join(getcwd(),'all_data_regressor.csv')):
73 | self.all_data = read_csv(join(getcwd(),'all_data_regressor.csv'))
74 | self.all_data = concat([self.all_data, final_data.dropna()])
75 | if not exists(join(getcwd(),'all_data_regressor.csv')):
76 | self.all_data.to_csv(join(getcwd(),'all_data_regressor.csv'),index=False)
77 | self.all_data.to_csv(join(getcwd(),'all_data_regressor.csv'),index=False)
78 | year_list_find.append(year)
79 | print(f'year list after loop: {year_list_find}')
80 | with open(join(getcwd(),'year_count.yaml'), 'w') as write_file:
81 | yaml.dump(year_counts, write_file)
82 | print(f'writing {year} to yaml file')
83 | else:
84 | self.all_data = read_csv(join(getcwd(),'all_data_regressor.csv'))
85 | print('len data: ', len(self.all_data))
86 | self.all_data = self.all_data.drop_duplicates(keep='last')
87 | print(f'length of data after duplicates are dropped: {len(self.all_data)}')
88 | def convert_to_float(self):
89 | for col in self.all_data.columns:
90 | self.all_data[col].replace('', np.nan, inplace=True)
91 | self.all_data[col] = self.all_data[col].astype(float)
92 |
93 | def feature_engineering(self):
94 | for col in self.all_data.columns:
95 | if 'Unnamed' in col:
96 | self.all_data.drop(columns=col,inplace=True)
97 | range_ma = [2,3,4,5,10,12,14]
98 | temp_ma = DataFrame()
99 | for val in range_ma:
100 | for col in self.all_data.columns:
101 | if 'game_result' in col or 'simple_rating_system' in col or 'game_loc' in col:
102 | temp_ma[col] = self.all_data[col]
103 | else:
104 | dynamic_name = col + '_' + str(val)
105 | temp_ma[dynamic_name] = self.all_data[col].ewm(span=val,min_periods=0).mean()
106 | self.all_data = temp_ma
107 | def split(self):
108 | # self.delete_opp()
109 | for col in self.all_data.columns:
110 | if 'Unnamed' in col:
111 | self.all_data.drop(columns=col,inplace=True)
112 | self.convert_to_float()
113 | self.feature_engineering()
114 | self.y = self.all_data['game_result'].astype(int)
115 | self.x = self.all_data.drop(columns=['game_result'])
116 | # self.pre_process()
117 | self.x_no_corr = self.x
118 | #Dropna and remove all data from subsequent y data
119 | real_values = ~self.x_no_corr.isna().any(axis=1)
120 | self.x_no_corr.dropna(inplace=True)
121 | self.y = self.y.loc[real_values]
122 | self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x_no_corr, self.y, train_size=0.8)
123 | # normalize data
124 | self.scaler = StandardScaler()
125 | self.x_train = self.scaler.fit_transform(self.x_train)
126 | self.x_test = self.scaler.transform(self.x_test)
127 | def pre_process(self):
128 | # Remove features with a correlation coef greater than 0.85
129 | corr_matrix = np.abs(self.x.astype(float).corr())
130 | upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
131 | to_drop = [column for column in upper.columns if any(upper[column] >= 0.90)]
132 | self.drop_cols = to_drop
133 | self.x_no_corr = self.x.drop(columns=to_drop)
134 | cols = self.x_no_corr.columns
135 | print(f'Columns dropped >= 0.90: {to_drop}')
136 | #Drop samples that are outliers
137 | print(f'old feature dataframe shape before outlier removal: {self.x_no_corr.shape}')
138 | for col_name in cols:
139 | Q1 = np.percentile(self.x_no_corr[col_name], 25)
140 | Q3 = np.percentile(self.x_no_corr[col_name], 75)
141 | IQR = Q3 - Q1
142 | upper = np.where(self.x_no_corr[col_name] >= (Q3+2.5*IQR)) #1.5 is the standard, use two to see if more data helps improve model performance
143 | lower = np.where(self.x_no_corr[col_name] <= (Q1-2.5*IQR))
144 | self.x_no_corr.drop(upper[0], inplace = True)
145 | self.x_no_corr.drop(lower[0], inplace = True)
146 | self.y.drop(upper[0], inplace = True)
147 | self.y.drop(lower[0], inplace = True)
148 | if 'level_0' in self.x_no_corr.columns:
149 | self.x_no_corr.drop(columns=['level_0'],inplace = True)
150 | self.x_no_corr.reset_index(inplace = True)
151 | self.y.reset_index(inplace = True, drop=True)
152 | self.x_no_corr.drop(columns=['level_0','index'],inplace = True)
153 | print(f'new feature dataframe shape after outlier removal: {self.x_no_corr.shape}')
154 | top_corr_features = corr_matrix.index
155 | def create_model(self, neurons=32, learning_rate=0.001, dropout_rate=0.2, alpha=0.1):
156 | model = keras.Sequential([
157 | layers.Dense(neurons, input_shape=(self.x_no_corr.shape[1],)),
158 | layers.LeakyReLU(alpha=alpha),
159 | layers.Dropout(dropout_rate),
160 | layers.Dense(neurons),
161 | layers.LeakyReLU(alpha=alpha),
162 | layers.Dropout(dropout_rate),
163 | layers.Dense(neurons),
164 | layers.LeakyReLU(alpha=alpha),
165 | layers.Dropout(dropout_rate),
166 | layers.Dense(1, activation='sigmoid')
167 | ])
168 | optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
169 | model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
170 | return model
171 | def deep_learn(self):
172 | if exists('deep_learning_MA.h5'):
173 | self.model = keras.models.load_model('deep_learning_MA.h5')
174 | else:
175 | #best params
176 | # Best: 0.999925 using {'alpha': 0.1, 'batch_size': 32, 'dropout_rate': 0.2,
177 | # 'learning_rate': 0.001, 'neurons': 16}
178 | optimizer = keras.optimizers.Adam(learning_rate=0.001)
179 | self.model = keras.Sequential([
180 | layers.Dense(16, input_shape=(self.x_no_corr.shape[1],)),
181 | layers.LeakyReLU(alpha=0.1),
182 | layers.BatchNormalization(),
183 | layers.Dropout(0.2),
184 | layers.Dense(16),
185 | layers.LeakyReLU(alpha=0.1),
186 | layers.BatchNormalization(),
187 | layers.Dropout(0.2),
188 | layers.Dense(16),
189 | layers.LeakyReLU(alpha=0.1),
190 | layers.BatchNormalization(),
191 | layers.Dropout(0.2),
192 | layers.Dense(16),
193 | layers.LeakyReLU(alpha=0.1),
194 | layers.BatchNormalization(),
195 | layers.Dropout(0.2),
196 | layers.Dense(16),
197 | layers.LeakyReLU(alpha=0.1),
198 | layers.BatchNormalization(),
199 | layers.Dropout(0.2),
200 | layers.Dense(16),
201 | layers.LeakyReLU(alpha=0.1),
202 | layers.BatchNormalization(),
203 | layers.Dropout(0.2),
204 | layers.Dense(1, activation='sigmoid')
205 | ])
206 | self.model.compile(optimizer=optimizer,
207 | loss='binary_crossentropy',
208 | metrics=['accuracy'])
209 | history = self.model.fit(self.x_train, self.y_train,
210 | epochs=75, batch_size=32,
211 | validation_data=(self.x_test,self.y_test))
212 | #validation_split=0.2)
213 | # param_grid = {
214 | # 'neurons': [16, 32, 64],
215 | # 'learning_rate': [0.01, 0.001, 0.0001],
216 | # 'dropout_rate': [0.1, 0.2, 0.3],
217 | # 'alpha': [0.01, 0.1, 0.2],
218 | # 'batch_size': [16, 32, 64]
219 | # }
220 | # param_grid = {
221 | # 'neurons': [16, 32],
222 | # 'learning_rate': [0.01, 0.001],
223 | # 'dropout_rate': [0.2],
224 | # 'alpha': [0.1],
225 | # 'batch_size': [32, 64]
226 | # }
227 | # model = KerasClassifier(build_fn=self.create_model,
228 | # epochs=50, batch_size=32, verbose=4)
229 | # grid = GridSearchCV(estimator=model,
230 | # param_grid=param_grid,
231 | # cv=3,
232 | # verbose=3)
233 | # self.grid_result = grid.fit(self.x_train, self.y_train)
234 | # print("Best: %f using %s" % (self.grid_result.best_score_, self.grid_result.best_params_))
235 | # self.model = self.grid_result
236 | # input()
237 | self.model.save('deep_learning_MA.h5')
238 | plt.figure()
239 | plt.plot(history.history['accuracy'], label='training accuracy')
240 | plt.plot(history.history['val_accuracy'], label='validation accuracy')
241 | plt.title('Accuracy History')
242 | plt.xlabel('Epoch')
243 | plt.ylabel('Accuracy')
244 | plt.legend()
245 | plt.savefig('Accuracy.png',dpi=300)
246 | plt.close()
247 |
248 | # plot loss history
249 | plt.figure()
250 | plt.plot(history.history['loss'], label='training loss')
251 | plt.plot(history.history['val_loss'], label='validation loss')
252 | plt.title('Loss History')
253 | plt.xlabel('Epoch')
254 | plt.ylabel('Loss')
255 | plt.legend()
256 | plt.savefig('Loss.png',dpi=300)
257 | plt.close()
258 | def predict_two_teams(self):
259 | teams_sports_ref = read_csv('teams_sports_ref_format.csv')
260 | while True:
261 | try:
262 | team_1 = input('team_1: ')
263 | if team_1 == 'exit':
264 | break
265 | team_2 = input('team_2: ')
266 | #Game location
267 | game_loc_team1 = int(input(f'{team_1} : #home = 0, away = 1, N = 2: '))
268 | if game_loc_team1 == 0:
269 | game_loc_team2 = 1
270 | elif game_loc_team1 == 1:
271 | game_loc_team2 = 0
272 | elif game_loc_team1 == 2:
273 | game_loc_team2 = 2
274 | #Check to see if the team was spelled right
275 | team_1 = get_close_matches(team_1,teams_sports_ref['teams'].tolist(),n=1)[0]
276 | team_2 = get_close_matches(team_2,teams_sports_ref['teams'].tolist(),n=1)[0]
277 | #2023 data
278 | year = 2023
279 | sleep(4)
280 | basic = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs.html'
281 | adv = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs-advanced.html'
282 | team_1_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_1.lower(),year)
283 | sleep(4) #I get get banned for a small period of time if I do not do this
284 | basic = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs.html'
285 | adv = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs-advanced.html'
286 | team_2_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_2.lower(),year)
287 | #Remove empty cells
288 | team_1_df2023['pts'].replace('', np.nan, inplace=True)
289 | team_1_df2023.replace('', np.nan, inplace=True)
290 | team_1_df2023.dropna(inplace=True)
291 | team_2_df2023['pts'].replace('', np.nan, inplace=True)
292 | team_2_df2023.replace('', np.nan, inplace=True)
293 | team_2_df2023.dropna(inplace=True)
294 | #Remove pts and game result
295 | # for col in team_1_df2023.columns:
296 | # if 'opp' in col:
297 | # team_1_df2023.drop(columns=col,inplace=True)
298 | # for col in team_2_df2023.columns:
299 | # if 'opp' in col:
300 | # team_2_df2023.drop(columns=col,inplace=True)
301 | team_1_df2023.drop(columns=['game_result'],inplace=True)
302 | team_2_df2023.drop(columns=['game_result'],inplace=True)
303 | #Range over all ranges data were trained on
304 | range_ma = [2,3,4,5,10,12,14]
305 | #Team 1
306 | data1_mean = DataFrame()
307 | for val in range_ma:
308 | for col in team_1_df2023.columns:
309 | if 'game_result' in col or 'simple_rating_system' in col or 'game_loc' in col:
310 | data1_mean[col] = team_1_df2023[col]
311 | else:
312 | dynamic_name = col + '_' + str(val)
313 | data1_mean[dynamic_name] = team_1_df2023[col].ewm(span=val,min_periods=0).mean()
314 | #Team 2
315 | data2_mean = DataFrame()
316 | for val in range_ma:
317 | for col in team_2_df2023.columns:
318 | if 'game_result' in col or 'simple_rating_system' in col or 'game_loc' in col:
319 | data2_mean[col] = team_2_df2023[col]
320 | else:
321 | dynamic_name = col + '_' + str(val)
322 | data2_mean[dynamic_name] = team_2_df2023[col].ewm(span=val,min_periods=0).mean()
323 | #replace team 1 opp data with teams 2 data
324 | data1_mean_copy = data1_mean
325 | for col in data1_mean.columns:
326 | if "opp" in col:
327 | if "opp_trb" in col:
328 | new_col = col.replace("opp_trb", "total_board")
329 | data1_mean.loc[data1_mean.index[-1], col] = data2_mean.loc[data2_mean.index[-1], new_col]
330 | else:
331 | new_col = col.replace("opp_", "")
332 | data1_mean.loc[data1_mean.index[-1], col] = data2_mean.loc[data2_mean.index[-1], new_col]
333 | #Get SRS
334 | team_1_srs = cbb_web_scraper.get_latest_srs(team_1)
335 | data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_1_srs
336 | data2_mean.loc[data2_mean.index[-1], 'simple_rating_system'] = cbb_web_scraper.get_latest_srs(team_2)
337 | data1_mean_copy.loc[data1_mean_copy.index[-1], 'simple_rating_system'] = team_1_srs
338 | #TEAM 1 Prediction
339 | x_new = self.scaler.transform(data1_mean.iloc[-1:])
340 | prediction_team_1 = self.model.predict(x_new)
341 | #replace team 2 opp data with teams 1 data
342 | for col in data2_mean.columns:
343 | if "opp" in col:
344 | if "opp_trb" in col:
345 | new_col = col.replace("opp_trb", "total_board")
346 | data2_mean.loc[data2_mean.index[-1], col] = data1_mean_copy.loc[data1_mean_copy.index[-1], new_col]
347 | else:
348 | new_col = col.replace("opp_", "")
349 | data2_mean.loc[data2_mean.index[-1], col] = data1_mean_copy.loc[data1_mean_copy.index[-1], new_col]
350 | #TEAM 2 Prediction
351 | x_new = self.scaler.transform(data2_mean.iloc[-1:])
352 | prediction_team_2 = self.model.predict(x_new)
353 | print('===============================')
354 | print(f'prediction of {team_1} winning: {prediction_team_1[0][0]*100}%')
355 | print(f'prediction of {team_2} winning: {prediction_team_2[0][0]*100}%')
356 | print('===============================')
357 | #Drop the correlated features
358 | # team_1_df2023.drop(columns=self.drop_cols, inplace=True)
359 | # team_2_df2023.drop(columns=self.drop_cols, inplace=True)
360 | # ma_range = np.arange(2,5,1) #2 was the most correct value for mean and 8 was the best for the median; chose 9 for tiebreaking
361 | # team_1_count = 0
362 | # team_2_count = 0
363 | # team_1_count_mean = 0
364 | # team_2_count_mean = 0
365 | # team_1_ma_win = []
366 | # team_1_ma_loss = []
367 | # team_2_ma = []
368 | # #get SRS
369 | # team_srs = cbb_web_scraper.get_latest_srs(team_1)
370 | # for ma in tqdm(ma_range):
371 | # # data1_median = team_1_df2023.rolling(ma).median()
372 | # # data1_median['game_loc'] = game_loc_team1
373 | # # data2_median = team_2_df2023.rolling(ma).median()
374 | # # data2_median['game_loc'] = game_loc_team2
375 | # # data1_mean_old = team_1_df2023.rolling(ma).mean()
376 | # # data2_mean_old = team_2_df2023.rolling(ma).mean()
377 | # # TEAM 1
378 | # data1_mean = team_1_df2023.ewm(span=ma,min_periods=ma-1).mean()
379 | # data1_mean['game_loc'] = game_loc_team1
380 | # data2_mean = team_2_df2023.ewm(span=ma,min_periods=ma-1).mean()
381 | # data2_mean['game_loc'] = game_loc_team2
382 | # # team_1_predict_median = self.RandForclass.predict(data1_median.iloc[-1:])
383 | # # team_2_predict_median = self.RandForclass.predict(data2_median.iloc[-1:])
384 | # #Here replace opponent metrics with the features of the second team
385 | # for col in team_1_df2023.columns:
386 | # if "opp" in col:
387 | # if col == 'opp_trb':
388 | # # new_col = col.replace("opp_", "")
389 | # data1_mean.loc[data1_mean.index[-1], 'opp_trb'] = data2_mean.loc[data2_mean.index[-1], 'total_board']
390 | # else:
391 | # new_col = col.replace("opp_", "")
392 | # data1_mean.loc[data1_mean.index[-1], col] = data2_mean.loc[data2_mean.index[-1], new_col]
393 | # #get latest SRS value
394 | # data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_srs
395 | # # data1_mean['simple_rating_system'].iloc[-1] = cbb_web_scraper.get_latest_srs(team_1)# float(input(f'input {team_1} current simple rating system value: '))
396 | # #TEAM 1 Prediction
397 | # x_new = self.scaler.transform(data1_mean.iloc[-1:])
398 | # prediction = self.model.predict(x_new)
399 | # print(f'prediction: {prediction[0]*100}%')
400 | # probability = prediction[0]
401 | # if probability > 0.5:
402 | # team_1_count += 1
403 | # elif probability < 0.5:
404 | # team_2_count += 1
405 | # # team_1_predict_mean = self.RandForclass.predict_proba(data1_mean.iloc[-1:])
406 | #TEAM
407 | # data1_mean_change = team_1_df2023.ewm(span=ma,min_periods=ma-1).mean()
408 | # data1_mean_change['game_loc'] = game_loc_team1
409 | # data2_mean_change = team_2_df2023.ewm(span=ma,min_periods=ma-1).mean()
410 | # data2_mean_change['game_loc'] = game_loc_team2
411 | # x_new = self.scaler.transform(data2_mean_change.iloc[-1:])
412 | # prediction = self.model.predict(x_new)
413 | # probability = prediction[0]
414 | # team_1_predict_median = self.RandForclass.predict(data1_median.iloc[-1:])
415 | # team_2_predict_median = self.RandForclass.predict(data2_median.iloc[-1:])
416 | #Here replace opponent metrics with the features of the second team
417 | # for col in team_2_df2023.columns:
418 | # if "opp" in col:
419 | # if col == 'opp_trb':
420 | # # new_col = col.replace("opp_", "")
421 | # data2_mean_change.loc[data2_mean_change.index[-1], 'opp_trb'] = data1_mean_change.loc[data1_mean_change.index[-1], 'total_board']
422 | # else:
423 | # new_col = col.replace("opp_", "")
424 | # data2_mean_change.loc[data2_mean_change.index[-1], col] = data1_mean_change.loc[data1_mean_change.index[-1], new_col]
425 | # team_2_predict_mean = self.RandForclass.predict_proba(data2_mean_change.iloc[-1:])
426 | # team_2_ma.append(team_2_predict_mean[0][1])
427 | # print('===============================================================')
428 | # print(f'{team_1} win probability {round(np.mean(team_1_ma_win),4)*100}%')
429 | # print(f'{team_2} win probability {round(np.median(team_2_predict_mean),4)*100}%')
430 | # print(f'{team_2} winning: {np.mean(team_2_ma)}%')
431 | # print('===============================================================')
432 | # # if np.mean(team_1_ma_win) > np.mean(team_1_ma_loss):
433 | # # print(f'{team_1} wins over {team_2}')
434 | # # else:
435 | # # print(f'{team_2} wins over {team_1}')
436 | # if team_1_count > team_2_count:
437 | # print(f'{team_1} wins over {team_2}')
438 | # elif team_1_count < team_2_count:
439 | # print(f'{team_2} wins over {team_1}')
440 | # print('===============================================================')
441 | except Exception as e:
442 | print(f'The error: {e}')
443 | def run_analysis(self):
444 | self.get_teams()
445 | self.split()
446 | self.deep_learn()
447 | self.predict_two_teams()
448 | def main():
449 | cbbDeep().run_analysis()
450 | if __name__ == '__main__':
451 | main()
--------------------------------------------------------------------------------
/cbb_regression.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | College Basketball Predictions
5 | @author: brianszekely
6 | """
7 | import cbb_web_scraper
8 | from os import getcwd
9 | from os.path import join, exists
10 | import yaml
11 | from tqdm import tqdm
12 | from time import sleep
13 | from pandas import DataFrame, concat, read_csv, isnull
14 | import numpy as np
15 | from sklearn.model_selection import train_test_split
16 | from sklearn.model_selection import GridSearchCV
17 | from sklearn.ensemble import RandomForestRegressor
18 | import matplotlib.pyplot as plt
19 | import seaborn as sns
20 | from sys import argv
21 | from sklearn.metrics import mean_squared_error, r2_score
22 | # from sklearn.model_selection import cross_val_score, KFold
23 | import pickle
24 | import joblib
25 | import sys
26 | import os
27 | from scipy.stats import variation
28 | from difflib import get_close_matches
29 | from datetime import datetime, timedelta
30 | class cbb_regressor():
31 | def __init__(self):
32 | print('initialize class cbb_regressor')
33 | self.all_data = DataFrame()
34 | def get_teams(self):
35 | year_list_find = []
36 | year_list = [2023,2022,2021,2019,2018,2017,2016,2015,2014,2013,2012] #,2014,2013,2012,2011,2010
37 | if exists(join(getcwd(),'year_count.yaml')):
38 | with open(join(getcwd(),'year_count.yaml')) as file:
39 | year_counts = yaml.load(file, Loader=yaml.FullLoader)
40 | else:
41 | year_counts = {'year':year_list_find}
42 | #Remove any years that have already been collected
43 | if year_counts['year']:
44 | year_list_check = year_counts['year']
45 | year_list_find = year_counts['year']
46 | year_list = [i for i in year_list if i not in year_list_check]
47 | print(f'Need data for year: {year_list}')
48 | #Collect data per year
49 | if year_list:
50 | for year in tqdm(year_list):
51 | all_teams = cbb_web_scraper.get_teams_year(year_list[-1],year_list[0])
52 | team_names = sorted(all_teams)
53 | final_list = []
54 | self.year_store = year
55 | for abv in tqdm(team_names):
56 | try:
57 | print() #tqdm things
58 | print(f'current team: {abv}, year: {year}')
59 | basic = 'https://www.sports-reference.com/cbb/schools/' + abv + '/' + str(self.year_store) + '-gamelogs.html'
60 | adv = 'https://www.sports-reference.com/cbb/schools/' + abv + '/' + str(self.year_store) + '-gamelogs-advanced.html'
61 | df_inst = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,abv,self.year_store)
62 | df_inst['pts'].replace('', np.nan, inplace=True)
63 | df_inst.dropna(inplace=True)
64 | final_list.append(df_inst)
65 | except Exception as e:
66 | print(e)
67 | print(f'{abv} data are not available')
68 | sleep(4) #I get get banned for a small period of time if I do not do this
69 | final_data = concat(final_list)
70 | if exists(join(getcwd(),'all_data_regressor.csv')):
71 | self.all_data = read_csv(join(getcwd(),'all_data_regressor.csv'))
72 | self.all_data = concat([self.all_data, final_data.dropna()])
73 | if not exists(join(getcwd(),'all_data_regressor.csv')):
74 | self.all_data.to_csv(join(getcwd(),'all_data_regressor.csv'),index=False)
75 | self.all_data.to_csv(join(getcwd(),'all_data_regressor.csv'),index=False)
76 | year_list_find.append(year)
77 | print(f'year list after loop: {year_list_find}')
78 | with open(join(getcwd(),'year_count.yaml'), 'w') as write_file:
79 | yaml.dump(year_counts, write_file)
80 | print(f'writing {year} to yaml file')
81 | else:
82 | self.all_data = read_csv(join(getcwd(),'all_data_regressor.csv'))
83 | print('len data: ', len(self.all_data))
84 | self.all_data = self.all_data.drop_duplicates(keep='last')
85 | print(f'length of data after duplicates are dropped: {len(self.all_data)}')
86 | def convert_to_float(self):
87 | for col in self.all_data.columns:
88 | self.all_data[col].replace('', np.nan, inplace=True)
89 | self.all_data[col] = self.all_data[col].astype(float)
90 | def delete_opp(self):
91 | """
92 | Drop any opponent data, as it may not be helpful when coming to prediction. Hard to estimate with running average
93 | """
94 | for col in self.all_data.columns:
95 | if 'opp' in col:
96 | self.all_data.drop(columns=col,inplace=True)
97 | def split(self):
98 | # self.delete_opp()
99 | for col in self.all_data.columns:
100 | if 'Unnamed' in col:
101 | self.all_data.drop(columns=col,inplace=True)
102 | self.convert_to_float()
103 | self.y = self.all_data['pts']
104 | self.x = self.all_data.drop(columns=['pts','game_result'])
105 | self.pre_process()
106 | #Dropna and remove all data from subsequent y data
107 | real_values = ~self.x_no_corr.isna().any(axis=1)
108 | self.x_no_corr.dropna(inplace=True)
109 | self.y = self.y.loc[real_values]
110 | self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x_no_corr, self.y, train_size=0.8)
111 | def pre_process(self):
112 | # Remove features with a correlation coef greater than 0.85
113 | corr_matrix = np.abs(self.x.astype(float).corr())
114 | upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
115 | to_drop = [column for column in upper.columns if any(upper[column] >= 0.90)]
116 | self.drop_cols = to_drop
117 | self.x_no_corr = self.x.drop(columns=to_drop)
118 | cols = self.x_no_corr.columns
119 | print(f'Columns dropped >= 0.90: {to_drop}')
120 | #Drop samples that are outliers
121 | print(f'old feature dataframe shape before outlier removal: {self.x_no_corr.shape}')
122 | for col_name in cols:
123 | Q1 = np.percentile(self.x_no_corr[col_name], 25)
124 | Q3 = np.percentile(self.x_no_corr[col_name], 75)
125 | IQR = Q3 - Q1
126 | upper = np.where(self.x_no_corr[col_name] >= (Q3+2.5*IQR)) #1.5 is the standard, use two to see if more data helps improve model performance
127 | lower = np.where(self.x_no_corr[col_name] <= (Q1-2.5*IQR))
128 | self.x_no_corr.drop(upper[0], inplace = True)
129 | self.x_no_corr.drop(lower[0], inplace = True)
130 | self.y.drop(upper[0], inplace = True)
131 | self.y.drop(lower[0], inplace = True)
132 | if 'level_0' in self.x_no_corr.columns:
133 | self.x_no_corr.drop(columns=['level_0'],inplace = True)
134 | self.x_no_corr.reset_index(inplace = True)
135 | self.y.reset_index(inplace = True, drop=True)
136 | self.x_no_corr.drop(columns=['level_0','index'],inplace = True)
137 | print(f'new feature dataframe shape after outlier removal: {self.x_no_corr.shape}')
138 | top_corr_features = corr_matrix.index
139 | plt.figure(figsize=(20,20))
140 | sns.heatmap(corr_matrix[top_corr_features],annot=True,cmap="RdYlGn")
141 | plt.tight_layout()
142 | plt.savefig('correlations.png',dpi=250)
143 | plt.close()
144 | def random_forest_analysis(self):
145 | if argv[1] == 'tune':
146 | #RANDOM FOREST REGRESSOR
147 | RandForclass = RandomForestRegressor()
148 | #Use the number of features as a stopping criterion for depth
149 | rows, cols = self.x_train.shape
150 | cols = int(cols / 1.18) #try to avoid overfitting on depth
151 | #square root of the total number of features is a good limit
152 | # cols = int(np.sqrt(cols))
153 | #parameters to tune
154 | #increasing min_samples_leaf, this will reduce overfitting
155 | Rand_perm = {
156 | 'criterion' : ["squared_error", "poisson"], #absolute_error - takes forever to run
157 | 'n_estimators': range(300,500,100),
158 | # 'min_samples_split': np.arange(2, 5, 1, dtype=int),
159 | 'max_features' : [1, 'sqrt', 'log2'],
160 | 'max_depth': np.arange(2,cols,1),
161 | 'min_samples_leaf': np.arange(1,3,1)
162 | }
163 | #['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score',
164 | # average_precision', 'balanced_accuracy', 'completeness_score', 'explained_variance',
165 | # 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score',
166 | # 'homogeneity_score', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples',
167 | # 'jaccard_weighted', 'matthews_corrcoef', 'max_error', 'mutual_info_score', 'neg_brier_score',
168 | # 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error',
169 | # 'neg_mean_gamma_deviance', 'neg_mean_poisson_deviance', 'neg_mean_squared_error',
170 | # 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'neg_root_mean_squared_error',
171 | # 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'rand_score', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'roc_auc_ovo', 'roc_auc_ovo_weighted', 'roc_auc_ovr', 'roc_auc_ovr_weighted', 'top_k_accuracy', 'v_measure_score']
172 | clf_rand = GridSearchCV(RandForclass, Rand_perm,
173 | scoring=['neg_root_mean_squared_error','explained_variance'],
174 | cv=5,
175 | refit='neg_root_mean_squared_error',verbose=4, n_jobs=-1)
176 | #save
177 | search_rand = clf_rand.fit(self.x_train,self.y_train)
178 | #Write fitted and tuned model to file
179 | # with open('randomForestModelTuned.pkl','wb') as f:
180 | # pickle.dump(search_rand,f)
181 | joblib.dump(search_rand, "./randomForestModelTuned.joblib", compress=9)
182 | print('RandomForestRegressor - best params: ',search_rand.best_params_)
183 | self.RandForRegressor = search_rand
184 | self.rmse = mean_squared_error(self.RandForRegressor.predict(self.x_test),self.y_test,squared=False)
185 | print('RMSE: ',mean_squared_error(self.RandForRegressor.predict(self.x_test),self.y_test,squared=False))
186 | print('R2 score: ',r2_score(self.RandForRegressor.predict(self.x_test),self.y_test))
187 | else:
188 | print('Load tuned Random Forest Regressor')
189 | # load RandomForestModel
190 | # with open('randomForestModelTuned.pkl', 'rb') as f:
191 | # self.RandForRegressor = pickle.load(f)
192 | self.RandForRegressor=joblib.load("./randomForestModelTuned.joblib")
193 | print(f'Current RandomForestRegressor Parameters: {self.RandForRegressor.best_params_}')
194 | print('RMSE: ',mean_squared_error(self.RandForRegressor.predict(self.x_test),self.y_test,squared=False))
195 | print('R2 score: ',r2_score(self.RandForRegressor.predict(self.x_test),self.y_test))
196 | self.rmse = mean_squared_error(self.RandForRegressor.predict(self.x_test),self.y_test,squared=False)
197 | # self.RandForRegressor = RandomForestRegressor(criterion='squared_error',
198 | # max_depth=20,
199 | # max_features='log2',
200 | # n_estimators=300,
201 | # min_samples_leaf=3)
202 | # def multi_layer_perceptron(self):
203 | # pass
204 | # def keras_regressor_analysis(self):
205 | # pass
206 | def predict_two_teams(self):
207 | teams_sports_ref = read_csv('teams_sports_ref_format.csv')
208 | while True:
209 | try:
210 | team_1 = input('team_1: ')
211 | if team_1 == 'exit':
212 | break
213 | team_2 = input('team_2: ')
214 | #Game location
215 | game_loc_team1 = int(input(f'{team_1} : home = 0, away = 1, neutral = 2: '))
216 | if game_loc_team1 == 0:
217 | game_loc_team2 = 1
218 | elif game_loc_team1 == 1:
219 | game_loc_team2 = 0
220 | elif game_loc_team1 == 2:
221 | game_loc_team2 = 2
222 | #Check to see if the team was spelled right
223 | team_1 = get_close_matches(team_1,teams_sports_ref['teams'].tolist(),n=1)[0]
224 | team_2 = get_close_matches(team_2,teams_sports_ref['teams'].tolist(),n=1)[0]
225 | #2023 data
226 | year = 2023
227 | # sleep(4)
228 | basic = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs.html'
229 | adv = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs-advanced.html'
230 | team_1_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_1.lower(),year)
231 | sleep(4) #I get get banned for a small period of time if I do not do this
232 | basic = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs.html'
233 | adv = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs-advanced.html'
234 | team_2_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_2.lower(),year)
235 | #Remove empty cells
236 | team_1_df2023['pts'].replace('', np.nan, inplace=True)
237 | team_1_df2023.replace('', np.nan, inplace=True)
238 | team_1_df2023.dropna(inplace=True)
239 | team_2_df2023['pts'].replace('', np.nan, inplace=True)
240 | team_2_df2023.replace('', np.nan, inplace=True)
241 | team_2_df2023.dropna(inplace=True)
242 | #Save series of pts for visualizations
243 | self.pts_team_1 = team_1_df2023['pts'].astype(float)
244 | self.team_1_name = team_1
245 | self.pts_team_2 = team_2_df2023['pts'].astype(float)
246 | self.team_2_name = team_2
247 | #Remove pts and game result
248 | # for col in team_1_df2023.columns:
249 | # if 'opp' in col:
250 | # team_1_df2023.drop(columns=col,inplace=True)
251 | # for col in team_2_df2023.columns:
252 | # if 'opp' in col:
253 | # team_2_df2023.drop(columns=col,inplace=True)
254 | # team_1_df2023.drop(columns=['game_result','pts'],inplace=True)
255 | # team_2_df2023.drop(columns=['game_result','pts'],inplace=True)
256 | #Drop the correlated features
257 | team_1_df2023.drop(columns=self.drop_cols, inplace=True)
258 | team_2_df2023.drop(columns=self.drop_cols, inplace=True)
259 | # team_1_df2023.to_csv('team_1.csv')
260 | # team_2_df2023.to_csv('team_2.csv')
261 | # print(team_1_df2023)
262 | # print(team_2_df2023)
263 | #Clean up dataframe
264 | # for col in team_1_df2023.columns:
265 | # if 'Unnamed' in col:
266 | # team_1_df2023.drop(columns=col,inplace=True)
267 | # for col in team_2_df2023.columns:
268 | # if 'Unnamed' in col:
269 | # team_2_df2023.drop(columns=col,inplace=True)
270 | #Try to find the moving averages that work
271 | # ma_range = np.arange(2,len(team_2_df2023)-2,1)
272 | ma_range = np.arange(2,7,1) #2 was the most correct value for mean and 8 was the best for the median; chose 9 for tiebreaking
273 | team_1_count = 0
274 | team_2_count = 0
275 | team_1_count_mean = 0
276 | team_2_count_mean = 0
277 | team_1_ma = []
278 | team_2_ma = []
279 | team_1_median = []
280 | team_2_median = []
281 | num_pts_score_team_1= []
282 | num_pts_score_team_2 = []
283 | mean_team_1_var = []
284 | mean_team_2_var = []
285 | # Get the latest simple rating system for both teams
286 | team_1_srs = cbb_web_scraper.get_latest_srs(team_1)
287 | team_2_srs = cbb_web_scraper.get_latest_srs(team_2)
288 | for ma in tqdm(ma_range):
289 | data1_median = team_1_df2023.rolling(ma).median()
290 | data1_median['game_loc'] = game_loc_team1
291 | data2_median = team_2_df2023.rolling(ma).median()
292 | data2_median['game_loc'] = game_loc_team2
293 | # data1_mean_old = team_1_df2023.rolling(ma).mean()
294 | # data2_mean_old = team_2_df2023.rolling(ma).mean()
295 | data1_mean = team_1_df2023.ewm(span=ma,min_periods=ma-1).mean()
296 | data1_mean['game_loc'] = game_loc_team1
297 | data2_mean = team_2_df2023.ewm(span=ma,min_periods=ma-1).mean()
298 | data2_mean['game_loc'] = game_loc_team2
299 | for col in team_1_df2023.columns:
300 | if "opp" in col:
301 | if col == 'opp_trb':
302 | # new_col = col.replace("opp_", "")
303 | data1_mean.loc[data1_mean.index[-1], 'opp_trb'] = data2_mean.loc[data2_mean.index[-1], 'total_board']
304 | data2_mean.loc[data2_mean.index[-1], 'opp_trb'] = data1_mean.loc[data1_mean.index[-1], 'total_board']
305 |
306 | data1_median.loc[data1_median.index[-1], 'opp_trb'] = data2_median.loc[data2_median.index[-1], 'total_board']
307 | data2_median.loc[data2_median.index[-1], 'opp_trb'] = data1_median.loc[data1_median.index[-1], 'total_board']
308 | else:
309 | new_col = col.replace("opp_", "")
310 | data1_mean.loc[data1_mean.index[-1], col] = data2_mean.loc[data2_mean.index[-1], new_col]
311 | data2_mean.loc[data2_mean.index[-1], col] = data1_mean.loc[data1_mean.index[-1], new_col]
312 |
313 | data1_median.loc[data1_median.index[-1], col] = data2_median.loc[data2_median.index[-1], new_col]
314 | data2_median.loc[data2_median.index[-1], col] = data1_median.loc[data1_median.index[-1], new_col]
315 |
316 | #Drop game result and points features
317 | data1_median.drop(columns=['game_result','pts'],inplace=True)
318 | data2_median.drop(columns=['game_result','pts'],inplace=True)
319 | data1_mean.drop(columns=['game_result','pts'],inplace=True)
320 | data2_mean.drop(columns=['game_result','pts'],inplace=True)
321 | #apply SRS
322 | data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_1_srs
323 | data2_mean.loc[data2_mean.index[-1], 'simple_rating_system'] = team_2_srs
324 | data1_median.loc[data1_median.index[-1], 'simple_rating_system'] = team_1_srs
325 | data2_median.loc[data2_median.index[-1], 'simple_rating_system'] = team_2_srs
326 | #Get current predictions for both teams
327 | team_1_predict_median = self.RandForRegressor.predict(data1_median.iloc[-1:])
328 | team_2_predict_median = self.RandForRegressor.predict(data2_median.iloc[-1:])
329 | team_1_predict_mean = self.RandForRegressor.predict(data1_mean.iloc[-1:])
330 | team_2_predict_mean = self.RandForRegressor.predict(data2_mean.iloc[-1:])
331 | num_pts_score_team_1.append(team_1_predict_mean[0])
332 | num_pts_score_team_2.append(team_2_predict_mean[0])
333 | num_pts_score_team_1.append(team_1_predict_median[0])
334 | num_pts_score_team_2.append(team_2_predict_median[0])
335 | if team_1_predict_median > team_2_predict_median:
336 | team_1_count += 1
337 | team_1_median.append(ma)
338 | if team_1_predict_median < team_2_predict_median:
339 | team_2_count += 1
340 | team_2_median.append(ma)
341 | if team_1_predict_mean > team_2_predict_mean:
342 | team_1_count_mean += 1
343 | team_1_ma.append(ma)
344 | if team_1_predict_mean < team_2_predict_mean:
345 | team_2_count_mean += 1
346 | team_2_ma.append(ma)
347 | #check variability between fg and off_ftg
348 | mean_team_1_var.append(np.mean(data1_mean[['fg','off_rtg']].dropna().std()))
349 | mean_team_1_var.append(np.mean(data1_median[['fg','off_rtg']].dropna().std()))
350 | mean_team_2_var.append(np.mean(data2_mean[['fg','off_rtg']].dropna().std()))
351 | mean_team_2_var.append(np.mean(data2_median[['fg','off_rtg']].dropna().std()))
352 | print('===============================================================')
353 | print(f'{team_1} SRS data: {team_1_srs}')
354 | print(f'{team_2} SRS data: {team_2_srs}')
355 | print('===============================================================')
356 | print(f'Outcomes with a rolling median from 2-{len(ma_range)} games')
357 | print(f'{team_1}: {team_1_count} | {team_1_median}')
358 | print(f'{team_2}: {team_2_count} | {team_2_median}')
359 | if team_1_count > team_2_count:
360 | print(f'======= {team_1} wins =======')
361 | elif team_1_count < team_2_count:
362 | print(f'======= {team_2} wins =======')
363 | print('===============================================================')
364 | print(f'Outcomes with a mean from 2-{len(ma_range)} games')
365 | print(f'{team_1}: {team_1_count_mean} | {team_1_ma}')
366 | print(f'{team_2}: {team_2_count_mean} | {team_2_ma}')
367 | if team_1_count_mean > team_2_count_mean:
368 | print(f'======= {team_1} wins =======')
369 | elif team_1_count_mean < team_2_count_mean:
370 | print(f'======= {team_2} wins =======')
371 | print('===============================================================')
372 | print(f'{team_1} number of pts score: {int(np.mean(num_pts_score_team_1))} +/- {np.std(num_pts_score_team_1)}')
373 | print(f'{team_2} number of pts score: {int(np.mean(num_pts_score_team_2))} +/- {np.std(num_pts_score_team_2)}')
374 | if abs(int(np.mean(num_pts_score_team_1)) - int(np.mean(num_pts_score_team_2))) < 3:#self.rmse
375 | print('The point differential is less than the model RMSE, be cautious.')
376 | print('===============================================================')
377 | print(f'Mean variance of two best features {team_1}: {np.mean(mean_team_1_var)}')
378 | print(f'Mean variance of two best features {team_2}: {np.mean(mean_team_2_var)}')
379 | print('===============================================================')
380 | print(f'Standard deviation of points scored by {team_1}: {np.std(self.pts_team_1)}')
381 | print(f'Standard deviation of points scored by {team_2}: {np.std(self.pts_team_2)}')
382 | print('===============================================================')
383 | if "tod" in sys.argv[2]:
384 | date_today = str(datetime.now().date()).replace("-", "")
385 | elif "tom" in sys.argv[2]:
386 | date_today = str(datetime.now().date() + timedelta(days=1)).replace("-", "")
387 | URL = "https://www.espn.com/mens-college-basketball/schedule/_/date/" + date_today #sys argv????
388 | print(f'ESPN prediction: {cbb_web_scraper.get_espn(URL,team_1,team_2)}')
389 | print('===============================================================')
390 | if sys.argv[2] == "show":
391 | self.visualization(np.mean(num_pts_score_team_1),np.mean(num_pts_score_team_2))
392 | except Exception as e:
393 | print(f'The error: {e}')
394 | exc_type, exc_obj, exc_tb = sys.exc_info()
395 | fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
396 | print(exc_type,' File with the error: ', fname, ' Line number with error: ',exc_tb.tb_lineno)
397 | if exc_tb.tb_lineno == 226:
398 | print(f'{team_1} data could not be found. check spelling or internet connection. Some teams do not have data on SportsReference')
399 | elif exc_tb.tb_lineno == 229:
400 | print(f'{team_2} data could not be found. check spelling or internet connection. Some teams do not have data on SportsReference')
401 | def feature_importances_random_forest(self):
402 | importances = self.RandForRegressor.best_estimator_.feature_importances_
403 | indices = np.argsort(importances)
404 | plt.figure(figsize=(12,10))
405 | plt.title('Feature Importances Random Forest')
406 | # plt.barh(range(len(indices)), importances[indices], color='k', align='center')
407 | sns.barplot(x=importances[indices], y=[self.x_test.columns[i] for i in indices], color='k')
408 | plt.yticks(range(len(indices)), [self.x_test.columns[i] for i in indices])
409 | plt.xlabel('Relative Importance')
410 | plt.tight_layout()
411 | plt.savefig('feature_importance_random_forest.png',dpi=300)
412 | def visualization(self,pred_1,pred_2):
413 | games_1 = range(1,len(self.pts_team_1)+1,1)
414 | games_2 = range(1,len(self.pts_team_2)+1,1)
415 | team_1_pred = self.team_1_name + " prediction"
416 | team_2_pred = self.team_2_name + " prediction"
417 | plt.figure()
418 | plt.plot(games_1,self.pts_team_1,color='green',label=self.team_1_name)
419 | plt.plot(games_2,self.pts_team_2,color='blue',label=self.team_2_name)
420 | plt.scatter(len(self.pts_team_1)+2,pred_1,color='green',label=team_1_pred)
421 | plt.scatter(len(self.pts_team_2)+2,pred_2,color='blue',label=team_2_pred)
422 | plt.legend()
423 | plt.xlabel('Games')
424 | plt.ylabel('Points')
425 | plt.tight_layout()
426 | plt.show()
427 | def run_analysis(self):
428 | self.get_teams()
429 | self.split()
430 | self.random_forest_analysis()
431 | self.predict_two_teams()
432 | self.feature_importances_random_forest()
433 | def main():
434 | cbb_regressor().run_analysis()
435 | if __name__ == '__main__':
436 | main()
--------------------------------------------------------------------------------
/cbb_classification.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | College Basketball Predictions via classification and probability with ESPN
5 | @author: brianszekely
6 | """
7 | import cbb_web_scraper
8 | from os import getcwd
9 | from os.path import join, exists
10 | import yaml
11 | from tqdm import tqdm
12 | from time import sleep
13 | from pandas import DataFrame, concat, read_csv, isnull
14 | from sklearn.model_selection import train_test_split
15 | from sklearn.model_selection import GridSearchCV
16 | import numpy as np
17 | import matplotlib.pyplot as plt
18 | import seaborn as sns
19 | from sys import argv
20 | import joblib
21 | from sklearn.metrics import confusion_matrix, accuracy_score
22 | from difflib import get_close_matches
23 | from sklearn.metrics import roc_curve
24 | import seaborn as sns
25 | from tensorflow.keras.utils import to_categorical
26 | from sklearn.decomposition import PCA
27 | import xgboost as xgb
28 | from tensorflow.keras.layers import Dense, BatchNormalization
29 | from tensorflow.keras.models import Sequential
30 | from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
31 | from tensorflow.keras.optimizers import Adam, RMSprop
32 | from tensorflow.keras.regularizers import l1, l2
33 | from keras_tuner import RandomSearch
34 | from tensorflow.keras.losses import BinaryCrossentropy
35 | from tensorflow.keras.layers import Dropout
36 | from tensorflow.keras.models import load_model
37 | import os
38 | from colorama import Fore, Style
39 | from sklearn.preprocessing import StandardScaler, RobustScaler
40 | import shap
41 |
42 | """
43 | TODO:
44 | -adjust noise for better learning
45 | -may remove opp_pts and pts to enhance other features
46 | -feature engineer with rolling std or mean
47 | """
48 | def create_sequential_model(hp, n_features, n_outputs):
49 | model = Sequential()
50 | #Add hidden layers
51 | for i in range(hp.Int('num_layers', 1, 10)):
52 | if i == 0:
53 | # First hidden layer needs input shape
54 | model.add(Dense(units=hp.Int(f'units_{i}', min_value=8, max_value=128, step=8),
55 | activation=hp.Choice(f'activation_{i}', values=['relu', 'leaky_relu', 'tanh', 'linear']),
56 | kernel_regularizer=l2(hp.Float(f'regularizer_strength_{i}', min_value=1e-1, max_value=1, sampling='log')),
57 | input_shape=(n_features,)))
58 | else:
59 | model.add(Dense(units=hp.Int(f'units_{i}', min_value=8, max_value=128, step=8),
60 | activation=hp.Choice(f'activation_{i}', values=['relu', 'leaky_relu', 'tanh', 'linear']),
61 | kernel_regularizer=l2(hp.Float(f'regularizer_strength_{i}', min_value=1e-1, max_value=1, sampling='log'))))
62 | model.add(BatchNormalization())
63 | model.add(Dropout(rate=hp.Float(f'dropout_rate_{i}', min_value=0.3, max_value=0.6, step=0.1)))
64 |
65 | # Output layer
66 | model.add(Dense(n_outputs, activation='sigmoid')) # Binary classification
67 |
68 | # Compile model
69 | optimizer_choice = hp.Choice('optimizer', values=['adam', 'rmsprop']) #, 'sgd'
70 | if optimizer_choice == 'adam':
71 | optimizer = Adam(learning_rate=hp.Float('adam_learning_rate', min_value=0.0001, max_value=0.01, sampling='log'))
72 | else:
73 | optimizer = RMSprop(learning_rate=hp.Float('rmsprop_learning_rate', min_value=0.0001, max_value=0.01, sampling='log'))
74 |
75 | model.compile(optimizer=optimizer,
76 | loss=BinaryCrossentropy(),
77 | metrics=['accuracy'])
78 |
79 | return model
80 |
81 | class cbbClass():
82 | def __init__(self,pre_process):
83 | print('instantiate class cbbClass')
84 | self.all_data = DataFrame()
85 | self.which_analysis = pre_process # 'pca' or 'corr'
86 |
87 | def get_teams(self):
88 | year_list_find = []
89 | year_list = [2024,2023]#,2022,2021,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010]
90 | if exists(join(getcwd(),'year_count.yaml')):
91 | with open(join(getcwd(),'year_count.yaml')) as file:
92 | year_counts = yaml.load(file, Loader=yaml.FullLoader)
93 | else:
94 | year_counts = {'year':year_list_find}
95 | #Remove any years that have already been collected
96 | if year_counts['year']:
97 | year_list_check = year_counts['year']
98 | year_list_find = year_counts['year']
99 | year_list = [i for i in year_list if i not in year_list_check]
100 | print(f'Need data for year: {year_list}')
101 | #Collect data per year
102 | if year_list:
103 | for year in tqdm(year_list):
104 | all_teams = cbb_web_scraper.get_teams_year(year_list[-1],2024)
105 | team_names = sorted(all_teams)
106 | final_list = []
107 | self.year_store = year
108 | for abv in tqdm(team_names):
109 | try:
110 | print() #tqdm things
111 | print(f'current team: {abv}, year: {year}')
112 | basic = 'https://www.sports-reference.com/cbb/schools/' + abv + '/' + str(self.year_store) + '-gamelogs.html'
113 | adv = 'https://www.sports-reference.com/cbb/schools/' + abv + '/' + str(self.year_store) + '-gamelogs-advanced.html'
114 | df_inst = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,abv,self.year_store)
115 | print(df_inst)
116 | df_inst['pts'].replace('', np.nan, inplace=True)
117 | df_inst.dropna(inplace=True)
118 | final_list.append(df_inst)
119 | except Exception as e:
120 | print(e)
121 | print(f'{abv} data are not available')
122 | sleep(4) #I get get banned for a small period of time if I do not do this
123 | final_data = concat(final_list)
124 | if exists(join(getcwd(),'all_data.csv')):
125 | self.all_data = read_csv(join(getcwd(),'all_data.csv'))
126 | self.all_data = concat([self.all_data, final_data.dropna()])
127 | if not exists(join(getcwd(),'all_data.csv')):
128 | self.all_data.to_csv(join(getcwd(),'all_data.csv'),index=False)
129 | self.all_data.to_csv(join(getcwd(),'all_data.csv'),index=False)
130 | year_list_find.append(year)
131 | print(f'year list after loop: {year_list_find}')
132 | with open(join(getcwd(),'year_count.yaml'), 'w') as write_file:
133 | yaml.dump(year_counts, write_file)
134 | print(f'writing {year} to yaml file')
135 | else:
136 | self.all_data = read_csv(join(getcwd(),'all_data.csv'))
137 | print('dataset size: ', np.shape(self.all_data))
138 | self.all_data = self.all_data.drop_duplicates(keep='last')
139 | print(f'dataset size after duplicates are dropped: {np.shape(self.all_data)}')
140 |
141 | def pca_analysis(self):
142 | #scale first before pca
143 | self.scaler = StandardScaler()
144 | x_scale = self.scaler.fit_transform(self.x)
145 | self.pca = PCA(n_components=0.95) #explain 95% of the variance
146 | self.x_no_corr = self.pca.fit_transform(x_scale)
147 |
148 | #Visualize PCA components
149 | plt.figure()
150 | plt.figure(figsize=(8, 6))
151 | plt.bar(range(self.pca.n_components_), self.pca.explained_variance_ratio_)
152 | plt.xlabel('Principal Component')
153 | plt.ylabel('Explained Variance Ratio')
154 | plt.title('Explained Variance Ratio of Principal Components')
155 | plt.savefig('pca_components.png',dpi=400)
156 | plt.close()
157 |
158 | def convert_to_float(self):
159 | for col in self.all_data.columns:
160 | self.all_data[col].replace('', np.nan, inplace=True)
161 | self.all_data[col] = self.all_data[col].astype(float)
162 | self.all_data.dropna(inplace=True)
163 |
164 | def delete_opp(self):
165 | """
166 | Drop any opponent data, as it may not be helpful when coming to prediction. Hard to estimate with running average
167 | """
168 | for col in self.all_data.columns:
169 | if 'opp' in col:
170 | self.all_data.drop(columns=col,inplace=True)
171 | def split(self):
172 | # self.delete_opp()
173 | for col in self.all_data.columns:
174 | if 'Unnamed' in col:
175 | self.all_data.drop(columns=col,inplace=True)
176 | self.convert_to_float()
177 | #self.y = np.delete(self.y, np.where(np.isnan(self.x_no_corr)), axis=0)
178 | #self.x_no_corr = self.x_no_corr.dropna()
179 | self.y = self.all_data['game_result'].astype(int)
180 | result_counts = self.all_data['game_result'].value_counts()
181 | #plot the counts
182 | plt.figure(figsize=(8, 6))
183 | result_counts.plot(kind='bar')
184 | plt.xlabel('Game Result')
185 | plt.ylabel('Count')
186 | plt.title('Count of Labels')
187 | plt.savefig('class_label_count.png',dpi=400)
188 | plt.close()
189 |
190 | #onehot encode
191 | self.y = to_categorical(self.y)
192 | self.x = self.all_data.drop(columns=['game_result'])
193 |
194 | # #Dropna and remove all data from subsequent y data
195 | # real_values = ~self.x_no_corr.isna().any(axis=1)
196 | # self.x_no_corr.dropna(inplace=True)
197 | # self.y = self.y.loc[real_values]
198 |
199 |
200 | #pca data or no correlated data
201 | if self.which_analysis == 'pca':
202 | #pca
203 | self.pca_analysis()
204 | else:
205 | #correlational analysis and outlier removal
206 | self.pre_process_corr_out_remove()
207 | #75/15/10 split
208 | #Split data into training and the rest (75% training, 25% temporary)
209 | self.x_train, x_temp, self.y_train, y_temp = train_test_split(self.x_no_corr, self.y, train_size=0.75, random_state=42)
210 | #Split the rest into validation and test data (60% validation, 40% test)
211 | validation_ratio = 0.15 / (1 - 0.75) # Adjust ratio for the remaining part
212 | self.x_validation, self.x_test, self.y_validation, self.y_test = train_test_split(x_temp, y_temp, train_size=validation_ratio, random_state=42)
213 |
214 | def pre_process_corr_out_remove(self):
215 | # Remove features with a correlation coef greater than 0.90
216 | corr_val = 0.9
217 | corr_matrix = np.abs(self.x.astype(float).corr())
218 | upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
219 | to_drop = [column for column in upper.columns if any(upper[column] >= corr_val)]
220 | self.drop_cols = to_drop
221 | self.drop_cols = self.drop_cols + ['opp_pts', 'pts','game_loc','simple_rating_system'] #remove these extra features
222 | self.x_no_corr = self.x.drop(columns=self.drop_cols)
223 | cols = self.x_no_corr.columns
224 | print(f'Columns dropped >= {corr_val}: {self.drop_cols}')
225 | #Drop samples that are outliers
226 | print(f'old feature dataframe shape before outlier removal: {self.x_no_corr.shape}')
227 | for col_name in cols:
228 | Q1 = np.percentile(self.x_no_corr[col_name], 5)
229 | Q3 = np.percentile(self.x_no_corr[col_name], 95)
230 | IQR = Q3 - Q1
231 | upper = np.where(self.x_no_corr[col_name] >= (Q3+2.5*IQR)) #1.5 is the standard, use two to see if more data helps improve model performance
232 | lower = np.where(self.x_no_corr[col_name] <= (Q1-2.5*IQR))
233 | self.x_no_corr.drop(upper[0], inplace = True)
234 | self.x_no_corr.drop(lower[0], inplace = True)
235 | self.y = np.delete(self.y, upper[0], axis=0)
236 | self.y = np.delete(self.y, lower[0], axis=0)
237 | # self.y.drop(upper[0], inplace = True)
238 | # self.y.drop(lower[0], inplace = True)
239 | if 'level_0' in self.x_no_corr.columns:
240 | self.x_no_corr.drop(columns=['level_0'],inplace = True)
241 | self.x_no_corr.reset_index(inplace = True)
242 | # self.y.reset_index(inplace = True, drop=True)
243 | self.x_no_corr.drop(columns=['level_0','index'],inplace = True)
244 | print(f'new feature dataframe shape after outlier removal: {self.x_no_corr.shape}')
245 | top_corr_features = corr_matrix.index
246 | plt.figure(figsize=(25,25))
247 | sns.heatmap(corr_matrix[top_corr_features],annot=True,cmap="RdYlGn")
248 | plt.tight_layout()
249 | plt.savefig('correlations_class.png',dpi=300)
250 | plt.close()
251 |
252 | #Extra preprocessing steps
253 | #standardize
254 | self.cols_save = self.x_no_corr.columns
255 | self.scaler = StandardScaler()
256 | self.x_no_corr = self.scaler.fit_transform(self.x_no_corr)
257 | #normalize
258 | self.min_max_scaler = RobustScaler()
259 | self.x_no_corr = self.min_max_scaler.fit_transform(self.x_no_corr)
260 | self.x_no_corr = DataFrame(self.x_no_corr,columns=self.cols_save)
261 | #Generate random noise with the same shape as the DataFrame
262 | noise = np.random.normal(loc=0, scale=0.175, size=self.x_no_corr.shape) #the higher the scale value is, the more uniform the distribution becomes
263 | self.x_no_corr = self.x_no_corr + noise
264 |
265 | # def random_forest_analysis(self):
266 | # if argv[1] == 'tune':
267 | # #RANDOM FOREST REGRESSOR
268 | # RandForclass = RandomForestClassifier()
269 | # #Use the number of features as a stopping criterion for depth
270 | # rows, cols = self.x_train.shape
271 | # cols = int(cols / 2.5) #try to avoid overfitting on depth
272 | # #square root of the total number of features is a good limit
273 | # # cols = int(np.sqrt(cols))
274 | # #parameters to tune
275 | # #increasing min_samples_leaf, this will reduce overfitting
276 | # Rand_perm = {
277 | # 'criterion' : ["gini","entropy"], #absolute_error - takes forever to run
278 | # 'n_estimators': range(300,500,100),
279 | # # 'min_samples_split': np.arange(2, 5, 1, dtype=int),
280 | # 'max_features' : [1, 'sqrt', 'log2'],
281 | # 'max_depth': np.arange(2,cols,1),
282 | # 'min_samples_leaf': np.arange(2,4,1)
283 | # }
284 | # clf_rand = GridSearchCV(RandForclass, Rand_perm,
285 | # scoring=['accuracy','f1'],
286 | # cv=5,
287 | # refit='accuracy',
288 | # verbose=4,
289 | # n_jobs=-1)
290 | # search_rand = clf_rand.fit(self.x_train,self.y_train)
291 | # #Write fitted and tuned model to file
292 | # # with open('randomForestModelTuned.pkl','wb') as f:
293 | # # pickle.dump(search_rand,f)
294 | # joblib.dump(search_rand, "./classifierModelTuned.joblib", compress=9)
295 | # print('RandomForestClassifier - best params: ',search_rand.best_params_)
296 | # self.RandForclass = search_rand
297 | # prediction = self.RandForclass.predict(self.x_test)
298 | # print(confusion_matrix(self.y_test, prediction))# Display accuracy score
299 | # print(f'Model accuracy: {accuracy_score(self.y_test, prediction)}')# Display F1 score
300 | # # print(f1_score(self.y_test, prediction))
301 | # else:
302 | # print('Load tuned Random Forest Classifier')
303 | # # load RandomForestModel
304 | # self.RandForclass=joblib.load("./classifierModelTuned.joblib")
305 | # prediction = self.RandForclass.predict(self.x_test)
306 | # print(confusion_matrix(self.y_test, prediction))# Display accuracy score
307 | # print(f'Model accuracy: {accuracy_score(self.y_test, prediction)}')# Display F1 score
308 | # # print(f1_score(self.y_test, prediction))
309 | # y_proba = self.RandForclass.predict_proba(self.x_test)[:, 1]
310 | # fpr, tpr, thresholds = roc_curve(self.y_test, y_proba)
311 | # plt.plot(fpr, tpr)
312 | # plt.xlabel('False Positive Rate')
313 | # plt.ylabel('True Positive Rate')
314 | # plt.title('ROC Curve')
315 | # plt.savefig('ROC_curve_class.png',dpi=300)
316 |
317 | def xgboost_analysis(self):
318 | if not os.path.exists('classifierModelTuned_xgb.joblib'):
319 | if self.which_analysis == 'pca':
320 | y_train_combined = np.concatenate([self.y_train, self.y_validation], axis=0)
321 | x_train_combined = np.concatenate([self.x_train, self.x_validation], axis=0)
322 | else:
323 | y_train_combined = np.concatenate([self.y_train, self.y_validation], axis=0)
324 | x_train_combined = concat([self.x_train, self.x_validation], axis=0)
325 | if argv[1] == 'tune':
326 | # XGBoost Classifier
327 | xgb_class = xgb.XGBClassifier()
328 |
329 | # Parameters to tune
330 | params = {
331 | 'learning_rate': [0.01, 0.1],
332 | 'n_estimators': range(100, 300, 100),
333 | 'max_depth': range(2, 4, 2),
334 | 'min_child_weight': [1, 5],
335 | 'gamma': [0, 0.2],
336 | 'subsample': [0.6, 1.0],
337 | 'colsample_bytree': [0.6, 1.0],
338 | 'reg_alpha': [0, 0.01],
339 | 'reg_lambda': [0, 0.01],
340 | 'scale_pos_weight': [1, 3]
341 | }
342 |
343 | clf_xgb = GridSearchCV(xgb_class, params,
344 | scoring=['accuracy'],
345 | cv=5,
346 | refit='accuracy',
347 | verbose=4)
348 | search_xgb = clf_xgb.fit(x_train_combined, y_train_combined)
349 |
350 | # Write fitted and tuned model to file
351 | joblib.dump(search_xgb, "./classifierModelTuned_xgb.joblib", compress=9)
352 | print('XGBoost Classifier - best params: ', search_xgb.best_params_)
353 | self.xgb_class = search_xgb
354 | prediction = self.xgb_class.predict(self.x_test)
355 | print('Confusion Matrix: \n',confusion_matrix(np.argmax(self.y_test, axis=1), np.argmax(prediction, axis=1))) # Display accuracy score
356 | print(f'Model accuracy on test data:: {accuracy_score(np.argmax(self.y_test, axis=1), np.argmax(prediction, axis=1))}') # Display F1 score
357 |
358 | else:
359 | print('Load tuned XGBoost Classifier')
360 | # load XGBoost Model
361 | self.xgb_class = joblib.load("./classifierModelTuned_xgb.joblib")
362 | prediction = self.xgb_class.predict(self.x_test)
363 | print('Confusion Matrix on test data: \n',confusion_matrix(np.argmax(self.y_test, axis=1), np.argmax(prediction, axis=1))) # Display accuracy score
364 | print(f'Model accuracy on test data: {accuracy_score(np.argmax(self.y_test, axis=1), np.argmax(prediction, axis=1))}') # Display F1 score
365 | with open("output_xgb.txt", "w") as file:
366 | file.write('Confusion Matrix on test data: \n')
367 | file.write(str(confusion_matrix(np.argmax(self.y_test, axis=1), np.argmax(prediction, axis=1))))
368 | file.write('\n')
369 | file.write(f'Model accuracy on test data: {accuracy_score(np.argmax(self.y_test, axis=1), np.argmax(prediction, axis=1))}')
370 | file.write('\n')
371 | y_proba = self.xgb_class.predict_proba(self.x_test)
372 | fpr, tpr, thresholds = roc_curve(np.argmax(self.y_test, axis=1), np.argmax(y_proba, axis=1))
373 | plt.figure()
374 | plt.plot(fpr, tpr)
375 | plt.xlabel('False Positive Rate')
376 | plt.ylabel('True Positive Rate')
377 | plt.title('ROC Curve')
378 | plt.savefig('ROC_curve_class.png', dpi=300)
379 | plt.close()
380 | else:
381 | self.xgb_class = joblib.load("./classifierModelTuned_xgb.joblib")
382 |
383 |
384 | def deep_learn_analysis(self):
385 | if not os.path.exists('binary_keras_deep.h5'):
386 | tuner = RandomSearch(
387 | lambda hp: create_sequential_model(hp, self.x_train.shape[1], 2),
388 | objective='val_loss', #val_loss
389 | max_trials=10,
390 | directory=f'cbb_sequential_hp',
391 | project_name='sequential_hyperparameter_tuning',
392 | )
393 |
394 | early_stopping = EarlyStopping(monitor='val_loss', patience=9, restore_best_weights=True)
395 | reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6)
396 | tuner.search(x=self.x_train, y=self.y_train,
397 | epochs=200,
398 | validation_data=(self.x_validation, self.y_validation),
399 | callbacks=[early_stopping, reduce_lr])
400 |
401 | # best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
402 | best_model = tuner.get_best_models(num_models=1)[0]
403 |
404 | # Fit tuned model
405 | loss_final = float(100)
406 | for i in tqdm(range(15)):
407 | best_model.fit(self.x_train, self.y_train,
408 | epochs=200,
409 | validation_data=(self.x_validation, self.y_validation),
410 | callbacks=[early_stopping, reduce_lr])
411 | loss, acc = best_model.evaluate(self.x_test, self.y_test)
412 | if loss < loss_final:
413 | self.final_model_deep = best_model
414 | loss, acc = self.final_model_deep.evaluate(self.x_test, self.y_test)
415 | print(f'Final model test loss {loss} and accuracy {acc}')
416 | with open("output_deep_learn.txt", "w") as file:
417 | file.write(f'Final model test loss {loss} and accuracy {acc}')
418 | file.write('\n')
419 | self.final_model_deep.save('binary_keras_deep.h5')
420 | else:
421 | self.final_model_deep = load_model('binary_keras_deep.h5')
422 |
423 | def predict_two_teams(self):
424 | teams_sports_ref = read_csv('teams_sports_ref_format.csv')
425 | while True:
426 | # try:
427 | team_1 = input('team_1: ')
428 | if team_1 == 'exit':
429 | break
430 | team_2 = input('team_2: ')
431 | #Game location
432 | game_loc_team1 = int(input(f'{team_1} : home = 0, away = 1, neutral = 2: '))
433 | if game_loc_team1 == 0:
434 | game_loc_team2 = 1
435 | elif game_loc_team1 == 1:
436 | game_loc_team2 = 0
437 | elif game_loc_team1 == 2:
438 | game_loc_team2 = 2
439 | #Check to see if the team was spelled right
440 | team_1 = get_close_matches(team_1,teams_sports_ref['teams'].tolist(),n=1)[0]
441 | team_2 = get_close_matches(team_2,teams_sports_ref['teams'].tolist(),n=1)[0]
442 | #2023 data
443 | year = 2024
444 | # sleep(4)
445 | basic = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs.html'
446 | adv = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs-advanced.html'
447 | team_1_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_1.lower(),year)
448 | sleep(4) #I get get banned for a small period of time if I do not do this
449 | basic = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs.html'
450 | adv = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs-advanced.html'
451 | team_2_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_2.lower(),year)
452 | #Remove empty cells
453 | team_1_df2023['pts'].replace('', np.nan, inplace=True)
454 | team_1_df2023.replace('', np.nan, inplace=True)
455 | team_1_df2023.dropna(inplace=True)
456 | team_2_df2023['pts'].replace('', np.nan, inplace=True)
457 | team_2_df2023.replace('', np.nan, inplace=True)
458 | team_2_df2023.dropna(inplace=True)
459 | for col in team_1_df2023.columns:
460 | team_1_df2023[col] = team_1_df2023[col].astype(float)
461 | for col in team_2_df2023.columns:
462 | team_2_df2023[col] = team_2_df2023[col].astype(float)
463 |
464 | #Combine dfs
465 | if len(team_1_df2023) > len(team_2_df2023):
466 | team_1_df2023 = team_1_df2023.tail(len(team_2_df2023))
467 | elif len(team_2_df2023) > len(team_1_df2023):
468 | team_2_df2023 = team_2_df2023.tail(len(team_1_df2023))
469 |
470 | team_1_df2023 = team_1_df2023.reset_index(drop=True)
471 | team_2_df2023 = team_2_df2023.reset_index(drop=True)
472 | team_1_df_copy = team_1_df2023.copy()
473 | team_2_df_copy = team_2_df2023.copy()
474 | #replace team 1 opp data with team 2
475 | for index, row in team_1_df2023.iterrows():
476 | for col in team_1_df2023.columns:
477 | if "opp" in col:
478 | if col == 'opp_trb':
479 | team_1_df2023.at[index, 'opp_trb'] = team_2_df2023.at[index, 'total_board']
480 | else:
481 | new_col = col.replace("opp_", "")
482 | team_1_df2023.at[index, col] = team_2_df2023.at[index, new_col]
483 |
484 | #replace team 2 opp data with team 1
485 | for index, row in team_2_df_copy.iterrows():
486 | for col in team_2_df_copy.columns:
487 | if "opp" in col:
488 | if col == 'opp_trb':
489 | team_2_df_copy.at[index, 'opp_trb'] = team_1_df_copy.at[index, 'total_board']
490 | else:
491 | new_col = col.replace("opp_", "")
492 | team_2_df_copy.at[index, col] = team_1_df_copy.at[index, new_col]
493 |
494 |
495 | #Remove pts and game result
496 | # for col in team_1_df2023.columns:
497 | # if 'opp' in col:
498 | # team_1_df2023.drop(columns=col,inplace=True)
499 | # for col in team_2_df2023.columns:
500 | # if 'opp' in col:
501 | # team_2_df2023.drop(columns=col,inplace=True)
502 | if self.which_analysis == 'pca':
503 | team_1_df2023.drop(columns=['game_result'],inplace=True)
504 | team_2_df_copy.drop(columns=['game_result'],inplace=True)
505 | team_1_df2023 = self.scaler.transform(team_1_df2023)
506 | team_2_df_copy = self.scaler.transform(team_2_df_copy)
507 | team_1_df2023 = self.pca.transform(team_1_df2023)
508 | team_2_df_copy = self.pca.transform(team_2_df_copy)
509 |
510 | #make df for other analysis
511 | team_1_df_separate = DataFrame(team_1_df2023).abs()
512 | team_2_df_separate = DataFrame(team_2_df_copy).abs()
513 | prop_1 = team_1_df_separate.std() / team_1_df_separate.mean()
514 | prop_2 = team_2_df_separate.std() / team_2_df_separate.mean()
515 | else:
516 | team_1_df2023.drop(columns=['game_result'],inplace=True)
517 | team_2_df2023.drop(columns=['game_result'],inplace=True)
518 | #Drop the correlated features
519 | team_1_df2023.drop(columns=self.drop_cols, inplace=True)
520 | team_2_df2023.drop(columns=self.drop_cols, inplace=True)
521 |
522 | team_1_df2023 = self.scaler.transform(team_1_df2023)
523 | team_2_df2023 = self.scaler.transform(team_2_df2023)
524 |
525 | team_1_df2023 = self.min_max_scaler.transform(team_1_df2023)
526 | team_2_df2023 = self.min_max_scaler.transform(team_2_df2023)
527 |
528 | team_1_df2023 = DataFrame(team_1_df2023,columns=self.cols_save)
529 | team_2_df2023 = DataFrame(team_2_df2023,columns=self.cols_save)
530 |
531 | ma_range = np.arange(2,10,1) #2 was the most correct value for mean and 8 was the best for the median; chose 9 for tiebreaking
532 | # team_1_count = 0
533 | # team_2_count = 0
534 | # team_1_count_mean = 0
535 | # team_2_count_mean = 0
536 | team_1_ma_win = []
537 | team_2_ma_win = []
538 | random_pred_1, random_pred_2 = [], []
539 | random_pred_1_monte, random_pred_2_monte = [], []
540 | qt_best_team_1, qt_best_team_2 = [], []
541 | qt_worst_team_1, qt_worst_team_2 = [], []
542 | #get latest SRS value
543 | team_1_srs = cbb_web_scraper.get_latest_srs(team_1)
544 | team_2_srs = cbb_web_scraper.get_latest_srs(team_2)
545 |
546 | # #Monte carlo simulation
547 | num_simulations = 1000
548 | mean_1 = np.mean(team_1_df2023, axis=0)
549 | std_1 = np.std(team_1_df2023, axis=0)
550 | mean_2 = np.mean(team_2_df_copy, axis=0)
551 | std_2 = np.std(team_2_df_copy, axis=0)
552 | for _ in tqdm(range(num_simulations)):
553 | random_stats_team_1 = np.random.normal(mean_1, std_1, size=(1,team_1_df_separate.shape[1]))
554 | random_stats_team_2 = np.random.normal(mean_2, std_2, size=(1,team_2_df_separate.shape[1]))
555 | random_stats_team_1 = random_stats_team_1[0]
556 | random_stats_team_2 = random_stats_team_2[0]
557 | outcome_team_1 = self.xgb_class.predict_proba([random_stats_team_1])
558 | outcome_deep_1 = self.final_model_deep.predict([np.expand_dims(random_stats_team_1, axis=0)])
559 | outcome_team_2 = self.xgb_class.predict_proba([random_stats_team_2])
560 | outcome_deep_2 = self.final_model_deep.predict([np.expand_dims(random_stats_team_2, axis=0)])
561 | random_pred_1_monte.append(outcome_team_1[0][1])
562 | random_pred_1_monte.append(outcome_deep_1[0][1])
563 | random_pred_2_monte.append(outcome_team_1[0][0])
564 | random_pred_2_monte.append(outcome_team_1[0][0])
565 | random_pred_2_monte.append(outcome_team_2[0][1])
566 | random_pred_2_monte.append(outcome_deep_2[0][1])
567 | random_pred_1_monte.append(outcome_team_2[0][0])
568 | random_pred_1_monte.append(outcome_deep_2[0][0])
569 |
570 | #every game of one team vs every game for other team
571 | for _ in tqdm(range(len(team_1_df2023) * 30)):
572 | if self.which_analysis == 'pca':
573 | random_row_df1 = team_1_df2023[np.random.choice(len(team_1_df2023), size=1),:]
574 | random_row_df2 = team_2_df_copy[np.random.choice(len(team_2_df_copy), size=1),:]
575 | else:
576 | random_row_df1 = team_1_df2023.sample(n=1)
577 | random_row_df2 = team_2_df_copy.sample(n=1)
578 | # random_row_df2 = team_2_df2023.sample(n=1)
579 |
580 | # for col in random_row_df1.columns:
581 | # if "opp" in col:
582 | # if col == 'opp_trb':
583 | # random_row_df1.at[random_row_df1.index[0], 'opp_trb'] = random_row_df2.at[random_row_df2.index[0], 'total_board']
584 | # else:
585 | # new_col = col.replace("opp_", "")
586 | # random_row_df1.at[random_row_df1.index[0], col] = random_row_df2.at[random_row_df2.index[0], new_col]
587 | outcome_team_1 = self.xgb_class.predict_proba(random_row_df1)
588 | outcome_team_2 = self.xgb_class.predict_proba(random_row_df2)
589 | outcome_deep_1 = self.final_model_deep.predict(random_row_df1)
590 | outcome_deep_2 = self.final_model_deep.predict(random_row_df2)
591 |
592 | #team 1 win percentage [lose win]
593 | random_pred_1.append(outcome_team_1[0][1])
594 | random_pred_1.append(outcome_deep_1[0][1])
595 | random_pred_2.append(outcome_team_1[0][0])
596 | random_pred_2.append(outcome_team_1[0][0])
597 | #team 2 win percentage [lose win]
598 | random_pred_2.append(outcome_team_2[0][1])
599 | random_pred_2.append(outcome_deep_2[0][1])
600 | random_pred_1.append(outcome_team_2[0][0])
601 | random_pred_1.append(outcome_deep_2[0][0])
602 |
603 | #rolling average predictions
604 | team_1_df2023 = DataFrame(team_1_df2023)
605 | team_2_df_copy = DataFrame(team_2_df_copy)
606 | for ma in tqdm(ma_range):
607 | # TEAM 1
608 | data1_mean = team_1_df2023.ewm(span=ma,min_periods=ma-1).mean()
609 | data2_mean = team_2_df_copy.ewm(span=ma,min_periods=ma-1).mean()
610 |
611 | outcome = self.xgb_class.predict_proba(data1_mean.iloc[-1:].values)
612 | outcome_deep = self.final_model_deep.predict(data1_mean.iloc[-1:].values)
613 | outcome2 = self.xgb_class.predict_proba(data2_mean.iloc[-1:].values)
614 | outcome_deep2 = self.final_model_deep.predict(data2_mean.iloc[-1:].values)
615 |
616 | team_1_ma_win.append(outcome[0][1])
617 | team_1_ma_win.append(outcome_deep[0][1])
618 | team_2_ma_win.append(outcome[0][0])
619 | team_2_ma_win.append(outcome_deep[0][0])
620 |
621 | team_1_ma_win.append(outcome2[0][0])
622 | team_1_ma_win.append(outcome_deep2[0][0])
623 | team_2_ma_win.append(outcome2[0][1])
624 | team_2_ma_win.append(outcome_deep2[0][1])
625 |
626 | #quantile predictions - both play at their bests
627 | for ma in tqdm(ma_range):
628 | # TEAM 1
629 | data1_mean = team_1_df2023.rolling(window=ma).quantile(0.75)
630 | # data1_mean['game_loc'] = game_loc_team1
631 | data2_mean = team_2_df_copy.rolling(window=ma).quantile(0.75)
632 | # data2_mean['game_loc'] = game_loc_team2
633 | #get latest SRS value
634 | # data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_1_srs
635 | # data2_mean.loc[data2_mean.index[-1], 'simple_rating_system'] = team_2_srs
636 | outcome = self.xgb_class.predict_proba(data1_mean.iloc[-1:].values)
637 | outcome_deep = self.final_model_deep.predict(data1_mean.iloc[-1:].values)
638 | outcome2 = self.xgb_class.predict_proba(data2_mean.iloc[-1:].values)
639 | outcome_deep2 = self.final_model_deep.predict(data2_mean.iloc[-1:].values)
640 |
641 | qt_best_team_1.append(outcome[0][1])
642 | qt_best_team_1.append(outcome_deep[0][1])
643 | qt_best_team_2.append(outcome[0][0])
644 | qt_best_team_2.append(outcome_deep[0][0])
645 |
646 | qt_best_team_1.append(outcome2[0][0])
647 | qt_best_team_1.append(outcome_deep2[0][0])
648 | qt_best_team_2.append(outcome2[0][1])
649 | qt_best_team_2.append(outcome_deep2[0][1])
650 |
651 | #quantile predictions - both play at their worsts
652 | for ma in tqdm(ma_range):
653 | # TEAM 1
654 | data1_mean = team_1_df2023.rolling(window=ma).quantile(0.25)
655 | # data1_mean['game_loc'] = game_loc_team1
656 | data2_mean = team_2_df_copy.rolling(window=ma).quantile(0.25)
657 | # data2_mean['game_loc'] = game_loc_team2
658 | #get latest SRS value
659 | # data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_1_srs
660 | # data2_mean.loc[data2_mean.index[-1], 'simple_rating_system'] = team_2_srs
661 | # data1_mean['simple_rating_system'].iloc[-1] = cbb_web_scraper.get_latest_srs(team_1)
662 | outcome = self.xgb_class.predict_proba(data1_mean.iloc[-1:].values)
663 | outcome_deep = self.final_model_deep.predict(data1_mean.iloc[-1:].values)
664 | outcome2 = self.xgb_class.predict_proba(data2_mean.iloc[-1:].values)
665 | outcome_deep2 = self.final_model_deep.predict(data2_mean.iloc[-1:].values)
666 |
667 | qt_worst_team_1.append(outcome[0][1])
668 | qt_worst_team_1.append(outcome_deep[0][1])
669 | qt_worst_team_2.append(outcome[0][0])
670 | qt_worst_team_2.append(outcome_deep[0][0])
671 |
672 | qt_worst_team_1.append(outcome2[0][0])
673 | qt_worst_team_1.append(outcome_deep2[0][0])
674 | qt_worst_team_2.append(outcome2[0][1])
675 | qt_worst_team_2.append(outcome_deep2[0][1])
676 |
677 | ###########TEAM 2 VS TEAM 1###################
678 | # temp = team_1_df2023
679 | # team_1_df2023 = team_2_df2023
680 | # team_2_df2023 = temp
681 |
682 | # if game_loc_team1 == 1:
683 | # game_loc_team1 = 0
684 | # elif game_loc_team1 == 0:
685 | # game_loc_team1 = 1
686 | # if game_loc_team2 == 0:
687 | # game_loc_team2 = 1
688 | # elif game_loc_team2 == 1:
689 | # game_loc_team2 = 0
690 |
691 | # #get latest SRS value - flip them
692 | # team_1_srs = cbb_web_scraper.get_latest_srs(team_2)
693 | # team_2_srs = cbb_web_scraper.get_latest_srs(team_1)
694 | # #every game of one team vs every game for other team
695 | # for _ in range(len(team_1_df2023) * 2):
696 | # random_row_df1 = team_1_df2023.sample(n=1)
697 | # random_row_df2 = team_2_df2023.sample(n=1)
698 |
699 | # for col in random_row_df1.columns:
700 | # if "opp" in col:
701 | # if col == 'opp_trb':
702 | # random_row_df1.at[random_row_df1.index[0], 'opp_trb'] = random_row_df2.at[random_row_df2.index[0], 'total_board']
703 | # else:
704 | # new_col = col.replace("opp_", "")
705 | # random_row_df1.at[random_row_df1.index[0], col] = random_row_df2.at[random_row_df2.index[0], new_col]
706 |
707 | # outcome = self.xgb_class.predict_proba(random_row_df1)
708 | # outcome_deep = self.final_model_deep.predict(random_row_df1)
709 |
710 | # random_pred_1.append(outcome[0][1])
711 | # random_pred_1.append(outcome_deep[0][1])
712 | # random_pred_2.append(outcome[0][0])
713 | # random_pred_2.append(outcome_deep[0][0])
714 |
715 | # #rolling average predictions
716 | # for ma in tqdm(ma_range):
717 | # # TEAM 1
718 | # data1_mean = team_1_df2023.ewm(span=ma,min_periods=ma-1).mean()
719 | # # data1_mean['game_loc'] = game_loc_team1
720 | # data2_mean = team_2_df2023.ewm(span=ma,min_periods=ma-1).mean()
721 | # # data2_mean['game_loc'] = game_loc_team2
722 | # #Here replace opponent metrics with the features of the second team
723 | # for col in data1_mean.columns:
724 | # if "opp" in col:
725 | # if col == 'opp_trb':
726 | # # new_col = col.replace("opp_", "")
727 | # data1_mean.loc[data1_mean.index[-1], 'opp_trb'] = data2_mean.loc[data2_mean.index[-1], 'total_board']
728 | # else:
729 | # new_col = col.replace("opp_", "")
730 | # data1_mean.loc[data1_mean.index[-1], col] = data2_mean.loc[data2_mean.index[-1], new_col]
731 | # #get latest SRS value
732 | # # data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_1_srs
733 | # # data2_mean.loc[data2_mean.index[-1], 'simple_rating_system'] = team_2_srs
734 | # # data1_mean['simple_rating_system'].iloc[-1] = cbb_web_scraper.get_latest_srs(team_1)
735 | # outcome = self.xgb_class.predict_proba(data1_mean.iloc[-1:])
736 | # outcome_deep = self.final_model_deep.predict(data1_mean.iloc[-1:])
737 |
738 | # team_1_ma_win.append(outcome[0][1])
739 | # team_1_ma_win.append(outcome_deep[0][1])
740 | # team_2_ma_win.append(outcome[0][0])
741 | # team_2_ma_win.append(outcome_deep[0][0])
742 | # #quantile predictions - both play at their bests
743 | # for ma in tqdm(ma_range):
744 | # # TEAM 1
745 | # data1_mean = team_1_df2023.rolling(window=ma).quantile(0.75).iloc[-1:]
746 | # # data1_mean['game_loc'] = game_loc_team1
747 | # data2_mean = team_2_df2023.rolling(window=ma).quantile(0.75).iloc[-1:]
748 | # # data2_mean['game_loc'] = game_loc_team2
749 | # #Here replace opponent metrics with the features of the second team
750 | # for col in data1_mean.columns:
751 | # if "opp" in col:
752 | # if col == 'opp_trb':
753 | # # new_col = col.replace("opp_", "")
754 | # data1_mean.loc[data1_mean.index[-1], 'opp_trb'] = data2_mean.loc[data2_mean.index[-1], 'total_board']
755 | # else:
756 | # new_col = col.replace("opp_", "")
757 | # data1_mean.loc[data1_mean.index[-1], col] = data2_mean.loc[data2_mean.index[-1], new_col]
758 | # #get latest SRS value
759 | # # data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_1_srs
760 | # # data2_mean.loc[data2_mean.index[-1], 'simple_rating_system'] = team_2_srs
761 | # # data1_mean['simple_rating_system'].iloc[-1] = cbb_web_scraper.get_latest_srs(team_1)
762 | # outcome = self.xgb_class.predict_proba(data1_mean.iloc[-1:])
763 | # outcome_deep = self.final_model_deep.predict(data1_mean.iloc[-1:])
764 |
765 | # qt_best_team_1.append(outcome[0][1])
766 | # qt_best_team_1.append(outcome_deep[0][1])
767 | # qt_best_team_2.append(outcome[0][0])
768 | # qt_best_team_2.append(outcome_deep[0][0])
769 |
770 | # #quantile predictions - both play at their worsts
771 | # for ma in tqdm(ma_range):
772 | # # TEAM 1
773 | # data1_mean = team_1_df2023.rolling(window=ma).quantile(0.25).iloc[-1:]
774 | # # data1_mean['game_loc'] = game_loc_team1
775 | # data2_mean = team_2_df2023.rolling(window=ma).quantile(0.25).iloc[-1:]
776 | # # data2_mean['game_loc'] = game_loc_team2
777 | # #Here replace opponent metrics with the features of the second team
778 | # for col in data1_mean.columns:
779 | # if "opp" in col:
780 | # if col == 'opp_trb':
781 | # # new_col = col.replace("opp_", "")
782 | # data1_mean.loc[data1_mean.index[-1], 'opp_trb'] = data2_mean.loc[data2_mean.index[-1], 'total_board']
783 | # else:
784 | # new_col = col.replace("opp_", "")
785 | # data1_mean.loc[data1_mean.index[-1], col] = data2_mean.loc[data2_mean.index[-1], new_col]
786 | # #get latest SRS value
787 | # # data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_1_srs
788 | # # data2_mean.loc[data2_mean.index[-1], 'simple_rating_system'] = team_2_srs
789 | # # data1_mean['simple_rating_system'].iloc[-1] = cbb_web_scraper.get_latest_srs(team_1)
790 | # outcome = self.xgb_class.predict_proba(data1_mean.iloc[-1:])
791 | # outcome_deep = self.final_model_deep.predict(data1_mean.iloc[-1:])
792 |
793 | # qt_worst_team_1.append(outcome[0][1])
794 | # qt_worst_team_1.append(outcome_deep[0][1])
795 | # qt_worst_team_2.append(outcome[0][0])
796 | # qt_worst_team_2.append(outcome_deep[0][0])
797 |
798 | # #reflip for printing
799 | # team_1_srs = cbb_web_scraper.get_latest_srs(team_1)
800 | # team_2_srs = cbb_web_scraper.get_latest_srs(team_2)
801 | print('===============================================================')
802 | if team_1_srs > team_2_srs:
803 | print(Fore.GREEN + Style.BRIGHT + f'{team_1} SRS data: {team_1_srs}'+ Style.RESET_ALL)
804 | print(Fore.RED + Style.BRIGHT + f'{team_2} SRS data: {team_2_srs}'+ Style.RESET_ALL)
805 | else:
806 | print(Fore.RED + Style.BRIGHT + f'{team_1} SRS data: {team_1_srs}'+ Style.RESET_ALL)
807 | print(Fore.GREEN + Style.BRIGHT + f'{team_2} SRS data: {team_2_srs}'+ Style.RESET_ALL)
808 | print('===============================================================')
809 | if np.mean(prop_1.sum()) < np.mean(prop_2.sum()):
810 | print(Fore.GREEN + Style.BRIGHT + f'{team_1} summed variability: {prop_1.sum()}'+ Style.RESET_ALL)
811 | print(Fore.RED + Style.BRIGHT + f'{team_2} summed variability: {prop_2.sum()}'+ Style.RESET_ALL)
812 | else:
813 | print(Fore.RED + Style.BRIGHT + f'{team_1} summed variability: {prop_1.sum()}'+ Style.RESET_ALL)
814 | print(Fore.GREEN + Style.BRIGHT + f'{team_2} summed variability: {prop_2.sum()}'+ Style.RESET_ALL)
815 | print('===============================================================')
816 | if np.mean(team_1_ma_win) > np.mean(team_2_ma_win):
817 | print(Fore.GREEN + Style.BRIGHT + f'{team_1} average win probabilities: {np.mean(team_1_ma_win)}'+ Style.RESET_ALL)
818 | print(Fore.RED + Style.BRIGHT + f'{team_2} average win probabilities: {np.mean(team_2_ma_win)}'+ Style.RESET_ALL)
819 | else:
820 | print(Fore.RED + Style.BRIGHT + f'{team_1} average win probabilities: {np.mean(team_1_ma_win)}'+ Style.RESET_ALL)
821 | print(Fore.GREEN + Style.BRIGHT + f'{team_2} average win probabilities: {np.mean(team_2_ma_win)}'+ Style.RESET_ALL)
822 | print('===============================================================')
823 | if np.mean(qt_best_team_1) > np.mean(qt_best_team_2):
824 | print(Fore.GREEN + Style.BRIGHT + f'{team_1} average win probabilities if they play at their best: {np.mean(qt_best_team_1)}'+ Style.RESET_ALL)
825 | print(Fore.RED + Style.BRIGHT + f'{team_2} average win probabilities if they play at their best: {np.mean(qt_best_team_2)}'+ Style.RESET_ALL)
826 | else:
827 | print(Fore.RED + Style.BRIGHT + f'{team_1} average win probabilities if they play at their best: {np.mean(qt_best_team_1)}'+ Style.RESET_ALL)
828 | print(Fore.GREEN + Style.BRIGHT + f'{team_2} average win probabilities if they play at their best: {np.mean(qt_best_team_2)}'+ Style.RESET_ALL)
829 | print('===============================================================')
830 | if np.mean(qt_worst_team_1) > np.mean(qt_worst_team_2):
831 | print(Fore.GREEN + Style.BRIGHT + f'{team_1} average win probabilities if they play at their worst: {np.mean(qt_worst_team_1)}'+ Style.RESET_ALL)
832 | print(Fore.RED + Style.BRIGHT + f'{team_2} average win probabilities if they play at their worst: {np.mean(qt_worst_team_2)}'+ Style.RESET_ALL)
833 | else:
834 | print(Fore.RED + Style.BRIGHT + f'{team_1} average win probabilities if they play at their worst: {np.mean(qt_worst_team_1)}'+ Style.RESET_ALL)
835 | print(Fore.GREEN + Style.BRIGHT + f'{team_2} average win probabilities if they play at their worst: {np.mean(qt_worst_team_2)}'+ Style.RESET_ALL)
836 | print('===============================================================')
837 | if np.mean(random_pred_1) > np.mean(random_pred_2):
838 | print(Fore.GREEN + Style.BRIGHT + f'{team_1} average win probabilities randomly selecting games: {np.mean(random_pred_1)}'+ Style.RESET_ALL)
839 | print(Fore.RED + Style.BRIGHT + f'{team_2} average win probabilities randomly selecting games: {np.mean(random_pred_2)}'+ Style.RESET_ALL)
840 | else:
841 | print(Fore.RED + Style.BRIGHT + f'{team_1} average win probabilities randomly selecting games: {np.mean(random_pred_1)}'+ Style.RESET_ALL)
842 | print(Fore.GREEN + Style.BRIGHT + f'{team_2} average win probabilities randomly selecting games: {np.mean(random_pred_2)}'+ Style.RESET_ALL)
843 | print('===============================================================')
844 | if np.mean(random_pred_1_monte) > np.mean(random_pred_2_monte):
845 | print(Fore.GREEN + Style.BRIGHT + f'{team_1} average win probabilities Monte Carlo Simulation: {np.mean(random_pred_1_monte)}'+ Style.RESET_ALL)
846 | print(Fore.RED + Style.BRIGHT + f'{team_2} average win probabilities Monte Carlo Simulation: {np.mean(random_pred_2_monte)}'+ Style.RESET_ALL)
847 | else:
848 | print(Fore.RED + Style.BRIGHT + f'{team_1} average win probabilities Monte Carlo Simulation: {np.mean(random_pred_1_monte)}'+ Style.RESET_ALL)
849 | print(Fore.GREEN + Style.BRIGHT + f'{team_2} average win probabilities Monte Carlo Simulation: {np.mean(random_pred_2_monte)}'+ Style.RESET_ALL)
850 |
851 | # if "tod" in sys.argv[2]:
852 | # date_today = str(datetime.now().date()).replace("-", "")
853 | # elif "tom" in sys.argv[2]:
854 | # date_today = str(datetime.now().date() + timedelta(days=1)).replace("-", "")
855 | # URL = "https://www.espn.com/mens-college-basketball/schedule/_/date/" + date_today #sys argv????
856 | # print(f'ESPN prediction: {cbb_web_scraper.get_espn(URL,team_1,team_2)}')
857 | print('===============================================================')
858 | # except Exception as e:
859 | # print(f'The error: {e}')
860 | def feature_importances_random_forest(self):
861 | importances = self.RandForclass.best_estimator_.feature_importances_
862 | indices = np.argsort(importances)
863 | plt.figure()
864 | plt.title('Feature Importances Random Forest - Classifier')
865 | plt.barh(range(len(indices)), importances[indices], color='k', align='center')
866 | plt.yticks(range(len(indices)), [self.x_test.columns[i] for i in indices])
867 | plt.xlabel('Relative Importance - explained variance')
868 | plt.tight_layout()
869 | plt.savefig('feature_importance_random_forest_classifier.png',dpi=300)
870 |
871 | def feature_importances_xgb(self):
872 | importances = self.xgb_class.best_estimator_.feature_importances_
873 | indices = np.argsort(importances)
874 | plt.figure(figsize=(10,8))
875 | plt.title('Feature Importances XGBoost - Classifier')
876 | plt.barh(range(len(indices)), importances[indices], color='k', align='center')
877 | plt.yticks(range(len(indices)), [self.x_test.columns[i] for i in indices])
878 | plt.xlabel('Relative Importance - explained variance')
879 | plt.tight_layout()
880 | plt.savefig('feature_importance_xgb_classifier.png',dpi=300)
881 | plt.close()
882 |
883 | def deep_learning_feature_importances(self):
884 | model = self.final_model_deep
885 | x_train_array = np.array(self.x_test)
886 | masker = shap.maskers.Independent(data=x_train_array)
887 | explainer = shap.Explainer(model, masker)
888 | shap_values = explainer.shap_values(x_train_array)
889 | feature_importances = np.mean(np.abs(shap_values),axis=0)
890 | shap.summary_plot(feature_importances.T,
891 | feature_names=self.cols_save,
892 | plot_type="bar",
893 | max_display=feature_importances.shape[0],
894 | show=False)
895 | plt.savefig('SHAP_feature_importances.png',dpi=400)
896 | plt.close()
897 |
898 | def run_analysis(self):
899 | self.get_teams()
900 | self.split()
901 | self.deep_learn_analysis()
902 | self.xgboost_analysis()
903 | self.predict_two_teams()
904 | if self.which_analysis != 'pca':
905 | self.feature_importances_xgb()
906 | self.deep_learning_feature_importances()
907 |
908 | def main():
909 | cbbClass('pca').run_analysis() # 'pca' or 'corr'
910 | if __name__ == '__main__':
911 | main()
--------------------------------------------------------------------------------