├── .gitignore
├── LICENSE
├── README.md
├── ROC_curve_class.png
├── SHAP_feature_importances.png
├── all_data.csv
├── all_teams_cbb.csv
├── binary_keras_deep.h5
├── cbb.yaml
├── cbb_classification.py
├── cbb_regression.py
├── cbb_web_scraper.py
├── class_label_count.png
├── correlations_class.png
├── deep_learn.py
├── deep_learn_MA.py
├── deep_learn_regressor.py
├── extra
    ├── analyze_output.py
    └── exact_match.py
├── feature_importance_xgb_classifier.png
├── pca_components.png
├── teams_sports_ref_format.csv
└── year_count.yaml


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | # key file
  6 | key.txt
  7 | errors.log
  8 | #Compression
  9 | randomForestModelTuned.pkl
 10 | randomForestModelTuned.joblib
 11 | classifierModelTuned.joblib
 12 | classifierModelTuned_xgb.joblib
 13 | #training directory
 14 | cbb_sequential_hp/
 15 | # C extensions
 16 | *.so
 17 | 
 18 | # Distribution / packaging
 19 | .Python
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | wheels/
 32 | pip-wheel-metadata/
 33 | share/python-wheels/
 34 | *.egg-info/
 35 | .installed.cfg
 36 | *.egg
 37 | MANIFEST
 38 | 
 39 | # PyInstaller
 40 | #  Usually these files are written by a python script from a template
 41 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 42 | *.manifest
 43 | *.spec
 44 | 
 45 | #Python Files
 46 | github.py
 47 | 
 48 | # Installer logs
 49 | pip-log.txt
 50 | pip-delete-this-directory.txt
 51 | 
 52 | # Unit test / coverage reports
 53 | htmlcov/
 54 | .tox/
 55 | .nox/
 56 | .coverage
 57 | .coverage.*
 58 | .cache
 59 | nosetests.xml
 60 | coverage.xml
 61 | *.cover
 62 | *.py,cover
 63 | .hypothesis/
 64 | .pytest_cache/
 65 | 
 66 | # Translations
 67 | *.mo
 68 | *.pot
 69 | 
 70 | # Django stuff:
 71 | *.log
 72 | local_settings.py
 73 | db.sqlite3
 74 | db.sqlite3-journal
 75 | 
 76 | # Flask stuff:
 77 | instance/
 78 | .webassets-cache
 79 | 
 80 | # Scrapy stuff:
 81 | .scrapy
 82 | 
 83 | # Sphinx documentation
 84 | docs/_build/
 85 | 
 86 | # PyBuilder
 87 | target/
 88 | 
 89 | # Jupyter Notebook
 90 | .ipynb_checkpoints
 91 | 
 92 | # IPython
 93 | profile_default/
 94 | ipython_config.py
 95 | 
 96 | # pyenv
 97 | .python-version
 98 | 
 99 | # pipenv
100 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
101 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
102 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
103 | #   install all needed dependencies.
104 | #Pipfile.lock
105 | 
106 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
107 | __pypackages__/
108 | 
109 | # Celery stuff
110 | celerybeat-schedule
111 | celerybeat.pid
112 | 
113 | # SageMath parsed files
114 | *.sage.py
115 | 
116 | # Environments
117 | .env
118 | .venv
119 | env/
120 | venv/
121 | ENV/
122 | env.bak/
123 | venv.bak/
124 | 
125 | # Spyder project settings
126 | .spyderproject
127 | .spyproject
128 | 
129 | # Rope project settings
130 | .ropeproject
131 | 
132 | # mkdocs documentation
133 | /site
134 | 
135 | # mypy
136 | .mypy_cache/
137 | .dmypy.json
138 | dmypy.json
139 | 
140 | # Pyre type checker
141 | .pyre/
142 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Brian Szekely
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # College Basketball Game Predictions
 2 | 
 3 | Machine learning that predicts the outcome of any Division I college basketball game. Data are from 2010 - 2024 seasons. 
 4 | <!-- Currently the prediction accuracy is between 63-66% on future game outcomes.  -->
 5 | Data are from SportsReference.com
 6 | 
 7 | ## Usage
 8 | 
 9 | ```python
10 | python cbb_classification.py tune or python cbb_classification.py notune
11 | ```
12 | 
13 | ```bash
14 | Removed features (>=0.9 correlation): ['fta', 'fta_per_fga_pct', 'fg3a_per_fga_pct', 'ts_pct', 'stl_pct', 'blk_pct', 'efg_pct', 'tov_pct', 'orb_pct', 'ft_rate']
15 | dataset shape: (27973 samples, 55 features)
16 | 
17 | ### Current prediction accuracies - XGBoost
18 | # After 5 fold cross validation and pre-processing
19 | Current XGBoost Classifier - best params:  {'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 1, 'n_estimators': 200, 'reg_alpha': 0.01, 'reg_lambda': 0.01, 'scale_pos_weight': 1, 'subsample': 1.0}
20 | 
21 | 
22 | #Classification - XGBoost
23 | Confusion Matrix:[[1316   46]
24 |                   [  31 1404]]
25 | Model accuracy on test data: 0.9688952449052556
26 | 
27 | #Classificatino - DNN Keras
28 | Final model test loss 0.07359004765748978 and accuracy 0.9760457873344421
29 | ```
30 | ### Correlation Matrix
31 | ![](https://github.com/bszek213/cbb_machine_learning/blob/dev/correlations.png)
32 | 
33 | <!-- ### Feature Importances Regression
34 | ![](https://github.com/bszek213/cbb_machine_learning/blob/dev/feature_importance_random_forest.png) -->
35 | ### Feature Importances Classification
36 | XGBoost
37 | ![](https://github.com/bszek213/cbb_machine_learning/blob/dev/feature_importance_xgb_classifier.png)
38 | Deep Neural Network
39 | ![](https://github.com/bszek213/cbb_machine_learning/blob/dev/SHAP_feature_importances.png)
40 | 
41 | ## Contributing
42 | Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change.
43 | 


--------------------------------------------------------------------------------
/ROC_curve_class.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bszek213/cbb_machine_learning/aab183383e3237c3c1007e476abcd66028c7604c/ROC_curve_class.png


--------------------------------------------------------------------------------
/SHAP_feature_importances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bszek213/cbb_machine_learning/aab183383e3237c3c1007e476abcd66028c7604c/SHAP_feature_importances.png


--------------------------------------------------------------------------------
/all_teams_cbb.csv:
--------------------------------------------------------------------------------
  1 | School,From,To
  2 | Abilene Christian,1971,2024
  3 | Air Force,1958,2024
  4 | Akron,1902,2024
  5 | Alabama,1913,2024
  6 | Alabama A&M,2000,2024
  7 | Alabama State,1983,2024
  8 | Albany (NY),2000,2024
  9 | Alcorn State,1978,2024
 10 | Allegheny Gators,1896,1916
 11 | American,1967,2024
 12 | Amherst Lord Jeffs,1901,1902
 13 | Appalachian State,1974,2024
 14 | Arizona,1905,2024
 15 | Arizona State,1912,2024
 16 | Arkansas,1924,2024
 17 | Arkansas State,1971,2024
 18 | Arkansas-Pine Bluff,1999,2024
 19 | Armstrong Pirates,1987,1987
 20 | Army,1903,2024
 21 | Auburn,1906,2024
 22 | Augusta Jaguars,1985,1991
 23 | Augustana (IL) Vikings,1902,1917
 24 | Austin Peay,1964,2024
 25 | Baker University Wildcats,1903,1908
 26 | Baldwin-Wallace Yellow Jackets,1948,1953
 27 | Ball State,1972,2024
 28 | Baltimore Super Bees,1979,1983
 29 | Baylor,1907,2024
 30 | Bellarmine,2021,2024
 31 | Belmont,2000,2024
 32 | Beloit Buccaneers,1911,1924
 33 | Bethune-Cookman,1981,2024
 34 | Binghamton,2002,2024
 35 | Birmingham-Southern Panthers,1920,2006
 36 | Bloomsburg Huskies,1896,1911
 37 | Boise State,1972,2024
 38 | Boston College,1946,2024
 39 | Boston University,1916,2024
 40 | Bowling Green State,1916,2024
 41 | Bradley,1903,2024
 42 | Brigham Young,1903,2024
 43 | Brigham Young College,1908,1908
 44 | Brooklyn Bulldogs,1934,1992
 45 | Brown,1901,2024
 46 | Bryant,2011,2024
 47 | Bucknell,1896,2024
 48 | Buffalo,1907,2024
 49 | Butler,1897,2024
 50 | Cal Poly,1995,2024
 51 | Cal State Bakersfield,2011,2024
 52 | Cal State Fullerton,1975,2024
 53 | Cal State Los Angeles Golden Eagles,1971,1975
 54 | Cal State Northridge,1991,2024
 55 | California,1908,2024
 56 | California Baptist,2019,2024
 57 | Campbell,1978,2024
 58 | Canisius,1904,2024
 59 | Canterbury College,1931,1931
 60 | Carleton College Knights,1910,1934
 61 | Carnegie Mellon Tartans,1933,1939
 62 | Case Western Reserve Spartans,1898,1955
 63 | Catholic Cardinals,1913,1981
 64 | Centenary (LA) Gents,1960,2011
 65 | Central Arkansas,2011,2024
 66 | Central Connecticut State,1987,2024
 67 | Central Michigan,1974,2024
 68 | Central Missouri Mules,1913,1937
 69 | Central Pennsylvania College Knights,1896,1900
 70 | Centre (KY) Colonels,1910,1919
 71 | Charleston Southern,1975,2024
 72 | Charlotte,1973,2024
 73 | Chattanooga,1978,2024
 74 | Cheyenne Business College,1903,1903
 75 | Chicago Maroons,1896,1946
 76 | Chicago State,1985,2024
 77 | Cincinnati,1902,2024
 78 | City College of New York Beavers,1906,1953
 79 | Clemson,1912,2024
 80 | Cleveland State,1973,2024
 81 | Coastal Carolina,1987,2024
 82 | Colgate,1901,2024
 83 | College of Charleston,1992,2024
 84 | College of New Jersey Lions,1900,1900
 85 | Colorado,1902,2024
 86 | Colorado College Tigers,1915,1937
 87 | Colorado School of Mines Orediggers,1908,1937
 88 | Colorado State,1902,2024
 89 | Columbia,1901,2024
 90 | Concordia Seminary Preachers,1907,1923
 91 | Connecticut,1901,2024
 92 | Coppin State,1986,2024
 93 | Cornell,1899,2024
 94 | Cotner College,1910,1911
 95 | Creighton,1912,2024
 96 | Cumberland,1904,1904
 97 | Dakota Wesleyan Tigers,1932,1932
 98 | Dartmouth,1900,2024
 99 | Davidson,1909,2024
100 | Dayton,1904,2024
101 | Delaware,1906,2024
102 | Delaware State,1974,2024
103 | Denison Big Red,1905,1944
104 | Denver,1904,2024
105 | DePaul,1924,2024
106 | DePauw Tigers,1916,1932
107 | Detroit Mercy,1910,2024
108 | Dickinson College Red Devils,1926,1947
109 | Drake,1907,2024
110 | Drexel,1895,2024
111 | Duke,1906,2024
112 | Duquesne,1914,2024
113 | East Carolina,1967,2024
114 | East Central Tigers,1929,1931
115 | East Tennessee State,1959,2024
116 | Eastern Illinois,1982,2024
117 | Eastern Kentucky,1948,2024
118 | Eastern Michigan,1933,2024
119 | Eastern Washington,1984,2024
120 | Elon,2000,2024
121 | Emporia State Hornets,1934,1934
122 | Ensign College,1903,1903
123 | Evansville,1925,2024
124 | Fairfield,1965,2024
125 | FDU,1968,2024
126 | Florida,1921,2024
127 | Florida A&M,1980,2024
128 | Florida Atlantic,1994,2024
129 | Florida Gulf Coast,2011,2024
130 | Florida International,1988,2024
131 | Florida State,1957,2024
132 | Fordham,1903,2024
133 | Franklin Grizzlies,1907,1925
134 | Fresno State,1956,2024
135 | Furman,1920,2024
136 | Gardner-Webb,2003,2024
137 | Geneva Golden Tornadoes,1893,1943
138 | George Mason,1979,2024
139 | George Washington,1913,2024
140 | Georgetown,1907,2024
141 | Georgia,1906,2024
142 | Georgia Southern,1972,2024
143 | Georgia State,1974,2024
144 | Georgia Tech,1920,2024
145 | Gettysburg Bullets,1901,1973
146 | Gonzaga,1944,2024
147 | Grambling,1978,2024
148 | Grand Canyon,2014,2024
149 | Green Bay,1982,2024
150 | Grinnell Pioneers,1901,1939
151 | Grove City Wolverines,1899,1925
152 | Hamline Pipers,1945,1948
153 | Hampton,1996,2024
154 | Hardin-Simmons Cowboys,1923,1990
155 | Hartford Hawks,1985,2023
156 | Harvard,1901,2024
157 | Haskell (KS) Fighting Indians,1903,1908
158 | Hawaii,1971,2024
159 | High Point,2000,2024
160 | Hiram Terriers,1894,1904
161 | Hofstra,1943,2024
162 | Holy Cross,1901,2024
163 | Hope Flying Dutchmen,1908,1913
164 | Houston,1951,2024
165 | Houston Christian,1974,2024
166 | Howard,1974,2024
167 | Idaho,1906,2024
168 | Idaho State,1959,2024
169 | Illinois,1906,2024
170 | Illinois State,1899,2024
171 | Illinois Wesleyan Titans,1928,1928
172 | Illinois-Chicago,1982,2024
173 | Incarnate Word,2014,2024
174 | Indiana,1901,2024
175 | Indiana State,1900,2024
176 | Iona,1954,2024
177 | Iowa,1893,2024
178 | Iowa State,1908,2024
179 | IUPUI,1999,2024
180 | Jackson State,1978,2024
181 | Jacksonville,1967,2024
182 | Jacksonville State,1996,2024
183 | James Madison,1977,2024
184 | John Carroll Blue Streaks,1948,1955
185 | Kalamazoo Hornets,1908,1923
186 | Kansas,1899,2024
187 | Missouri Kansas City,1990,2024
188 | Kansas State,1906,2024
189 | Kennesaw State,2010,2024
190 | Kent State,1914,2024
191 | Kentucky,1903,2024
192 | Kentucky Wesleyan Panthers,1957,1958
193 | La Salle,1932,2024
194 | Lafayette,1901,2024
195 | Lake Forest Foresters,1905,1916
196 | Lamar,1970,2024
197 | Lawrence Tech,1948,1948
198 | Le Moyne,2024,2024
199 | Lehigh,1902,2024
200 | Lewis Flyers,1905,1905
201 | Liberty,1989,2024
202 | Lindenwood,2023,2024
203 | Lipscomb,2004,2024
204 | Arkansas Little Rock,1979,2024
205 | Long Beach State,1970,2024
206 | Long Island University,1929,2024
207 | Longwood,2008,2024
208 | Louisiana Lafayette,1972,2024
209 | Louisiana State,1909,2024
210 | Louisiana Tech,1974,2024
211 | Louisiana-Monroe,1974,2024
212 | Louisville,1912,2024
213 | Loyola (IL),1921,2024
214 | Loyola (LA) Wolfpack,1952,1972
215 | Loyola (MD),1908,2024
216 | Loyola Marymount,1943,2024
217 | Macalester Scots,1896,1899
218 | Maine,1904,2024
219 | Manchester Spartans,1926,1926
220 | Manhattan,1905,2024
221 | Marietta Pioneers,1908,1920
222 | Marist,1982,2024
223 | Marquette,1917,2024
224 | Marshall,1919,2024
225 | Maryland,1924,2024
226 | Maryland-Baltimore County,1987,2024
227 | Maryland-Eastern Shore,1974,2024
228 | Massachusetts,1926,2024
229 | Massachusetts Institute of Technology Engineers,1909,1909
230 | Massachusetts-Lowell,1906,2024
231 | McNeese State,1974,2024
232 | Memphis,1956,2024
233 | Mercer,1974,2024
234 | Merchant Marine Mariners,1946,1947
235 | Merrimack,2020,2024
236 | Miami (FL),1949,2024
237 | Miami (OH),1906,2024
238 | Michigan,1918,2024
239 | Michigan State,1899,2024
240 | Middle Tennessee,1959,2024
241 | Millikin Big Blue,1910,1921
242 | Millsaps Majors,1911,1921
243 | Milwaukee,1974,2024
244 | Minnesota,1896,2024
245 | Minnesota A&M Aggies,1896,1903
246 | Mississippi,1909,2024
247 | Mississippi State,1909,2024
248 | Mississippi Valley State,1980,2024
249 | Missouri,1907,2024
250 | Missouri State,1983,2024
251 | Monmouth,1984,2024
252 | Montana,1912,2024
253 | Montana State,1902,2024
254 | Morehead State,1956,2024
255 | Morgan State,1985,2024
256 | Morris Brown Wolverines,2002,2003
257 | Mount St. Mary's,1989,2024
258 | Mount Union Purple Raiders,1896,1932
259 | Muhlenberg Mules,1901,1963
260 | Murray State,1954,2024
261 | Muskingum Fighting Muskies,1905,1927
262 | Navy,1908,2024
263 | North Carolina State,1913,2024
264 | Nebraska,1897,2024
265 | Nebraska Wesleyan Prairie Wolves,1906,1917
266 | Nevada,1913,2024
267 | Nevada-Las Vegas,1970,2024
268 | New Hampshire,1927,2024
269 | New Mexico,1900,2024
270 | New Mexico State,1905,2024
271 | New Orleans,1976,2024
272 | New York University Violets,1907,1971
273 | Newberry Wolves,1921,1921
274 | Niagara,1906,2024
275 | Nicholls State,1981,2024
276 | NJIT,2010,2024
277 | Norfolk State,1998,2024
278 | North Alabama,2019,2024
279 | North Carolina,1911,2024
280 | North Carolina A&T,1974,2024
281 | North Carolina Central,2011,2024
282 | North Central Cardinals,1911,1922
283 | North Dakota,1905,2024
284 | North Dakota State,1898,2024
285 | North Florida,2010,2024
286 | North Texas,1922,2024
287 | Northeastern,1938,2024
288 | Northeastern Illinois Golden Eagles,1991,1998
289 | Northern Arizona,1919,2024
290 | Northern Colorado,1911,2024
291 | Northern Illinois,1927,2024
292 | Northern Iowa,1981,2024
293 | Northern Kentucky,2013,2024
294 | Northwest Missouri State Bearcats,1930,1932
295 | Northwestern,1905,2024
296 | Northwestern State,1977,2024
297 | Notre Dame,1897,2024
298 | Oakland,2000,2024
299 | Oberlin Yeomen,1905,1921
300 | Ohio,1908,2024
301 | Ohio State,1899,2024
302 | Ohio Wesleyan Battling Bishops,1929,1935
303 | Oklahoma,1908,2024
304 | Oklahoma City Chiefs,1951,1985
305 | Oklahoma State,1908,2024
306 | Old Dominion,1977,2024
307 | Nebraska Omaha,2013,2024
308 | Oral Roberts,1972,2024
309 | Oregon,1903,2024
310 | Oregon State,1902,2024
311 | Pacific,1938,2024
312 | Penn State,1897,2024
313 | Pennsylvania,1897,2024
314 | Pepperdine,1944,2024
315 | Phillips Haymakers,1920,1920
316 | Pittsburg State Gorillas,1927,1931
317 | Pittsburgh,1906,2024
318 | Portland,1954,2024
319 | Portland State,1973,2024
320 | Prairie View,1981,2024
321 | Pratt Institute Cannoneers,1934,1934
322 | Presbyterian,2011,2024
323 | Princeton,1901,2024
324 | Providence,1929,2024
325 | Purdue,1897,2024
326 | Purdue Fort Wayne,2003,2024
327 | Queens (NC),2023,2024
328 | Quinnipiac,1999,2024
329 | Radford,1985,2024
330 | Regis (CO) Rangers,1962,1964
331 | Rensselaer Engineers,1901,1924
332 | Rhode Island,1904,2024
333 | Rice,1915,2024
334 | Richmond,1913,2024
335 | Rider,1929,2024
336 | Ripon Red Hawks,1902,1922
337 | Roanoke Maroons,1912,1919
338 | Robert Morris,1977,2024
339 | Rochester (NY) Yellowjackets,1910,1944
340 | Rose-Hulman Fightin' Engineers,1898,1898
341 | Rutgers,1914,2024
342 | Sacramento State,1992,2024
343 | Sacred Heart,2000,2024
344 | Saint Francis (PA),1956,2024
345 | Saint Joseph's,1910,2024
346 | Saint Louis,1916,2024
347 | Saint Mary's (CA),1910,2024
348 | Saint Peter's,1966,2024
349 | Sam Houston,1987,2024
350 | Samford,1973,2024
351 | San Diego,1980,2024
352 | San Diego State,1971,2024
353 | San Francisco,1924,2024
354 | San Jose State,1938,2024
355 | Santa Clara,1909,2024
356 | Savage School of Physical Education,1896,1898
357 | Savannah State Tigers,2003,2019
358 | Scranton Royals,1948,1948
359 | Seattle,1953,2024
360 | Seton Hall,1909,2024
361 | Sewanee Tigers,1923,1941
362 | Siena,1939,2024
363 | South Alabama,1972,2024
364 | South Carolina,1909,2024
365 | South Carolina State,1972,2024
366 | South Carolina Upstate,2011,2024
367 | South Dakota,2011,2024
368 | South Dakota State,2009,2024
369 | South Florida,1974,2024
370 | Southeast Missouri State,1992,2024
371 | Southeastern Louisiana,1981,2024
372 | Southern,1978,2024
373 | Southern California,1907,2024
374 | Southern Illinois,1968,2024
375 | Southern Illinois-Edwardsville,2011,2024
376 | Southern Indiana,2023,2024
377 | Southern Methodist,1917,2024
378 | Southern Mississippi,1973,2024
379 | Southern Utah,1989,2024
380 | Southwestern (KS) Moundbuilders,1905,1923
381 | Southwestern (TX) Pirates,1915,1916
382 | Springfield Pride,1897,1935
383 | St. Bonaventure,1920,2024
384 | St. Francis (NY) Terriers,1902,2023
385 | St. John's (NY),1908,2024
386 | St. John's College (OH),1921,1921
387 | St. Lawrence Saints,1902,1914
388 | St. Thomas,2022,2024
389 | Stanford,1914,2024
390 | Stephen F. Austin,1987,2024
391 | Stetson,1972,2024
392 | Stevens Institute Ducks,1917,1920
393 | Stonehill,2023,2024
394 | Stony Brook,2000,2024
395 | SUNY-Potsdam Bears,1910,1913
396 | Swarthmore Garnet,1906,1919
397 | Syracuse,1901,2024
398 | Tarleton State,2021,2024
399 | TCU,1914,2024
400 | Temple,1895,2024
401 | Tennessee,1909,2024
402 | Tennessee State,1978,2024
403 | Tennessee Tech,1944,2024
404 | Tennessee-Martin,1993,2024
405 | Texas,1906,2024
406 | Texas A&M,1913,2024
407 | Texas A&M-Commerce,2023,2024
408 | Texas A&M-Corpus Christi,2003,2024
409 | Texas Southern,1978,2024
410 | Texas State,1985,2024
411 | Texas Tech,1926,2024
412 | Texas Wesleyan Rams,1948,1948
413 | Texas-Rio Grande Valley,1969,2024
414 | The Citadel,1913,2024
415 | Toledo,1916,2024
416 | Towson,1980,2024
417 | Trinity (CT) Bantams,1897,1911
418 | Trinity (TX) Tigers,1971,1973
419 | Troy,1994,2024
420 | Tulane,1906,2024
421 | Tulsa,1914,2024
422 | U.S. International Gulls,1982,1991
423 | Alabama Birmingham,1980,2024
424 | California Davis,2008,2024
425 | California irvine,1978,2024
426 | California Riverside,2002,2024
427 | California San Diego,2021,2024
428 | California Santa Barbara,1964,2024
429 | Central Florida,1985,2024
430 | UCLA,1920,2024
431 | North Carolina Asheville,1987,2024
432 | North Carolina Greensboro,1992,2024
433 | North Carolina Wilmington,1977,2024
434 | Union (NY) Dutchmen,1907,1925
435 | Texas Arlington,1969,2024
436 | Utah,1909,2024
437 | Utah State,1904,2024
438 | Utah Tech,2021,2024
439 | Utah Valley,2010,2024
440 | University Texas El Paso,1923,2024
441 | Utica Pioneers,1982,1987
442 | University Texas San Antonio,1982,2024
443 | Valparaiso,1918,2024
444 | Vanderbilt,1901,2024
445 | Vermont,1921,2024
446 | Villanova,1921,2024
447 | Virginia,1906,2024
448 | Virginia Commonwealth,1974,2024
449 | Virginia Military Institute,1909,2024
450 | Virginia Tech,1909,2024
451 | Wabash Little Giants,1897,1925
452 | Wagner,1966,2024
453 | Wake Forest,1906,2024
454 | Washburn Ichabods,1906,1941
455 | Washington,1896,2024
456 | Washington & Jefferson Presidents,1913,1944
457 | Washington & Lee Generals,1907,1959
458 | Washington (MO) Bears,1905,1960
459 | Washington College Shoremen,1913,1925
460 | Washington State,1902,2024
461 | Wayne State (MI) Warriors,1928,1950
462 | Weber State,1964,2024
463 | Wesleyan (CT) Cardinals,1896,1913
464 | West Chester Golden Rams,1899,1982
465 | West Texas A&M Buffaloes,1921,1986
466 | West Virginia,1904,2024
467 | Western Carolina,1977,2024
468 | Western Colorado Mountaineers,1924,1937
469 | Western Illinois,1982,2024
470 | Western Kentucky,1922,2024
471 | Western Michigan,1914,2024
472 | Westminster (MO) Blue Jays,1920,1920
473 | Westminster (PA) Titans,1898,1935
474 | Wheaton (IL) Thunder,1902,1905
475 | Whittier Poets,1909,1915
476 | Wichita State,1906,2024
477 | Widener Pride,1899,1909
478 | William Mary,1906,2024
479 | Williams Ephs,1901,1911
480 | Winthrop,1987,2024
481 | Wisconsin,1899,2024
482 | Wisconsin-Stevens Point Pointers,1898,1918
483 | Wisconsin-Superior Yellowjackets,1900,1901
484 | Wittenberg Tigers,1931,1931
485 | Wofford,1996,2024
486 | Wooster Fighting Scots,1901,1931
487 | WPI Engineers,1920,1920
488 | Wright State,1988,2024
489 | Wyoming,1905,2024
490 | Xavier,1920,2024
491 | Yale,1896,2024
492 | Youngstown State,1948,2024
493 | 


--------------------------------------------------------------------------------
/binary_keras_deep.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bszek213/cbb_machine_learning/aab183383e3237c3c1007e476abcd66028c7604c/binary_keras_deep.h5


--------------------------------------------------------------------------------
/cbb.yaml:
--------------------------------------------------------------------------------
 1 | name: cbb
 2 | channels:
 3 | - conda-forge
 4 | - robostack
 5 | - anaconda
 6 | - intel
 7 | - rapidsai
 8 | dependencies:
 9 | - hvplot
10 | - numpy
11 | - pandas
12 | - holoviews
13 | - scikit-learn
14 | - keras
15 | - cudatoolkit=11.2
16 | - cudnn=8.1.0
17 | - scipy
18 | - ipython
19 | - plotly
20 | - seaborn
21 | - ipywidgets
22 | - ipykernel
23 | - matplotlib
24 | - spyder
25 | - notebook
26 | - keyboard
27 | - eli5
28 | - pip
29 | - pip:
30 |   - sportsipy
31 |   - tensorflow
32 |   - beautifulsoup4
33 |   - eli5
34 |   - cfbd
35 | 


--------------------------------------------------------------------------------
/cbb_classification.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | College Basketball Predictions via classification and probability with ESPN 
  5 | @author: brianszekely
  6 | """
  7 | import cbb_web_scraper
  8 | from os import getcwd
  9 | from os.path import join, exists 
 10 | import yaml
 11 | from tqdm import tqdm
 12 | from time import sleep
 13 | from pandas import DataFrame, concat, read_csv, isnull
 14 | from sklearn.model_selection import train_test_split
 15 | from sklearn.model_selection import GridSearchCV
 16 | import numpy as np
 17 | import matplotlib.pyplot as plt
 18 | import seaborn as sns
 19 | from sys import argv
 20 | import joblib
 21 | from sklearn.metrics import confusion_matrix, accuracy_score
 22 | from difflib import get_close_matches
 23 | from sklearn.metrics import roc_curve
 24 | import seaborn as sns
 25 | from tensorflow.keras.utils import to_categorical
 26 | from sklearn.decomposition import PCA
 27 | import xgboost as xgb
 28 | from tensorflow.keras.layers import Dense, BatchNormalization
 29 | from tensorflow.keras.models import Sequential
 30 | from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
 31 | from tensorflow.keras.optimizers import Adam, RMSprop
 32 | from tensorflow.keras.regularizers import l1, l2
 33 | from keras_tuner import RandomSearch
 34 | from tensorflow.keras.losses import BinaryCrossentropy
 35 | from tensorflow.keras.layers import Dropout
 36 | from tensorflow.keras.models import load_model
 37 | import os
 38 | from colorama import Fore, Style
 39 | from sklearn.preprocessing import StandardScaler, RobustScaler
 40 | import shap
 41 | 
 42 | """
 43 | TODO:
 44 | -adjust noise for better learning
 45 | -may remove opp_pts and pts to enhance other features
 46 | -feature engineer with rolling std or mean
 47 | """
 48 | def create_sequential_model(hp, n_features, n_outputs):
 49 |     model = Sequential()
 50 |     #Add hidden layers
 51 |     for i in range(hp.Int('num_layers', 1, 10)):
 52 |         if i == 0:
 53 |             # First hidden layer needs input shape
 54 |             model.add(Dense(units=hp.Int(f'units_{i}', min_value=8, max_value=128, step=8),
 55 |                             activation=hp.Choice(f'activation_{i}', values=['relu', 'leaky_relu', 'tanh', 'linear']),
 56 |                             kernel_regularizer=l2(hp.Float(f'regularizer_strength_{i}', min_value=1e-1, max_value=1, sampling='log')),
 57 |                             input_shape=(n_features,)))
 58 |         else:
 59 |             model.add(Dense(units=hp.Int(f'units_{i}', min_value=8, max_value=128, step=8),
 60 |                             activation=hp.Choice(f'activation_{i}', values=['relu', 'leaky_relu', 'tanh', 'linear']),
 61 |                             kernel_regularizer=l2(hp.Float(f'regularizer_strength_{i}', min_value=1e-1, max_value=1, sampling='log'))))
 62 |             model.add(BatchNormalization())
 63 |             model.add(Dropout(rate=hp.Float(f'dropout_rate_{i}', min_value=0.3, max_value=0.6, step=0.1)))
 64 |     
 65 |     # Output layer
 66 |     model.add(Dense(n_outputs, activation='sigmoid'))  # Binary classification
 67 |     
 68 |     # Compile model
 69 |     optimizer_choice = hp.Choice('optimizer', values=['adam', 'rmsprop']) #, 'sgd'
 70 |     if optimizer_choice == 'adam':
 71 |         optimizer = Adam(learning_rate=hp.Float('adam_learning_rate', min_value=0.0001, max_value=0.01, sampling='log'))
 72 |     else:
 73 |         optimizer = RMSprop(learning_rate=hp.Float('rmsprop_learning_rate', min_value=0.0001, max_value=0.01, sampling='log'))
 74 |     
 75 |     model.compile(optimizer=optimizer,
 76 |                   loss=BinaryCrossentropy(),
 77 |                   metrics=['accuracy'])
 78 |     
 79 |     return model
 80 | 
 81 | class cbbClass():
 82 |     def __init__(self,pre_process):
 83 |         print('instantiate class cbbClass')
 84 |         self.all_data = DataFrame()
 85 |         self.which_analysis = pre_process # 'pca' or 'corr'
 86 | 
 87 |     def get_teams(self):
 88 |         year_list_find = []
 89 |         year_list = [2024,2023]#,2022,2021,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010]
 90 |         if exists(join(getcwd(),'year_count.yaml')):
 91 |             with open(join(getcwd(),'year_count.yaml')) as file:
 92 |                 year_counts = yaml.load(file, Loader=yaml.FullLoader)
 93 |         else:
 94 |             year_counts = {'year':year_list_find}
 95 |         #Remove any years that have already been collected
 96 |         if year_counts['year']:
 97 |             year_list_check =  year_counts['year']
 98 |             year_list_find = year_counts['year']
 99 |             year_list = [i for i in year_list if i not in year_list_check]
100 |             print(f'Need data for year: {year_list}')
101 |         #Collect data per year
102 |         if year_list:   
103 |             for year in tqdm(year_list):
104 |                 all_teams = cbb_web_scraper.get_teams_year(year_list[-1],2024)
105 |                 team_names = sorted(all_teams)
106 |                 final_list = []
107 |                 self.year_store = year
108 |                 for abv in tqdm(team_names):    
109 |                     try:
110 |                         print() #tqdm things
111 |                         print(f'current team: {abv}, year: {year}')
112 |                         basic = 'https://www.sports-reference.com/cbb/schools/' + abv + '/' + str(self.year_store) + '-gamelogs.html'
113 |                         adv = 'https://www.sports-reference.com/cbb/schools/' + abv + '/' + str(self.year_store) + '-gamelogs-advanced.html'
114 |                         df_inst = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,abv,self.year_store)
115 |                         print(df_inst)
116 |                         df_inst['pts'].replace('', np.nan, inplace=True)
117 |                         df_inst.dropna(inplace=True)
118 |                         final_list.append(df_inst)
119 |                     except Exception as e:
120 |                         print(e)
121 |                         print(f'{abv} data are not available')
122 |                     sleep(4) #I get get banned for a small period of time if I do not do this
123 |                 final_data = concat(final_list)
124 |                 if exists(join(getcwd(),'all_data.csv')):
125 |                     self.all_data = read_csv(join(getcwd(),'all_data.csv'))  
126 |                 self.all_data = concat([self.all_data, final_data.dropna()])
127 |                 if not exists(join(getcwd(),'all_data.csv')):
128 |                     self.all_data.to_csv(join(getcwd(),'all_data.csv'),index=False)
129 |                 self.all_data.to_csv(join(getcwd(),'all_data.csv'),index=False)
130 |                 year_list_find.append(year)
131 |                 print(f'year list after loop: {year_list_find}')
132 |                 with open(join(getcwd(),'year_count.yaml'), 'w') as write_file:
133 |                     yaml.dump(year_counts, write_file)
134 |                     print(f'writing {year} to yaml file')
135 |         else:
136 |             self.all_data = read_csv(join(getcwd(),'all_data.csv'))
137 |         print('dataset size: ', np.shape(self.all_data))
138 |         self.all_data = self.all_data.drop_duplicates(keep='last')
139 |         print(f'dataset size after duplicates are dropped: {np.shape(self.all_data)}')
140 |     
141 |     def pca_analysis(self):
142 |         #scale first before pca 
143 |         self.scaler = StandardScaler()
144 |         x_scale = self.scaler.fit_transform(self.x)
145 |         self.pca = PCA(n_components=0.95) #explain 95% of the variance
146 |         self.x_no_corr = self.pca.fit_transform(x_scale)
147 | 
148 |         #Visualize PCA components
149 |         plt.figure()
150 |         plt.figure(figsize=(8, 6))
151 |         plt.bar(range(self.pca.n_components_), self.pca.explained_variance_ratio_)
152 |         plt.xlabel('Principal Component')
153 |         plt.ylabel('Explained Variance Ratio')
154 |         plt.title('Explained Variance Ratio of Principal Components')
155 |         plt.savefig('pca_components.png',dpi=400)
156 |         plt.close()
157 | 
158 |     def convert_to_float(self):
159 |         for col in self.all_data.columns:
160 |             self.all_data[col].replace('', np.nan, inplace=True)
161 |             self.all_data[col] = self.all_data[col].astype(float)
162 |         self.all_data.dropna(inplace=True)
163 | 
164 |     def delete_opp(self):
165 |         """
166 |         Drop any opponent data, as it may not be helpful when coming to prediction. Hard to estimate with running average
167 |         """
168 |         for col in self.all_data.columns:
169 |             if 'opp' in col:
170 |                 self.all_data.drop(columns=col,inplace=True)
171 |     def split(self):
172 |         # self.delete_opp()
173 |         for col in self.all_data.columns:
174 |             if 'Unnamed' in col:
175 |                 self.all_data.drop(columns=col,inplace=True)
176 |         self.convert_to_float()
177 |         #self.y = np.delete(self.y, np.where(np.isnan(self.x_no_corr)), axis=0)
178 |         #self.x_no_corr = self.x_no_corr.dropna()
179 |         self.y = self.all_data['game_result'].astype(int)
180 |         result_counts = self.all_data['game_result'].value_counts()
181 |         #plot the counts
182 |         plt.figure(figsize=(8, 6))
183 |         result_counts.plot(kind='bar')
184 |         plt.xlabel('Game Result')
185 |         plt.ylabel('Count')
186 |         plt.title('Count of Labels')
187 |         plt.savefig('class_label_count.png',dpi=400)
188 |         plt.close()
189 | 
190 |         #onehot encode
191 |         self.y = to_categorical(self.y)
192 |         self.x = self.all_data.drop(columns=['game_result'])
193 |         
194 |         # #Dropna and remove all data from subsequent y data
195 |         # real_values = ~self.x_no_corr.isna().any(axis=1)
196 |         # self.x_no_corr.dropna(inplace=True)
197 |         # self.y = self.y.loc[real_values]
198 | 
199 |        
200 |         #pca data or no correlated data
201 |         if self.which_analysis == 'pca':
202 |             #pca 
203 |             self.pca_analysis()
204 |         else:
205 |             #correlational analysis and outlier removal
206 |             self.pre_process_corr_out_remove()
207 |         #75/15/10 split
208 |         #Split data into training and the rest (75% training, 25% temporary)
209 |         self.x_train, x_temp, self.y_train, y_temp = train_test_split(self.x_no_corr, self.y, train_size=0.75, random_state=42)
210 |         #Split the rest into validation and test data (60% validation, 40% test)
211 |         validation_ratio = 0.15 / (1 - 0.75)  # Adjust ratio for the remaining part
212 |         self.x_validation, self.x_test, self.y_validation, self.y_test = train_test_split(x_temp, y_temp, train_size=validation_ratio, random_state=42)
213 | 
214 |     def pre_process_corr_out_remove(self):
215 |         # Remove features with a correlation coef greater than 0.90
216 |         corr_val = 0.9
217 |         corr_matrix = np.abs(self.x.astype(float).corr())
218 |         upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
219 |         to_drop = [column for column in upper.columns if any(upper[column] >= corr_val)]
220 |         self.drop_cols = to_drop
221 |         self.drop_cols = self.drop_cols + ['opp_pts', 'pts','game_loc','simple_rating_system'] #remove these extra features
222 |         self.x_no_corr = self.x.drop(columns=self.drop_cols)
223 |         cols = self.x_no_corr.columns
224 |         print(f'Columns dropped  >= {corr_val}: {self.drop_cols}')
225 |         #Drop samples that are outliers 
226 |         print(f'old feature dataframe shape before outlier removal: {self.x_no_corr.shape}')
227 |         for col_name in cols:
228 |             Q1 = np.percentile(self.x_no_corr[col_name], 5)
229 |             Q3 = np.percentile(self.x_no_corr[col_name], 95)
230 |             IQR = Q3 - Q1
231 |             upper = np.where(self.x_no_corr[col_name] >= (Q3+2.5*IQR)) #1.5 is the standard, use two to see if more data helps improve model performance
232 |             lower = np.where(self.x_no_corr[col_name] <= (Q1-2.5*IQR)) 
233 |             self.x_no_corr.drop(upper[0], inplace = True)
234 |             self.x_no_corr.drop(lower[0], inplace = True)
235 |             self.y = np.delete(self.y, upper[0], axis=0)
236 |             self.y = np.delete(self.y, lower[0], axis=0)
237 |             # self.y.drop(upper[0], inplace = True)
238 |             # self.y.drop(lower[0], inplace = True)
239 |             if 'level_0' in self.x_no_corr.columns:
240 |                 self.x_no_corr.drop(columns=['level_0'],inplace = True)
241 |             self.x_no_corr.reset_index(inplace = True)
242 |             # self.y.reset_index(inplace = True, drop=True)
243 |         self.x_no_corr.drop(columns=['level_0','index'],inplace = True)
244 |         print(f'new feature dataframe shape after outlier removal: {self.x_no_corr.shape}')
245 |         top_corr_features = corr_matrix.index
246 |         plt.figure(figsize=(25,25))
247 |         sns.heatmap(corr_matrix[top_corr_features],annot=True,cmap="RdYlGn")    
248 |         plt.tight_layout()
249 |         plt.savefig('correlations_class.png',dpi=300)
250 |         plt.close()
251 |         
252 |         #Extra preprocessing steps
253 |         #standardize
254 |         self.cols_save = self.x_no_corr.columns
255 |         self.scaler = StandardScaler()
256 |         self.x_no_corr = self.scaler.fit_transform(self.x_no_corr)
257 |         #normalize
258 |         self.min_max_scaler = RobustScaler()
259 |         self.x_no_corr = self.min_max_scaler.fit_transform(self.x_no_corr)
260 |         self.x_no_corr = DataFrame(self.x_no_corr,columns=self.cols_save)
261 |         #Generate random noise with the same shape as the DataFrame
262 |         noise = np.random.normal(loc=0, scale=0.175, size=self.x_no_corr.shape) #the higher the scale value is, the more uniform the distribution becomes
263 |         self.x_no_corr = self.x_no_corr + noise
264 | 
265 |     # def random_forest_analysis(self):
266 |     #     if argv[1] == 'tune':
267 |     #         #RANDOM FOREST REGRESSOR
268 |     #         RandForclass = RandomForestClassifier()
269 |     #         #Use the number of features as a stopping criterion for depth
270 |     #         rows, cols = self.x_train.shape
271 |     #         cols = int(cols / 2.5) #try to avoid overfitting on depth
272 |     #         #square root of the total number of features is a good limit
273 |     #         # cols = int(np.sqrt(cols))
274 |     #         #parameters to tune
275 |     #         #increasing min_samples_leaf, this will reduce overfitting
276 |     #         Rand_perm = {
277 |     #             'criterion' : ["gini","entropy"], #absolute_error - takes forever to run
278 |     #             'n_estimators': range(300,500,100),
279 |     #             # 'min_samples_split': np.arange(2, 5, 1, dtype=int),
280 |     #             'max_features' : [1, 'sqrt', 'log2'],
281 |     #             'max_depth': np.arange(2,cols,1),
282 |     #             'min_samples_leaf': np.arange(2,4,1)
283 |     #             }
284 |     #         clf_rand = GridSearchCV(RandForclass, Rand_perm, 
285 |     #                             scoring=['accuracy','f1'],
286 |     #                             cv=5,
287 |     #                            refit='accuracy',
288 |     #                            verbose=4, 
289 |     #                            n_jobs=-1)
290 |     #         search_rand = clf_rand.fit(self.x_train,self.y_train)
291 |     #         #Write fitted and tuned model to file
292 |     #         # with open('randomForestModelTuned.pkl','wb') as f:
293 |     #         #     pickle.dump(search_rand,f)
294 |     #         joblib.dump(search_rand, "./classifierModelTuned.joblib", compress=9)
295 |     #         print('RandomForestClassifier - best params: ',search_rand.best_params_)
296 |     #         self.RandForclass = search_rand
297 |     #         prediction = self.RandForclass.predict(self.x_test)
298 |     #         print(confusion_matrix(self.y_test, prediction))# Display accuracy score
299 |     #         print(f'Model accuracy: {accuracy_score(self.y_test, prediction)}')# Display F1 score
300 |     #         # print(f1_score(self.y_test, prediction))
301 |     #     else:
302 |     #         print('Load tuned Random Forest Classifier')
303 |     #         # load RandomForestModel
304 |     #         self.RandForclass=joblib.load("./classifierModelTuned.joblib")
305 |     #         prediction = self.RandForclass.predict(self.x_test)
306 |     #         print(confusion_matrix(self.y_test, prediction))# Display accuracy score
307 |     #         print(f'Model accuracy: {accuracy_score(self.y_test, prediction)}')# Display F1 score
308 |     #         # print(f1_score(self.y_test, prediction))
309 |     #     y_proba = self.RandForclass.predict_proba(self.x_test)[:, 1]
310 |     #     fpr, tpr, thresholds = roc_curve(self.y_test, y_proba)
311 |     #     plt.plot(fpr, tpr)
312 |     #     plt.xlabel('False Positive Rate')
313 |     #     plt.ylabel('True Positive Rate')
314 |     #     plt.title('ROC Curve')
315 |     #     plt.savefig('ROC_curve_class.png',dpi=300)
316 |     
317 |     def xgboost_analysis(self):
318 |         if not os.path.exists('classifierModelTuned_xgb.joblib'):
319 |             if self.which_analysis == 'pca':
320 |                 y_train_combined = np.concatenate([self.y_train, self.y_validation], axis=0)
321 |                 x_train_combined = np.concatenate([self.x_train, self.x_validation], axis=0)
322 |             else:
323 |                 y_train_combined = np.concatenate([self.y_train, self.y_validation], axis=0)
324 |                 x_train_combined = concat([self.x_train, self.x_validation], axis=0)
325 |             if argv[1] == 'tune':
326 |                 # XGBoost Classifier
327 |                 xgb_class = xgb.XGBClassifier()
328 | 
329 |                 # Parameters to tune
330 |                 params = {
331 |                     'learning_rate': [0.01, 0.1],
332 |                     'n_estimators': range(100, 300, 100),
333 |                     'max_depth': range(2, 4, 2),
334 |                     'min_child_weight': [1, 5],
335 |                     'gamma': [0, 0.2],
336 |                     'subsample': [0.6, 1.0],
337 |                     'colsample_bytree': [0.6, 1.0],
338 |                     'reg_alpha': [0, 0.01],
339 |                     'reg_lambda': [0, 0.01],
340 |                     'scale_pos_weight': [1, 3]
341 |                 }
342 | 
343 |                 clf_xgb = GridSearchCV(xgb_class, params,
344 |                                     scoring=['accuracy'],
345 |                                     cv=5,
346 |                                     refit='accuracy',
347 |                                     verbose=4)
348 |                 search_xgb = clf_xgb.fit(x_train_combined, y_train_combined)
349 | 
350 |                 # Write fitted and tuned model to file
351 |                 joblib.dump(search_xgb, "./classifierModelTuned_xgb.joblib", compress=9)
352 |                 print('XGBoost Classifier - best params: ', search_xgb.best_params_)
353 |                 self.xgb_class = search_xgb
354 |                 prediction = self.xgb_class.predict(self.x_test)
355 |                 print('Confusion Matrix: \n',confusion_matrix(np.argmax(self.y_test, axis=1), np.argmax(prediction, axis=1)))  # Display accuracy score
356 |                 print(f'Model accuracy on test data:: {accuracy_score(np.argmax(self.y_test, axis=1), np.argmax(prediction, axis=1))}')  # Display F1 score
357 | 
358 |             else:
359 |                 print('Load tuned XGBoost Classifier')
360 |                 # load XGBoost Model
361 |                 self.xgb_class = joblib.load("./classifierModelTuned_xgb.joblib")
362 |                 prediction = self.xgb_class.predict(self.x_test)
363 |                 print('Confusion Matrix on test data: \n',confusion_matrix(np.argmax(self.y_test, axis=1), np.argmax(prediction, axis=1)))  # Display accuracy score
364 |                 print(f'Model accuracy on test data: {accuracy_score(np.argmax(self.y_test, axis=1), np.argmax(prediction, axis=1))}')  # Display F1 score
365 |                 with open("output_xgb.txt", "w") as file:
366 |                     file.write('Confusion Matrix on test data: \n')
367 |                     file.write(str(confusion_matrix(np.argmax(self.y_test, axis=1), np.argmax(prediction, axis=1))))
368 |                     file.write('\n')
369 |                     file.write(f'Model accuracy on test data: {accuracy_score(np.argmax(self.y_test, axis=1), np.argmax(prediction, axis=1))}')
370 |                     file.write('\n')
371 |             y_proba = self.xgb_class.predict_proba(self.x_test)
372 |             fpr, tpr, thresholds = roc_curve(np.argmax(self.y_test, axis=1), np.argmax(y_proba, axis=1))
373 |             plt.figure()
374 |             plt.plot(fpr, tpr)
375 |             plt.xlabel('False Positive Rate')
376 |             plt.ylabel('True Positive Rate')
377 |             plt.title('ROC Curve')
378 |             plt.savefig('ROC_curve_class.png', dpi=300)
379 |             plt.close()
380 |         else:
381 |             self.xgb_class = joblib.load("./classifierModelTuned_xgb.joblib")
382 | 
383 | 
384 |     def deep_learn_analysis(self):
385 |         if not os.path.exists('binary_keras_deep.h5'):
386 |             tuner = RandomSearch(
387 |                     lambda hp: create_sequential_model(hp, self.x_train.shape[1], 2),
388 |                     objective='val_loss', #val_loss
389 |                     max_trials=10,
390 |                     directory=f'cbb_sequential_hp',
391 |                     project_name='sequential_hyperparameter_tuning',
392 |                 )
393 | 
394 |             early_stopping = EarlyStopping(monitor='val_loss', patience=9, restore_best_weights=True)
395 |             reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6)
396 |             tuner.search(x=self.x_train, y=self.y_train,
397 |                         epochs=200,
398 |                         validation_data=(self.x_validation, self.y_validation),
399 |                         callbacks=[early_stopping, reduce_lr])
400 | 
401 |             # best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
402 |             best_model = tuner.get_best_models(num_models=1)[0]
403 | 
404 |             # Fit tuned model
405 |             loss_final = float(100)
406 |             for i in tqdm(range(15)):
407 |                 best_model.fit(self.x_train, self.y_train,
408 |                             epochs=200, 
409 |                             validation_data=(self.x_validation, self.y_validation),
410 |                             callbacks=[early_stopping, reduce_lr])
411 |                 loss, acc = best_model.evaluate(self.x_test, self.y_test)
412 |                 if loss < loss_final:
413 |                     self.final_model_deep = best_model
414 |             loss, acc = self.final_model_deep.evaluate(self.x_test, self.y_test)
415 |             print(f'Final model test loss {loss} and accuracy {acc}')
416 |             with open("output_deep_learn.txt", "w") as file:
417 |                 file.write(f'Final model test loss {loss} and accuracy {acc}')
418 |                 file.write('\n')
419 |             self.final_model_deep.save('binary_keras_deep.h5')
420 |         else:
421 |             self.final_model_deep = load_model('binary_keras_deep.h5')
422 | 
423 |     def predict_two_teams(self):
424 |         teams_sports_ref = read_csv('teams_sports_ref_format.csv')
425 |         while True:
426 |             # try:
427 |                 team_1 = input('team_1: ')
428 |                 if team_1 == 'exit':
429 |                     break
430 |                 team_2 = input('team_2: ')
431 |                 #Game location
432 |                 game_loc_team1 = int(input(f'{team_1} : home = 0, away = 1, neutral = 2: '))
433 |                 if game_loc_team1 == 0:
434 |                     game_loc_team2 = 1
435 |                 elif game_loc_team1 == 1:
436 |                     game_loc_team2 = 0
437 |                 elif game_loc_team1 == 2:
438 |                     game_loc_team2 = 2
439 |                 #Check to see if the team was spelled right
440 |                 team_1  = get_close_matches(team_1,teams_sports_ref['teams'].tolist(),n=1)[0]
441 |                 team_2  = get_close_matches(team_2,teams_sports_ref['teams'].tolist(),n=1)[0]
442 |                 #2023 data
443 |                 year = 2024
444 |                 # sleep(4)
445 |                 basic = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs.html'
446 |                 adv = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs-advanced.html'
447 |                 team_1_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_1.lower(),year)
448 |                 sleep(4) #I get get banned for a small period of time if I do not do this
449 |                 basic = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs.html'
450 |                 adv = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs-advanced.html'
451 |                 team_2_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_2.lower(),year)
452 |                 #Remove empty cells
453 |                 team_1_df2023['pts'].replace('', np.nan, inplace=True)
454 |                 team_1_df2023.replace('', np.nan, inplace=True)
455 |                 team_1_df2023.dropna(inplace=True)
456 |                 team_2_df2023['pts'].replace('', np.nan, inplace=True)
457 |                 team_2_df2023.replace('', np.nan, inplace=True)
458 |                 team_2_df2023.dropna(inplace=True)
459 |                 for col in team_1_df2023.columns:
460 |                     team_1_df2023[col] = team_1_df2023[col].astype(float)
461 |                 for col in team_2_df2023.columns:
462 |                     team_2_df2023[col] = team_2_df2023[col].astype(float)
463 | 
464 |                 #Combine dfs
465 |                 if len(team_1_df2023) > len(team_2_df2023):
466 |                     team_1_df2023 = team_1_df2023.tail(len(team_2_df2023))
467 |                 elif len(team_2_df2023) > len(team_1_df2023):
468 |                     team_2_df2023 = team_2_df2023.tail(len(team_1_df2023))
469 | 
470 |                 team_1_df2023 = team_1_df2023.reset_index(drop=True)
471 |                 team_2_df2023 = team_2_df2023.reset_index(drop=True)
472 |                 team_1_df_copy = team_1_df2023.copy()
473 |                 team_2_df_copy = team_2_df2023.copy()
474 |                 #replace team 1 opp data with team 2
475 |                 for index, row in team_1_df2023.iterrows():
476 |                     for col in team_1_df2023.columns:
477 |                         if "opp" in col:
478 |                             if col == 'opp_trb':
479 |                                 team_1_df2023.at[index, 'opp_trb'] = team_2_df2023.at[index, 'total_board']
480 |                             else:
481 |                                 new_col = col.replace("opp_", "")
482 |                                 team_1_df2023.at[index, col] = team_2_df2023.at[index, new_col]
483 |                 
484 |                 #replace team 2 opp data with team 1
485 |                 for index, row in team_2_df_copy.iterrows():
486 |                     for col in team_2_df_copy.columns:
487 |                         if "opp" in col:
488 |                             if col == 'opp_trb':
489 |                                 team_2_df_copy.at[index, 'opp_trb'] = team_1_df_copy.at[index, 'total_board']
490 |                             else:
491 |                                 new_col = col.replace("opp_", "")
492 |                                 team_2_df_copy.at[index, col] = team_1_df_copy.at[index, new_col]
493 |                 
494 |                 
495 |                 #Remove pts and game result
496 |                 # for col in team_1_df2023.columns:
497 |                 #     if 'opp' in col:
498 |                 #         team_1_df2023.drop(columns=col,inplace=True)
499 |                 # for col in team_2_df2023.columns:
500 |                 #     if 'opp' in col:
501 |                 #         team_2_df2023.drop(columns=col,inplace=True)
502 |                 if self.which_analysis == 'pca':
503 |                     team_1_df2023.drop(columns=['game_result'],inplace=True)
504 |                     team_2_df_copy.drop(columns=['game_result'],inplace=True)
505 |                     team_1_df2023 = self.scaler.transform(team_1_df2023)
506 |                     team_2_df_copy = self.scaler.transform(team_2_df_copy)
507 |                     team_1_df2023 = self.pca.transform(team_1_df2023)
508 |                     team_2_df_copy = self.pca.transform(team_2_df_copy)     
509 | 
510 |                     #make df for other analysis
511 |                     team_1_df_separate = DataFrame(team_1_df2023).abs()
512 |                     team_2_df_separate = DataFrame(team_2_df_copy).abs()
513 |                     prop_1 = team_1_df_separate.std() / team_1_df_separate.mean()
514 |                     prop_2 = team_2_df_separate.std() / team_2_df_separate.mean()
515 |                 else:
516 |                     team_1_df2023.drop(columns=['game_result'],inplace=True)
517 |                     team_2_df2023.drop(columns=['game_result'],inplace=True)
518 |                     #Drop the correlated features
519 |                     team_1_df2023.drop(columns=self.drop_cols, inplace=True)
520 |                     team_2_df2023.drop(columns=self.drop_cols, inplace=True)
521 | 
522 |                     team_1_df2023 = self.scaler.transform(team_1_df2023)
523 |                     team_2_df2023 = self.scaler.transform(team_2_df2023)
524 | 
525 |                     team_1_df2023 = self.min_max_scaler.transform(team_1_df2023)
526 |                     team_2_df2023 = self.min_max_scaler.transform(team_2_df2023)
527 | 
528 |                     team_1_df2023 = DataFrame(team_1_df2023,columns=self.cols_save)
529 |                     team_2_df2023 = DataFrame(team_2_df2023,columns=self.cols_save) 
530 | 
531 |                 ma_range = np.arange(2,10,1) #2 was the most correct value for mean and 8 was the best for the median; chose 9 for tiebreaking
532 |                 # team_1_count = 0
533 |                 # team_2_count = 0
534 |                 # team_1_count_mean = 0
535 |                 # team_2_count_mean = 0
536 |                 team_1_ma_win = []
537 |                 team_2_ma_win = []
538 |                 random_pred_1, random_pred_2 = [], []
539 |                 random_pred_1_monte, random_pred_2_monte = [], []
540 |                 qt_best_team_1, qt_best_team_2 = [], []
541 |                 qt_worst_team_1, qt_worst_team_2 = [], []
542 |                 #get latest SRS value
543 |                 team_1_srs = cbb_web_scraper.get_latest_srs(team_1)
544 |                 team_2_srs = cbb_web_scraper.get_latest_srs(team_2)
545 | 
546 |                 # #Monte carlo simulation
547 |                 num_simulations = 1000
548 |                 mean_1 = np.mean(team_1_df2023, axis=0)
549 |                 std_1 = np.std(team_1_df2023, axis=0)
550 |                 mean_2 = np.mean(team_2_df_copy, axis=0)
551 |                 std_2 = np.std(team_2_df_copy, axis=0)
552 |                 for _ in tqdm(range(num_simulations)):
553 |                     random_stats_team_1 = np.random.normal(mean_1, std_1, size=(1,team_1_df_separate.shape[1]))
554 |                     random_stats_team_2 = np.random.normal(mean_2, std_2, size=(1,team_2_df_separate.shape[1]))
555 |                     random_stats_team_1 = random_stats_team_1[0]
556 |                     random_stats_team_2 = random_stats_team_2[0]
557 |                     outcome_team_1 = self.xgb_class.predict_proba([random_stats_team_1])
558 |                     outcome_deep_1 = self.final_model_deep.predict([np.expand_dims(random_stats_team_1, axis=0)])
559 |                     outcome_team_2 = self.xgb_class.predict_proba([random_stats_team_2])
560 |                     outcome_deep_2 = self.final_model_deep.predict([np.expand_dims(random_stats_team_2, axis=0)])
561 |                     random_pred_1_monte.append(outcome_team_1[0][1])
562 |                     random_pred_1_monte.append(outcome_deep_1[0][1])
563 |                     random_pred_2_monte.append(outcome_team_1[0][0])
564 |                     random_pred_2_monte.append(outcome_team_1[0][0])
565 |                     random_pred_2_monte.append(outcome_team_2[0][1])
566 |                     random_pred_2_monte.append(outcome_deep_2[0][1])
567 |                     random_pred_1_monte.append(outcome_team_2[0][0])
568 |                     random_pred_1_monte.append(outcome_deep_2[0][0])
569 | 
570 |                 #every game of one team vs every game for other team
571 |                 for _ in tqdm(range(len(team_1_df2023) * 30)):
572 |                     if self.which_analysis == 'pca':
573 |                         random_row_df1 = team_1_df2023[np.random.choice(len(team_1_df2023), size=1),:]
574 |                         random_row_df2 = team_2_df_copy[np.random.choice(len(team_2_df_copy), size=1),:]
575 |                     else:
576 |                         random_row_df1 = team_1_df2023.sample(n=1)
577 |                         random_row_df2 = team_2_df_copy.sample(n=1)
578 |                     # random_row_df2 = team_2_df2023.sample(n=1)
579 | 
580 |                     # for col in random_row_df1.columns:
581 |                     #     if "opp" in col:
582 |                     #         if col == 'opp_trb':
583 |                     #             random_row_df1.at[random_row_df1.index[0], 'opp_trb'] = random_row_df2.at[random_row_df2.index[0], 'total_board']
584 |                     #         else:
585 |                     #             new_col = col.replace("opp_", "")
586 |                     #             random_row_df1.at[random_row_df1.index[0], col] = random_row_df2.at[random_row_df2.index[0], new_col]
587 |                     outcome_team_1 = self.xgb_class.predict_proba(random_row_df1)
588 |                     outcome_team_2 = self.xgb_class.predict_proba(random_row_df2)
589 |                     outcome_deep_1 = self.final_model_deep.predict(random_row_df1)
590 |                     outcome_deep_2 = self.final_model_deep.predict(random_row_df2)
591 |                     
592 |                     #team 1 win percentage [lose win]
593 |                     random_pred_1.append(outcome_team_1[0][1]) 
594 |                     random_pred_1.append(outcome_deep_1[0][1])
595 |                     random_pred_2.append(outcome_team_1[0][0])
596 |                     random_pred_2.append(outcome_team_1[0][0])
597 |                     #team 2 win percentage [lose win]
598 |                     random_pred_2.append(outcome_team_2[0][1])
599 |                     random_pred_2.append(outcome_deep_2[0][1])
600 |                     random_pred_1.append(outcome_team_2[0][0])
601 |                     random_pred_1.append(outcome_deep_2[0][0])
602 | 
603 |                 #rolling average predictions
604 |                 team_1_df2023 = DataFrame(team_1_df2023)
605 |                 team_2_df_copy = DataFrame(team_2_df_copy)
606 |                 for ma in tqdm(ma_range):
607 |                     # TEAM 1
608 |                     data1_mean = team_1_df2023.ewm(span=ma,min_periods=ma-1).mean()
609 |                     data2_mean = team_2_df_copy.ewm(span=ma,min_periods=ma-1).mean()
610 | 
611 |                     outcome = self.xgb_class.predict_proba(data1_mean.iloc[-1:].values)
612 |                     outcome_deep = self.final_model_deep.predict(data1_mean.iloc[-1:].values)
613 |                     outcome2 = self.xgb_class.predict_proba(data2_mean.iloc[-1:].values)
614 |                     outcome_deep2 = self.final_model_deep.predict(data2_mean.iloc[-1:].values)
615 | 
616 |                     team_1_ma_win.append(outcome[0][1])
617 |                     team_1_ma_win.append(outcome_deep[0][1])
618 |                     team_2_ma_win.append(outcome[0][0])
619 |                     team_2_ma_win.append(outcome_deep[0][0])
620 | 
621 |                     team_1_ma_win.append(outcome2[0][0])
622 |                     team_1_ma_win.append(outcome_deep2[0][0])
623 |                     team_2_ma_win.append(outcome2[0][1])
624 |                     team_2_ma_win.append(outcome_deep2[0][1])
625 | 
626 |                 #quantile predictions - both play at their bests
627 |                 for ma in tqdm(ma_range):
628 |                     # TEAM 1
629 |                     data1_mean = team_1_df2023.rolling(window=ma).quantile(0.75)
630 |                     # data1_mean['game_loc'] = game_loc_team1
631 |                     data2_mean = team_2_df_copy.rolling(window=ma).quantile(0.75)
632 |                     # data2_mean['game_loc'] = game_loc_team2
633 |                     #get latest SRS value
634 |                     # data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_1_srs
635 |                     # data2_mean.loc[data2_mean.index[-1], 'simple_rating_system'] = team_2_srs 
636 |                     outcome = self.xgb_class.predict_proba(data1_mean.iloc[-1:].values)
637 |                     outcome_deep = self.final_model_deep.predict(data1_mean.iloc[-1:].values)
638 |                     outcome2 = self.xgb_class.predict_proba(data2_mean.iloc[-1:].values)
639 |                     outcome_deep2 = self.final_model_deep.predict(data2_mean.iloc[-1:].values)
640 | 
641 |                     qt_best_team_1.append(outcome[0][1])
642 |                     qt_best_team_1.append(outcome_deep[0][1])
643 |                     qt_best_team_2.append(outcome[0][0])
644 |                     qt_best_team_2.append(outcome_deep[0][0])
645 | 
646 |                     qt_best_team_1.append(outcome2[0][0])
647 |                     qt_best_team_1.append(outcome_deep2[0][0])
648 |                     qt_best_team_2.append(outcome2[0][1])
649 |                     qt_best_team_2.append(outcome_deep2[0][1])
650 | 
651 |                 #quantile predictions - both play at their worsts
652 |                 for ma in tqdm(ma_range):
653 |                     # TEAM 1
654 |                     data1_mean = team_1_df2023.rolling(window=ma).quantile(0.25)
655 |                     # data1_mean['game_loc'] = game_loc_team1
656 |                     data2_mean = team_2_df_copy.rolling(window=ma).quantile(0.25)
657 |                     # data2_mean['game_loc'] = game_loc_team2
658 |                     #get latest SRS value
659 |                     # data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_1_srs
660 |                     # data2_mean.loc[data2_mean.index[-1], 'simple_rating_system'] = team_2_srs 
661 |                     # data1_mean['simple_rating_system'].iloc[-1] = cbb_web_scraper.get_latest_srs(team_1)
662 |                     outcome = self.xgb_class.predict_proba(data1_mean.iloc[-1:].values)
663 |                     outcome_deep = self.final_model_deep.predict(data1_mean.iloc[-1:].values)
664 |                     outcome2 = self.xgb_class.predict_proba(data2_mean.iloc[-1:].values)
665 |                     outcome_deep2 = self.final_model_deep.predict(data2_mean.iloc[-1:].values)
666 | 
667 |                     qt_worst_team_1.append(outcome[0][1])
668 |                     qt_worst_team_1.append(outcome_deep[0][1])
669 |                     qt_worst_team_2.append(outcome[0][0])
670 |                     qt_worst_team_2.append(outcome_deep[0][0])
671 | 
672 |                     qt_worst_team_1.append(outcome2[0][0])
673 |                     qt_worst_team_1.append(outcome_deep2[0][0])
674 |                     qt_worst_team_2.append(outcome2[0][1])
675 |                     qt_worst_team_2.append(outcome_deep2[0][1])
676 |                 
677 |                 ###########TEAM 2 VS TEAM 1###################
678 |                 # temp = team_1_df2023
679 |                 # team_1_df2023 = team_2_df2023
680 |                 # team_2_df2023 = temp
681 | 
682 |                 # if game_loc_team1 == 1:
683 |                 #     game_loc_team1 = 0
684 |                 # elif game_loc_team1 == 0:
685 |                 #     game_loc_team1 = 1
686 |                 # if game_loc_team2 == 0:
687 |                 #     game_loc_team2 = 1
688 |                 # elif game_loc_team2 == 1:
689 |                 #     game_loc_team2 = 0
690 | 
691 |                 # #get latest SRS value - flip them
692 |                 # team_1_srs = cbb_web_scraper.get_latest_srs(team_2)
693 |                 # team_2_srs = cbb_web_scraper.get_latest_srs(team_1)
694 |                 # #every game of one team vs every game for other team
695 |                 # for _ in range(len(team_1_df2023) * 2):
696 |                 #     random_row_df1 = team_1_df2023.sample(n=1)
697 |                 #     random_row_df2 = team_2_df2023.sample(n=1)
698 | 
699 |                 #     for col in random_row_df1.columns:
700 |                 #         if "opp" in col:
701 |                 #             if col == 'opp_trb':
702 |                 #                 random_row_df1.at[random_row_df1.index[0], 'opp_trb'] = random_row_df2.at[random_row_df2.index[0], 'total_board']
703 |                 #             else:
704 |                 #                 new_col = col.replace("opp_", "")
705 |                 #                 random_row_df1.at[random_row_df1.index[0], col] = random_row_df2.at[random_row_df2.index[0], new_col]
706 |                     
707 |                 #     outcome = self.xgb_class.predict_proba(random_row_df1)
708 |                 #     outcome_deep = self.final_model_deep.predict(random_row_df1)
709 | 
710 |                 #     random_pred_1.append(outcome[0][1])
711 |                 #     random_pred_1.append(outcome_deep[0][1])
712 |                 #     random_pred_2.append(outcome[0][0])
713 |                 #     random_pred_2.append(outcome_deep[0][0])
714 |                     
715 |                 # #rolling average predictions
716 |                 # for ma in tqdm(ma_range):
717 |                 #     # TEAM 1
718 |                 #     data1_mean = team_1_df2023.ewm(span=ma,min_periods=ma-1).mean()
719 |                 #     # data1_mean['game_loc'] = game_loc_team1
720 |                 #     data2_mean = team_2_df2023.ewm(span=ma,min_periods=ma-1).mean()
721 |                 #     # data2_mean['game_loc'] = game_loc_team2
722 |                 #     #Here replace opponent metrics with the features of the second team
723 |                 #     for col in data1_mean.columns:
724 |                 #         if "opp" in col:
725 |                 #             if col == 'opp_trb':
726 |                 #                 # new_col = col.replace("opp_", "")
727 |                 #                 data1_mean.loc[data1_mean.index[-1], 'opp_trb'] = data2_mean.loc[data2_mean.index[-1], 'total_board']
728 |                 #             else:
729 |                 #                 new_col = col.replace("opp_", "")
730 |                 #                 data1_mean.loc[data1_mean.index[-1], col] = data2_mean.loc[data2_mean.index[-1], new_col]
731 |                 #     #get latest SRS value
732 |                 #     # data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_1_srs
733 |                 #     # data2_mean.loc[data2_mean.index[-1], 'simple_rating_system'] = team_2_srs 
734 |                 #     # data1_mean['simple_rating_system'].iloc[-1] = cbb_web_scraper.get_latest_srs(team_1)
735 |                 #     outcome = self.xgb_class.predict_proba(data1_mean.iloc[-1:])
736 |                 #     outcome_deep = self.final_model_deep.predict(data1_mean.iloc[-1:])
737 | 
738 |                 #     team_1_ma_win.append(outcome[0][1])
739 |                 #     team_1_ma_win.append(outcome_deep[0][1])
740 |                 #     team_2_ma_win.append(outcome[0][0])
741 |                 #     team_2_ma_win.append(outcome_deep[0][0])
742 |                 # #quantile predictions - both play at their bests
743 |                 # for ma in tqdm(ma_range):
744 |                 #     # TEAM 1
745 |                 #     data1_mean = team_1_df2023.rolling(window=ma).quantile(0.75).iloc[-1:]
746 |                 #     # data1_mean['game_loc'] = game_loc_team1
747 |                 #     data2_mean = team_2_df2023.rolling(window=ma).quantile(0.75).iloc[-1:]
748 |                 #     # data2_mean['game_loc'] = game_loc_team2
749 |                 #     #Here replace opponent metrics with the features of the second team
750 |                 #     for col in data1_mean.columns:
751 |                 #         if "opp" in col:
752 |                 #             if col == 'opp_trb':
753 |                 #                 # new_col = col.replace("opp_", "")
754 |                 #                 data1_mean.loc[data1_mean.index[-1], 'opp_trb'] = data2_mean.loc[data2_mean.index[-1], 'total_board']
755 |                 #             else:
756 |                 #                 new_col = col.replace("opp_", "")
757 |                 #                 data1_mean.loc[data1_mean.index[-1], col] = data2_mean.loc[data2_mean.index[-1], new_col]
758 |                 #     #get latest SRS value
759 |                 #     # data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_1_srs
760 |                 #     # data2_mean.loc[data2_mean.index[-1], 'simple_rating_system'] = team_2_srs 
761 |                 #     # data1_mean['simple_rating_system'].iloc[-1] = cbb_web_scraper.get_latest_srs(team_1)
762 |                 #     outcome = self.xgb_class.predict_proba(data1_mean.iloc[-1:])
763 |                 #     outcome_deep = self.final_model_deep.predict(data1_mean.iloc[-1:])
764 | 
765 |                 #     qt_best_team_1.append(outcome[0][1])
766 |                 #     qt_best_team_1.append(outcome_deep[0][1])
767 |                 #     qt_best_team_2.append(outcome[0][0])
768 |                 #     qt_best_team_2.append(outcome_deep[0][0])
769 | 
770 |                 # #quantile predictions - both play at their worsts
771 |                 # for ma in tqdm(ma_range):
772 |                 #     # TEAM 1
773 |                 #     data1_mean = team_1_df2023.rolling(window=ma).quantile(0.25).iloc[-1:]
774 |                 #     # data1_mean['game_loc'] = game_loc_team1
775 |                 #     data2_mean = team_2_df2023.rolling(window=ma).quantile(0.25).iloc[-1:]
776 |                 #     # data2_mean['game_loc'] = game_loc_team2
777 |                 #     #Here replace opponent metrics with the features of the second team
778 |                 #     for col in data1_mean.columns:
779 |                 #         if "opp" in col:
780 |                 #             if col == 'opp_trb':
781 |                 #                 # new_col = col.replace("opp_", "")
782 |                 #                 data1_mean.loc[data1_mean.index[-1], 'opp_trb'] = data2_mean.loc[data2_mean.index[-1], 'total_board']
783 |                 #             else:
784 |                 #                 new_col = col.replace("opp_", "")
785 |                 #                 data1_mean.loc[data1_mean.index[-1], col] = data2_mean.loc[data2_mean.index[-1], new_col]
786 |                 #     #get latest SRS value
787 |                 #     # data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_1_srs
788 |                 #     # data2_mean.loc[data2_mean.index[-1], 'simple_rating_system'] = team_2_srs 
789 |                 #     # data1_mean['simple_rating_system'].iloc[-1] = cbb_web_scraper.get_latest_srs(team_1)
790 |                 #     outcome = self.xgb_class.predict_proba(data1_mean.iloc[-1:])
791 |                 #     outcome_deep = self.final_model_deep.predict(data1_mean.iloc[-1:])
792 | 
793 |                 #     qt_worst_team_1.append(outcome[0][1])
794 |                 #     qt_worst_team_1.append(outcome_deep[0][1])
795 |                 #     qt_worst_team_2.append(outcome[0][0])
796 |                 #     qt_worst_team_2.append(outcome_deep[0][0])
797 | 
798 |                 # #reflip for printing
799 |                 # team_1_srs = cbb_web_scraper.get_latest_srs(team_1)
800 |                 # team_2_srs = cbb_web_scraper.get_latest_srs(team_2)
801 |                 print('===============================================================')
802 |                 if team_1_srs > team_2_srs:
803 |                     print(Fore.GREEN + Style.BRIGHT + f'{team_1} SRS data: {team_1_srs}'+ Style.RESET_ALL)
804 |                     print(Fore.RED + Style.BRIGHT + f'{team_2} SRS data: {team_2_srs}'+ Style.RESET_ALL)
805 |                 else:
806 |                     print(Fore.RED + Style.BRIGHT + f'{team_1} SRS data: {team_1_srs}'+ Style.RESET_ALL)
807 |                     print(Fore.GREEN + Style.BRIGHT + f'{team_2} SRS data: {team_2_srs}'+ Style.RESET_ALL)
808 |                 print('===============================================================')
809 |                 if np.mean(prop_1.sum()) < np.mean(prop_2.sum()):
810 |                     print(Fore.GREEN + Style.BRIGHT + f'{team_1} summed variability: {prop_1.sum()}'+ Style.RESET_ALL)
811 |                     print(Fore.RED + Style.BRIGHT + f'{team_2} summed variability: {prop_2.sum()}'+ Style.RESET_ALL)
812 |                 else:
813 |                     print(Fore.RED + Style.BRIGHT + f'{team_1} summed variability: {prop_1.sum()}'+ Style.RESET_ALL)
814 |                     print(Fore.GREEN + Style.BRIGHT + f'{team_2} summed variability: {prop_2.sum()}'+ Style.RESET_ALL)
815 |                 print('===============================================================')
816 |                 if np.mean(team_1_ma_win) > np.mean(team_2_ma_win):
817 |                     print(Fore.GREEN + Style.BRIGHT + f'{team_1} average win probabilities: {np.mean(team_1_ma_win)}'+ Style.RESET_ALL)
818 |                     print(Fore.RED + Style.BRIGHT + f'{team_2} average win probabilities: {np.mean(team_2_ma_win)}'+ Style.RESET_ALL)
819 |                 else:
820 |                     print(Fore.RED + Style.BRIGHT + f'{team_1} average win probabilities: {np.mean(team_1_ma_win)}'+ Style.RESET_ALL)
821 |                     print(Fore.GREEN + Style.BRIGHT + f'{team_2} average win probabilities: {np.mean(team_2_ma_win)}'+ Style.RESET_ALL)
822 |                 print('===============================================================')
823 |                 if np.mean(qt_best_team_1) > np.mean(qt_best_team_2):
824 |                     print(Fore.GREEN + Style.BRIGHT + f'{team_1} average win probabilities if they play at their best: {np.mean(qt_best_team_1)}'+ Style.RESET_ALL)
825 |                     print(Fore.RED + Style.BRIGHT + f'{team_2} average win probabilities if they play at their best: {np.mean(qt_best_team_2)}'+ Style.RESET_ALL)
826 |                 else:
827 |                     print(Fore.RED + Style.BRIGHT + f'{team_1} average win probabilities if they play at their best: {np.mean(qt_best_team_1)}'+ Style.RESET_ALL)
828 |                     print(Fore.GREEN + Style.BRIGHT + f'{team_2} average win probabilities if they play at their best: {np.mean(qt_best_team_2)}'+ Style.RESET_ALL)
829 |                 print('===============================================================')
830 |                 if np.mean(qt_worst_team_1) > np.mean(qt_worst_team_2):
831 |                     print(Fore.GREEN + Style.BRIGHT + f'{team_1} average win probabilities if they play at their worst: {np.mean(qt_worst_team_1)}'+ Style.RESET_ALL)
832 |                     print(Fore.RED + Style.BRIGHT + f'{team_2} average win probabilities if they play at their worst: {np.mean(qt_worst_team_2)}'+ Style.RESET_ALL)
833 |                 else:
834 |                     print(Fore.RED + Style.BRIGHT + f'{team_1} average win probabilities if they play at their worst: {np.mean(qt_worst_team_1)}'+ Style.RESET_ALL)
835 |                     print(Fore.GREEN + Style.BRIGHT + f'{team_2} average win probabilities if they play at their worst: {np.mean(qt_worst_team_2)}'+ Style.RESET_ALL)
836 |                 print('===============================================================')
837 |                 if np.mean(random_pred_1) > np.mean(random_pred_2):
838 |                     print(Fore.GREEN + Style.BRIGHT + f'{team_1} average win probabilities randomly selecting games: {np.mean(random_pred_1)}'+ Style.RESET_ALL)
839 |                     print(Fore.RED + Style.BRIGHT + f'{team_2} average win probabilities randomly selecting games: {np.mean(random_pred_2)}'+ Style.RESET_ALL)
840 |                 else:
841 |                     print(Fore.RED + Style.BRIGHT + f'{team_1} average win probabilities randomly selecting games: {np.mean(random_pred_1)}'+ Style.RESET_ALL)
842 |                     print(Fore.GREEN + Style.BRIGHT + f'{team_2} average win probabilities randomly selecting games: {np.mean(random_pred_2)}'+ Style.RESET_ALL)
843 |                 print('===============================================================')
844 |                 if np.mean(random_pred_1_monte) > np.mean(random_pred_2_monte):
845 |                     print(Fore.GREEN + Style.BRIGHT + f'{team_1} average win probabilities Monte Carlo Simulation: {np.mean(random_pred_1_monte)}'+ Style.RESET_ALL)
846 |                     print(Fore.RED + Style.BRIGHT + f'{team_2} average win probabilities Monte Carlo Simulation: {np.mean(random_pred_2_monte)}'+ Style.RESET_ALL)
847 |                 else:
848 |                     print(Fore.RED + Style.BRIGHT + f'{team_1} average win probabilities Monte Carlo Simulation: {np.mean(random_pred_1_monte)}'+ Style.RESET_ALL)
849 |                     print(Fore.GREEN + Style.BRIGHT + f'{team_2} average win probabilities Monte Carlo Simulation: {np.mean(random_pred_2_monte)}'+ Style.RESET_ALL)
850 |                 
851 |                 # if "tod" in sys.argv[2]:
852 |                 #     date_today = str(datetime.now().date()).replace("-", "")
853 |                 # elif "tom" in sys.argv[2]:
854 |                 #     date_today = str(datetime.now().date() + timedelta(days=1)).replace("-", "")
855 |                 # URL = "https://www.espn.com/mens-college-basketball/schedule/_/date/" + date_today #sys argv????
856 |                 # print(f'ESPN prediction: {cbb_web_scraper.get_espn(URL,team_1,team_2)}')
857 |                 print('===============================================================')
858 |             # except Exception as e:
859 |             #     print(f'The error: {e}')
860 |     def feature_importances_random_forest(self):
861 |         importances = self.RandForclass.best_estimator_.feature_importances_
862 |         indices = np.argsort(importances)
863 |         plt.figure()
864 |         plt.title('Feature Importances Random Forest - Classifier')
865 |         plt.barh(range(len(indices)), importances[indices], color='k', align='center')
866 |         plt.yticks(range(len(indices)), [self.x_test.columns[i] for i in indices])
867 |         plt.xlabel('Relative Importance - explained variance')
868 |         plt.tight_layout()
869 |         plt.savefig('feature_importance_random_forest_classifier.png',dpi=300)
870 |     
871 |     def feature_importances_xgb(self):
872 |         importances = self.xgb_class.best_estimator_.feature_importances_
873 |         indices = np.argsort(importances)
874 |         plt.figure(figsize=(10,8))
875 |         plt.title('Feature Importances XGBoost - Classifier')
876 |         plt.barh(range(len(indices)), importances[indices], color='k', align='center')
877 |         plt.yticks(range(len(indices)), [self.x_test.columns[i] for i in indices])
878 |         plt.xlabel('Relative Importance - explained variance')
879 |         plt.tight_layout()
880 |         plt.savefig('feature_importance_xgb_classifier.png',dpi=300)
881 |         plt.close()
882 |     
883 |     def deep_learning_feature_importances(self):
884 |         model = self.final_model_deep
885 |         x_train_array = np.array(self.x_test)
886 |         masker = shap.maskers.Independent(data=x_train_array)
887 |         explainer = shap.Explainer(model, masker)
888 |         shap_values = explainer.shap_values(x_train_array)
889 |         feature_importances = np.mean(np.abs(shap_values),axis=0)
890 |         shap.summary_plot(feature_importances.T, 
891 |                         feature_names=self.cols_save, 
892 |                         plot_type="bar", 
893 |                         max_display=feature_importances.shape[0],
894 |                         show=False)
895 |         plt.savefig('SHAP_feature_importances.png',dpi=400)
896 |         plt.close() 
897 | 
898 |     def run_analysis(self):
899 |         self.get_teams()
900 |         self.split()
901 |         self.deep_learn_analysis()
902 |         self.xgboost_analysis()
903 |         self.predict_two_teams()
904 |         if self.which_analysis != 'pca':
905 |             self.feature_importances_xgb()
906 |             self.deep_learning_feature_importances()
907 | 
908 | def main():
909 |     cbbClass('pca').run_analysis() # 'pca' or 'corr'
910 | if __name__ == '__main__':
911 |     main()


--------------------------------------------------------------------------------
/cbb_regression.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | College Basketball Predictions
  5 | @author: brianszekely
  6 | """
  7 | import cbb_web_scraper
  8 | from os import getcwd
  9 | from os.path import join, exists 
 10 | import yaml
 11 | from tqdm import tqdm
 12 | from time import sleep
 13 | from pandas import DataFrame, concat, read_csv, isnull
 14 | import numpy as np
 15 | from sklearn.model_selection import train_test_split
 16 | from sklearn.model_selection import GridSearchCV
 17 | from sklearn.ensemble import RandomForestRegressor
 18 | import matplotlib.pyplot as plt 
 19 | import seaborn as sns
 20 | from sys import argv
 21 | from sklearn.metrics import mean_squared_error, r2_score
 22 | # from sklearn.model_selection import cross_val_score, KFold
 23 | import pickle
 24 | import joblib
 25 | import sys
 26 | import os
 27 | from scipy.stats import variation
 28 | from difflib import get_close_matches
 29 | from datetime import datetime, timedelta
 30 | class cbb_regressor():
 31 |     def __init__(self):
 32 |         print('initialize class cbb_regressor')
 33 |         self.all_data = DataFrame()
 34 |     def get_teams(self):
 35 |         year_list_find = []
 36 |         year_list = [2023,2022,2021,2019,2018,2017,2016,2015,2014,2013,2012] #,2014,2013,2012,2011,2010
 37 |         if exists(join(getcwd(),'year_count.yaml')):
 38 |             with open(join(getcwd(),'year_count.yaml')) as file:
 39 |                 year_counts = yaml.load(file, Loader=yaml.FullLoader)
 40 |         else:
 41 |             year_counts = {'year':year_list_find}
 42 |         #Remove any years that have already been collected
 43 |         if year_counts['year']:
 44 |             year_list_check =  year_counts['year']
 45 |             year_list_find = year_counts['year']
 46 |             year_list = [i for i in year_list if i not in year_list_check]
 47 |             print(f'Need data for year: {year_list}')
 48 |         #Collect data per year
 49 |         if year_list:   
 50 |             for year in tqdm(year_list):
 51 |                 all_teams = cbb_web_scraper.get_teams_year(year_list[-1],year_list[0])
 52 |                 team_names = sorted(all_teams)
 53 |                 final_list = []
 54 |                 self.year_store = year
 55 |                 for abv in tqdm(team_names):    
 56 |                     try:
 57 |                         print() #tqdm things
 58 |                         print(f'current team: {abv}, year: {year}')
 59 |                         basic = 'https://www.sports-reference.com/cbb/schools/' + abv + '/' + str(self.year_store) + '-gamelogs.html'
 60 |                         adv = 'https://www.sports-reference.com/cbb/schools/' + abv + '/' + str(self.year_store) + '-gamelogs-advanced.html'
 61 |                         df_inst = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,abv,self.year_store)
 62 |                         df_inst['pts'].replace('', np.nan, inplace=True)
 63 |                         df_inst.dropna(inplace=True)
 64 |                         final_list.append(df_inst)
 65 |                     except Exception as e:
 66 |                         print(e)
 67 |                         print(f'{abv} data are not available')
 68 |                     sleep(4) #I get get banned for a small period of time if I do not do this
 69 |                 final_data = concat(final_list)
 70 |                 if exists(join(getcwd(),'all_data_regressor.csv')):
 71 |                     self.all_data = read_csv(join(getcwd(),'all_data_regressor.csv'))  
 72 |                 self.all_data = concat([self.all_data, final_data.dropna()])
 73 |                 if not exists(join(getcwd(),'all_data_regressor.csv')):
 74 |                     self.all_data.to_csv(join(getcwd(),'all_data_regressor.csv'),index=False)
 75 |                 self.all_data.to_csv(join(getcwd(),'all_data_regressor.csv'),index=False)
 76 |                 year_list_find.append(year)
 77 |                 print(f'year list after loop: {year_list_find}')
 78 |                 with open(join(getcwd(),'year_count.yaml'), 'w') as write_file:
 79 |                     yaml.dump(year_counts, write_file)
 80 |                     print(f'writing {year} to yaml file')
 81 |         else:
 82 |             self.all_data = read_csv(join(getcwd(),'all_data_regressor.csv'))
 83 |         print('len data: ', len(self.all_data))
 84 |         self.all_data = self.all_data.drop_duplicates(keep='last')
 85 |         print(f'length of data after duplicates are dropped: {len(self.all_data)}')
 86 |     def convert_to_float(self):
 87 |         for col in self.all_data.columns:
 88 |             self.all_data[col].replace('', np.nan, inplace=True)
 89 |             self.all_data[col] = self.all_data[col].astype(float)
 90 |     def delete_opp(self):
 91 |         """
 92 |         Drop any opponent data, as it may not be helpful when coming to prediction. Hard to estimate with running average
 93 |         """
 94 |         for col in self.all_data.columns:
 95 |             if 'opp' in col:
 96 |                 self.all_data.drop(columns=col,inplace=True)
 97 |     def split(self):
 98 |         # self.delete_opp()
 99 |         for col in self.all_data.columns:
100 |             if 'Unnamed' in col:
101 |                 self.all_data.drop(columns=col,inplace=True)
102 |         self.convert_to_float()
103 |         self.y = self.all_data['pts']
104 |         self.x = self.all_data.drop(columns=['pts','game_result'])
105 |         self.pre_process()
106 |         #Dropna and remove all data from subsequent y data
107 |         real_values = ~self.x_no_corr.isna().any(axis=1)
108 |         self.x_no_corr.dropna(inplace=True)
109 |         self.y = self.y.loc[real_values]
110 |         self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x_no_corr, self.y, train_size=0.8)
111 |     def pre_process(self):
112 |         # Remove features with a correlation coef greater than 0.85
113 |         corr_matrix = np.abs(self.x.astype(float).corr())
114 |         upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
115 |         to_drop = [column for column in upper.columns if any(upper[column] >= 0.90)]
116 |         self.drop_cols = to_drop
117 |         self.x_no_corr = self.x.drop(columns=to_drop)
118 |         cols = self.x_no_corr.columns
119 |         print(f'Columns dropped  >= 0.90: {to_drop}')
120 |         #Drop samples that are outliers 
121 |         print(f'old feature dataframe shape before outlier removal: {self.x_no_corr.shape}')
122 |         for col_name in cols:
123 |             Q1 = np.percentile(self.x_no_corr[col_name], 25)
124 |             Q3 = np.percentile(self.x_no_corr[col_name], 75)
125 |             IQR = Q3 - Q1
126 |             upper = np.where(self.x_no_corr[col_name] >= (Q3+2.5*IQR)) #1.5 is the standard, use two to see if more data helps improve model performance
127 |             lower = np.where(self.x_no_corr[col_name] <= (Q1-2.5*IQR)) 
128 |             self.x_no_corr.drop(upper[0], inplace = True)
129 |             self.x_no_corr.drop(lower[0], inplace = True)
130 |             self.y.drop(upper[0], inplace = True)
131 |             self.y.drop(lower[0], inplace = True)
132 |             if 'level_0' in self.x_no_corr.columns:
133 |                 self.x_no_corr.drop(columns=['level_0'],inplace = True)
134 |             self.x_no_corr.reset_index(inplace = True)
135 |             self.y.reset_index(inplace = True, drop=True)
136 |         self.x_no_corr.drop(columns=['level_0','index'],inplace = True)
137 |         print(f'new feature dataframe shape after outlier removal: {self.x_no_corr.shape}')
138 |         top_corr_features = corr_matrix.index
139 |         plt.figure(figsize=(20,20))
140 |         sns.heatmap(corr_matrix[top_corr_features],annot=True,cmap="RdYlGn")    
141 |         plt.tight_layout()
142 |         plt.savefig('correlations.png',dpi=250)
143 |         plt.close()
144 |     def random_forest_analysis(self):
145 |         if argv[1] == 'tune':
146 |             #RANDOM FOREST REGRESSOR
147 |             RandForclass = RandomForestRegressor()
148 |             #Use the number of features as a stopping criterion for depth
149 |             rows, cols = self.x_train.shape
150 |             cols = int(cols / 1.18) #try to avoid overfitting on depth
151 |             #square root of the total number of features is a good limit
152 |             # cols = int(np.sqrt(cols))
153 |             #parameters to tune
154 |             #increasing min_samples_leaf, this will reduce overfitting
155 |             Rand_perm = {
156 |                 'criterion' : ["squared_error", "poisson"], #absolute_error - takes forever to run
157 |                 'n_estimators': range(300,500,100),
158 |                 # 'min_samples_split': np.arange(2, 5, 1, dtype=int),
159 |                 'max_features' : [1, 'sqrt', 'log2'],
160 |                 'max_depth': np.arange(2,cols,1),
161 |                 'min_samples_leaf': np.arange(1,3,1)
162 |                 }
163 |             #['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 
164 |             # average_precision', 'balanced_accuracy', 'completeness_score', 'explained_variance', 
165 |             # 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 
166 |             # 'homogeneity_score', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 
167 |             # 'jaccard_weighted', 'matthews_corrcoef', 'max_error', 'mutual_info_score', 'neg_brier_score',
168 |             # 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 
169 |             # 'neg_mean_gamma_deviance', 'neg_mean_poisson_deviance', 'neg_mean_squared_error', 
170 |             # 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'neg_root_mean_squared_error', 
171 |             # 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'rand_score', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'roc_auc_ovo', 'roc_auc_ovo_weighted', 'roc_auc_ovr', 'roc_auc_ovr_weighted', 'top_k_accuracy', 'v_measure_score']
172 |             clf_rand = GridSearchCV(RandForclass, Rand_perm, 
173 |                                 scoring=['neg_root_mean_squared_error','explained_variance'],
174 |                                 cv=5,
175 |                                refit='neg_root_mean_squared_error',verbose=4, n_jobs=-1)
176 |             #save
177 |             search_rand = clf_rand.fit(self.x_train,self.y_train)
178 |             #Write fitted and tuned model to file
179 |             # with open('randomForestModelTuned.pkl','wb') as f:
180 |             #     pickle.dump(search_rand,f)
181 |             joblib.dump(search_rand, "./randomForestModelTuned.joblib", compress=9)
182 |             print('RandomForestRegressor - best params: ',search_rand.best_params_)
183 |             self.RandForRegressor = search_rand
184 |             self.rmse = mean_squared_error(self.RandForRegressor.predict(self.x_test),self.y_test,squared=False)
185 |             print('RMSE: ',mean_squared_error(self.RandForRegressor.predict(self.x_test),self.y_test,squared=False))
186 |             print('R2 score: ',r2_score(self.RandForRegressor.predict(self.x_test),self.y_test))
187 |         else:
188 |             print('Load tuned Random Forest Regressor')
189 |             # load RandomForestModel    
190 |             # with open('randomForestModelTuned.pkl', 'rb') as f:
191 |             #     self.RandForRegressor = pickle.load(f)
192 |             self.RandForRegressor=joblib.load("./randomForestModelTuned.joblib")
193 |             print(f'Current RandomForestRegressor Parameters: {self.RandForRegressor.best_params_}')
194 |             print('RMSE: ',mean_squared_error(self.RandForRegressor.predict(self.x_test),self.y_test,squared=False))
195 |             print('R2 score: ',r2_score(self.RandForRegressor.predict(self.x_test),self.y_test))
196 |             self.rmse = mean_squared_error(self.RandForRegressor.predict(self.x_test),self.y_test,squared=False)
197 |             # self.RandForRegressor = RandomForestRegressor(criterion='squared_error', 
198 |             #                                               max_depth=20,
199 |             #                                               max_features='log2', 
200 |             #                                               n_estimators=300,
201 |             #                                               min_samples_leaf=3)       
202 |     # def multi_layer_perceptron(self):
203 |     #     pass
204 |     # def keras_regressor_analysis(self):
205 |     #     pass
206 |     def predict_two_teams(self):
207 |         teams_sports_ref = read_csv('teams_sports_ref_format.csv')
208 |         while True:
209 |             try:
210 |                 team_1 = input('team_1: ')
211 |                 if team_1 == 'exit':
212 |                     break
213 |                 team_2 = input('team_2: ')
214 |                 #Game location
215 |                 game_loc_team1 = int(input(f'{team_1} : home = 0, away = 1, neutral = 2: '))
216 |                 if game_loc_team1 == 0:
217 |                     game_loc_team2 = 1
218 |                 elif game_loc_team1 == 1:
219 |                     game_loc_team2 = 0
220 |                 elif game_loc_team1 == 2:
221 |                     game_loc_team2 = 2
222 |                 #Check to see if the team was spelled right
223 |                 team_1  = get_close_matches(team_1,teams_sports_ref['teams'].tolist(),n=1)[0]
224 |                 team_2  = get_close_matches(team_2,teams_sports_ref['teams'].tolist(),n=1)[0]
225 |                 #2023 data
226 |                 year = 2023
227 |                 # sleep(4)
228 |                 basic = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs.html'
229 |                 adv = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs-advanced.html'
230 |                 team_1_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_1.lower(),year)
231 |                 sleep(4) #I get get banned for a small period of time if I do not do this
232 |                 basic = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs.html'
233 |                 adv = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs-advanced.html'
234 |                 team_2_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_2.lower(),year)
235 |                 #Remove empty cells
236 |                 team_1_df2023['pts'].replace('', np.nan, inplace=True)
237 |                 team_1_df2023.replace('', np.nan, inplace=True)
238 |                 team_1_df2023.dropna(inplace=True)
239 |                 team_2_df2023['pts'].replace('', np.nan, inplace=True)
240 |                 team_2_df2023.replace('', np.nan, inplace=True)
241 |                 team_2_df2023.dropna(inplace=True)
242 |                 #Save series of pts for visualizations
243 |                 self.pts_team_1 = team_1_df2023['pts'].astype(float)
244 |                 self.team_1_name = team_1
245 |                 self.pts_team_2 = team_2_df2023['pts'].astype(float)
246 |                 self.team_2_name = team_2
247 |                 #Remove pts and game result
248 |                 # for col in team_1_df2023.columns:
249 |                 #     if 'opp' in col:
250 |                 #         team_1_df2023.drop(columns=col,inplace=True)
251 |                 # for col in team_2_df2023.columns:
252 |                 #     if 'opp' in col:
253 |                 #         team_2_df2023.drop(columns=col,inplace=True)
254 |                 # team_1_df2023.drop(columns=['game_result','pts'],inplace=True)
255 |                 # team_2_df2023.drop(columns=['game_result','pts'],inplace=True)
256 |                 #Drop the correlated features
257 |                 team_1_df2023.drop(columns=self.drop_cols, inplace=True)
258 |                 team_2_df2023.drop(columns=self.drop_cols, inplace=True)
259 |                 # team_1_df2023.to_csv('team_1.csv')
260 |                 # team_2_df2023.to_csv('team_2.csv')
261 |                 # print(team_1_df2023)
262 |                 # print(team_2_df2023)
263 |                 #Clean up dataframe
264 |                 # for col in team_1_df2023.columns:
265 |                 #     if 'Unnamed' in col:
266 |                 #         team_1_df2023.drop(columns=col,inplace=True)
267 |                 # for col in team_2_df2023.columns:
268 |                 #     if 'Unnamed' in col:
269 |                 #         team_2_df2023.drop(columns=col,inplace=True)
270 |                 #Try to find the moving averages that work
271 |                 # ma_range = np.arange(2,len(team_2_df2023)-2,1)
272 |                 ma_range = np.arange(2,7,1) #2 was the most correct value for mean and 8 was the best for the median; chose 9 for tiebreaking
273 |                 team_1_count = 0
274 |                 team_2_count = 0
275 |                 team_1_count_mean = 0
276 |                 team_2_count_mean = 0
277 |                 team_1_ma = []
278 |                 team_2_ma = []
279 |                 team_1_median = []
280 |                 team_2_median = []
281 |                 num_pts_score_team_1= []
282 |                 num_pts_score_team_2 = []
283 |                 mean_team_1_var = []
284 |                 mean_team_2_var = []
285 |                 # Get the latest simple rating system for both teams
286 |                 team_1_srs = cbb_web_scraper.get_latest_srs(team_1)
287 |                 team_2_srs = cbb_web_scraper.get_latest_srs(team_2)
288 |                 for ma in tqdm(ma_range):
289 |                     data1_median = team_1_df2023.rolling(ma).median()
290 |                     data1_median['game_loc'] = game_loc_team1
291 |                     data2_median = team_2_df2023.rolling(ma).median()
292 |                     data2_median['game_loc'] = game_loc_team2
293 |                     # data1_mean_old = team_1_df2023.rolling(ma).mean()
294 |                     # data2_mean_old = team_2_df2023.rolling(ma).mean()
295 |                     data1_mean = team_1_df2023.ewm(span=ma,min_periods=ma-1).mean()
296 |                     data1_mean['game_loc'] = game_loc_team1
297 |                     data2_mean = team_2_df2023.ewm(span=ma,min_periods=ma-1).mean()
298 |                     data2_mean['game_loc'] = game_loc_team2
299 |                     for col in team_1_df2023.columns:
300 |                         if "opp" in col:
301 |                             if col == 'opp_trb':
302 |                                 # new_col = col.replace("opp_", "")
303 |                                 data1_mean.loc[data1_mean.index[-1], 'opp_trb'] = data2_mean.loc[data2_mean.index[-1], 'total_board']
304 |                                 data2_mean.loc[data2_mean.index[-1], 'opp_trb'] = data1_mean.loc[data1_mean.index[-1], 'total_board']
305 | 
306 |                                 data1_median.loc[data1_median.index[-1], 'opp_trb'] = data2_median.loc[data2_median.index[-1], 'total_board']
307 |                                 data2_median.loc[data2_median.index[-1], 'opp_trb'] = data1_median.loc[data1_median.index[-1], 'total_board']
308 |                             else:
309 |                                 new_col = col.replace("opp_", "")
310 |                                 data1_mean.loc[data1_mean.index[-1], col] = data2_mean.loc[data2_mean.index[-1], new_col]
311 |                                 data2_mean.loc[data2_mean.index[-1], col] = data1_mean.loc[data1_mean.index[-1], new_col]
312 | 
313 |                                 data1_median.loc[data1_median.index[-1], col] = data2_median.loc[data2_median.index[-1], new_col]
314 |                                 data2_median.loc[data2_median.index[-1], col] = data1_median.loc[data1_median.index[-1], new_col]
315 | 
316 |                     #Drop game result and points features
317 |                     data1_median.drop(columns=['game_result','pts'],inplace=True)
318 |                     data2_median.drop(columns=['game_result','pts'],inplace=True)
319 |                     data1_mean.drop(columns=['game_result','pts'],inplace=True)
320 |                     data2_mean.drop(columns=['game_result','pts'],inplace=True)
321 |                     #apply SRS
322 |                     data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_1_srs
323 |                     data2_mean.loc[data2_mean.index[-1], 'simple_rating_system'] = team_2_srs
324 |                     data1_median.loc[data1_median.index[-1], 'simple_rating_system'] = team_1_srs
325 |                     data2_median.loc[data2_median.index[-1], 'simple_rating_system'] = team_2_srs
326 |                     #Get current predictions for both teams
327 |                     team_1_predict_median = self.RandForRegressor.predict(data1_median.iloc[-1:])
328 |                     team_2_predict_median = self.RandForRegressor.predict(data2_median.iloc[-1:])
329 |                     team_1_predict_mean = self.RandForRegressor.predict(data1_mean.iloc[-1:])
330 |                     team_2_predict_mean = self.RandForRegressor.predict(data2_mean.iloc[-1:])
331 |                     num_pts_score_team_1.append(team_1_predict_mean[0])
332 |                     num_pts_score_team_2.append(team_2_predict_mean[0])
333 |                     num_pts_score_team_1.append(team_1_predict_median[0])
334 |                     num_pts_score_team_2.append(team_2_predict_median[0])
335 |                     if team_1_predict_median > team_2_predict_median:
336 |                         team_1_count += 1
337 |                         team_1_median.append(ma)
338 |                     if team_1_predict_median < team_2_predict_median:
339 |                         team_2_count += 1
340 |                         team_2_median.append(ma)
341 |                     if team_1_predict_mean > team_2_predict_mean:
342 |                         team_1_count_mean += 1
343 |                         team_1_ma.append(ma)
344 |                     if team_1_predict_mean < team_2_predict_mean:
345 |                         team_2_count_mean += 1
346 |                         team_2_ma.append(ma)
347 |                     #check variability between fg and off_ftg
348 |                     mean_team_1_var.append(np.mean(data1_mean[['fg','off_rtg']].dropna().std()))
349 |                     mean_team_1_var.append(np.mean(data1_median[['fg','off_rtg']].dropna().std()))
350 |                     mean_team_2_var.append(np.mean(data2_mean[['fg','off_rtg']].dropna().std()))
351 |                     mean_team_2_var.append(np.mean(data2_median[['fg','off_rtg']].dropna().std()))
352 |                 print('===============================================================')
353 |                 print(f'{team_1} SRS data: {team_1_srs}')
354 |                 print(f'{team_2} SRS data: {team_2_srs}')
355 |                 print('===============================================================')
356 |                 print(f'Outcomes with a rolling median from 2-{len(ma_range)} games')
357 |                 print(f'{team_1}: {team_1_count} | {team_1_median}')
358 |                 print(f'{team_2}: {team_2_count} | {team_2_median}')
359 |                 if team_1_count > team_2_count:
360 |                     print(f'======= {team_1} wins =======')
361 |                 elif team_1_count < team_2_count:
362 |                     print(f'======= {team_2} wins =======')
363 |                 print('===============================================================')
364 |                 print(f'Outcomes with a mean from 2-{len(ma_range)} games')
365 |                 print(f'{team_1}: {team_1_count_mean} | {team_1_ma}')
366 |                 print(f'{team_2}: {team_2_count_mean} | {team_2_ma}')
367 |                 if team_1_count_mean > team_2_count_mean:
368 |                     print(f'======= {team_1} wins =======')
369 |                 elif team_1_count_mean < team_2_count_mean:
370 |                     print(f'======= {team_2} wins =======')
371 |                 print('===============================================================')
372 |                 print(f'{team_1} number of pts score: {int(np.mean(num_pts_score_team_1))} +/- {np.std(num_pts_score_team_1)}')
373 |                 print(f'{team_2} number of pts score: {int(np.mean(num_pts_score_team_2))} +/- {np.std(num_pts_score_team_2)}')
374 |                 if abs(int(np.mean(num_pts_score_team_1)) - int(np.mean(num_pts_score_team_2))) < 3:#self.rmse
375 |                     print('The point differential is less than the model RMSE, be cautious.')
376 |                 print('===============================================================')
377 |                 print(f'Mean variance of two best features {team_1}: {np.mean(mean_team_1_var)}')
378 |                 print(f'Mean variance of two best features {team_2}: {np.mean(mean_team_2_var)}')
379 |                 print('===============================================================')
380 |                 print(f'Standard deviation of points scored by {team_1}: {np.std(self.pts_team_1)}')
381 |                 print(f'Standard deviation of points scored by {team_2}: {np.std(self.pts_team_2)}')
382 |                 print('===============================================================')
383 |                 if "tod" in sys.argv[2]:
384 |                     date_today = str(datetime.now().date()).replace("-", "")
385 |                 elif "tom" in sys.argv[2]:
386 |                     date_today = str(datetime.now().date() + timedelta(days=1)).replace("-", "")
387 |                 URL = "https://www.espn.com/mens-college-basketball/schedule/_/date/" + date_today #sys argv????
388 |                 print(f'ESPN prediction: {cbb_web_scraper.get_espn(URL,team_1,team_2)}')
389 |                 print('===============================================================')
390 |                 if sys.argv[2] == "show":
391 |                     self.visualization(np.mean(num_pts_score_team_1),np.mean(num_pts_score_team_2))
392 |             except Exception as e:
393 |                 print(f'The error: {e}')
394 |                 exc_type, exc_obj, exc_tb = sys.exc_info()
395 |                 fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
396 |                 print(exc_type,' File with the error: ', fname, ' Line number with error: ',exc_tb.tb_lineno)
397 |                 if exc_tb.tb_lineno == 226:
398 |                     print(f'{team_1} data could not be found. check spelling or internet connection. Some teams do not have data on SportsReference')
399 |                 elif exc_tb.tb_lineno == 229:
400 |                     print(f'{team_2} data could not be found. check spelling or internet connection. Some teams do not have data on SportsReference')
401 |     def feature_importances_random_forest(self):
402 |         importances = self.RandForRegressor.best_estimator_.feature_importances_
403 |         indices = np.argsort(importances)
404 |         plt.figure(figsize=(12,10))
405 |         plt.title('Feature Importances Random Forest')
406 |         # plt.barh(range(len(indices)), importances[indices], color='k', align='center')
407 |         sns.barplot(x=importances[indices], y=[self.x_test.columns[i] for i in indices], color='k')
408 |         plt.yticks(range(len(indices)), [self.x_test.columns[i] for i in indices])
409 |         plt.xlabel('Relative Importance')
410 |         plt.tight_layout()
411 |         plt.savefig('feature_importance_random_forest.png',dpi=300)
412 |     def visualization(self,pred_1,pred_2):
413 |         games_1 = range(1,len(self.pts_team_1)+1,1)
414 |         games_2 = range(1,len(self.pts_team_2)+1,1)
415 |         team_1_pred = self.team_1_name + " prediction"
416 |         team_2_pred = self.team_2_name + " prediction"
417 |         plt.figure()
418 |         plt.plot(games_1,self.pts_team_1,color='green',label=self.team_1_name)
419 |         plt.plot(games_2,self.pts_team_2,color='blue',label=self.team_2_name)
420 |         plt.scatter(len(self.pts_team_1)+2,pred_1,color='green',label=team_1_pred)
421 |         plt.scatter(len(self.pts_team_2)+2,pred_2,color='blue',label=team_2_pred)
422 |         plt.legend()
423 |         plt.xlabel('Games')
424 |         plt.ylabel('Points')
425 |         plt.tight_layout()
426 |         plt.show()
427 |     def run_analysis(self):
428 |         self.get_teams()
429 |         self.split()
430 |         self.random_forest_analysis()
431 |         self.predict_two_teams()
432 |         self.feature_importances_random_forest()
433 | def main():
434 |     cbb_regressor().run_analysis()
435 | if __name__ == '__main__':
436 |     main()


--------------------------------------------------------------------------------
/cbb_web_scraper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | html parse code - college basketball
  5 | @author: brianszekely
  6 | """
  7 | import requests
  8 | from bs4 import BeautifulSoup
  9 | from pandas import DataFrame
 10 | from numpy import nan
 11 | from time import sleep
 12 | from os.path import join, exists
 13 | from os import getcwd
 14 | from urllib import request
 15 | from urllib.request import Request, urlopen
 16 | from pandas import read_csv
 17 | from numpy import where
 18 | from re import search
 19 | from difflib import get_close_matches
 20 | from datetime import datetime
 21 | from numpy import nan
 22 | #TODO: CREATE A FEATURE OF opp_simple_rating_system
 23 | 
 24 | def get_teams_year(year_min,year_max):
 25 |     #Try to redo this when 429 is not an issue
 26 |     # URL = 'https://www.sports-reference.com/cbb/schools/'
 27 |     # hdr = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
 28 |     # req = Request(URL,headers=hdr)
 29 |     # html = request.urlopen(req)
 30 |     # soup = BeautifulSoup(html, "html.parser")
 31 |     # table = soup.find(class_="table_container is_setup")
 32 |     # print(soup)
 33 |     # input()
 34 |     #Read in from csv
 35 |     teams_save = []
 36 |     teams = read_csv('all_teams_cbb.csv')
 37 |     teams_with_year = where((teams['From'] <= year_min) & (teams['To'] == year_max))[0]
 38 |     for team in teams['School'].iloc[teams_with_year]:
 39 |         team = team.replace(' ', '-').lower()
 40 |         if '.' in team:
 41 |             team = team.replace(".", "")
 42 |         if 'the' in team:
 43 |             team = team.replace("the-", "")
 44 |         if '&' in team:
 45 |             team = team.replace("&", "")
 46 |         if '(' in team and ')' in team:
 47 |             team = team.replace("(", "")
 48 |             team = team.replace(")", "")
 49 |         if "'" in team:
 50 |             team = team.replace("'", "")
 51 |         teams_save.append(team)
 52 |     return teams_save
 53 | 
 54 | def alter_string(team):
 55 |     team = team.replace(' ', '-').lower()
 56 |     if '.' in team:
 57 |         team = team.replace(".", "")
 58 |     if 'the' in team:
 59 |         team = team.replace("the-", "")
 60 |     if '&' in team:
 61 |         team = team.replace("&", "")
 62 |     if '(' in team and ')' in team:
 63 |         team = team.replace("(", "")
 64 |         team = team.replace(")", "")
 65 |     if "'" in team:
 66 |         team = team.replace("'", "")
 67 |     return team
 68 | def get_latest_srs(team):
 69 |     sleep(4)
 70 |     url_srs = f'https://www.sports-reference.com/cbb/schools/{team}/men/2024-schedule.html'
 71 |     hdr = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
 72 |     req_1 = Request(url_srs,headers=hdr)
 73 |     html_1 = request.urlopen(req_1)
 74 |     soup_3 = BeautifulSoup(html_1, "html.parser")
 75 |     table3 = soup_3.find(id='div_schedule')
 76 |     tbody2 = table3.find('tbody')
 77 |     tr_body2 = tbody2.find_all('tr')
 78 |     srs = []
 79 |     for trb in tr_body2:
 80 |         for td in trb.find_all('td'):
 81 |             if td.get('data-stat') == "srs":
 82 |                 srs.append(td.get_text())
 83 |     return float(srs[-1])
 84 | 
 85 | def get_adv_opp_variables(team,parsed_date):
 86 |     date_without_time = parsed_date.strftime('%Y-%m-%d')
 87 |     sleep(3)
 88 |     url ='https://www.sports-reference.com/cbb/schools/' + team + '/' + str(2024) + '-gamelogs-advanced.html'
 89 |     hdr = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
 90 |     req_1 = Request(url,headers=hdr)
 91 |     html_1 = request.urlopen(req_1)
 92 |     soup_2 = BeautifulSoup(html_1, "html.parser")
 93 |     table2 = soup_2.find(id="all_sgl-advanced")
 94 |     tbody2 = table2.find('tbody')
 95 |     tr_body2 = tbody2.find_all('tr')
 96 |     # off_rtg, def_rtg = [], []
 97 |     efg_pct =  None
 98 |     print(f'team they played: {team}')
 99 |     for trb in tr_body2:
100 |         for td in trb.find_all('td'):
101 |             if td.get('data-stat') == 'date':
102 |                 if td.get_text() == date_without_time:
103 |                     continue
104 |                 else:
105 |                     break
106 |             if td.get('data-stat') == "efg_pct":
107 |                 efg_pct = td.get_text()
108 |     return efg_pct
109 | 
110 | def html_to_df_web_scrape_cbb(URL,URL1,team,year):
111 |     #URL = Basic data ; URL1 = Advanced stats
112 |     url_srs = f'https://www.sports-reference.com/cbb/schools/{team}/men/{year}-schedule.html'
113 |     hdr = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
114 |     req_1 = Request(URL,headers=hdr)
115 |     html_1 = request.urlopen(req_1)
116 |     sleep(4)
117 |     req_2 = Request(URL1,headers=hdr)
118 |     html_2 = request.urlopen(req_2)
119 |     sleep(4)
120 |     req_3 = Request(url_srs,headers=hdr)
121 |     html_3 = request.urlopen(req_3)
122 |     # while True:4700++6
123 |         # try:
124 |     soup_1 = BeautifulSoup(html_1, "html.parser")
125 |     soup_2 = BeautifulSoup(html_2, "html.parser")
126 |     soup_3 = BeautifulSoup(html_3, "html.parser")
127 |             # page = requests.get(URL)
128 |             # soup = BeautifulSoup(page.content, "html.parser")
129 |             # page1 = requests.get(URL1)
130 |             # soup1 = BeautifulSoup(page1.content, "html.parser")
131 |         #     break
132 |         # except:
133 |         #     print('HTTPSConnectionPool(host="www.sports-reference.com", port=443): Max retries exceeded. Retry in 10 seconds')
134 |         #     sleep(10)
135 |     # table = soup_1.find(id="all_sgl-basic")
136 |     table = soup_1.select_one('table[id^="sgl-basic"]')
137 |     table1 = soup_2.find(id="all_sgl-advanced")
138 |     table3 = soup_3.find(id='div_schedule')
139 |     tbody = table.find('tbody')
140 |     tbody1 = table1.find('tbody')
141 |     tbody2 = table3.find('tbody')
142 |     tr_body = tbody.find_all('tr')
143 |     tr_body1 = tbody1.find_all('tr')
144 |     tr_body2 = tbody2.find_all('tr')
145 |     # game_season = []
146 |     # date_game = []
147 |     # game_location = []
148 |     # opp_id= []
149 |     # BASIC STATS
150 |     game_result= []
151 |     pts= []
152 |     opp_pts= []
153 |     fg= []
154 |     fga= []
155 |     fg_pct= []
156 |     fg3= []
157 |     fg3a= []
158 |     fg3_pct= []
159 |     ft= []
160 |     fta= []
161 |     ft_pct= []
162 |     orb= []
163 |     total_board= []
164 |     ast= []
165 |     stl= []
166 |     blk= []
167 |     tov= []
168 |     pf= []
169 |     opp_fg = []
170 |     opp_fga= []
171 |     opp_fg_pct= []
172 |     opp_fg3= []
173 |     opp_fg3a= []
174 |     opp_fg3_pct= []
175 |     opp_ft= []
176 |     opp_fta= []
177 |     opp_ft_pct= []
178 |     opp_orb= []
179 |     opp_trb= []
180 |     opp_ast= []
181 |     opp_stl= []
182 |     opp_blk= []
183 |     opp_tov= []
184 |     opp_pf= []
185 |     game_loc = []
186 |     srs = []
187 |     date_save = []
188 |     efg_percent_opp = []
189 |     # opp_srs = []
190 |     #SIMPLE RATING SYSTEM
191 |     # teams_sports_ref = read_csv('teams_sports_ref_format.csv')
192 |     for trb in tr_body2:
193 |         for td in trb.find_all('td'):
194 |             # if td.get('data-stat') == 'opp_name':
195 |             #     get_close_matches(td.get_text(),teams_sports_ref['teams'].tolist(),n=1)[0]
196 |             #     print(td.get_text())
197 |             if td.get('data-stat') == "srs":
198 |                 if td.get_text() == '':
199 |                     srs.append(nan)
200 |                 else:
201 |                     srs.append(td.get_text())
202 |     #SIMPLE RATING SYSTEM - OPPONENT ?
203 |     #BASIC STATS - change td.get_text() to float(td.get_text()) ?
204 |     for trb in tr_body:
205 |         for td in trb.find_all('td'):
206 |             if td.get('data-stat') == "game_location":
207 |                 #home = 0, away = 1, N = 2
208 |                 if td.get_text() == 'N':
209 |                     game_loc.append(2)
210 |                 elif td.get_text() == '@':
211 |                     game_loc.append(1)
212 |                 elif td.get_text() == '':
213 |                     game_loc.append(0)
214 |             if td.get('data-stat') == "game_result":
215 |                 if 'W' in td.get_text():
216 |                     game_result.append(1)
217 |                 else:
218 |                     game_result.append(0)
219 |             if td.get('data-stat') == "date":
220 |                 parsed_date = datetime.strptime(td.get_text(), '%Y-%m-%d')
221 |                 month = parsed_date.month
222 |                 day = parsed_date.day
223 |                 date_save.append(float(f'{month}.{day}'))
224 |             #TODO: FIX THIS IN THE FUTURE TO ADD OPPONENT VARIABLES
225 |             # if td.get('data-stat') == "opp_team_id":
226 |             #     opp_name = alter_string(td.get_text())
227 |             #     try:
228 |             #         efg_percent_opp.append(get_adv_opp_variables(opp_name,parsed_date))
229 |             #     except:
230 |             #         print(f'no advanced data for {opp_name}, advanced opponent variables are None')
231 |             #         efg_percent_opp.append(nan)
232 |             if td.get('data-stat') == "pts":
233 |                 pts.append(td.get_text())
234 |             if td.get('data-stat') == "opp_pts":
235 |                 opp_pts.append(td.get_text())
236 |             if td.get('data-stat') == "fg":
237 |                 fg.append(td.get_text())
238 |             if td.get('data-stat') == "fga":
239 |                 fga.append(td.get_text())
240 |             if td.get('data-stat') == "fg_pct":
241 |                 fg_pct.append(td.get_text())
242 |             if td.get('data-stat') == "fg3":
243 |                 fg3.append(td.get_text())
244 |             if td.get('data-stat') == "fg3a":
245 |                 fg3a.append(td.get_text())
246 |             if td.get('data-stat') == "fg3_pct":
247 |                 fg3_pct.append(td.get_text())
248 |             if td.get('data-stat') == "ft":
249 |                 ft.append(td.get_text())
250 |             if td.get('data-stat') == "fta":
251 |                 fta.append(td.get_text())
252 |             if td.get('data-stat') == "ft_pct":
253 |                 ft_pct.append(td.get_text())
254 |             if td.get('data-stat') == "orb":
255 |                 orb.append(td.get_text())
256 |             if td.get('data-stat') == "trb":
257 |                 total_board.append(td.get_text())
258 |             if td.get('data-stat') == "ast":
259 |                 ast.append(td.get_text())
260 |             if td.get('data-stat') == "stl":
261 |                 stl.append(td.get_text())
262 |             if td.get('data-stat') == "blk":
263 |                 blk.append(td.get_text())
264 |             if td.get('data-stat') == "tov":
265 |                 tov.append(td.get_text())
266 |             if td.get('data-stat') == "pf":
267 |                 pf.append(td.get_text())
268 |             if td.get('data-stat') == "opp_fg":
269 |                 opp_fg.append(td.get_text())
270 |             if td.get('data-stat') == "opp_fga":
271 |                 opp_fga.append(td.get_text())
272 |             if td.get('data-stat') == "opp_fg_pct":
273 |                 opp_fg_pct.append(td.get_text())
274 |             if td.get('data-stat') == "opp_fg3":
275 |                 opp_fg3.append(td.get_text())
276 |             if td.get('data-stat') == "opp_fg3a":
277 |                 opp_fg3a.append(td.get_text())
278 |             if td.get('data-stat') == "opp_fg3_pct":
279 |                 opp_fg3_pct.append(td.get_text())
280 |             if td.get('data-stat') == "opp_ft":
281 |                 opp_ft.append(td.get_text())
282 |             if td.get('data-stat') == "opp_fta":
283 |                 opp_fta.append(td.get_text())
284 |             if td.get('data-stat') == "opp_ft_pct":
285 |                 opp_ft_pct.append(td.get_text())
286 |             if td.get('data-stat') == "opp_orb":
287 |                 opp_orb.append(td.get_text())
288 |             if td.get('data-stat') == "opp_trb":
289 |                 opp_trb.append(td.get_text())
290 |             if td.get('data-stat') == "opp_ast":
291 |                 opp_ast.append(td.get_text())
292 |             if td.get('data-stat') == "opp_stl":
293 |                 opp_stl.append(td.get_text())
294 |             if td.get('data-stat') == "opp_blk":
295 |                 opp_blk.append(td.get_text())
296 |             if td.get('data-stat') == "opp_tov":
297 |                 opp_tov.append(td.get_text())
298 |             if td.get('data-stat') == "opp_pf":
299 |                 opp_pf.append(td.get_text())      
300 |     #ADVANCED STATS
301 |     off_rtg = []
302 |     def_rtg = []
303 |     off_rtg_opp = []
304 |     def_rtg_opp = []
305 |     pace = []
306 |     fta_per_fga_pct = []
307 |     fg3a_per_fga_pct = []
308 |     ts_pct = []
309 |     trb_pct = []
310 |     ast_pct = []
311 |     stl_pct = [] 
312 |     blk_pct = []
313 |     efg_pct = []
314 |     tov_pct = []
315 |     orb_pct = []
316 |     ft_rate = []
317 |     opp_efg_pct= []
318 |     opp_tov_pct = []
319 |     drb_pct = []
320 |     opp_ft_rate = []
321 |     for trb in tr_body1:
322 |         for td in trb.find_all('td'):
323 |             if td.get('data-stat') == "off_rtg":
324 |                 off_rtg.append(td.get_text())
325 |                 def_rtg_opp.append(td.get_text())
326 |             if td.get('data-stat') == "def_rtg":
327 |                 off_rtg_opp.append(td.get_text())
328 |                 def_rtg.append(td.get_text())
329 |             if td.get('data-stat') == "pace":
330 |                 pace.append(td.get_text())
331 |             if td.get('data-stat') == "fta_per_fga_pct":
332 |                 fta_per_fga_pct.append(td.get_text())
333 |             if td.get('data-stat') == "fg3a_per_fga_pct":
334 |                 fg3a_per_fga_pct.append(td.get_text())
335 |             if td.get('data-stat') == "ts_pct":
336 |                 ts_pct.append(td.get_text())
337 |             if td.get('data-stat') == "trb_pct":
338 |                 trb_pct.append(td.get_text())
339 |             if td.get('data-stat') == "ast_pct":
340 |                 ast_pct.append(td.get_text())
341 |             if td.get('data-stat') == "stl_pct":
342 |                 stl_pct.append(td.get_text())
343 |             if td.get('data-stat') == "blk_pct":
344 |                 blk_pct.append(td.get_text())
345 |             if td.get('data-stat') == "efg_pct":
346 |                 efg_pct.append(td.get_text())
347 |             if td.get('data-stat') == "tov_pct":
348 |                 tov_pct.append(td.get_text())
349 |             if td.get('data-stat') == "orb_pct":
350 |                 orb_pct.append(td.get_text())
351 |             if td.get('data-stat') == "ft_rate":
352 |                 ft_rate.append(td.get_text())
353 |             if td.get('data-stat') == "opp_efg_pct":
354 |                 opp_efg_pct.append(td.get_text())
355 |             if td.get('data-stat') == "opp_tov_pct":
356 |                 opp_tov_pct.append(td.get_text())
357 |             if td.get('data-stat') == "drb_pct":
358 |                 drb_pct.append(td.get_text())
359 |             if td.get('data-stat') == "opp_ft_rate":
360 |                 opp_ft_rate.append(td.get_text())
361 |     return DataFrame(list(zip(game_result,pts,opp_pts,fg,fga,
362 |     fg_pct,fg3,fg3a,fg3_pct,ft,fta,ft_pct,orb,total_board,ast,
363 |     stl,blk,tov,pf,opp_fg,opp_fga,opp_fg_pct,opp_fg3,opp_fg3a,opp_fg3_pct,
364 |     opp_ft,opp_fta,opp_ft_pct,opp_orb,opp_trb,opp_ast,opp_stl,opp_blk,opp_tov,
365 |     opp_pf, off_rtg,def_rtg,pace,fta_per_fga_pct,fg3a_per_fga_pct,ts_pct,
366 |     trb_pct,ast_pct,stl_pct,blk_pct,efg_pct,tov_pct,orb_pct,ft_rate,opp_efg_pct,
367 |     opp_tov_pct,drb_pct,opp_ft_rate,game_loc,srs,date_save,
368 |     off_rtg_opp,def_rtg_opp)),
369 |             columns =['game_result','pts','opp_pts','fg','fga',
370 |             'fg_pct','fg3','fg3a','fg3_pct','ft','fta','ft_pct','orb','total_board','ast',
371 |             'stl','blk','tov','pf','opp_fg','opp_fga','opp_fg_pct','opp_fg3','opp_fg3a','opp_fg3_pct',
372 |             'opp_ft','opp_fta','opp_ft_pct','opp_orb','opp_trb','opp_ast','opp_stl','opp_blk','opp_tov',
373 |             'opp_pf','off_rtg','def_rtg','pace','fta_per_fga_pct','fg3a_per_fga_pct','ts_pct',
374 |             'trb_pct','ast_pct','stl_pct','blk_pct','efg_pct','tov_pct','orb_pct','ft_rate','opp_efg_pct',
375 |             'opp_tov_pct','drb_pct','opp_ft_rate','game_loc','simple_rating_system','date_played',
376 |             'opp_off_rtg','opp_def_rtg'])
377 | 
378 | def get_espn(URL,team_1,team_2):
379 |     team_1 = create_acr(team_1)
380 |     team_2 = create_acr(team_2)
381 |     # URL = "https://www.espn.com/mens-college-basketball/schedule/_/date/20230131"
382 |     hdr = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
383 |     req_1 = Request(URL,headers=hdr)
384 |     html_1 = request.urlopen(req_1)
385 |     soup_1 = BeautifulSoup(html_1, "html.parser")
386 |     table = soup_1.find(class_="ResponsiveTable")
387 |     table1 = table.find(class_="Table__Scroller")
388 |     table2 = table.find(class_="Table")
389 |     table3 = table.find(class_="Table__TBODY")
390 |     for td in table3.find_all(class_="Table__TR Table__TR--sm Table__even"):
391 |         try:
392 |             #Get team names
393 |             inst = td.find(class_="events__col Table__TD")
394 |             href_team = inst.find(class_="AnchorLink").get("href")
395 |             if team_1 in href_team:
396 |                 #Get game link
397 |                 inst = td.find(class_="date__col Table__TD")
398 |                 href_val = inst.find(class_="AnchorLink").get("href")
399 |                 game = "https://www.espn.com" + href_val
400 |                 req_second = Request(game,headers=hdr)
401 |                 html_second = request.urlopen(req_second)
402 |                 soup_second = BeautifulSoup(html_second, "html.parser")
403 |                 #Team 1 - left-0 top-0 = Away
404 |                 team_1_predict = soup_second.find(class_="matchupPredictor__teamValue matchupPredictor__teamValue--b left-0 top-0 flex items-baseline absolute copy")
405 |                 start = '>'
406 |                 end = "<div"
407 |                 team_1_result = float(search('%s(.*)%s' % (start, end), str(team_1_predict)).group(1))
408 |                 #Team 2 - bottom-0 right-0 = Home
409 |                 team_2_predict = soup_second.find(class_="matchupPredictor__teamValue matchupPredictor__teamValue--a bottom-0 right-0 flex items-baseline absolute copy")
410 |                 start = '>'
411 |                 end = "<div"
412 |                 team_2_result = float(search('%s(.*)%s' % (start, end), str(team_2_predict)).group(1))
413 |                 return {team_1: team_1_result, team_2: team_2_result}
414 |         except:
415 |             continue
416 |             # print("'NoneType' object has no attribute 'get', This will happen when a team has already played today. There is no probability of team winning since they already played")
417 |     return 'Could not find teams'
418 | def create_acr(name):
419 |     if name == "virginia-commonwealth":
420 |         return "vcu"
421 |     elif name == "connecticut":
422 |         return "uconn"
423 |     elif name == "nevada-las-vegas":
424 |         return "unlv"
425 |     elif name == "loyola-il":
426 |         return "loyola-chicago"
427 |     elif name == "massachusetts":
428 |         return "umass"
429 |     elif name == "illinois-chicago":
430 |         return "uic"
431 |     elif name == "southern-methodist":
432 |         return "smu"
433 |     elif name == "louisiana-state":
434 |         return "lsu"
435 |     elif name == "albany-ny":
436 |         return "albany"
437 |     elif name == "central-connecticut-state":
438 |         return "central-connecticut"
439 |     elif name == "north-carolina-greensboro":
440 |         return "unc-greensboro"
441 |     elif name == "virginia-military-institute":
442 |         return "vmi"
443 |     elif name == "saint-francis-pa":
444 |         return "st-francis-pa"
445 |     elif name == "southern-california":
446 |         return "usc"
447 |     elif name == "saint-marys-ca":
448 |         return "saint-marys"
449 |     elif name == "alabama-birmingham":
450 |         return "uab"
451 |     elif name == "north-carolina-state":
452 |         return "nc-state"
453 |     elif name == "maryland-baltimore-county":
454 |         return "umbc"
455 |     else:
456 |         return name
457 |     


--------------------------------------------------------------------------------
/class_label_count.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bszek213/cbb_machine_learning/aab183383e3237c3c1007e476abcd66028c7604c/class_label_count.png


--------------------------------------------------------------------------------
/correlations_class.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bszek213/cbb_machine_learning/aab183383e3237c3c1007e476abcd66028c7604c/correlations_class.png


--------------------------------------------------------------------------------
/deep_learn.py:
--------------------------------------------------------------------------------
  1 | #deep learning implementation
  2 | import tensorflow as tf
  3 | from tensorflow import keras
  4 | from tensorflow.keras import layers
  5 | from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
  6 | from sklearn.preprocessing import StandardScaler
  7 | import cbb_web_scraper
  8 | from os import getcwd
  9 | from os.path import join, exists 
 10 | import yaml
 11 | from tqdm import tqdm
 12 | from time import sleep
 13 | from pandas import DataFrame, concat, read_csv, isnull
 14 | from sklearn.model_selection import train_test_split
 15 | from sklearn.model_selection import GridSearchCV
 16 | # from sklearn.ensemble import RandomForestClassifier
 17 | import numpy as np
 18 | import matplotlib.pyplot as plt
 19 | import seaborn as sns
 20 | # from sys import argv
 21 | import joblib
 22 | from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
 23 | from difflib import get_close_matches
 24 | # from datetime import datetime, timedelta
 25 | # from sklearn.metrics import roc_curve
 26 | import seaborn as sns
 27 | 
 28 | #TODO: CREATE A FEATURE OF opp_simple_rating_system
 29 | 
 30 | class cbbDeep():
 31 |     def __init__(self):
 32 |         print('instantiate class cbbClass')
 33 |         self.all_data = DataFrame()
 34 |         # if exists(join(getcwd(),'randomForestModelTuned.joblib')):
 35 |         #     self.RandForRegressor=joblib.load("./randomForestModelTuned.joblib")
 36 |     def get_teams(self):
 37 |         year_list_find = []
 38 |         year_list = [2023,2022,2021,2019,2018,2017,2016,2015,2014,2013,2012] # ,2014,2013,2012,2011,2010
 39 |         if exists(join(getcwd(),'year_count.yaml')):
 40 |             with open(join(getcwd(),'year_count.yaml')) as file:
 41 |                 year_counts = yaml.load(file, Loader=yaml.FullLoader)
 42 |         else:
 43 |             year_counts = {'year':year_list_find}
 44 |         #Remove any years that have already been collected
 45 |         if year_counts['year']:
 46 |             year_list_check =  year_counts['year']
 47 |             year_list_find = year_counts['year']
 48 |             year_list = [i for i in year_list if i not in year_list_check]
 49 |             print(f'Need data for year: {year_list}')
 50 |         #Collect data per year
 51 |         if year_list:   
 52 |             for year in tqdm(year_list):
 53 |                 all_teams = cbb_web_scraper.get_teams_year(year_list[-1],year_list[0])
 54 |                 team_names = sorted(all_teams)
 55 |                 final_list = []
 56 |                 self.year_store = year
 57 |                 for abv in tqdm(team_names):    
 58 |                     try:
 59 |                         print() #tqdm things
 60 |                         print(f'current team: {abv}, year: {year}')
 61 |                         basic = 'https://www.sports-reference.com/cbb/schools/' + abv + '/' + str(self.year_store) + '-gamelogs.html'
 62 |                         adv = 'https://www.sports-reference.com/cbb/schools/' + abv + '/' + str(self.year_store) + '-gamelogs-advanced.html'
 63 |                         df_inst = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,abv,self.year_store)
 64 |                         df_inst['pts'].replace('', np.nan, inplace=True)
 65 |                         df_inst.dropna(inplace=True)
 66 |                         final_list.append(df_inst)
 67 |                     except Exception as e:
 68 |                         print(e)
 69 |                         print(f'{abv} data are not available')
 70 |                     sleep(4) #I get get banned for a small period of time if I do not do this
 71 |                 final_data = concat(final_list)
 72 |                 if exists(join(getcwd(),'all_data_regressor.csv')):
 73 |                     self.all_data = read_csv(join(getcwd(),'all_data_regressor.csv'))  
 74 |                 self.all_data = concat([self.all_data, final_data.dropna()])
 75 |                 if not exists(join(getcwd(),'all_data_regressor.csv')):
 76 |                     self.all_data.to_csv(join(getcwd(),'all_data_regressor.csv'),index=False)
 77 |                 self.all_data.to_csv(join(getcwd(),'all_data_regressor.csv'),index=False)
 78 |                 year_list_find.append(year)
 79 |                 print(f'year list after loop: {year_list_find}')
 80 |                 with open(join(getcwd(),'year_count.yaml'), 'w') as write_file:
 81 |                     yaml.dump(year_counts, write_file)
 82 |                     print(f'writing {year} to yaml file')
 83 |         else:
 84 |             self.all_data = read_csv(join(getcwd(),'all_data_regressor.csv'))
 85 |         print('len data: ', len(self.all_data))
 86 |         self.all_data = self.all_data.drop_duplicates(keep='last')
 87 |         print(f'length of data after duplicates are dropped: {len(self.all_data)}')
 88 |     def convert_to_float(self):
 89 |         for col in self.all_data.columns:
 90 |             self.all_data[col].replace('', np.nan, inplace=True)
 91 |             self.all_data[col] = self.all_data[col].astype(float)
 92 |     def split(self):
 93 |         # self.delete_opp()
 94 |         for col in self.all_data.columns:
 95 |             if 'Unnamed' in col:
 96 |                 self.all_data.drop(columns=col,inplace=True)
 97 |         self.convert_to_float()
 98 |         self.y = self.all_data['game_result'].astype(int)
 99 |         self.x = self.all_data.drop(columns=['game_result'])
100 |         self.pre_process()
101 |         #Dropna and remove all data from subsequent y data
102 |         real_values = ~self.x_no_corr.isna().any(axis=1)
103 |         self.x_no_corr.dropna(inplace=True)
104 |         self.y = self.y.loc[real_values]
105 |         self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x_no_corr, self.y, train_size=0.8)
106 |         # normalize data
107 |         self.scaler = StandardScaler()
108 |         self.x_train = self.scaler.fit_transform(self.x_train)
109 |         self.x_test = self.scaler.transform(self.x_test)
110 |     def pre_process(self):
111 |         # Remove features with a correlation coef greater than 0.85
112 |         corr_matrix = np.abs(self.x.astype(float).corr())
113 |         upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
114 |         to_drop = [column for column in upper.columns if any(upper[column] >= 0.90)]
115 |         self.drop_cols = to_drop
116 |         self.x_no_corr = self.x.drop(columns=to_drop)
117 |         cols = self.x_no_corr.columns
118 |         print(f'Columns dropped  >= 0.90: {to_drop}')
119 |         #Drop samples that are outliers 
120 |         print(f'old feature dataframe shape before outlier removal: {self.x_no_corr.shape}')
121 |         for col_name in cols:
122 |             Q1 = np.percentile(self.x_no_corr[col_name], 25)
123 |             Q3 = np.percentile(self.x_no_corr[col_name], 75)
124 |             IQR = Q3 - Q1
125 |             upper = np.where(self.x_no_corr[col_name] >= (Q3+2.5*IQR)) #1.5 is the standard, use two to see if more data helps improve model performance
126 |             lower = np.where(self.x_no_corr[col_name] <= (Q1-2.5*IQR)) 
127 |             self.x_no_corr.drop(upper[0], inplace = True)
128 |             self.x_no_corr.drop(lower[0], inplace = True)
129 |             self.y.drop(upper[0], inplace = True)
130 |             self.y.drop(lower[0], inplace = True)
131 |             if 'level_0' in self.x_no_corr.columns:
132 |                 self.x_no_corr.drop(columns=['level_0'],inplace = True)
133 |             self.x_no_corr.reset_index(inplace = True)
134 |             self.y.reset_index(inplace = True, drop=True)
135 |         self.x_no_corr.drop(columns=['level_0','index'],inplace = True)
136 |         print(f'new feature dataframe shape after outlier removal: {self.x_no_corr.shape}')
137 |         top_corr_features = corr_matrix.index
138 |     def create_model(self, neurons=32, learning_rate=0.001, dropout_rate=0.2, alpha=0.1):
139 |         model = keras.Sequential([
140 |             layers.Dense(neurons, input_shape=(self.x_no_corr.shape[1],)),
141 |             layers.LeakyReLU(alpha=alpha),
142 |             layers.Dropout(dropout_rate),
143 |             layers.Dense(neurons),
144 |             layers.LeakyReLU(alpha=alpha),
145 |             layers.Dropout(dropout_rate),
146 |             layers.Dense(neurons),
147 |             layers.LeakyReLU(alpha=alpha),
148 |             layers.Dropout(dropout_rate),
149 |             layers.Dense(1, activation='sigmoid')
150 |         ])
151 |         optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
152 |         model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
153 |         return model
154 |     def deep_learn(self):
155 |         if exists('deep_learning.h5'):
156 |             self.model = keras.models.load_model('deep_learning.h5')
157 |         else:
158 |             #best params
159 |             # Best: 0.999925 using {'alpha': 0.1, 'batch_size': 32, 'dropout_rate': 0.2,
160 |             #  'learning_rate': 0.001, 'neurons': 16}
161 |             optimizer = keras.optimizers.Adam(learning_rate=0.001)
162 |             self.model = keras.Sequential([
163 |                     layers.Dense(16, input_shape=(self.x_no_corr.shape[1],)),
164 |                     layers.LeakyReLU(alpha=0.1),
165 |                     layers.BatchNormalization(),
166 |                     layers.Dropout(0.2),
167 |                     layers.Dense(16),
168 |                     layers.LeakyReLU(alpha=0.1),
169 |                     layers.BatchNormalization(),
170 |                     layers.Dropout(0.2),
171 |                     layers.Dense(16),
172 |                     layers.LeakyReLU(alpha=0.1),
173 |                     layers.BatchNormalization(),
174 |                     layers.Dropout(0.2),
175 |                     layers.Dense(16),
176 |                     layers.LeakyReLU(alpha=0.1),
177 |                     layers.BatchNormalization(),
178 |                     layers.Dropout(0.2),
179 |                     layers.Dense(16),
180 |                     layers.LeakyReLU(alpha=0.1),
181 |                     layers.BatchNormalization(),
182 |                     layers.Dropout(0.2),
183 |                     layers.Dense(16),
184 |                     layers.LeakyReLU(alpha=0.1),
185 |                     layers.BatchNormalization(),
186 |                     layers.Dropout(0.2),
187 |                     layers.Dense(1, activation='sigmoid')
188 |                 ])
189 |             self.model.compile(optimizer=optimizer,
190 |                 loss='binary_crossentropy',
191 |                 metrics=['accuracy'])
192 |             history = self.model.fit(self.x_train, self.y_train, 
193 |                                      epochs=50, batch_size=32,
194 |                                     validation_data=(self.x_test,self.y_test))
195 |                                      #validation_split=0.2)
196 |             # param_grid = {
197 |             #     'neurons': [16, 32, 64],
198 |             #     'learning_rate': [0.01, 0.001, 0.0001],
199 |             #     'dropout_rate': [0.1, 0.2, 0.3],
200 |             #     'alpha': [0.01, 0.1, 0.2],
201 |             #     'batch_size': [16, 32, 64]
202 |             # }
203 |             # param_grid = {
204 |             #     'neurons': [16, 32],
205 |             #     'learning_rate': [0.01, 0.001],
206 |             #     'dropout_rate': [0.2],
207 |             #     'alpha': [0.1],
208 |             #     'batch_size': [32, 64]
209 |             # }
210 |             # model = KerasClassifier(build_fn=self.create_model, 
211 |             #                         epochs=50, batch_size=32, verbose=4)
212 |             # grid = GridSearchCV(estimator=model, 
213 |             #                     param_grid=param_grid,
214 |             #                     cv=3,
215 |             #                     verbose=3)
216 |             # self.grid_result = grid.fit(self.x_train, self.y_train)
217 |             # print("Best: %f using %s" % (self.grid_result.best_score_, self.grid_result.best_params_))
218 |             # self.model = self.grid_result
219 |             # input()
220 |             self.model.save('deep_learning.h5')
221 |             plt.figure()
222 |             plt.plot(history.history['accuracy'], label='training accuracy')
223 |             plt.plot(history.history['val_accuracy'], label='validation accuracy')
224 |             plt.title('Accuracy History')
225 |             plt.xlabel('Epoch')
226 |             plt.ylabel('Accuracy')
227 |             plt.legend()
228 |             plt.savefig('Accuracy.png',dpi=300)
229 |             plt.close()
230 | 
231 |             # plot loss history
232 |             plt.figure()
233 |             plt.plot(history.history['loss'], label='training loss')
234 |             plt.plot(history.history['val_loss'], label='validation loss')
235 |             plt.title('Loss History')
236 |             plt.xlabel('Epoch')
237 |             plt.ylabel('Loss')
238 |             plt.legend()
239 |             plt.savefig('Loss.png',dpi=300)
240 |             plt.close()
241 |     def predict_two_teams(self):
242 |         teams_sports_ref = read_csv('teams_sports_ref_format.csv')
243 |         while True:
244 |             try:
245 |                 team_1 = input('team_1: ')
246 |                 if team_1 == 'exit':
247 |                     break
248 |                 team_2 = input('team_2: ')
249 |                 #Game location
250 |                 game_loc_team1 = int(input(f'{team_1} : #home = 0, away = 1, N = 2: '))
251 |                 if game_loc_team1 == 0:
252 |                     game_loc_team2 = 1
253 |                 elif game_loc_team1 == 1:
254 |                     game_loc_team2 = 0
255 |                 elif game_loc_team1 == 2:
256 |                     game_loc_team2 = 2
257 |                 #Check to see if the team was spelled right
258 |                 team_1  = get_close_matches(team_1,teams_sports_ref['teams'].tolist(),n=1)[0]
259 |                 team_2  = get_close_matches(team_2,teams_sports_ref['teams'].tolist(),n=1)[0]
260 |                 #2023 data
261 |                 year = 2023
262 |                 sleep(4)
263 |                 basic = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs.html'
264 |                 adv = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs-advanced.html'
265 |                 team_1_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_1.lower(),year)
266 |                 sleep(4) #I get get banned for a small period of time if I do not do this
267 |                 basic = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs.html'
268 |                 adv = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs-advanced.html'
269 |                 team_2_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_2.lower(),year)
270 |                 #Remove empty cells
271 |                 team_1_df2023['pts'].replace('', np.nan, inplace=True)
272 |                 team_1_df2023.replace('', np.nan, inplace=True)
273 |                 team_1_df2023.dropna(inplace=True)
274 |                 team_2_df2023['pts'].replace('', np.nan, inplace=True)
275 |                 team_2_df2023.replace('', np.nan, inplace=True)
276 |                 team_2_df2023.dropna(inplace=True)
277 |                 #Remove pts and game result
278 |                 # for col in team_1_df2023.columns:
279 |                 #     if 'opp' in col:
280 |                 #         team_1_df2023.drop(columns=col,inplace=True)
281 |                 # for col in team_2_df2023.columns:
282 |                 #     if 'opp' in col:
283 |                 #         team_2_df2023.drop(columns=col,inplace=True)
284 |                 team_1_df2023.drop(columns=['game_result'],inplace=True)
285 |                 team_2_df2023.drop(columns=['game_result'],inplace=True)
286 |                 #Drop the correlated features
287 |                 team_1_df2023.drop(columns=self.drop_cols, inplace=True)
288 |                 team_2_df2023.drop(columns=self.drop_cols, inplace=True)
289 |                 ma_range = np.arange(2,5,1) #2 was the most correct value for mean and 8 was the best for the median; chose 9 for tiebreaking
290 |                 team_1_count = 0
291 |                 team_2_count = 0
292 |                 team_1_count_mean = 0
293 |                 team_2_count_mean = 0
294 |                 team_1_ma_win = []
295 |                 team_1_ma_loss = []
296 |                 team_2_ma = []
297 |                 #get SRS
298 |                 team_srs = cbb_web_scraper.get_latest_srs(team_1)
299 |                 for ma in tqdm(ma_range):
300 |                     # data1_median = team_1_df2023.rolling(ma).median()
301 |                     # data1_median['game_loc'] = game_loc_team1
302 |                     # data2_median = team_2_df2023.rolling(ma).median()
303 |                     # data2_median['game_loc'] = game_loc_team2
304 |                     # data1_mean_old = team_1_df2023.rolling(ma).mean()
305 |                     # data2_mean_old = team_2_df2023.rolling(ma).mean()
306 |                     # TEAM 1
307 |                     data1_mean = team_1_df2023.ewm(span=ma,min_periods=ma-1).mean()
308 |                     data1_mean['game_loc'] = game_loc_team1
309 |                     data2_mean = team_2_df2023.ewm(span=ma,min_periods=ma-1).mean()
310 |                     data2_mean['game_loc'] = game_loc_team2
311 |                     # team_1_predict_median = self.RandForclass.predict(data1_median.iloc[-1:])
312 |                     # team_2_predict_median = self.RandForclass.predict(data2_median.iloc[-1:])
313 |                     #Here replace opponent metrics with the features of the second team
314 |                     for col in team_1_df2023.columns:
315 |                         if "opp" in col:
316 |                             if col == 'opp_trb':
317 |                                 # new_col = col.replace("opp_", "")
318 |                                 data1_mean.loc[data1_mean.index[-1], 'opp_trb'] = data2_mean.loc[data2_mean.index[-1], 'total_board']
319 |                             else:
320 |                                 new_col = col.replace("opp_", "")
321 |                                 data1_mean.loc[data1_mean.index[-1], col] = data2_mean.loc[data2_mean.index[-1], new_col]
322 |                     #get latest SRS value
323 |                     data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_srs
324 |                     # data1_mean['simple_rating_system'].iloc[-1] = cbb_web_scraper.get_latest_srs(team_1)# float(input(f'input {team_1} current simple rating system value: '))
325 |                     #TEAM 1 Prediction
326 |                     x_new = self.scaler.transform(data1_mean.iloc[-1:])
327 |                     prediction = self.model.predict(x_new)
328 |                     print(f'prediction: {prediction[0]*100}%')
329 |                     probability = prediction[0]
330 |                     if probability > 0.5:
331 |                         team_1_count += 1
332 |                     elif probability < 0.5:
333 |                         team_2_count += 1
334 |                     # team_1_predict_mean = self.RandForclass.predict_proba(data1_mean.iloc[-1:])
335 |                     #TEAM
336 |                     # data1_mean_change = team_1_df2023.ewm(span=ma,min_periods=ma-1).mean()
337 |                     # data1_mean_change['game_loc'] = game_loc_team1
338 |                     # data2_mean_change = team_2_df2023.ewm(span=ma,min_periods=ma-1).mean()
339 |                     # data2_mean_change['game_loc'] = game_loc_team2
340 |                     # x_new = self.scaler.transform(data2_mean_change.iloc[-1:])
341 |                     # prediction = self.model.predict(x_new)
342 |                     # probability = prediction[0]
343 |                     # team_1_predict_median = self.RandForclass.predict(data1_median.iloc[-1:])
344 |                     # team_2_predict_median = self.RandForclass.predict(data2_median.iloc[-1:])
345 |                     #Here replace opponent metrics with the features of the second team
346 |                     # for col in team_2_df2023.columns:
347 |                     #     if "opp" in col:
348 |                     #         if col == 'opp_trb':
349 |                     #             # new_col = col.replace("opp_", "")
350 |                     #             data2_mean_change.loc[data2_mean_change.index[-1], 'opp_trb'] = data1_mean_change.loc[data1_mean_change.index[-1], 'total_board']
351 |                     #         else:
352 |                     #             new_col = col.replace("opp_", "")
353 |                     #             data2_mean_change.loc[data2_mean_change.index[-1], col] = data1_mean_change.loc[data1_mean_change.index[-1], new_col]
354 |                     # team_2_predict_mean = self.RandForclass.predict_proba(data2_mean_change.iloc[-1:])
355 |                 # team_2_ma.append(team_2_predict_mean[0][1])
356 |                 # print('===============================================================')
357 |                 # print(f'{team_1} win probability {round(np.mean(team_1_ma_win),4)*100}%')
358 |                 # print(f'{team_2} win probability {round(np.median(team_2_predict_mean),4)*100}%')
359 |                 # print(f'{team_2} winning: {np.mean(team_2_ma)}%')
360 |                 print('===============================================================')
361 |                 # if np.mean(team_1_ma_win) > np.mean(team_1_ma_loss):
362 |                 #     print(f'{team_1} wins over {team_2}')
363 |                 # else:
364 |                 #     print(f'{team_2} wins over {team_1}')
365 |                 if team_1_count > team_2_count:
366 |                     print(f'{team_1} wins over {team_2}')
367 |                 elif team_1_count < team_2_count:
368 |                     print(f'{team_2} wins over {team_1}')
369 |                 print('===============================================================')
370 |             except Exception as e:
371 |                     print(f'The error: {e}')
372 |     def run_analysis(self):
373 |         self.get_teams()
374 |         self.split()
375 |         self.deep_learn()
376 |         self.predict_two_teams()
377 | def main():
378 |     cbbDeep().run_analysis()
379 | if __name__ == '__main__':
380 |     main()


--------------------------------------------------------------------------------
/deep_learn_MA.py:
--------------------------------------------------------------------------------
  1 | #deep learning implementation
  2 | import tensorflow as tf
  3 | from tensorflow import keras
  4 | from tensorflow.keras import layers
  5 | from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
  6 | from sklearn.preprocessing import StandardScaler
  7 | import cbb_web_scraper
  8 | from os import getcwd
  9 | from os.path import join, exists 
 10 | import yaml
 11 | from tqdm import tqdm
 12 | from time import sleep
 13 | from pandas import DataFrame, concat, read_csv, isnull
 14 | from sklearn.model_selection import train_test_split
 15 | from sklearn.model_selection import GridSearchCV
 16 | # from sklearn.ensemble import RandomForestClassifier
 17 | import numpy as np
 18 | import matplotlib.pyplot as plt
 19 | import seaborn as sns
 20 | # from sys import argv
 21 | import joblib
 22 | from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
 23 | from difflib import get_close_matches
 24 | # from datetime import datetime, timedelta
 25 | # from sklearn.metrics import roc_curve
 26 | import seaborn as sns
 27 | 
 28 | #TODO: Create a PCA method that remove correlated features and reduces the dimension so that the resulting dims = 95% of the total variance
 29 | 
 30 | class cbbDeep():
 31 |     def __init__(self):
 32 |         print('instantiate class cbbClass')
 33 |         self.all_data = DataFrame()
 34 |         # if exists(join(getcwd(),'randomForestModelTuned.joblib')):
 35 |         #     self.RandForRegressor=joblib.load("./randomForestModelTuned.joblib")
 36 |     def get_teams(self):
 37 |         year_list_find = []
 38 |         year_list = [2023,2022,2021,2019,2018,2017,2016,2015,2014,2013,2012] #,2014,2013,2012,2011,2010
 39 |         if exists(join(getcwd(),'year_count.yaml')):
 40 |             with open(join(getcwd(),'year_count.yaml')) as file:
 41 |                 year_counts = yaml.load(file, Loader=yaml.FullLoader)
 42 |         else:
 43 |             year_counts = {'year':year_list_find}
 44 |         #Remove any years that have already been collected
 45 |         if year_counts['year']:
 46 |             year_list_check =  year_counts['year']
 47 |             year_list_find = year_counts['year']
 48 |             year_list = [i for i in year_list if i not in year_list_check]
 49 |             print(f'Need data for year: {year_list}')
 50 |         #Collect data per year
 51 |         if year_list:   
 52 |             for year in tqdm(year_list):
 53 |                 all_teams = cbb_web_scraper.get_teams_year(year_list[-1],year_list[0])
 54 |                 team_names = sorted(all_teams)
 55 |                 final_list = []
 56 |                 self.year_store = year
 57 |                 for abv in tqdm(team_names):    
 58 |                     try:
 59 |                         print() #tqdm things
 60 |                         print(f'current team: {abv}, year: {year}')
 61 |                         basic = 'https://www.sports-reference.com/cbb/schools/' + abv + '/' + str(self.year_store) + '-gamelogs.html'
 62 |                         adv = 'https://www.sports-reference.com/cbb/schools/' + abv + '/' + str(self.year_store) + '-gamelogs-advanced.html'
 63 |                         df_inst = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,abv,self.year_store)
 64 |                         df_inst['pts'].replace('', np.nan, inplace=True)
 65 |                         df_inst.dropna(inplace=True)
 66 |                         final_list.append(df_inst)
 67 |                     except Exception as e:
 68 |                         print(e)
 69 |                         print(f'{abv} data are not available')
 70 |                     sleep(4) #I get get banned for a small period of time if I do not do this
 71 |                 final_data = concat(final_list)
 72 |                 if exists(join(getcwd(),'all_data_regressor.csv')):
 73 |                     self.all_data = read_csv(join(getcwd(),'all_data_regressor.csv'))  
 74 |                 self.all_data = concat([self.all_data, final_data.dropna()])
 75 |                 if not exists(join(getcwd(),'all_data_regressor.csv')):
 76 |                     self.all_data.to_csv(join(getcwd(),'all_data_regressor.csv'),index=False)
 77 |                 self.all_data.to_csv(join(getcwd(),'all_data_regressor.csv'),index=False)
 78 |                 year_list_find.append(year)
 79 |                 print(f'year list after loop: {year_list_find}')
 80 |                 with open(join(getcwd(),'year_count.yaml'), 'w') as write_file:
 81 |                     yaml.dump(year_counts, write_file)
 82 |                     print(f'writing {year} to yaml file')
 83 |         else:
 84 |             self.all_data = read_csv(join(getcwd(),'all_data_regressor.csv'))
 85 |         print('len data: ', len(self.all_data))
 86 |         self.all_data = self.all_data.drop_duplicates(keep='last')
 87 |         print(f'length of data after duplicates are dropped: {len(self.all_data)}')
 88 |     def convert_to_float(self):
 89 |         for col in self.all_data.columns:
 90 |             self.all_data[col].replace('', np.nan, inplace=True)
 91 |             self.all_data[col] = self.all_data[col].astype(float)
 92 |     
 93 |     def feature_engineering(self):
 94 |         for col in self.all_data.columns:
 95 |             if 'Unnamed' in col:
 96 |                 self.all_data.drop(columns=col,inplace=True)
 97 |         range_ma = [2,3,4,5,10,12,14]
 98 |         temp_ma = DataFrame()
 99 |         for val in range_ma:
100 |             for col in self.all_data.columns:
101 |                 if 'game_result' in col or 'simple_rating_system' in col or 'game_loc' in col:
102 |                     temp_ma[col] = self.all_data[col]
103 |                 else:
104 |                     dynamic_name = col + '_' + str(val)
105 |                     temp_ma[dynamic_name] = self.all_data[col].ewm(span=val,min_periods=0).mean()
106 |         self.all_data = temp_ma
107 |     def split(self):
108 |         # self.delete_opp()
109 |         for col in self.all_data.columns:
110 |             if 'Unnamed' in col:
111 |                 self.all_data.drop(columns=col,inplace=True)
112 |         self.convert_to_float()
113 |         self.feature_engineering()
114 |         self.y = self.all_data['game_result'].astype(int)
115 |         self.x = self.all_data.drop(columns=['game_result'])
116 |         # self.pre_process()
117 |         self.x_no_corr = self.x
118 |         #Dropna and remove all data from subsequent y data
119 |         real_values = ~self.x_no_corr.isna().any(axis=1)
120 |         self.x_no_corr.dropna(inplace=True)
121 |         self.y = self.y.loc[real_values]
122 |         self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x_no_corr, self.y, train_size=0.8)
123 |         # normalize data
124 |         self.scaler = StandardScaler()
125 |         self.x_train = self.scaler.fit_transform(self.x_train)
126 |         self.x_test = self.scaler.transform(self.x_test)
127 |     def pre_process(self):
128 |         # Remove features with a correlation coef greater than 0.85
129 |         corr_matrix = np.abs(self.x.astype(float).corr())
130 |         upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
131 |         to_drop = [column for column in upper.columns if any(upper[column] >= 0.90)]
132 |         self.drop_cols = to_drop
133 |         self.x_no_corr = self.x.drop(columns=to_drop)
134 |         cols = self.x_no_corr.columns
135 |         print(f'Columns dropped  >= 0.90: {to_drop}')
136 |         #Drop samples that are outliers 
137 |         print(f'old feature dataframe shape before outlier removal: {self.x_no_corr.shape}')
138 |         for col_name in cols:
139 |             Q1 = np.percentile(self.x_no_corr[col_name], 25)
140 |             Q3 = np.percentile(self.x_no_corr[col_name], 75)
141 |             IQR = Q3 - Q1
142 |             upper = np.where(self.x_no_corr[col_name] >= (Q3+2.5*IQR)) #1.5 is the standard, use two to see if more data helps improve model performance
143 |             lower = np.where(self.x_no_corr[col_name] <= (Q1-2.5*IQR)) 
144 |             self.x_no_corr.drop(upper[0], inplace = True)
145 |             self.x_no_corr.drop(lower[0], inplace = True)
146 |             self.y.drop(upper[0], inplace = True)
147 |             self.y.drop(lower[0], inplace = True)
148 |             if 'level_0' in self.x_no_corr.columns:
149 |                 self.x_no_corr.drop(columns=['level_0'],inplace = True)
150 |             self.x_no_corr.reset_index(inplace = True)
151 |             self.y.reset_index(inplace = True, drop=True)
152 |         self.x_no_corr.drop(columns=['level_0','index'],inplace = True)
153 |         print(f'new feature dataframe shape after outlier removal: {self.x_no_corr.shape}')
154 |         top_corr_features = corr_matrix.index
155 |     def create_model(self, neurons=32, learning_rate=0.001, dropout_rate=0.2, alpha=0.1):
156 |         model = keras.Sequential([
157 |             layers.Dense(neurons, input_shape=(self.x_no_corr.shape[1],)),
158 |             layers.LeakyReLU(alpha=alpha),
159 |             layers.Dropout(dropout_rate),
160 |             layers.Dense(neurons),
161 |             layers.LeakyReLU(alpha=alpha),
162 |             layers.Dropout(dropout_rate),
163 |             layers.Dense(neurons),
164 |             layers.LeakyReLU(alpha=alpha),
165 |             layers.Dropout(dropout_rate),
166 |             layers.Dense(1, activation='sigmoid')
167 |         ])
168 |         optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
169 |         model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
170 |         return model
171 |     def deep_learn(self):
172 |         if exists('deep_learning_MA.h5'):
173 |             self.model = keras.models.load_model('deep_learning_MA.h5')
174 |         else:
175 |             #best params
176 |             # Best: 0.999925 using {'alpha': 0.1, 'batch_size': 32, 'dropout_rate': 0.2,
177 |             #  'learning_rate': 0.001, 'neurons': 16}
178 |             optimizer = keras.optimizers.Adam(learning_rate=0.001)
179 |             self.model = keras.Sequential([
180 |                     layers.Dense(16, input_shape=(self.x_no_corr.shape[1],)),
181 |                     layers.LeakyReLU(alpha=0.1),
182 |                     layers.BatchNormalization(),
183 |                     layers.Dropout(0.2),
184 |                     layers.Dense(16),
185 |                     layers.LeakyReLU(alpha=0.1),
186 |                     layers.BatchNormalization(),
187 |                     layers.Dropout(0.2),
188 |                     layers.Dense(16),
189 |                     layers.LeakyReLU(alpha=0.1),
190 |                     layers.BatchNormalization(),
191 |                     layers.Dropout(0.2),
192 |                     layers.Dense(16),
193 |                     layers.LeakyReLU(alpha=0.1),
194 |                     layers.BatchNormalization(),
195 |                     layers.Dropout(0.2),
196 |                     layers.Dense(16),
197 |                     layers.LeakyReLU(alpha=0.1),
198 |                     layers.BatchNormalization(),
199 |                     layers.Dropout(0.2),
200 |                     layers.Dense(16),
201 |                     layers.LeakyReLU(alpha=0.1),
202 |                     layers.BatchNormalization(),
203 |                     layers.Dropout(0.2),
204 |                     layers.Dense(1, activation='sigmoid')
205 |                 ])
206 |             self.model.compile(optimizer=optimizer,
207 |                 loss='binary_crossentropy',
208 |                 metrics=['accuracy'])
209 |             history = self.model.fit(self.x_train, self.y_train, 
210 |                                      epochs=75, batch_size=32,
211 |                                     validation_data=(self.x_test,self.y_test))
212 |                                      #validation_split=0.2)
213 |             # param_grid = {
214 |             #     'neurons': [16, 32, 64],
215 |             #     'learning_rate': [0.01, 0.001, 0.0001],
216 |             #     'dropout_rate': [0.1, 0.2, 0.3],
217 |             #     'alpha': [0.01, 0.1, 0.2],
218 |             #     'batch_size': [16, 32, 64]
219 |             # }
220 |             # param_grid = {
221 |             #     'neurons': [16, 32],
222 |             #     'learning_rate': [0.01, 0.001],
223 |             #     'dropout_rate': [0.2],
224 |             #     'alpha': [0.1],
225 |             #     'batch_size': [32, 64]
226 |             # }
227 |             # model = KerasClassifier(build_fn=self.create_model, 
228 |             #                         epochs=50, batch_size=32, verbose=4)
229 |             # grid = GridSearchCV(estimator=model, 
230 |             #                     param_grid=param_grid,
231 |             #                     cv=3,
232 |             #                     verbose=3)
233 |             # self.grid_result = grid.fit(self.x_train, self.y_train)
234 |             # print("Best: %f using %s" % (self.grid_result.best_score_, self.grid_result.best_params_))
235 |             # self.model = self.grid_result
236 |             # input()
237 |             self.model.save('deep_learning_MA.h5')
238 |             plt.figure()
239 |             plt.plot(history.history['accuracy'], label='training accuracy')
240 |             plt.plot(history.history['val_accuracy'], label='validation accuracy')
241 |             plt.title('Accuracy History')
242 |             plt.xlabel('Epoch')
243 |             plt.ylabel('Accuracy')
244 |             plt.legend()
245 |             plt.savefig('Accuracy.png',dpi=300)
246 |             plt.close()
247 | 
248 |             # plot loss history
249 |             plt.figure()
250 |             plt.plot(history.history['loss'], label='training loss')
251 |             plt.plot(history.history['val_loss'], label='validation loss')
252 |             plt.title('Loss History')
253 |             plt.xlabel('Epoch')
254 |             plt.ylabel('Loss')
255 |             plt.legend()
256 |             plt.savefig('Loss.png',dpi=300)
257 |             plt.close()
258 |     def predict_two_teams(self):
259 |         teams_sports_ref = read_csv('teams_sports_ref_format.csv')
260 |         while True:
261 |             try:
262 |                 team_1 = input('team_1: ')
263 |                 if team_1 == 'exit':
264 |                     break
265 |                 team_2 = input('team_2: ')
266 |                 #Game location
267 |                 game_loc_team1 = int(input(f'{team_1} : #home = 0, away = 1, N = 2: '))
268 |                 if game_loc_team1 == 0:
269 |                     game_loc_team2 = 1
270 |                 elif game_loc_team1 == 1:
271 |                     game_loc_team2 = 0
272 |                 elif game_loc_team1 == 2:
273 |                     game_loc_team2 = 2
274 |                 #Check to see if the team was spelled right
275 |                 team_1  = get_close_matches(team_1,teams_sports_ref['teams'].tolist(),n=1)[0]
276 |                 team_2  = get_close_matches(team_2,teams_sports_ref['teams'].tolist(),n=1)[0]
277 |                 #2023 data
278 |                 year = 2023
279 |                 sleep(4)
280 |                 basic = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs.html'
281 |                 adv = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs-advanced.html'
282 |                 team_1_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_1.lower(),year)
283 |                 sleep(4) #I get get banned for a small period of time if I do not do this
284 |                 basic = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs.html'
285 |                 adv = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs-advanced.html'
286 |                 team_2_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_2.lower(),year)
287 |                 #Remove empty cells
288 |                 team_1_df2023['pts'].replace('', np.nan, inplace=True)
289 |                 team_1_df2023.replace('', np.nan, inplace=True)
290 |                 team_1_df2023.dropna(inplace=True)
291 |                 team_2_df2023['pts'].replace('', np.nan, inplace=True)
292 |                 team_2_df2023.replace('', np.nan, inplace=True)
293 |                 team_2_df2023.dropna(inplace=True)
294 |                 #Remove pts and game result
295 |                 # for col in team_1_df2023.columns:
296 |                 #     if 'opp' in col:
297 |                 #         team_1_df2023.drop(columns=col,inplace=True)
298 |                 # for col in team_2_df2023.columns:
299 |                 #     if 'opp' in col:
300 |                 #         team_2_df2023.drop(columns=col,inplace=True)
301 |                 team_1_df2023.drop(columns=['game_result'],inplace=True)
302 |                 team_2_df2023.drop(columns=['game_result'],inplace=True)
303 |                 #Range over all ranges data were trained on
304 |                 range_ma = [2,3,4,5,10,12,14]
305 |                 #Team 1
306 |                 data1_mean = DataFrame() 
307 |                 for val in range_ma:
308 |                     for col in team_1_df2023.columns:
309 |                         if 'game_result' in col or 'simple_rating_system' in col or 'game_loc' in col:
310 |                             data1_mean[col] = team_1_df2023[col]
311 |                         else:
312 |                             dynamic_name = col + '_' + str(val)
313 |                             data1_mean[dynamic_name] = team_1_df2023[col].ewm(span=val,min_periods=0).mean()
314 |                 #Team 2
315 |                 data2_mean = DataFrame()
316 |                 for val in range_ma:
317 |                     for col in team_2_df2023.columns:
318 |                         if 'game_result' in col or 'simple_rating_system' in col or 'game_loc' in col:
319 |                             data2_mean[col] = team_2_df2023[col]
320 |                         else:
321 |                             dynamic_name = col + '_' + str(val)
322 |                             data2_mean[dynamic_name] = team_2_df2023[col].ewm(span=val,min_periods=0).mean()
323 |                 #replace team 1 opp data with teams 2 data
324 |                 data1_mean_copy = data1_mean
325 |                 for col in data1_mean.columns:
326 |                         if "opp" in col:
327 |                             if "opp_trb" in col:
328 |                                 new_col = col.replace("opp_trb", "total_board")
329 |                                 data1_mean.loc[data1_mean.index[-1], col] = data2_mean.loc[data2_mean.index[-1], new_col]
330 |                             else:
331 |                                 new_col = col.replace("opp_", "")
332 |                                 data1_mean.loc[data1_mean.index[-1], col] = data2_mean.loc[data2_mean.index[-1], new_col]
333 |                 #Get SRS
334 |                 team_1_srs = cbb_web_scraper.get_latest_srs(team_1)
335 |                 data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_1_srs
336 |                 data2_mean.loc[data2_mean.index[-1], 'simple_rating_system'] = cbb_web_scraper.get_latest_srs(team_2)
337 |                 data1_mean_copy.loc[data1_mean_copy.index[-1], 'simple_rating_system'] = team_1_srs
338 |                 #TEAM 1 Prediction
339 |                 x_new = self.scaler.transform(data1_mean.iloc[-1:])
340 |                 prediction_team_1 = self.model.predict(x_new)
341 |                 #replace team 2 opp data with teams 1 data
342 |                 for col in data2_mean.columns:
343 |                         if "opp" in col:
344 |                             if "opp_trb" in col:
345 |                                 new_col = col.replace("opp_trb", "total_board")
346 |                                 data2_mean.loc[data2_mean.index[-1], col] = data1_mean_copy.loc[data1_mean_copy.index[-1], new_col]
347 |                             else:
348 |                                 new_col = col.replace("opp_", "")
349 |                                 data2_mean.loc[data2_mean.index[-1], col] = data1_mean_copy.loc[data1_mean_copy.index[-1], new_col]
350 |                 #TEAM 2 Prediction
351 |                 x_new = self.scaler.transform(data2_mean.iloc[-1:])
352 |                 prediction_team_2 = self.model.predict(x_new)
353 |                 print('===============================')
354 |                 print(f'prediction of {team_1} winning: {prediction_team_1[0][0]*100}%')
355 |                 print(f'prediction of {team_2} winning: {prediction_team_2[0][0]*100}%')
356 |                 print('===============================')
357 |                 #Drop the correlated features
358 |                 # team_1_df2023.drop(columns=self.drop_cols, inplace=True)
359 |                 # team_2_df2023.drop(columns=self.drop_cols, inplace=True)
360 |                 # ma_range = np.arange(2,5,1) #2 was the most correct value for mean and 8 was the best for the median; chose 9 for tiebreaking
361 |                 # team_1_count = 0
362 |                 # team_2_count = 0
363 |                 # team_1_count_mean = 0
364 |                 # team_2_count_mean = 0
365 |                 # team_1_ma_win = []
366 |                 # team_1_ma_loss = []
367 |                 # team_2_ma = []
368 |                 # #get SRS
369 |                 # team_srs = cbb_web_scraper.get_latest_srs(team_1)
370 |                 # for ma in tqdm(ma_range):
371 |                 #     # data1_median = team_1_df2023.rolling(ma).median()
372 |                 #     # data1_median['game_loc'] = game_loc_team1
373 |                 #     # data2_median = team_2_df2023.rolling(ma).median()
374 |                 #     # data2_median['game_loc'] = game_loc_team2
375 |                 #     # data1_mean_old = team_1_df2023.rolling(ma).mean()
376 |                 #     # data2_mean_old = team_2_df2023.rolling(ma).mean()
377 |                 #     # TEAM 1
378 |                 #     data1_mean = team_1_df2023.ewm(span=ma,min_periods=ma-1).mean()
379 |                 #     data1_mean['game_loc'] = game_loc_team1
380 |                 #     data2_mean = team_2_df2023.ewm(span=ma,min_periods=ma-1).mean()
381 |                 #     data2_mean['game_loc'] = game_loc_team2
382 |                 #     # team_1_predict_median = self.RandForclass.predict(data1_median.iloc[-1:])
383 |                 #     # team_2_predict_median = self.RandForclass.predict(data2_median.iloc[-1:])
384 |                 #     #Here replace opponent metrics with the features of the second team
385 |                 #     for col in team_1_df2023.columns:
386 |                 #         if "opp" in col:
387 |                 #             if col == 'opp_trb':
388 |                 #                 # new_col = col.replace("opp_", "")
389 |                 #                 data1_mean.loc[data1_mean.index[-1], 'opp_trb'] = data2_mean.loc[data2_mean.index[-1], 'total_board']
390 |                 #             else:
391 |                 #                 new_col = col.replace("opp_", "")
392 |                 #                 data1_mean.loc[data1_mean.index[-1], col] = data2_mean.loc[data2_mean.index[-1], new_col]
393 |                 #     #get latest SRS value
394 |                 #     data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_srs
395 |                 #     # data1_mean['simple_rating_system'].iloc[-1] = cbb_web_scraper.get_latest_srs(team_1)# float(input(f'input {team_1} current simple rating system value: '))
396 |                 #     #TEAM 1 Prediction
397 |                 #     x_new = self.scaler.transform(data1_mean.iloc[-1:])
398 |                 #     prediction = self.model.predict(x_new)
399 |                 #     print(f'prediction: {prediction[0]*100}%')
400 |                 #     probability = prediction[0]
401 |                 #     if probability > 0.5:
402 |                 #         team_1_count += 1
403 |                 #     elif probability < 0.5:
404 |                 #         team_2_count += 1
405 |                 #     # team_1_predict_mean = self.RandForclass.predict_proba(data1_mean.iloc[-1:])
406 |                     #TEAM
407 |                     # data1_mean_change = team_1_df2023.ewm(span=ma,min_periods=ma-1).mean()
408 |                     # data1_mean_change['game_loc'] = game_loc_team1
409 |                     # data2_mean_change = team_2_df2023.ewm(span=ma,min_periods=ma-1).mean()
410 |                     # data2_mean_change['game_loc'] = game_loc_team2
411 |                     # x_new = self.scaler.transform(data2_mean_change.iloc[-1:])
412 |                     # prediction = self.model.predict(x_new)
413 |                     # probability = prediction[0]
414 |                     # team_1_predict_median = self.RandForclass.predict(data1_median.iloc[-1:])
415 |                     # team_2_predict_median = self.RandForclass.predict(data2_median.iloc[-1:])
416 |                     #Here replace opponent metrics with the features of the second team
417 |                     # for col in team_2_df2023.columns:
418 |                     #     if "opp" in col:
419 |                     #         if col == 'opp_trb':
420 |                     #             # new_col = col.replace("opp_", "")
421 |                     #             data2_mean_change.loc[data2_mean_change.index[-1], 'opp_trb'] = data1_mean_change.loc[data1_mean_change.index[-1], 'total_board']
422 |                     #         else:
423 |                     #             new_col = col.replace("opp_", "")
424 |                     #             data2_mean_change.loc[data2_mean_change.index[-1], col] = data1_mean_change.loc[data1_mean_change.index[-1], new_col]
425 |                     # team_2_predict_mean = self.RandForclass.predict_proba(data2_mean_change.iloc[-1:])
426 |                 # team_2_ma.append(team_2_predict_mean[0][1])
427 |                 # print('===============================================================')
428 |                 # print(f'{team_1} win probability {round(np.mean(team_1_ma_win),4)*100}%')
429 |                 # print(f'{team_2} win probability {round(np.median(team_2_predict_mean),4)*100}%')
430 |                 # print(f'{team_2} winning: {np.mean(team_2_ma)}%')
431 |                 # print('===============================================================')
432 |                 # # if np.mean(team_1_ma_win) > np.mean(team_1_ma_loss):
433 |                 # #     print(f'{team_1} wins over {team_2}')
434 |                 # # else:
435 |                 # #     print(f'{team_2} wins over {team_1}')
436 |                 # if team_1_count > team_2_count:
437 |                 #     print(f'{team_1} wins over {team_2}')
438 |                 # elif team_1_count < team_2_count:
439 |                 #     print(f'{team_2} wins over {team_1}')
440 |                 # print('===============================================================')
441 |             except Exception as e:
442 |                     print(f'The error: {e}')
443 |     def run_analysis(self):
444 |         self.get_teams()
445 |         self.split()
446 |         self.deep_learn()
447 |         self.predict_two_teams()
448 | def main():
449 |     cbbDeep().run_analysis()
450 | if __name__ == '__main__':
451 |     main()


--------------------------------------------------------------------------------
/deep_learn_regressor.py:
--------------------------------------------------------------------------------
  1 | #deep learning implementation
  2 | import tensorflow as tf
  3 | from tensorflow import keras
  4 | from tensorflow.keras import layers
  5 | from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
  6 | from sklearn.preprocessing import StandardScaler
  7 | import cbb_web_scraper
  8 | from os import getcwd
  9 | from os.path import join, exists 
 10 | import yaml
 11 | from tqdm import tqdm
 12 | from time import sleep
 13 | from pandas import DataFrame, concat, read_csv, isnull
 14 | from sklearn.model_selection import train_test_split
 15 | from sklearn.model_selection import GridSearchCV
 16 | # from sklearn.ensemble import RandomForestClassifier
 17 | import numpy as np
 18 | import matplotlib.pyplot as plt
 19 | import seaborn as sns
 20 | # from sys import argv
 21 | import joblib
 22 | from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
 23 | from difflib import get_close_matches
 24 | # from datetime import datetime, timedelta
 25 | # from sklearn.metrics import roc_curve
 26 | import seaborn as sns
 27 | 
 28 | #TODO: CREATE A FEATURE OF opp_simple_rating_system
 29 | 
 30 | class cbbDeep():
 31 |     def __init__(self):
 32 |         print('instantiate class cbbClass')
 33 |         self.all_data = DataFrame()
 34 |         # if exists(join(getcwd(),'randomForestModelTuned.joblib')):
 35 |         #     self.RandForRegressor=joblib.load("./randomForestModelTuned.joblib")
 36 |     def get_teams(self):
 37 |         year_list_find = []
 38 |         year_list = [2023,2022,2021,2019,2018,2017,2016,2015,2014,2013,2012] #,2014,2013,2012,2011,2010
 39 |         if exists(join(getcwd(),'year_count.yaml')):
 40 |             with open(join(getcwd(),'year_count.yaml')) as file:
 41 |                 year_counts = yaml.load(file, Loader=yaml.FullLoader)
 42 |         else:
 43 |             year_counts = {'year':year_list_find}
 44 |         #Remove any years that have already been collected
 45 |         if year_counts['year']:
 46 |             year_list_check =  year_counts['year']
 47 |             year_list_find = year_counts['year']
 48 |             year_list = [i for i in year_list if i not in year_list_check]
 49 |             print(f'Need data for year: {year_list}')
 50 |         #Collect data per year
 51 |         if year_list:   
 52 |             for year in tqdm(year_list):
 53 |                 all_teams = cbb_web_scraper.get_teams_year(year_list[-1],year_list[0])
 54 |                 team_names = sorted(all_teams)
 55 |                 final_list = []
 56 |                 self.year_store = year
 57 |                 for abv in tqdm(team_names):    
 58 |                     try:
 59 |                         print() #tqdm things
 60 |                         print(f'current team: {abv}, year: {year}')
 61 |                         basic = 'https://www.sports-reference.com/cbb/schools/' + abv + '/' + str(self.year_store) + '-gamelogs.html'
 62 |                         adv = 'https://www.sports-reference.com/cbb/schools/' + abv + '/' + str(self.year_store) + '-gamelogs-advanced.html'
 63 |                         df_inst = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,abv,self.year_store)
 64 |                         df_inst['pts'].replace('', np.nan, inplace=True)
 65 |                         df_inst.dropna(inplace=True)
 66 |                         final_list.append(df_inst)
 67 |                     except Exception as e:
 68 |                         print(e)
 69 |                         print(f'{abv} data are not available')
 70 |                     sleep(4) #I get get banned for a small period of time if I do not do this
 71 |                 final_data = concat(final_list)
 72 |                 if exists(join(getcwd(),'all_data_regressor.csv')):
 73 |                     self.all_data = read_csv(join(getcwd(),'all_data_regressor.csv'))  
 74 |                 self.all_data = concat([self.all_data, final_data.dropna()])
 75 |                 if not exists(join(getcwd(),'all_data_regressor.csv')):
 76 |                     self.all_data.to_csv(join(getcwd(),'all_data_regressor.csv'),index=False)
 77 |                 self.all_data.to_csv(join(getcwd(),'all_data_regressor.csv'),index=False)
 78 |                 year_list_find.append(year)
 79 |                 print(f'year list after loop: {year_list_find}')
 80 |                 with open(join(getcwd(),'year_count.yaml'), 'w') as write_file:
 81 |                     yaml.dump(year_counts, write_file)
 82 |                     print(f'writing {year} to yaml file')
 83 |         else:
 84 |             self.all_data = read_csv(join(getcwd(),'all_data_regressor.csv'))
 85 |         print('len data: ', len(self.all_data))
 86 |         self.all_data = self.all_data.drop_duplicates(keep='last')
 87 |         print(f'length of data after duplicates are dropped: {len(self.all_data)}')
 88 |     def convert_to_float(self):
 89 |         for col in self.all_data.columns:
 90 |             self.all_data[col].replace('', np.nan, inplace=True)
 91 |             self.all_data[col] = self.all_data[col].astype(float)
 92 |     def split(self):
 93 |         # self.delete_opp()
 94 |         for col in self.all_data.columns:
 95 |             if 'Unnamed' in col:
 96 |                 self.all_data.drop(columns=col,inplace=True)
 97 |         self.convert_to_float()
 98 |         self.y = self.all_data['pts'].astype(float)
 99 |         self.x = self.all_data.drop(columns=['game_result','pts'])
100 |         self.pre_process()
101 |         #Dropna and remove all data from subsequent y data
102 |         real_values = ~self.x_no_corr.isna().any(axis=1)
103 |         self.x_no_corr.dropna(inplace=True)
104 |         self.y = self.y.loc[real_values]
105 |         self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x_no_corr, self.y, train_size=0.8)
106 |         # normalize data
107 |         self.scaler = StandardScaler()
108 |         self.x_train = self.scaler.fit_transform(self.x_train)
109 |         self.x_test = self.scaler.transform(self.x_test)
110 |     def pre_process(self):
111 |         # Remove features with a correlation coef greater than 0.85
112 |         corr_matrix = np.abs(self.x.astype(float).corr())
113 |         upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
114 |         to_drop = [column for column in upper.columns if any(upper[column] >= 0.90)]
115 |         self.drop_cols = to_drop
116 |         self.x_no_corr = self.x.drop(columns=to_drop)
117 |         cols = self.x_no_corr.columns
118 |         self.drop_cols.append('game_result')
119 |         print(f'Columns dropped  >= 0.90: {to_drop}')
120 |         #Drop samples that are outliers 
121 |         print(f'old feature dataframe shape before outlier removal: {self.x_no_corr.shape}')
122 |         for col_name in cols:
123 |             Q1 = np.percentile(self.x_no_corr[col_name], 25)
124 |             Q3 = np.percentile(self.x_no_corr[col_name], 75)
125 |             IQR = Q3 - Q1
126 |             upper = np.where(self.x_no_corr[col_name] >= (Q3+2.5*IQR)) #1.5 is the standard, use two to see if more data helps improve model performance
127 |             lower = np.where(self.x_no_corr[col_name] <= (Q1-2.5*IQR)) 
128 |             self.x_no_corr.drop(upper[0], inplace = True)
129 |             self.x_no_corr.drop(lower[0], inplace = True)
130 |             self.y.drop(upper[0], inplace = True)
131 |             self.y.drop(lower[0], inplace = True)
132 |             if 'level_0' in self.x_no_corr.columns:
133 |                 self.x_no_corr.drop(columns=['level_0'],inplace = True)
134 |             self.x_no_corr.reset_index(inplace = True)
135 |             self.y.reset_index(inplace = True, drop=True)
136 |         self.x_no_corr.drop(columns=['level_0','index'],inplace = True)
137 |         print(f'new feature dataframe shape after outlier removal: {self.x_no_corr.shape}')
138 |         top_corr_features = corr_matrix.index
139 |     def create_model(self, neurons=32, learning_rate=0.001, dropout_rate=0.2, alpha=0.1):
140 |         model = keras.Sequential([
141 |             layers.Dense(neurons, input_shape=(self.x_no_corr.shape[1],)),
142 |             layers.LeakyReLU(alpha=alpha),
143 |             layers.Dropout(dropout_rate),
144 |             layers.Dense(neurons),
145 |             layers.LeakyReLU(alpha=alpha),
146 |             layers.Dropout(dropout_rate),
147 |             layers.Dense(neurons),
148 |             layers.LeakyReLU(alpha=alpha),
149 |             layers.Dropout(dropout_rate),
150 |             layers.Dense(1, activation='sigmoid')
151 |         ])
152 |         optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
153 |         model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
154 |         return model
155 |     def deep_learn(self):
156 |         if exists('deep_learning_regressor.h5'):
157 |             self.model = keras.models.load_model('deep_learning_regressor.h5')
158 |         else:
159 |             optimizer = keras.optimizers.Adam(learning_rate=0.001)
160 |             self.model = keras.Sequential([
161 |                 layers.Dense(16, input_shape=(self.x_no_corr.shape[1],)),
162 |                 layers.LeakyReLU(alpha=0.1),
163 |                 layers.BatchNormalization(),
164 |                 layers.Dropout(0.2),
165 |                 layers.Dense(16),
166 |                 layers.LeakyReLU(alpha=0.1),
167 |                 layers.BatchNormalization(),
168 |                 layers.Dropout(0.2),
169 |                 layers.Dense(16),
170 |                 layers.LeakyReLU(alpha=0.1),
171 |                 layers.BatchNormalization(),
172 |                 layers.Dropout(0.2),
173 |                 layers.Dense(16),
174 |                 layers.LeakyReLU(alpha=0.1),
175 |                 layers.BatchNormalization(),
176 |                 layers.Dropout(0.2),
177 |                 layers.Dense(16),
178 |                 layers.LeakyReLU(alpha=0.1),
179 |                 layers.BatchNormalization(),
180 |                 layers.Dropout(0.2),
181 |                 layers.Dense(16),
182 |                 layers.LeakyReLU(alpha=0.1),
183 |                 layers.BatchNormalization(),
184 |                 layers.Dropout(0.2),
185 |                 layers.Dense(1)
186 |             ])
187 |             self.model.compile(optimizer=optimizer,
188 |                 loss='mean_squared_error',
189 |                 metrics=['mean_absolute_error'])
190 |             history = self.model.fit(self.x_train, self.y_train, 
191 |                                     epochs=50, batch_size=32,
192 |                                     validation_data=(self.x_test,self.y_test))
193 | 
194 |             self.model.save('deep_learning_regressor.h5')
195 |             # plt.figure()
196 |             # plt.plot(history.history['accuracy'], label='training accuracy')
197 |             # plt.plot(history.history['val_accuracy'], label='validation accuracy')
198 |             # plt.title('Accuracy History')
199 |             # plt.xlabel('Epoch')
200 |             # plt.ylabel('Accuracy')
201 |             # plt.legend()
202 |             # plt.savefig('Accuracy.png',dpi=300)
203 |             # plt.close()
204 | 
205 |             # plot loss history
206 |             plt.figure()
207 |             plt.plot(history.history['loss'], label='training loss')
208 |             plt.plot(history.history['val_loss'], label='validation loss')
209 |             plt.title('Loss History')
210 |             plt.xlabel('Epoch')
211 |             plt.ylabel('Loss')
212 |             plt.legend()
213 |             plt.savefig('Loss.png',dpi=300)
214 |             plt.close()
215 |     def predict_two_teams(self):
216 |         teams_sports_ref = read_csv('teams_sports_ref_format.csv')
217 |         while True:
218 |             # try:
219 |                 team_1 = input('team_1: ')
220 |                 if team_1 == 'exit':
221 |                     break
222 |                 team_2 = input('team_2: ')
223 |                 #Game location
224 |                 game_loc_team1 = int(input(f'{team_1} : #home = 0, away = 1, N = 2: '))
225 |                 if game_loc_team1 == 0:
226 |                     game_loc_team2 = 1
227 |                 elif game_loc_team1 == 1:
228 |                     game_loc_team2 = 0
229 |                 elif game_loc_team1 == 2:
230 |                     game_loc_team2 = 2
231 |                 #Check to see if the team was spelled right
232 |                 team_1  = get_close_matches(team_1,teams_sports_ref['teams'].tolist(),n=1)[0]
233 |                 team_2  = get_close_matches(team_2,teams_sports_ref['teams'].tolist(),n=1)[0]
234 |                 #2023 data
235 |                 year = 2023
236 |                 sleep(4)
237 |                 basic = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs.html'
238 |                 adv = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs-advanced.html'
239 |                 team_1_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_1.lower(),year)
240 |                 sleep(4) #I get get banned for a small period of time if I do not do this
241 |                 basic = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs.html'
242 |                 adv = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs-advanced.html'
243 |                 team_2_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_2.lower(),year)
244 |                 #Remove empty cells
245 |                 team_1_df2023['pts'].replace('', np.nan, inplace=True)
246 |                 team_1_df2023.replace('', np.nan, inplace=True)
247 |                 team_1_df2023.dropna(inplace=True)
248 |                 team_2_df2023['pts'].replace('', np.nan, inplace=True)
249 |                 team_2_df2023.replace('', np.nan, inplace=True)
250 |                 team_2_df2023.dropna(inplace=True)
251 |                 #Remove pts and game result
252 |                 # for col in team_1_df2023.columns:
253 |                 #     if 'opp' in col:
254 |                 #         team_1_df2023.drop(columns=col,inplace=True)
255 |                 # for col in team_2_df2023.columns:
256 |                 #     if 'opp' in col:
257 |                 #         team_2_df2023.drop(columns=col,inplace=True)
258 |                 #Drop the correlated features
259 |                 # self.drop_cols.remove('game_result')
260 |                 team_1_df2023.drop(columns=self.drop_cols, inplace=True)
261 |                 team_2_df2023.drop(columns=self.drop_cols, inplace=True)
262 |                 ma_range = np.arange(2,5,1) #2 was the most correct value for mean and 8 was the best for the median; chose 9 for tiebreaking
263 |                 team_1_count = 0
264 |                 team_2_count = 0
265 |                 team_1_count_mean = 0
266 |                 team_2_count_mean = 0
267 |                 team_1_ma_win = []
268 |                 team_1_ma_loss = []
269 |                 team_2_ma = []
270 |                 #get SRS
271 |                 team_srs = cbb_web_scraper.get_latest_srs(team_1)
272 |                 team_srs_2 = cbb_web_scraper.get_latest_srs(team_2)
273 |                 values_team1 = []
274 |                 values_team2 = []
275 |                 for ma in tqdm(ma_range):
276 |                     # data1_median = team_1_df2023.rolling(ma).median()
277 |                     # data1_median['game_loc'] = game_loc_team1
278 |                     # data2_median = team_2_df2023.rolling(ma).median()
279 |                     # data2_median['game_loc'] = game_loc_team2
280 |                     # data1_mean_old = team_1_df2023.rolling(ma).mean()
281 |                     # data2_mean_old = team_2_df2023.rolling(ma).mean()
282 |                     # TEAM 1
283 |                     data1_mean = team_1_df2023.ewm(span=ma,min_periods=ma-1).mean()
284 |                     data1_mean['game_loc'] = game_loc_team1
285 |                     data2_mean = team_2_df2023.ewm(span=ma,min_periods=ma-1).mean()
286 |                     data2_mean['game_loc'] = game_loc_team2
287 |                     # team_1_predict_median = self.RandForclass.predict(data1_median.iloc[-1:])
288 |                     # team_2_predict_median = self.RandForclass.predict(data2_median.iloc[-1:])
289 |                     #Here replace opponent metrics with the features of the second team
290 |                     for col in team_1_df2023.columns:
291 |                         if "opp" in col:
292 |                             if col == 'opp_trb':
293 |                                 # new_col = col.replace("opp_", "")
294 |                                 data1_mean.loc[data1_mean.index[-1], 'opp_trb'] = data2_mean.loc[data2_mean.index[-1], 'total_board']
295 |                             else:
296 |                                 new_col = col.replace("opp_", "")
297 |                                 data1_mean.loc[data1_mean.index[-1], col] = data2_mean.loc[data2_mean.index[-1], new_col]
298 |                     #get latest SRS value
299 |                     data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_srs
300 |                     # data1_mean['simple_rating_system'].iloc[-1] = cbb_web_scraper.get_latest_srs(team_1)# float(input(f'input {team_1} current simple rating system value: '))
301 |                     #TEAM 1 Prediction
302 |                     #Drop y-value
303 |                     data1_mean.drop(columns=['pts'],inplace=True)
304 |                     x_new = self.scaler.transform(data1_mean.iloc[-1:])
305 |                     prediction = self.model.predict(x_new)
306 |                     print(f'prediction: {prediction[0][0]}')
307 |                     values_team1.append(prediction)
308 |                 for ma in tqdm(ma_range):
309 |                     # data1_median = team_1_df2023.rolling(ma).median()
310 |                     # data1_median['game_loc'] = game_loc_team1
311 |                     # data2_median = team_2_df2023.rolling(ma).median()
312 |                     # data2_median['game_loc'] = game_loc_team2
313 |                     # data1_mean_old = team_1_df2023.rolling(ma).mean()
314 |                     # data2_mean_old = team_2_df2023.rolling(ma).mean()
315 |                     # TEAM 1
316 |                     data1_mean = team_1_df2023.ewm(span=ma,min_periods=ma-1).mean()
317 |                     data1_mean['game_loc'] = game_loc_team1
318 |                     data2_mean = team_2_df2023.ewm(span=ma,min_periods=ma-1).mean()
319 |                     data2_mean['game_loc'] = game_loc_team2
320 |                     # team_1_predict_median = self.RandForclass.predict(data1_median.iloc[-1:])
321 |                     # team_2_predict_median = self.RandForclass.predict(data2_median.iloc[-1:])
322 |                     #Here replace opponent metrics with the features of the second team
323 |                     for col in team_1_df2023.columns:
324 |                         if "opp" in col:
325 |                             if col == 'opp_trb':
326 |                                 # new_col = col.replace("opp_", "")
327 |                                 data2_mean.loc[data2_mean.index[-1], 'opp_trb'] = data1_mean.loc[data1_mean.index[-1], 'total_board']
328 |                             else:
329 |                                 new_col = col.replace("opp_", "")
330 |                                 data2_mean.loc[data2_mean.index[-1], col] = data1_mean.loc[data1_mean.index[-1], new_col]
331 |                     #get latest SRS value
332 |                     data2_mean.loc[data2_mean.index[-1], 'simple_rating_system'] = team_srs_2
333 |                     # data1_mean['simple_rating_system'].iloc[-1] = cbb_web_scraper.get_latest_srs(team_1)# float(input(f'input {team_1} current simple rating system value: '))
334 |                     #TEAM 1 Prediction
335 |                     #Drop y-value
336 |                     data2_mean.drop(columns=['pts'],inplace=True)
337 |                     x_new = self.scaler.transform(data2_mean.iloc[-1:])
338 |                     prediction = self.model.predict(x_new)
339 |                     print(f'prediction: {prediction[0][0]}')
340 |                     values_team2.append(prediction)
341 |                     # if probability > 0.5:
342 |                     #     team_1_count += 1
343 |                     # elif probability < 0.5:
344 |                     #     team_2_count += 1
345 |                     # team_1_predict_mean = self.RandForclass.predict_proba(data1_mean.iloc[-1:])
346 |                     #TEAM
347 |                     # data1_mean_change = team_1_df2023.ewm(span=ma,min_periods=ma-1).mean()
348 |                     # data1_mean_change['game_loc'] = game_loc_team1
349 |                     # data2_mean_change = team_2_df2023.ewm(span=ma,min_periods=ma-1).mean()
350 |                     # data2_mean_change['game_loc'] = game_loc_team2
351 |                     # x_new = self.scaler.transform(data2_mean_change.iloc[-1:])
352 |                     # prediction = self.model.predict(x_new)
353 |                     # probability = prediction[0]
354 |                     # team_1_predict_median = self.RandForclass.predict(data1_median.iloc[-1:])
355 |                     # team_2_predict_median = self.RandForclass.predict(data2_median.iloc[-1:])
356 |                     #Here replace opponent metrics with the features of the second team
357 |                     # for col in team_2_df2023.columns:
358 |                     #     if "opp" in col:
359 |                     #         if col == 'opp_trb':
360 |                     #             # new_col = col.replace("opp_", "")
361 |                     #             data2_mean_change.loc[data2_mean_change.index[-1], 'opp_trb'] = data1_mean_change.loc[data1_mean_change.index[-1], 'total_board']
362 |                     #         else:
363 |                     #             new_col = col.replace("opp_", "")
364 |                     #             data2_mean_change.loc[data2_mean_change.index[-1], col] = data1_mean_change.loc[data1_mean_change.index[-1], new_col]
365 |                     # team_2_predict_mean = self.RandForclass.predict_proba(data2_mean_change.iloc[-1:])
366 |                 # team_2_ma.append(team_2_predict_mean[0][1])
367 |                 # print('===============================================================')
368 |                 # print(f'{team_1} win probability {round(np.mean(team_1_ma_win),4)*100}%')
369 |                 # print(f'{team_2} win probability {round(np.median(team_2_predict_mean),4)*100}%')
370 |                 # print(f'{team_2} winning: {np.mean(team_2_ma)}%')
371 |                 print('===============================================================')
372 |                 # if np.mean(team_1_ma_win) > np.mean(team_1_ma_loss):
373 |                 #     print(f'{team_1} wins over {team_2}')
374 |                 # else:
375 |                 #     print(f'{team_2} wins over {team_1}')
376 |                 print(f'{team_1} score {np.median(values_team1)} : {team_2} score {np.median(values_team2)}')
377 |                 # if team_1_count > team_2_count:
378 |                 #     print(f'{team_1} wins over {team_2}')
379 |                 # elif team_1_count < team_2_count:
380 |                 #     print(f'{team_2} wins over {team_1}')
381 |                 print('===============================================================')
382 |             # except Exception as e:
383 |             #         print(f'The error: {e}')
384 |     def run_analysis(self):
385 |         self.get_teams()
386 |         self.split()
387 |         self.deep_learn()
388 |         self.predict_two_teams()
389 | def main():
390 |     cbbDeep().run_analysis()
391 | if __name__ == '__main__':
392 |     main()


--------------------------------------------------------------------------------
/extra/analyze_output.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | analyze output from machine learning model to determine why the model gets some games wrong
 5 | @author: brianszekely
 6 | """
 7 | from pandas import read_csv, DataFrame
 8 | import matplotlib.pyplot as plt
 9 | from numpy import where, mean
10 | from scipy.stats import ttest_ind, pearsonr
11 | from seaborn import regplot
12 | def get_data(path):
13 |     return read_csv(path)
14 | def basic_stats(df):
15 |     df.dropna(inplace=True)
16 |     #get difference in outcomes
17 |     df['Team_1_pt_diff'] = abs(df['Team 1 Score'] - df['Team 1 Score Pred'])
18 |     df['Team_2_pt_diff'] = abs(df['Team 2 Score'] - df['Team 2 Score Pred'])
19 |     corr_median = where(df['Correct Median'] == 1)[0]
20 |     incorr_median = where(df['Correct Median'] == 0)[0]
21 |     #NO DIFFERENCE BETWEEN TEAM 1 VAR AND TEAM 2 VAR IN THE INCORRECT OUTCOMES
22 |     # plt.bar('team_1_var',df['Team 1 Var'].iloc[incorr])
23 |     # plt.bar('team_2_var',df['Team 2 Var'].iloc[incorr])
24 |     # print(ttest_ind(df['Team 1 Var'].iloc[incorr],df['Team 2 Var'].iloc[incorr]))
25 |     #NO DIFFERENCE BETWEEN TEAM 1 VAR AND TEAM 2 VAR IN THE CORRECT OUTCOMES
26 |     # plt.bar('team_1_var',df['Team 1 Var'].iloc[corr])
27 |     # plt.bar('team_2_var',df['Team 2 Var'].iloc[corr])
28 |     # print(ttest_ind(df['Team 1 Var'].iloc[corr],df['Team 2 Var'].iloc[corr]))
29 |     #LOW CORRELATIONS BETWEEN VARIABILITY AND ESTIMATED : ACTUAL POINT OUTCOMES
30 |     # regplot(data=df,x='Team 1 Var',y='Team_1_pt_diff',scatter=True,fit_reg=True,label='team1')
31 |     # regplot(data=df,x='Team 2 Var',y='Team_2_pt_diff',scatter=True,fit_reg=True,label='team2')
32 |     # print(pearsonr(df['Team_1_pt_diff'],df['Team 1 Var']))
33 |     # print(pearsonr(df['Team_2_pt_diff'],df['Team 2 Var']))
34 |     # plt.legend()
35 |     #NO DIFFERENCE IN VARIABILITY IN GAMES THAT ARE INCORRECTLY PREDICTED AND HAVE A LARGE PT DIFFERENTIAL COMPARED TO THE 
36 |     #TEAM THAT WAS CLOSER TO THE PREDICTED OUTCOME
37 |     #THERE IS A SIGNIFICANT DIFFERENCE BETWEEN THE TEAM THAT IS BIGGER IN DIFFERENCE THAN THE TEAM HAS A SMALLER DIFFERENCE
38 |     #MAY MEAN THAT ONE TEAM IS BEING INCORRECTLY PREDICTED, WHILE THE OTHER TEAM IS ALMOST SPOT ON
39 |     # greater_diff = []
40 |     # lesser_diff = []
41 |     # for i in range(len(incorr_median)):
42 |     #     if df['Team_1_pt_diff'].iloc[i] > df['Team_2_pt_diff'].iloc[i]:
43 |     #         greater_diff.append(df['Team_1_pt_diff'].iloc[i])
44 |     #         lesser_diff.append(df['Team_2_pt_diff'].iloc[i])
45 |     #     else:
46 |     #         greater_diff.append(df['Team_2_pt_diff'].iloc[i])
47 |     #         lesser_diff.append(df['Team_1_pt_diff'].iloc[i])
48 |     # plt.bar('greater_diff',mean(greater_diff)) 
49 |     # plt.bar('lesser_diff',mean(lesser_diff))
50 |     # print(ttest_ind(greater_diff,lesser_diff))
51 |     #NO CORRELATION BETWEEN BEST TWO FEATURES STD AND PTS DIFF BETWEEN TEAMS WITH HIGH DIFF AND LOW DIFF
52 |     greater_diff = []
53 |     greater_var = []
54 |     lesser_diff = []
55 |     lesser_var = []
56 |     for i in range(len(df)):
57 |         if df['Team_1_pt_diff'].iloc[i] > df['Team_2_pt_diff'].iloc[i]:
58 |             # greater_var.append(df['Team 1 Var'].iloc[i])
59 |             # greater_diff.append(df['Team_1_pt_diff'].iloc[i])
60 |             # lesser_diff.append(df['Team_2_pt_diff'].iloc[i])
61 |             # lesser_var.append(df['Team 2 Var'].iloc[i])
62 |             greater_diff.append(df['Team 1 Var'].iloc[i] / df['Team_1_pt_var'].iloc[i])
63 |             lesser_diff.append(df['Team 2 Var'].iloc[i] / df['Team_2_pt_var'].iloc[i])
64 |         else:
65 |             greater_diff.append(df['Team 2 Var'].iloc[i] / df['Team_2_pt_var'].iloc[i])
66 |             lesser_diff.append(df['Team 1 Var'].iloc[i] / df['Team_1_pt_var'].iloc[i])
67 |             # greater_var.append(df['Team 2 Var'].iloc[i])
68 |             # greater_diff.append(df['Team_2_pt_diff'].iloc[i])
69 |             # lesser_diff.append(df['Team_1_pt_diff'].iloc[i])
70 |             # lesser_var.append(df['Team 1 Var'].iloc[i])
71 |     plt.bar('greater_diff',mean(greater_diff)) 
72 |     plt.bar('lesser_diff',mean(lesser_diff))
73 |     print(ttest_ind(greater_diff,lesser_diff))
74 |     # greater_diff_df = DataFrame({'Team 1 Var': greater_var,'Team_1_pt_diff': greater_diff})
75 |     # lesser_diff_df = DataFrame({'Team 2 Var': lesser_var,'Team_2_pt_diff': lesser_diff})
76 |     # regplot(data=greater_diff_df,x='Team 1 Var',y='Team_1_pt_diff',scatter=True,fit_reg=True,label='greater')
77 |     # regplot(data=lesser_diff_df,x='Team 2 Var',y='Team_2_pt_diff',scatter=True,fit_reg=True,label='lesser')
78 |     # plt.legend()
79 |     # print(pearsonr(greater_diff_df['Team_1_pt_diff'],greater_diff_df['Team 1 Var']))
80 |     # print(pearsonr(lesser_diff_df['Team_2_pt_diff'],lesser_diff_df['Team 2 Var']))
81 |     plt.show()
82 | def main():
83 |     df = get_data('test_acc_regression.csv')
84 |     basic_stats(df)
85 | if __name__ == "__main__":
86 |     main()


--------------------------------------------------------------------------------
/extra/exact_match.py:
--------------------------------------------------------------------------------
 1 | from fuzzywuzzy import process
 2 | import pandas as pd
 3 | 
 4 | df = pd.read_csv('all_teams_cbb.csv')
 5 | teams = pd.read_csv('teams_sports_ref_format.csv')
 6 | def find_closest_match(school_name):
 7 |     closest_match = process.extractOne(school_name.lower(), teams['teams'])
 8 |     return closest_match[0]
 9 | 
10 | df['School'] = df['School'].apply(find_closest_match)
11 | 
12 | for val in df['School']:
13 |     print(val)


--------------------------------------------------------------------------------
/feature_importance_xgb_classifier.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bszek213/cbb_machine_learning/aab183383e3237c3c1007e476abcd66028c7604c/feature_importance_xgb_classifier.png


--------------------------------------------------------------------------------
/pca_components.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bszek213/cbb_machine_learning/aab183383e3237c3c1007e476abcd66028c7604c/pca_components.png


--------------------------------------------------------------------------------
/teams_sports_ref_format.csv:
--------------------------------------------------------------------------------
  1 | teams
  2 | abilene-christian
  3 | air-force
  4 | akron
  5 | alabama
  6 | alabama-am
  7 | alabama-state
  8 | albany-ny
  9 | alcorn-state
 10 | american
 11 | appalachian-state
 12 | arizona
 13 | arizona-state
 14 | arkansas
 15 | arkansas-state
 16 | arkansas-pine-bluff
 17 | army
 18 | auburn
 19 | austin-peay
 20 | ball-state
 21 | baylor
 22 | belmont
 23 | bethune-cookman
 24 | binghamton
 25 | boise-state
 26 | boston-college
 27 | boston-university
 28 | bowling-green-state
 29 | bradley
 30 | brigham-young
 31 | brown
 32 | bryant
 33 | bucknell
 34 | buffalo
 35 | butler
 36 | cal-poly
 37 | cal-state-bakersfield
 38 | cal-state-fullerton
 39 | cal-state-northridge
 40 | california
 41 | campbell
 42 | canisius
 43 | central-arkansas
 44 | central-connecticut-state
 45 | central-florida
 46 | central-michigan
 47 | charleston-southern
 48 | charlotte
 49 | chattanooga
 50 | chicago-state
 51 | cincinnati
 52 | clemson
 53 | cleveland-state
 54 | coastal-carolina
 55 | colgate
 56 | college-of-charleston
 57 | colorado
 58 | colorado-state
 59 | columbia
 60 | connecticut
 61 | coppin-state
 62 | cornell
 63 | creighton
 64 | dartmouth
 65 | davidson
 66 | dayton
 67 | delaware
 68 | delaware-state
 69 | denver
 70 | depaul
 71 | detroit-mercy
 72 | drake
 73 | drexel
 74 | duke
 75 | duquesne
 76 | east-carolina
 77 | east-tennessee-state
 78 | eastern-illinois
 79 | eastern-kentucky
 80 | eastern-michigan
 81 | eastern-washington
 82 | elon
 83 | evansville
 84 | fairfield
 85 | fairleigh-dickinson
 86 | florida
 87 | florida-am
 88 | florida-atlantic
 89 | florida-gulf-coast
 90 | florida-international
 91 | florida-state
 92 | fordham
 93 | fresno-state
 94 | furman
 95 | gardner-webb
 96 | george-mason
 97 | george-washington
 98 | georgetown
 99 | georgia
100 | georgia-southern
101 | georgia-state
102 | georgia-tech
103 | gonzaga
104 | grambling
105 | grand-canyon
106 | green-bay
107 | hampton
108 | hartford
109 | harvard
110 | hawaii
111 | high-point
112 | hofstra
113 | holy-cross
114 | houston
115 | houston-baptist
116 | howard
117 | idaho
118 | idaho-state
119 | illinois
120 | illinois-state
121 | illinois-chicago
122 | indiana
123 | indiana-state
124 | iona
125 | iowa
126 | iowa-state
127 | iupui
128 | jackson-state
129 | jacksonville
130 | jacksonville-state
131 | james-madison
132 | kansas
133 | missouri-kansas-city
134 | kansas-state
135 | kennesaw-state
136 | kent-state
137 | kentucky
138 | la-salle
139 | lafayette
140 | lamar
141 | lehigh
142 | liberty
143 | lipscomb
144 | arkansas-little-rock
145 | long-beach-state
146 | long-island-university
147 | longwood
148 | louisiana-lafayette
149 | louisiana-state
150 | louisiana-tech
151 | louisiana-monroe
152 | louisville
153 | loyola-il
154 | loyola-md
155 | loyola-marymount
156 | maine
157 | manhattan
158 | marist
159 | marquette
160 | marshall
161 | maryland
162 | maryland-baltimore-county
163 | maryland-eastern-shore
164 | massachusetts
165 | massachusetts-lowell
166 | mcneese-state
167 | memphis
168 | mercer
169 | miami-fl
170 | miami-oh
171 | michigan
172 | michigan-state
173 | middle-tennessee
174 | milwaukee
175 | minnesota
176 | mississippi
177 | mississippi-state
178 | mississippi-valley-state
179 | missouri
180 | missouri-state
181 | monmouth
182 | montana
183 | montana-state
184 | morehead-state
185 | morgan-state
186 | mount-st-marys
187 | murray-state
188 | navy
189 | north-carolina-state
190 | nebraska
191 | nevada
192 | nevada-las-vegas
193 | new-hampshire
194 | new-mexico
195 | new-mexico-state
196 | new-orleans
197 | niagara
198 | nicholls-state
199 | njit
200 | norfolk-state
201 | north-carolina
202 | north-carolina-at
203 | north-carolina-central
204 | north-dakota
205 | north-dakota-state
206 | north-florida
207 | north-texas
208 | northeastern
209 | northern-arizona
210 | northern-colorado
211 | northern-illinois
212 | northern-iowa
213 | northwestern
214 | northwestern-state
215 | notre-dame
216 | oakland
217 | ohio
218 | ohio-state
219 | oklahoma
220 | oklahoma-state
221 | old-dominion
222 | nebraska-omaha
223 | oral-roberts
224 | oregon
225 | oregon-state
226 | pacific
227 | penn-state
228 | pennsylvania
229 | pepperdine
230 | pittsburgh
231 | portland
232 | portland-state
233 | prairie-view
234 | presbyterian
235 | princeton
236 | providence
237 | purdue
238 | ipfw
239 | quinnipiac
240 | radford
241 | rhode-island
242 | rice
243 | richmond
244 | rider
245 | robert-morris
246 | rutgers
247 | sacramento-state
248 | sacred-heart
249 | saint-francis-pa
250 | saint-josephs
251 | saint-louis
252 | saint-marys-ca
253 | saint-peters
254 | sam-houston-state
255 | samford
256 | san-diego
257 | san-diego-state
258 | san-francisco
259 | san-jose-state
260 | santa-clara
261 | seattle
262 | seton-hall
263 | siena
264 | south-alabama
265 | south-carolina
266 | south-carolina-state
267 | south-carolina-upstate
268 | south-dakota
269 | south-dakota-state
270 | south-florida
271 | southeast-missouri-state
272 | southeastern-louisiana
273 | southern
274 | southern-california
275 | southern-illinois
276 | southern-illinois-edwardsville
277 | southern-methodist
278 | southern-mississippi
279 | southern-utah
280 | st-bonaventure
281 | st-francis-ny
282 | st-johns-ny
283 | stanford
284 | stephen-f-austin
285 | stetson
286 | stony-brook
287 | syracuse
288 | texas-christian
289 | temple
290 | tennessee
291 | tennessee-state
292 | tennessee-tech
293 | tennessee-martin
294 | texas
295 | texas-am
296 | texas-am-corpus-christi
297 | texas-southern
298 | texas-state
299 | texas-tech
300 | texas-pan-american
301 | citadel
302 | toledo
303 | towson
304 | troy
305 | tulane
306 | tulsa
307 | alabama-birmingham
308 | california-davis
309 | california-irvine
310 | california-riverside
311 | california-santa-barbara
312 | ucla
313 | north-carolina-asheville
314 | north-carolina-greensboro
315 | north-carolina-wilmington
316 | texas-arlington
317 | utah
318 | utah-state
319 | utah-valley
320 | texas-el-paso
321 | texas-san-antonio
322 | valparaiso
323 | vanderbilt
324 | vermont
325 | villanova
326 | virginia
327 | virginia-commonwealth
328 | virginia-military-institute
329 | virginia-tech
330 | wagner
331 | wake-forest
332 | washington
333 | washington-state
334 | weber-state
335 | west-virginia
336 | western-carolina
337 | western-illinois
338 | western-kentucky
339 | western-michigan
340 | wichita-state
341 | william-mary
342 | winthrop
343 | wisconsin
344 | wofford
345 | wright-state
346 | wyoming
347 | xavier
348 | yale
349 | youngstown-state
350 | 


--------------------------------------------------------------------------------
/year_count.yaml:
--------------------------------------------------------------------------------
1 | year:
2 | - 2024
3 | - 2023
4 | 


--------------------------------------------------------------------------------