├── .gitignore ├── LICENSE ├── README.md ├── ROC_curve_class.png ├── SHAP_feature_importances.png ├── all_data.csv ├── all_teams_cbb.csv ├── binary_keras_deep.h5 ├── cbb.yaml ├── cbb_classification.py ├── cbb_regression.py ├── cbb_web_scraper.py ├── class_label_count.png ├── correlations_class.png ├── deep_learn.py ├── deep_learn_MA.py ├── deep_learn_regressor.py ├── extra ├── analyze_output.py └── exact_match.py ├── feature_importance_xgb_classifier.png ├── pca_components.png ├── teams_sports_ref_format.csv └── year_count.yaml /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | # key file 6 | key.txt 7 | errors.log 8 | #Compression 9 | randomForestModelTuned.pkl 10 | randomForestModelTuned.joblib 11 | classifierModelTuned.joblib 12 | classifierModelTuned_xgb.joblib 13 | #training directory 14 | cbb_sequential_hp/ 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | pip-wheel-metadata/ 33 | share/python-wheels/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | MANIFEST 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | #Python Files 46 | github.py 47 | 48 | # Installer logs 49 | pip-log.txt 50 | pip-delete-this-directory.txt 51 | 52 | # Unit test / coverage reports 53 | htmlcov/ 54 | .tox/ 55 | .nox/ 56 | .coverage 57 | .coverage.* 58 | .cache 59 | nosetests.xml 60 | coverage.xml 61 | *.cover 62 | *.py,cover 63 | .hypothesis/ 64 | .pytest_cache/ 65 | 66 | # Translations 67 | *.mo 68 | *.pot 69 | 70 | # Django stuff: 71 | *.log 72 | local_settings.py 73 | db.sqlite3 74 | db.sqlite3-journal 75 | 76 | # Flask stuff: 77 | instance/ 78 | .webassets-cache 79 | 80 | # Scrapy stuff: 81 | .scrapy 82 | 83 | # Sphinx documentation 84 | docs/_build/ 85 | 86 | # PyBuilder 87 | target/ 88 | 89 | # Jupyter Notebook 90 | .ipynb_checkpoints 91 | 92 | # IPython 93 | profile_default/ 94 | ipython_config.py 95 | 96 | # pyenv 97 | .python-version 98 | 99 | # pipenv 100 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 101 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 102 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 103 | # install all needed dependencies. 104 | #Pipfile.lock 105 | 106 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 107 | __pypackages__/ 108 | 109 | # Celery stuff 110 | celerybeat-schedule 111 | celerybeat.pid 112 | 113 | # SageMath parsed files 114 | *.sage.py 115 | 116 | # Environments 117 | .env 118 | .venv 119 | env/ 120 | venv/ 121 | ENV/ 122 | env.bak/ 123 | venv.bak/ 124 | 125 | # Spyder project settings 126 | .spyderproject 127 | .spyproject 128 | 129 | # Rope project settings 130 | .ropeproject 131 | 132 | # mkdocs documentation 133 | /site 134 | 135 | # mypy 136 | .mypy_cache/ 137 | .dmypy.json 138 | dmypy.json 139 | 140 | # Pyre type checker 141 | .pyre/ 142 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Brian Szekely 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # College Basketball Game Predictions 2 | 3 | Machine learning that predicts the outcome of any Division I college basketball game. Data are from 2010 - 2024 seasons. 4 | 5 | Data are from SportsReference.com 6 | 7 | ## Usage 8 | 9 | ```python 10 | python cbb_classification.py tune or python cbb_classification.py notune 11 | ``` 12 | 13 | ```bash 14 | Removed features (>=0.9 correlation): ['fta', 'fta_per_fga_pct', 'fg3a_per_fga_pct', 'ts_pct', 'stl_pct', 'blk_pct', 'efg_pct', 'tov_pct', 'orb_pct', 'ft_rate'] 15 | dataset shape: (27973 samples, 55 features) 16 | 17 | ### Current prediction accuracies - XGBoost 18 | # After 5 fold cross validation and pre-processing 19 | Current XGBoost Classifier - best params: {'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 1, 'n_estimators': 200, 'reg_alpha': 0.01, 'reg_lambda': 0.01, 'scale_pos_weight': 1, 'subsample': 1.0} 20 | 21 | 22 | #Classification - XGBoost 23 | Confusion Matrix:[[1316 46] 24 | [ 31 1404]] 25 | Model accuracy on test data: 0.9688952449052556 26 | 27 | #Classificatino - DNN Keras 28 | Final model test loss 0.07359004765748978 and accuracy 0.9760457873344421 29 | ``` 30 | ### Correlation Matrix 31 |  32 | 33 | 35 | ### Feature Importances Classification 36 | XGBoost 37 |  38 | Deep Neural Network 39 |  40 | 41 | ## Contributing 42 | Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change. 43 | -------------------------------------------------------------------------------- /ROC_curve_class.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bszek213/cbb_machine_learning/aab183383e3237c3c1007e476abcd66028c7604c/ROC_curve_class.png -------------------------------------------------------------------------------- /SHAP_feature_importances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bszek213/cbb_machine_learning/aab183383e3237c3c1007e476abcd66028c7604c/SHAP_feature_importances.png -------------------------------------------------------------------------------- /all_teams_cbb.csv: -------------------------------------------------------------------------------- 1 | School,From,To 2 | Abilene Christian,1971,2024 3 | Air Force,1958,2024 4 | Akron,1902,2024 5 | Alabama,1913,2024 6 | Alabama A&M,2000,2024 7 | Alabama State,1983,2024 8 | Albany (NY),2000,2024 9 | Alcorn State,1978,2024 10 | Allegheny Gators,1896,1916 11 | American,1967,2024 12 | Amherst Lord Jeffs,1901,1902 13 | Appalachian State,1974,2024 14 | Arizona,1905,2024 15 | Arizona State,1912,2024 16 | Arkansas,1924,2024 17 | Arkansas State,1971,2024 18 | Arkansas-Pine Bluff,1999,2024 19 | Armstrong Pirates,1987,1987 20 | Army,1903,2024 21 | Auburn,1906,2024 22 | Augusta Jaguars,1985,1991 23 | Augustana (IL) Vikings,1902,1917 24 | Austin Peay,1964,2024 25 | Baker University Wildcats,1903,1908 26 | Baldwin-Wallace Yellow Jackets,1948,1953 27 | Ball State,1972,2024 28 | Baltimore Super Bees,1979,1983 29 | Baylor,1907,2024 30 | Bellarmine,2021,2024 31 | Belmont,2000,2024 32 | Beloit Buccaneers,1911,1924 33 | Bethune-Cookman,1981,2024 34 | Binghamton,2002,2024 35 | Birmingham-Southern Panthers,1920,2006 36 | Bloomsburg Huskies,1896,1911 37 | Boise State,1972,2024 38 | Boston College,1946,2024 39 | Boston University,1916,2024 40 | Bowling Green State,1916,2024 41 | Bradley,1903,2024 42 | Brigham Young,1903,2024 43 | Brigham Young College,1908,1908 44 | Brooklyn Bulldogs,1934,1992 45 | Brown,1901,2024 46 | Bryant,2011,2024 47 | Bucknell,1896,2024 48 | Buffalo,1907,2024 49 | Butler,1897,2024 50 | Cal Poly,1995,2024 51 | Cal State Bakersfield,2011,2024 52 | Cal State Fullerton,1975,2024 53 | Cal State Los Angeles Golden Eagles,1971,1975 54 | Cal State Northridge,1991,2024 55 | California,1908,2024 56 | California Baptist,2019,2024 57 | Campbell,1978,2024 58 | Canisius,1904,2024 59 | Canterbury College,1931,1931 60 | Carleton College Knights,1910,1934 61 | Carnegie Mellon Tartans,1933,1939 62 | Case Western Reserve Spartans,1898,1955 63 | Catholic Cardinals,1913,1981 64 | Centenary (LA) Gents,1960,2011 65 | Central Arkansas,2011,2024 66 | Central Connecticut State,1987,2024 67 | Central Michigan,1974,2024 68 | Central Missouri Mules,1913,1937 69 | Central Pennsylvania College Knights,1896,1900 70 | Centre (KY) Colonels,1910,1919 71 | Charleston Southern,1975,2024 72 | Charlotte,1973,2024 73 | Chattanooga,1978,2024 74 | Cheyenne Business College,1903,1903 75 | Chicago Maroons,1896,1946 76 | Chicago State,1985,2024 77 | Cincinnati,1902,2024 78 | City College of New York Beavers,1906,1953 79 | Clemson,1912,2024 80 | Cleveland State,1973,2024 81 | Coastal Carolina,1987,2024 82 | Colgate,1901,2024 83 | College of Charleston,1992,2024 84 | College of New Jersey Lions,1900,1900 85 | Colorado,1902,2024 86 | Colorado College Tigers,1915,1937 87 | Colorado School of Mines Orediggers,1908,1937 88 | Colorado State,1902,2024 89 | Columbia,1901,2024 90 | Concordia Seminary Preachers,1907,1923 91 | Connecticut,1901,2024 92 | Coppin State,1986,2024 93 | Cornell,1899,2024 94 | Cotner College,1910,1911 95 | Creighton,1912,2024 96 | Cumberland,1904,1904 97 | Dakota Wesleyan Tigers,1932,1932 98 | Dartmouth,1900,2024 99 | Davidson,1909,2024 100 | Dayton,1904,2024 101 | Delaware,1906,2024 102 | Delaware State,1974,2024 103 | Denison Big Red,1905,1944 104 | Denver,1904,2024 105 | DePaul,1924,2024 106 | DePauw Tigers,1916,1932 107 | Detroit Mercy,1910,2024 108 | Dickinson College Red Devils,1926,1947 109 | Drake,1907,2024 110 | Drexel,1895,2024 111 | Duke,1906,2024 112 | Duquesne,1914,2024 113 | East Carolina,1967,2024 114 | East Central Tigers,1929,1931 115 | East Tennessee State,1959,2024 116 | Eastern Illinois,1982,2024 117 | Eastern Kentucky,1948,2024 118 | Eastern Michigan,1933,2024 119 | Eastern Washington,1984,2024 120 | Elon,2000,2024 121 | Emporia State Hornets,1934,1934 122 | Ensign College,1903,1903 123 | Evansville,1925,2024 124 | Fairfield,1965,2024 125 | FDU,1968,2024 126 | Florida,1921,2024 127 | Florida A&M,1980,2024 128 | Florida Atlantic,1994,2024 129 | Florida Gulf Coast,2011,2024 130 | Florida International,1988,2024 131 | Florida State,1957,2024 132 | Fordham,1903,2024 133 | Franklin Grizzlies,1907,1925 134 | Fresno State,1956,2024 135 | Furman,1920,2024 136 | Gardner-Webb,2003,2024 137 | Geneva Golden Tornadoes,1893,1943 138 | George Mason,1979,2024 139 | George Washington,1913,2024 140 | Georgetown,1907,2024 141 | Georgia,1906,2024 142 | Georgia Southern,1972,2024 143 | Georgia State,1974,2024 144 | Georgia Tech,1920,2024 145 | Gettysburg Bullets,1901,1973 146 | Gonzaga,1944,2024 147 | Grambling,1978,2024 148 | Grand Canyon,2014,2024 149 | Green Bay,1982,2024 150 | Grinnell Pioneers,1901,1939 151 | Grove City Wolverines,1899,1925 152 | Hamline Pipers,1945,1948 153 | Hampton,1996,2024 154 | Hardin-Simmons Cowboys,1923,1990 155 | Hartford Hawks,1985,2023 156 | Harvard,1901,2024 157 | Haskell (KS) Fighting Indians,1903,1908 158 | Hawaii,1971,2024 159 | High Point,2000,2024 160 | Hiram Terriers,1894,1904 161 | Hofstra,1943,2024 162 | Holy Cross,1901,2024 163 | Hope Flying Dutchmen,1908,1913 164 | Houston,1951,2024 165 | Houston Christian,1974,2024 166 | Howard,1974,2024 167 | Idaho,1906,2024 168 | Idaho State,1959,2024 169 | Illinois,1906,2024 170 | Illinois State,1899,2024 171 | Illinois Wesleyan Titans,1928,1928 172 | Illinois-Chicago,1982,2024 173 | Incarnate Word,2014,2024 174 | Indiana,1901,2024 175 | Indiana State,1900,2024 176 | Iona,1954,2024 177 | Iowa,1893,2024 178 | Iowa State,1908,2024 179 | IUPUI,1999,2024 180 | Jackson State,1978,2024 181 | Jacksonville,1967,2024 182 | Jacksonville State,1996,2024 183 | James Madison,1977,2024 184 | John Carroll Blue Streaks,1948,1955 185 | Kalamazoo Hornets,1908,1923 186 | Kansas,1899,2024 187 | Missouri Kansas City,1990,2024 188 | Kansas State,1906,2024 189 | Kennesaw State,2010,2024 190 | Kent State,1914,2024 191 | Kentucky,1903,2024 192 | Kentucky Wesleyan Panthers,1957,1958 193 | La Salle,1932,2024 194 | Lafayette,1901,2024 195 | Lake Forest Foresters,1905,1916 196 | Lamar,1970,2024 197 | Lawrence Tech,1948,1948 198 | Le Moyne,2024,2024 199 | Lehigh,1902,2024 200 | Lewis Flyers,1905,1905 201 | Liberty,1989,2024 202 | Lindenwood,2023,2024 203 | Lipscomb,2004,2024 204 | Arkansas Little Rock,1979,2024 205 | Long Beach State,1970,2024 206 | Long Island University,1929,2024 207 | Longwood,2008,2024 208 | Louisiana Lafayette,1972,2024 209 | Louisiana State,1909,2024 210 | Louisiana Tech,1974,2024 211 | Louisiana-Monroe,1974,2024 212 | Louisville,1912,2024 213 | Loyola (IL),1921,2024 214 | Loyola (LA) Wolfpack,1952,1972 215 | Loyola (MD),1908,2024 216 | Loyola Marymount,1943,2024 217 | Macalester Scots,1896,1899 218 | Maine,1904,2024 219 | Manchester Spartans,1926,1926 220 | Manhattan,1905,2024 221 | Marietta Pioneers,1908,1920 222 | Marist,1982,2024 223 | Marquette,1917,2024 224 | Marshall,1919,2024 225 | Maryland,1924,2024 226 | Maryland-Baltimore County,1987,2024 227 | Maryland-Eastern Shore,1974,2024 228 | Massachusetts,1926,2024 229 | Massachusetts Institute of Technology Engineers,1909,1909 230 | Massachusetts-Lowell,1906,2024 231 | McNeese State,1974,2024 232 | Memphis,1956,2024 233 | Mercer,1974,2024 234 | Merchant Marine Mariners,1946,1947 235 | Merrimack,2020,2024 236 | Miami (FL),1949,2024 237 | Miami (OH),1906,2024 238 | Michigan,1918,2024 239 | Michigan State,1899,2024 240 | Middle Tennessee,1959,2024 241 | Millikin Big Blue,1910,1921 242 | Millsaps Majors,1911,1921 243 | Milwaukee,1974,2024 244 | Minnesota,1896,2024 245 | Minnesota A&M Aggies,1896,1903 246 | Mississippi,1909,2024 247 | Mississippi State,1909,2024 248 | Mississippi Valley State,1980,2024 249 | Missouri,1907,2024 250 | Missouri State,1983,2024 251 | Monmouth,1984,2024 252 | Montana,1912,2024 253 | Montana State,1902,2024 254 | Morehead State,1956,2024 255 | Morgan State,1985,2024 256 | Morris Brown Wolverines,2002,2003 257 | Mount St. Mary's,1989,2024 258 | Mount Union Purple Raiders,1896,1932 259 | Muhlenberg Mules,1901,1963 260 | Murray State,1954,2024 261 | Muskingum Fighting Muskies,1905,1927 262 | Navy,1908,2024 263 | North Carolina State,1913,2024 264 | Nebraska,1897,2024 265 | Nebraska Wesleyan Prairie Wolves,1906,1917 266 | Nevada,1913,2024 267 | Nevada-Las Vegas,1970,2024 268 | New Hampshire,1927,2024 269 | New Mexico,1900,2024 270 | New Mexico State,1905,2024 271 | New Orleans,1976,2024 272 | New York University Violets,1907,1971 273 | Newberry Wolves,1921,1921 274 | Niagara,1906,2024 275 | Nicholls State,1981,2024 276 | NJIT,2010,2024 277 | Norfolk State,1998,2024 278 | North Alabama,2019,2024 279 | North Carolina,1911,2024 280 | North Carolina A&T,1974,2024 281 | North Carolina Central,2011,2024 282 | North Central Cardinals,1911,1922 283 | North Dakota,1905,2024 284 | North Dakota State,1898,2024 285 | North Florida,2010,2024 286 | North Texas,1922,2024 287 | Northeastern,1938,2024 288 | Northeastern Illinois Golden Eagles,1991,1998 289 | Northern Arizona,1919,2024 290 | Northern Colorado,1911,2024 291 | Northern Illinois,1927,2024 292 | Northern Iowa,1981,2024 293 | Northern Kentucky,2013,2024 294 | Northwest Missouri State Bearcats,1930,1932 295 | Northwestern,1905,2024 296 | Northwestern State,1977,2024 297 | Notre Dame,1897,2024 298 | Oakland,2000,2024 299 | Oberlin Yeomen,1905,1921 300 | Ohio,1908,2024 301 | Ohio State,1899,2024 302 | Ohio Wesleyan Battling Bishops,1929,1935 303 | Oklahoma,1908,2024 304 | Oklahoma City Chiefs,1951,1985 305 | Oklahoma State,1908,2024 306 | Old Dominion,1977,2024 307 | Nebraska Omaha,2013,2024 308 | Oral Roberts,1972,2024 309 | Oregon,1903,2024 310 | Oregon State,1902,2024 311 | Pacific,1938,2024 312 | Penn State,1897,2024 313 | Pennsylvania,1897,2024 314 | Pepperdine,1944,2024 315 | Phillips Haymakers,1920,1920 316 | Pittsburg State Gorillas,1927,1931 317 | Pittsburgh,1906,2024 318 | Portland,1954,2024 319 | Portland State,1973,2024 320 | Prairie View,1981,2024 321 | Pratt Institute Cannoneers,1934,1934 322 | Presbyterian,2011,2024 323 | Princeton,1901,2024 324 | Providence,1929,2024 325 | Purdue,1897,2024 326 | Purdue Fort Wayne,2003,2024 327 | Queens (NC),2023,2024 328 | Quinnipiac,1999,2024 329 | Radford,1985,2024 330 | Regis (CO) Rangers,1962,1964 331 | Rensselaer Engineers,1901,1924 332 | Rhode Island,1904,2024 333 | Rice,1915,2024 334 | Richmond,1913,2024 335 | Rider,1929,2024 336 | Ripon Red Hawks,1902,1922 337 | Roanoke Maroons,1912,1919 338 | Robert Morris,1977,2024 339 | Rochester (NY) Yellowjackets,1910,1944 340 | Rose-Hulman Fightin' Engineers,1898,1898 341 | Rutgers,1914,2024 342 | Sacramento State,1992,2024 343 | Sacred Heart,2000,2024 344 | Saint Francis (PA),1956,2024 345 | Saint Joseph's,1910,2024 346 | Saint Louis,1916,2024 347 | Saint Mary's (CA),1910,2024 348 | Saint Peter's,1966,2024 349 | Sam Houston,1987,2024 350 | Samford,1973,2024 351 | San Diego,1980,2024 352 | San Diego State,1971,2024 353 | San Francisco,1924,2024 354 | San Jose State,1938,2024 355 | Santa Clara,1909,2024 356 | Savage School of Physical Education,1896,1898 357 | Savannah State Tigers,2003,2019 358 | Scranton Royals,1948,1948 359 | Seattle,1953,2024 360 | Seton Hall,1909,2024 361 | Sewanee Tigers,1923,1941 362 | Siena,1939,2024 363 | South Alabama,1972,2024 364 | South Carolina,1909,2024 365 | South Carolina State,1972,2024 366 | South Carolina Upstate,2011,2024 367 | South Dakota,2011,2024 368 | South Dakota State,2009,2024 369 | South Florida,1974,2024 370 | Southeast Missouri State,1992,2024 371 | Southeastern Louisiana,1981,2024 372 | Southern,1978,2024 373 | Southern California,1907,2024 374 | Southern Illinois,1968,2024 375 | Southern Illinois-Edwardsville,2011,2024 376 | Southern Indiana,2023,2024 377 | Southern Methodist,1917,2024 378 | Southern Mississippi,1973,2024 379 | Southern Utah,1989,2024 380 | Southwestern (KS) Moundbuilders,1905,1923 381 | Southwestern (TX) Pirates,1915,1916 382 | Springfield Pride,1897,1935 383 | St. Bonaventure,1920,2024 384 | St. Francis (NY) Terriers,1902,2023 385 | St. John's (NY),1908,2024 386 | St. John's College (OH),1921,1921 387 | St. Lawrence Saints,1902,1914 388 | St. Thomas,2022,2024 389 | Stanford,1914,2024 390 | Stephen F. Austin,1987,2024 391 | Stetson,1972,2024 392 | Stevens Institute Ducks,1917,1920 393 | Stonehill,2023,2024 394 | Stony Brook,2000,2024 395 | SUNY-Potsdam Bears,1910,1913 396 | Swarthmore Garnet,1906,1919 397 | Syracuse,1901,2024 398 | Tarleton State,2021,2024 399 | TCU,1914,2024 400 | Temple,1895,2024 401 | Tennessee,1909,2024 402 | Tennessee State,1978,2024 403 | Tennessee Tech,1944,2024 404 | Tennessee-Martin,1993,2024 405 | Texas,1906,2024 406 | Texas A&M,1913,2024 407 | Texas A&M-Commerce,2023,2024 408 | Texas A&M-Corpus Christi,2003,2024 409 | Texas Southern,1978,2024 410 | Texas State,1985,2024 411 | Texas Tech,1926,2024 412 | Texas Wesleyan Rams,1948,1948 413 | Texas-Rio Grande Valley,1969,2024 414 | The Citadel,1913,2024 415 | Toledo,1916,2024 416 | Towson,1980,2024 417 | Trinity (CT) Bantams,1897,1911 418 | Trinity (TX) Tigers,1971,1973 419 | Troy,1994,2024 420 | Tulane,1906,2024 421 | Tulsa,1914,2024 422 | U.S. International Gulls,1982,1991 423 | Alabama Birmingham,1980,2024 424 | California Davis,2008,2024 425 | California irvine,1978,2024 426 | California Riverside,2002,2024 427 | California San Diego,2021,2024 428 | California Santa Barbara,1964,2024 429 | Central Florida,1985,2024 430 | UCLA,1920,2024 431 | North Carolina Asheville,1987,2024 432 | North Carolina Greensboro,1992,2024 433 | North Carolina Wilmington,1977,2024 434 | Union (NY) Dutchmen,1907,1925 435 | Texas Arlington,1969,2024 436 | Utah,1909,2024 437 | Utah State,1904,2024 438 | Utah Tech,2021,2024 439 | Utah Valley,2010,2024 440 | University Texas El Paso,1923,2024 441 | Utica Pioneers,1982,1987 442 | University Texas San Antonio,1982,2024 443 | Valparaiso,1918,2024 444 | Vanderbilt,1901,2024 445 | Vermont,1921,2024 446 | Villanova,1921,2024 447 | Virginia,1906,2024 448 | Virginia Commonwealth,1974,2024 449 | Virginia Military Institute,1909,2024 450 | Virginia Tech,1909,2024 451 | Wabash Little Giants,1897,1925 452 | Wagner,1966,2024 453 | Wake Forest,1906,2024 454 | Washburn Ichabods,1906,1941 455 | Washington,1896,2024 456 | Washington & Jefferson Presidents,1913,1944 457 | Washington & Lee Generals,1907,1959 458 | Washington (MO) Bears,1905,1960 459 | Washington College Shoremen,1913,1925 460 | Washington State,1902,2024 461 | Wayne State (MI) Warriors,1928,1950 462 | Weber State,1964,2024 463 | Wesleyan (CT) Cardinals,1896,1913 464 | West Chester Golden Rams,1899,1982 465 | West Texas A&M Buffaloes,1921,1986 466 | West Virginia,1904,2024 467 | Western Carolina,1977,2024 468 | Western Colorado Mountaineers,1924,1937 469 | Western Illinois,1982,2024 470 | Western Kentucky,1922,2024 471 | Western Michigan,1914,2024 472 | Westminster (MO) Blue Jays,1920,1920 473 | Westminster (PA) Titans,1898,1935 474 | Wheaton (IL) Thunder,1902,1905 475 | Whittier Poets,1909,1915 476 | Wichita State,1906,2024 477 | Widener Pride,1899,1909 478 | William Mary,1906,2024 479 | Williams Ephs,1901,1911 480 | Winthrop,1987,2024 481 | Wisconsin,1899,2024 482 | Wisconsin-Stevens Point Pointers,1898,1918 483 | Wisconsin-Superior Yellowjackets,1900,1901 484 | Wittenberg Tigers,1931,1931 485 | Wofford,1996,2024 486 | Wooster Fighting Scots,1901,1931 487 | WPI Engineers,1920,1920 488 | Wright State,1988,2024 489 | Wyoming,1905,2024 490 | Xavier,1920,2024 491 | Yale,1896,2024 492 | Youngstown State,1948,2024 493 | -------------------------------------------------------------------------------- /binary_keras_deep.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bszek213/cbb_machine_learning/aab183383e3237c3c1007e476abcd66028c7604c/binary_keras_deep.h5 -------------------------------------------------------------------------------- /cbb.yaml: -------------------------------------------------------------------------------- 1 | name: cbb 2 | channels: 3 | - conda-forge 4 | - robostack 5 | - anaconda 6 | - intel 7 | - rapidsai 8 | dependencies: 9 | - hvplot 10 | - numpy 11 | - pandas 12 | - holoviews 13 | - scikit-learn 14 | - keras 15 | - cudatoolkit=11.2 16 | - cudnn=8.1.0 17 | - scipy 18 | - ipython 19 | - plotly 20 | - seaborn 21 | - ipywidgets 22 | - ipykernel 23 | - matplotlib 24 | - spyder 25 | - notebook 26 | - keyboard 27 | - eli5 28 | - pip 29 | - pip: 30 | - sportsipy 31 | - tensorflow 32 | - beautifulsoup4 33 | - eli5 34 | - cfbd 35 | -------------------------------------------------------------------------------- /cbb_classification.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | College Basketball Predictions via classification and probability with ESPN 5 | @author: brianszekely 6 | """ 7 | import cbb_web_scraper 8 | from os import getcwd 9 | from os.path import join, exists 10 | import yaml 11 | from tqdm import tqdm 12 | from time import sleep 13 | from pandas import DataFrame, concat, read_csv, isnull 14 | from sklearn.model_selection import train_test_split 15 | from sklearn.model_selection import GridSearchCV 16 | import numpy as np 17 | import matplotlib.pyplot as plt 18 | import seaborn as sns 19 | from sys import argv 20 | import joblib 21 | from sklearn.metrics import confusion_matrix, accuracy_score 22 | from difflib import get_close_matches 23 | from sklearn.metrics import roc_curve 24 | import seaborn as sns 25 | from tensorflow.keras.utils import to_categorical 26 | from sklearn.decomposition import PCA 27 | import xgboost as xgb 28 | from tensorflow.keras.layers import Dense, BatchNormalization 29 | from tensorflow.keras.models import Sequential 30 | from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau 31 | from tensorflow.keras.optimizers import Adam, RMSprop 32 | from tensorflow.keras.regularizers import l1, l2 33 | from keras_tuner import RandomSearch 34 | from tensorflow.keras.losses import BinaryCrossentropy 35 | from tensorflow.keras.layers import Dropout 36 | from tensorflow.keras.models import load_model 37 | import os 38 | from colorama import Fore, Style 39 | from sklearn.preprocessing import StandardScaler, RobustScaler 40 | import shap 41 | 42 | """ 43 | TODO: 44 | -adjust noise for better learning 45 | -may remove opp_pts and pts to enhance other features 46 | -feature engineer with rolling std or mean 47 | """ 48 | def create_sequential_model(hp, n_features, n_outputs): 49 | model = Sequential() 50 | #Add hidden layers 51 | for i in range(hp.Int('num_layers', 1, 10)): 52 | if i == 0: 53 | # First hidden layer needs input shape 54 | model.add(Dense(units=hp.Int(f'units_{i}', min_value=8, max_value=128, step=8), 55 | activation=hp.Choice(f'activation_{i}', values=['relu', 'leaky_relu', 'tanh', 'linear']), 56 | kernel_regularizer=l2(hp.Float(f'regularizer_strength_{i}', min_value=1e-1, max_value=1, sampling='log')), 57 | input_shape=(n_features,))) 58 | else: 59 | model.add(Dense(units=hp.Int(f'units_{i}', min_value=8, max_value=128, step=8), 60 | activation=hp.Choice(f'activation_{i}', values=['relu', 'leaky_relu', 'tanh', 'linear']), 61 | kernel_regularizer=l2(hp.Float(f'regularizer_strength_{i}', min_value=1e-1, max_value=1, sampling='log')))) 62 | model.add(BatchNormalization()) 63 | model.add(Dropout(rate=hp.Float(f'dropout_rate_{i}', min_value=0.3, max_value=0.6, step=0.1))) 64 | 65 | # Output layer 66 | model.add(Dense(n_outputs, activation='sigmoid')) # Binary classification 67 | 68 | # Compile model 69 | optimizer_choice = hp.Choice('optimizer', values=['adam', 'rmsprop']) #, 'sgd' 70 | if optimizer_choice == 'adam': 71 | optimizer = Adam(learning_rate=hp.Float('adam_learning_rate', min_value=0.0001, max_value=0.01, sampling='log')) 72 | else: 73 | optimizer = RMSprop(learning_rate=hp.Float('rmsprop_learning_rate', min_value=0.0001, max_value=0.01, sampling='log')) 74 | 75 | model.compile(optimizer=optimizer, 76 | loss=BinaryCrossentropy(), 77 | metrics=['accuracy']) 78 | 79 | return model 80 | 81 | class cbbClass(): 82 | def __init__(self,pre_process): 83 | print('instantiate class cbbClass') 84 | self.all_data = DataFrame() 85 | self.which_analysis = pre_process # 'pca' or 'corr' 86 | 87 | def get_teams(self): 88 | year_list_find = [] 89 | year_list = [2024,2023]#,2022,2021,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010] 90 | if exists(join(getcwd(),'year_count.yaml')): 91 | with open(join(getcwd(),'year_count.yaml')) as file: 92 | year_counts = yaml.load(file, Loader=yaml.FullLoader) 93 | else: 94 | year_counts = {'year':year_list_find} 95 | #Remove any years that have already been collected 96 | if year_counts['year']: 97 | year_list_check = year_counts['year'] 98 | year_list_find = year_counts['year'] 99 | year_list = [i for i in year_list if i not in year_list_check] 100 | print(f'Need data for year: {year_list}') 101 | #Collect data per year 102 | if year_list: 103 | for year in tqdm(year_list): 104 | all_teams = cbb_web_scraper.get_teams_year(year_list[-1],2024) 105 | team_names = sorted(all_teams) 106 | final_list = [] 107 | self.year_store = year 108 | for abv in tqdm(team_names): 109 | try: 110 | print() #tqdm things 111 | print(f'current team: {abv}, year: {year}') 112 | basic = 'https://www.sports-reference.com/cbb/schools/' + abv + '/' + str(self.year_store) + '-gamelogs.html' 113 | adv = 'https://www.sports-reference.com/cbb/schools/' + abv + '/' + str(self.year_store) + '-gamelogs-advanced.html' 114 | df_inst = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,abv,self.year_store) 115 | print(df_inst) 116 | df_inst['pts'].replace('', np.nan, inplace=True) 117 | df_inst.dropna(inplace=True) 118 | final_list.append(df_inst) 119 | except Exception as e: 120 | print(e) 121 | print(f'{abv} data are not available') 122 | sleep(4) #I get get banned for a small period of time if I do not do this 123 | final_data = concat(final_list) 124 | if exists(join(getcwd(),'all_data.csv')): 125 | self.all_data = read_csv(join(getcwd(),'all_data.csv')) 126 | self.all_data = concat([self.all_data, final_data.dropna()]) 127 | if not exists(join(getcwd(),'all_data.csv')): 128 | self.all_data.to_csv(join(getcwd(),'all_data.csv'),index=False) 129 | self.all_data.to_csv(join(getcwd(),'all_data.csv'),index=False) 130 | year_list_find.append(year) 131 | print(f'year list after loop: {year_list_find}') 132 | with open(join(getcwd(),'year_count.yaml'), 'w') as write_file: 133 | yaml.dump(year_counts, write_file) 134 | print(f'writing {year} to yaml file') 135 | else: 136 | self.all_data = read_csv(join(getcwd(),'all_data.csv')) 137 | print('dataset size: ', np.shape(self.all_data)) 138 | self.all_data = self.all_data.drop_duplicates(keep='last') 139 | print(f'dataset size after duplicates are dropped: {np.shape(self.all_data)}') 140 | 141 | def pca_analysis(self): 142 | #scale first before pca 143 | self.scaler = StandardScaler() 144 | x_scale = self.scaler.fit_transform(self.x) 145 | self.pca = PCA(n_components=0.95) #explain 95% of the variance 146 | self.x_no_corr = self.pca.fit_transform(x_scale) 147 | 148 | #Visualize PCA components 149 | plt.figure() 150 | plt.figure(figsize=(8, 6)) 151 | plt.bar(range(self.pca.n_components_), self.pca.explained_variance_ratio_) 152 | plt.xlabel('Principal Component') 153 | plt.ylabel('Explained Variance Ratio') 154 | plt.title('Explained Variance Ratio of Principal Components') 155 | plt.savefig('pca_components.png',dpi=400) 156 | plt.close() 157 | 158 | def convert_to_float(self): 159 | for col in self.all_data.columns: 160 | self.all_data[col].replace('', np.nan, inplace=True) 161 | self.all_data[col] = self.all_data[col].astype(float) 162 | self.all_data.dropna(inplace=True) 163 | 164 | def delete_opp(self): 165 | """ 166 | Drop any opponent data, as it may not be helpful when coming to prediction. Hard to estimate with running average 167 | """ 168 | for col in self.all_data.columns: 169 | if 'opp' in col: 170 | self.all_data.drop(columns=col,inplace=True) 171 | def split(self): 172 | # self.delete_opp() 173 | for col in self.all_data.columns: 174 | if 'Unnamed' in col: 175 | self.all_data.drop(columns=col,inplace=True) 176 | self.convert_to_float() 177 | #self.y = np.delete(self.y, np.where(np.isnan(self.x_no_corr)), axis=0) 178 | #self.x_no_corr = self.x_no_corr.dropna() 179 | self.y = self.all_data['game_result'].astype(int) 180 | result_counts = self.all_data['game_result'].value_counts() 181 | #plot the counts 182 | plt.figure(figsize=(8, 6)) 183 | result_counts.plot(kind='bar') 184 | plt.xlabel('Game Result') 185 | plt.ylabel('Count') 186 | plt.title('Count of Labels') 187 | plt.savefig('class_label_count.png',dpi=400) 188 | plt.close() 189 | 190 | #onehot encode 191 | self.y = to_categorical(self.y) 192 | self.x = self.all_data.drop(columns=['game_result']) 193 | 194 | # #Dropna and remove all data from subsequent y data 195 | # real_values = ~self.x_no_corr.isna().any(axis=1) 196 | # self.x_no_corr.dropna(inplace=True) 197 | # self.y = self.y.loc[real_values] 198 | 199 | 200 | #pca data or no correlated data 201 | if self.which_analysis == 'pca': 202 | #pca 203 | self.pca_analysis() 204 | else: 205 | #correlational analysis and outlier removal 206 | self.pre_process_corr_out_remove() 207 | #75/15/10 split 208 | #Split data into training and the rest (75% training, 25% temporary) 209 | self.x_train, x_temp, self.y_train, y_temp = train_test_split(self.x_no_corr, self.y, train_size=0.75, random_state=42) 210 | #Split the rest into validation and test data (60% validation, 40% test) 211 | validation_ratio = 0.15 / (1 - 0.75) # Adjust ratio for the remaining part 212 | self.x_validation, self.x_test, self.y_validation, self.y_test = train_test_split(x_temp, y_temp, train_size=validation_ratio, random_state=42) 213 | 214 | def pre_process_corr_out_remove(self): 215 | # Remove features with a correlation coef greater than 0.90 216 | corr_val = 0.9 217 | corr_matrix = np.abs(self.x.astype(float).corr()) 218 | upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) 219 | to_drop = [column for column in upper.columns if any(upper[column] >= corr_val)] 220 | self.drop_cols = to_drop 221 | self.drop_cols = self.drop_cols + ['opp_pts', 'pts','game_loc','simple_rating_system'] #remove these extra features 222 | self.x_no_corr = self.x.drop(columns=self.drop_cols) 223 | cols = self.x_no_corr.columns 224 | print(f'Columns dropped >= {corr_val}: {self.drop_cols}') 225 | #Drop samples that are outliers 226 | print(f'old feature dataframe shape before outlier removal: {self.x_no_corr.shape}') 227 | for col_name in cols: 228 | Q1 = np.percentile(self.x_no_corr[col_name], 5) 229 | Q3 = np.percentile(self.x_no_corr[col_name], 95) 230 | IQR = Q3 - Q1 231 | upper = np.where(self.x_no_corr[col_name] >= (Q3+2.5*IQR)) #1.5 is the standard, use two to see if more data helps improve model performance 232 | lower = np.where(self.x_no_corr[col_name] <= (Q1-2.5*IQR)) 233 | self.x_no_corr.drop(upper[0], inplace = True) 234 | self.x_no_corr.drop(lower[0], inplace = True) 235 | self.y = np.delete(self.y, upper[0], axis=0) 236 | self.y = np.delete(self.y, lower[0], axis=0) 237 | # self.y.drop(upper[0], inplace = True) 238 | # self.y.drop(lower[0], inplace = True) 239 | if 'level_0' in self.x_no_corr.columns: 240 | self.x_no_corr.drop(columns=['level_0'],inplace = True) 241 | self.x_no_corr.reset_index(inplace = True) 242 | # self.y.reset_index(inplace = True, drop=True) 243 | self.x_no_corr.drop(columns=['level_0','index'],inplace = True) 244 | print(f'new feature dataframe shape after outlier removal: {self.x_no_corr.shape}') 245 | top_corr_features = corr_matrix.index 246 | plt.figure(figsize=(25,25)) 247 | sns.heatmap(corr_matrix[top_corr_features],annot=True,cmap="RdYlGn") 248 | plt.tight_layout() 249 | plt.savefig('correlations_class.png',dpi=300) 250 | plt.close() 251 | 252 | #Extra preprocessing steps 253 | #standardize 254 | self.cols_save = self.x_no_corr.columns 255 | self.scaler = StandardScaler() 256 | self.x_no_corr = self.scaler.fit_transform(self.x_no_corr) 257 | #normalize 258 | self.min_max_scaler = RobustScaler() 259 | self.x_no_corr = self.min_max_scaler.fit_transform(self.x_no_corr) 260 | self.x_no_corr = DataFrame(self.x_no_corr,columns=self.cols_save) 261 | #Generate random noise with the same shape as the DataFrame 262 | noise = np.random.normal(loc=0, scale=0.175, size=self.x_no_corr.shape) #the higher the scale value is, the more uniform the distribution becomes 263 | self.x_no_corr = self.x_no_corr + noise 264 | 265 | # def random_forest_analysis(self): 266 | # if argv[1] == 'tune': 267 | # #RANDOM FOREST REGRESSOR 268 | # RandForclass = RandomForestClassifier() 269 | # #Use the number of features as a stopping criterion for depth 270 | # rows, cols = self.x_train.shape 271 | # cols = int(cols / 2.5) #try to avoid overfitting on depth 272 | # #square root of the total number of features is a good limit 273 | # # cols = int(np.sqrt(cols)) 274 | # #parameters to tune 275 | # #increasing min_samples_leaf, this will reduce overfitting 276 | # Rand_perm = { 277 | # 'criterion' : ["gini","entropy"], #absolute_error - takes forever to run 278 | # 'n_estimators': range(300,500,100), 279 | # # 'min_samples_split': np.arange(2, 5, 1, dtype=int), 280 | # 'max_features' : [1, 'sqrt', 'log2'], 281 | # 'max_depth': np.arange(2,cols,1), 282 | # 'min_samples_leaf': np.arange(2,4,1) 283 | # } 284 | # clf_rand = GridSearchCV(RandForclass, Rand_perm, 285 | # scoring=['accuracy','f1'], 286 | # cv=5, 287 | # refit='accuracy', 288 | # verbose=4, 289 | # n_jobs=-1) 290 | # search_rand = clf_rand.fit(self.x_train,self.y_train) 291 | # #Write fitted and tuned model to file 292 | # # with open('randomForestModelTuned.pkl','wb') as f: 293 | # # pickle.dump(search_rand,f) 294 | # joblib.dump(search_rand, "./classifierModelTuned.joblib", compress=9) 295 | # print('RandomForestClassifier - best params: ',search_rand.best_params_) 296 | # self.RandForclass = search_rand 297 | # prediction = self.RandForclass.predict(self.x_test) 298 | # print(confusion_matrix(self.y_test, prediction))# Display accuracy score 299 | # print(f'Model accuracy: {accuracy_score(self.y_test, prediction)}')# Display F1 score 300 | # # print(f1_score(self.y_test, prediction)) 301 | # else: 302 | # print('Load tuned Random Forest Classifier') 303 | # # load RandomForestModel 304 | # self.RandForclass=joblib.load("./classifierModelTuned.joblib") 305 | # prediction = self.RandForclass.predict(self.x_test) 306 | # print(confusion_matrix(self.y_test, prediction))# Display accuracy score 307 | # print(f'Model accuracy: {accuracy_score(self.y_test, prediction)}')# Display F1 score 308 | # # print(f1_score(self.y_test, prediction)) 309 | # y_proba = self.RandForclass.predict_proba(self.x_test)[:, 1] 310 | # fpr, tpr, thresholds = roc_curve(self.y_test, y_proba) 311 | # plt.plot(fpr, tpr) 312 | # plt.xlabel('False Positive Rate') 313 | # plt.ylabel('True Positive Rate') 314 | # plt.title('ROC Curve') 315 | # plt.savefig('ROC_curve_class.png',dpi=300) 316 | 317 | def xgboost_analysis(self): 318 | if not os.path.exists('classifierModelTuned_xgb.joblib'): 319 | if self.which_analysis == 'pca': 320 | y_train_combined = np.concatenate([self.y_train, self.y_validation], axis=0) 321 | x_train_combined = np.concatenate([self.x_train, self.x_validation], axis=0) 322 | else: 323 | y_train_combined = np.concatenate([self.y_train, self.y_validation], axis=0) 324 | x_train_combined = concat([self.x_train, self.x_validation], axis=0) 325 | if argv[1] == 'tune': 326 | # XGBoost Classifier 327 | xgb_class = xgb.XGBClassifier() 328 | 329 | # Parameters to tune 330 | params = { 331 | 'learning_rate': [0.01, 0.1], 332 | 'n_estimators': range(100, 300, 100), 333 | 'max_depth': range(2, 4, 2), 334 | 'min_child_weight': [1, 5], 335 | 'gamma': [0, 0.2], 336 | 'subsample': [0.6, 1.0], 337 | 'colsample_bytree': [0.6, 1.0], 338 | 'reg_alpha': [0, 0.01], 339 | 'reg_lambda': [0, 0.01], 340 | 'scale_pos_weight': [1, 3] 341 | } 342 | 343 | clf_xgb = GridSearchCV(xgb_class, params, 344 | scoring=['accuracy'], 345 | cv=5, 346 | refit='accuracy', 347 | verbose=4) 348 | search_xgb = clf_xgb.fit(x_train_combined, y_train_combined) 349 | 350 | # Write fitted and tuned model to file 351 | joblib.dump(search_xgb, "./classifierModelTuned_xgb.joblib", compress=9) 352 | print('XGBoost Classifier - best params: ', search_xgb.best_params_) 353 | self.xgb_class = search_xgb 354 | prediction = self.xgb_class.predict(self.x_test) 355 | print('Confusion Matrix: \n',confusion_matrix(np.argmax(self.y_test, axis=1), np.argmax(prediction, axis=1))) # Display accuracy score 356 | print(f'Model accuracy on test data:: {accuracy_score(np.argmax(self.y_test, axis=1), np.argmax(prediction, axis=1))}') # Display F1 score 357 | 358 | else: 359 | print('Load tuned XGBoost Classifier') 360 | # load XGBoost Model 361 | self.xgb_class = joblib.load("./classifierModelTuned_xgb.joblib") 362 | prediction = self.xgb_class.predict(self.x_test) 363 | print('Confusion Matrix on test data: \n',confusion_matrix(np.argmax(self.y_test, axis=1), np.argmax(prediction, axis=1))) # Display accuracy score 364 | print(f'Model accuracy on test data: {accuracy_score(np.argmax(self.y_test, axis=1), np.argmax(prediction, axis=1))}') # Display F1 score 365 | with open("output_xgb.txt", "w") as file: 366 | file.write('Confusion Matrix on test data: \n') 367 | file.write(str(confusion_matrix(np.argmax(self.y_test, axis=1), np.argmax(prediction, axis=1)))) 368 | file.write('\n') 369 | file.write(f'Model accuracy on test data: {accuracy_score(np.argmax(self.y_test, axis=1), np.argmax(prediction, axis=1))}') 370 | file.write('\n') 371 | y_proba = self.xgb_class.predict_proba(self.x_test) 372 | fpr, tpr, thresholds = roc_curve(np.argmax(self.y_test, axis=1), np.argmax(y_proba, axis=1)) 373 | plt.figure() 374 | plt.plot(fpr, tpr) 375 | plt.xlabel('False Positive Rate') 376 | plt.ylabel('True Positive Rate') 377 | plt.title('ROC Curve') 378 | plt.savefig('ROC_curve_class.png', dpi=300) 379 | plt.close() 380 | else: 381 | self.xgb_class = joblib.load("./classifierModelTuned_xgb.joblib") 382 | 383 | 384 | def deep_learn_analysis(self): 385 | if not os.path.exists('binary_keras_deep.h5'): 386 | tuner = RandomSearch( 387 | lambda hp: create_sequential_model(hp, self.x_train.shape[1], 2), 388 | objective='val_loss', #val_loss 389 | max_trials=10, 390 | directory=f'cbb_sequential_hp', 391 | project_name='sequential_hyperparameter_tuning', 392 | ) 393 | 394 | early_stopping = EarlyStopping(monitor='val_loss', patience=9, restore_best_weights=True) 395 | reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6) 396 | tuner.search(x=self.x_train, y=self.y_train, 397 | epochs=200, 398 | validation_data=(self.x_validation, self.y_validation), 399 | callbacks=[early_stopping, reduce_lr]) 400 | 401 | # best_hps = tuner.get_best_hyperparameters(num_trials=1)[0] 402 | best_model = tuner.get_best_models(num_models=1)[0] 403 | 404 | # Fit tuned model 405 | loss_final = float(100) 406 | for i in tqdm(range(15)): 407 | best_model.fit(self.x_train, self.y_train, 408 | epochs=200, 409 | validation_data=(self.x_validation, self.y_validation), 410 | callbacks=[early_stopping, reduce_lr]) 411 | loss, acc = best_model.evaluate(self.x_test, self.y_test) 412 | if loss < loss_final: 413 | self.final_model_deep = best_model 414 | loss, acc = self.final_model_deep.evaluate(self.x_test, self.y_test) 415 | print(f'Final model test loss {loss} and accuracy {acc}') 416 | with open("output_deep_learn.txt", "w") as file: 417 | file.write(f'Final model test loss {loss} and accuracy {acc}') 418 | file.write('\n') 419 | self.final_model_deep.save('binary_keras_deep.h5') 420 | else: 421 | self.final_model_deep = load_model('binary_keras_deep.h5') 422 | 423 | def predict_two_teams(self): 424 | teams_sports_ref = read_csv('teams_sports_ref_format.csv') 425 | while True: 426 | # try: 427 | team_1 = input('team_1: ') 428 | if team_1 == 'exit': 429 | break 430 | team_2 = input('team_2: ') 431 | #Game location 432 | game_loc_team1 = int(input(f'{team_1} : home = 0, away = 1, neutral = 2: ')) 433 | if game_loc_team1 == 0: 434 | game_loc_team2 = 1 435 | elif game_loc_team1 == 1: 436 | game_loc_team2 = 0 437 | elif game_loc_team1 == 2: 438 | game_loc_team2 = 2 439 | #Check to see if the team was spelled right 440 | team_1 = get_close_matches(team_1,teams_sports_ref['teams'].tolist(),n=1)[0] 441 | team_2 = get_close_matches(team_2,teams_sports_ref['teams'].tolist(),n=1)[0] 442 | #2023 data 443 | year = 2024 444 | # sleep(4) 445 | basic = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs.html' 446 | adv = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs-advanced.html' 447 | team_1_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_1.lower(),year) 448 | sleep(4) #I get get banned for a small period of time if I do not do this 449 | basic = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs.html' 450 | adv = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs-advanced.html' 451 | team_2_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_2.lower(),year) 452 | #Remove empty cells 453 | team_1_df2023['pts'].replace('', np.nan, inplace=True) 454 | team_1_df2023.replace('', np.nan, inplace=True) 455 | team_1_df2023.dropna(inplace=True) 456 | team_2_df2023['pts'].replace('', np.nan, inplace=True) 457 | team_2_df2023.replace('', np.nan, inplace=True) 458 | team_2_df2023.dropna(inplace=True) 459 | for col in team_1_df2023.columns: 460 | team_1_df2023[col] = team_1_df2023[col].astype(float) 461 | for col in team_2_df2023.columns: 462 | team_2_df2023[col] = team_2_df2023[col].astype(float) 463 | 464 | #Combine dfs 465 | if len(team_1_df2023) > len(team_2_df2023): 466 | team_1_df2023 = team_1_df2023.tail(len(team_2_df2023)) 467 | elif len(team_2_df2023) > len(team_1_df2023): 468 | team_2_df2023 = team_2_df2023.tail(len(team_1_df2023)) 469 | 470 | team_1_df2023 = team_1_df2023.reset_index(drop=True) 471 | team_2_df2023 = team_2_df2023.reset_index(drop=True) 472 | team_1_df_copy = team_1_df2023.copy() 473 | team_2_df_copy = team_2_df2023.copy() 474 | #replace team 1 opp data with team 2 475 | for index, row in team_1_df2023.iterrows(): 476 | for col in team_1_df2023.columns: 477 | if "opp" in col: 478 | if col == 'opp_trb': 479 | team_1_df2023.at[index, 'opp_trb'] = team_2_df2023.at[index, 'total_board'] 480 | else: 481 | new_col = col.replace("opp_", "") 482 | team_1_df2023.at[index, col] = team_2_df2023.at[index, new_col] 483 | 484 | #replace team 2 opp data with team 1 485 | for index, row in team_2_df_copy.iterrows(): 486 | for col in team_2_df_copy.columns: 487 | if "opp" in col: 488 | if col == 'opp_trb': 489 | team_2_df_copy.at[index, 'opp_trb'] = team_1_df_copy.at[index, 'total_board'] 490 | else: 491 | new_col = col.replace("opp_", "") 492 | team_2_df_copy.at[index, col] = team_1_df_copy.at[index, new_col] 493 | 494 | 495 | #Remove pts and game result 496 | # for col in team_1_df2023.columns: 497 | # if 'opp' in col: 498 | # team_1_df2023.drop(columns=col,inplace=True) 499 | # for col in team_2_df2023.columns: 500 | # if 'opp' in col: 501 | # team_2_df2023.drop(columns=col,inplace=True) 502 | if self.which_analysis == 'pca': 503 | team_1_df2023.drop(columns=['game_result'],inplace=True) 504 | team_2_df_copy.drop(columns=['game_result'],inplace=True) 505 | team_1_df2023 = self.scaler.transform(team_1_df2023) 506 | team_2_df_copy = self.scaler.transform(team_2_df_copy) 507 | team_1_df2023 = self.pca.transform(team_1_df2023) 508 | team_2_df_copy = self.pca.transform(team_2_df_copy) 509 | 510 | #make df for other analysis 511 | team_1_df_separate = DataFrame(team_1_df2023).abs() 512 | team_2_df_separate = DataFrame(team_2_df_copy).abs() 513 | prop_1 = team_1_df_separate.std() / team_1_df_separate.mean() 514 | prop_2 = team_2_df_separate.std() / team_2_df_separate.mean() 515 | else: 516 | team_1_df2023.drop(columns=['game_result'],inplace=True) 517 | team_2_df2023.drop(columns=['game_result'],inplace=True) 518 | #Drop the correlated features 519 | team_1_df2023.drop(columns=self.drop_cols, inplace=True) 520 | team_2_df2023.drop(columns=self.drop_cols, inplace=True) 521 | 522 | team_1_df2023 = self.scaler.transform(team_1_df2023) 523 | team_2_df2023 = self.scaler.transform(team_2_df2023) 524 | 525 | team_1_df2023 = self.min_max_scaler.transform(team_1_df2023) 526 | team_2_df2023 = self.min_max_scaler.transform(team_2_df2023) 527 | 528 | team_1_df2023 = DataFrame(team_1_df2023,columns=self.cols_save) 529 | team_2_df2023 = DataFrame(team_2_df2023,columns=self.cols_save) 530 | 531 | ma_range = np.arange(2,10,1) #2 was the most correct value for mean and 8 was the best for the median; chose 9 for tiebreaking 532 | # team_1_count = 0 533 | # team_2_count = 0 534 | # team_1_count_mean = 0 535 | # team_2_count_mean = 0 536 | team_1_ma_win = [] 537 | team_2_ma_win = [] 538 | random_pred_1, random_pred_2 = [], [] 539 | random_pred_1_monte, random_pred_2_monte = [], [] 540 | qt_best_team_1, qt_best_team_2 = [], [] 541 | qt_worst_team_1, qt_worst_team_2 = [], [] 542 | #get latest SRS value 543 | team_1_srs = cbb_web_scraper.get_latest_srs(team_1) 544 | team_2_srs = cbb_web_scraper.get_latest_srs(team_2) 545 | 546 | # #Monte carlo simulation 547 | num_simulations = 1000 548 | mean_1 = np.mean(team_1_df2023, axis=0) 549 | std_1 = np.std(team_1_df2023, axis=0) 550 | mean_2 = np.mean(team_2_df_copy, axis=0) 551 | std_2 = np.std(team_2_df_copy, axis=0) 552 | for _ in tqdm(range(num_simulations)): 553 | random_stats_team_1 = np.random.normal(mean_1, std_1, size=(1,team_1_df_separate.shape[1])) 554 | random_stats_team_2 = np.random.normal(mean_2, std_2, size=(1,team_2_df_separate.shape[1])) 555 | random_stats_team_1 = random_stats_team_1[0] 556 | random_stats_team_2 = random_stats_team_2[0] 557 | outcome_team_1 = self.xgb_class.predict_proba([random_stats_team_1]) 558 | outcome_deep_1 = self.final_model_deep.predict([np.expand_dims(random_stats_team_1, axis=0)]) 559 | outcome_team_2 = self.xgb_class.predict_proba([random_stats_team_2]) 560 | outcome_deep_2 = self.final_model_deep.predict([np.expand_dims(random_stats_team_2, axis=0)]) 561 | random_pred_1_monte.append(outcome_team_1[0][1]) 562 | random_pred_1_monte.append(outcome_deep_1[0][1]) 563 | random_pred_2_monte.append(outcome_team_1[0][0]) 564 | random_pred_2_monte.append(outcome_team_1[0][0]) 565 | random_pred_2_monte.append(outcome_team_2[0][1]) 566 | random_pred_2_monte.append(outcome_deep_2[0][1]) 567 | random_pred_1_monte.append(outcome_team_2[0][0]) 568 | random_pred_1_monte.append(outcome_deep_2[0][0]) 569 | 570 | #every game of one team vs every game for other team 571 | for _ in tqdm(range(len(team_1_df2023) * 30)): 572 | if self.which_analysis == 'pca': 573 | random_row_df1 = team_1_df2023[np.random.choice(len(team_1_df2023), size=1),:] 574 | random_row_df2 = team_2_df_copy[np.random.choice(len(team_2_df_copy), size=1),:] 575 | else: 576 | random_row_df1 = team_1_df2023.sample(n=1) 577 | random_row_df2 = team_2_df_copy.sample(n=1) 578 | # random_row_df2 = team_2_df2023.sample(n=1) 579 | 580 | # for col in random_row_df1.columns: 581 | # if "opp" in col: 582 | # if col == 'opp_trb': 583 | # random_row_df1.at[random_row_df1.index[0], 'opp_trb'] = random_row_df2.at[random_row_df2.index[0], 'total_board'] 584 | # else: 585 | # new_col = col.replace("opp_", "") 586 | # random_row_df1.at[random_row_df1.index[0], col] = random_row_df2.at[random_row_df2.index[0], new_col] 587 | outcome_team_1 = self.xgb_class.predict_proba(random_row_df1) 588 | outcome_team_2 = self.xgb_class.predict_proba(random_row_df2) 589 | outcome_deep_1 = self.final_model_deep.predict(random_row_df1) 590 | outcome_deep_2 = self.final_model_deep.predict(random_row_df2) 591 | 592 | #team 1 win percentage [lose win] 593 | random_pred_1.append(outcome_team_1[0][1]) 594 | random_pred_1.append(outcome_deep_1[0][1]) 595 | random_pred_2.append(outcome_team_1[0][0]) 596 | random_pred_2.append(outcome_team_1[0][0]) 597 | #team 2 win percentage [lose win] 598 | random_pred_2.append(outcome_team_2[0][1]) 599 | random_pred_2.append(outcome_deep_2[0][1]) 600 | random_pred_1.append(outcome_team_2[0][0]) 601 | random_pred_1.append(outcome_deep_2[0][0]) 602 | 603 | #rolling average predictions 604 | team_1_df2023 = DataFrame(team_1_df2023) 605 | team_2_df_copy = DataFrame(team_2_df_copy) 606 | for ma in tqdm(ma_range): 607 | # TEAM 1 608 | data1_mean = team_1_df2023.ewm(span=ma,min_periods=ma-1).mean() 609 | data2_mean = team_2_df_copy.ewm(span=ma,min_periods=ma-1).mean() 610 | 611 | outcome = self.xgb_class.predict_proba(data1_mean.iloc[-1:].values) 612 | outcome_deep = self.final_model_deep.predict(data1_mean.iloc[-1:].values) 613 | outcome2 = self.xgb_class.predict_proba(data2_mean.iloc[-1:].values) 614 | outcome_deep2 = self.final_model_deep.predict(data2_mean.iloc[-1:].values) 615 | 616 | team_1_ma_win.append(outcome[0][1]) 617 | team_1_ma_win.append(outcome_deep[0][1]) 618 | team_2_ma_win.append(outcome[0][0]) 619 | team_2_ma_win.append(outcome_deep[0][0]) 620 | 621 | team_1_ma_win.append(outcome2[0][0]) 622 | team_1_ma_win.append(outcome_deep2[0][0]) 623 | team_2_ma_win.append(outcome2[0][1]) 624 | team_2_ma_win.append(outcome_deep2[0][1]) 625 | 626 | #quantile predictions - both play at their bests 627 | for ma in tqdm(ma_range): 628 | # TEAM 1 629 | data1_mean = team_1_df2023.rolling(window=ma).quantile(0.75) 630 | # data1_mean['game_loc'] = game_loc_team1 631 | data2_mean = team_2_df_copy.rolling(window=ma).quantile(0.75) 632 | # data2_mean['game_loc'] = game_loc_team2 633 | #get latest SRS value 634 | # data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_1_srs 635 | # data2_mean.loc[data2_mean.index[-1], 'simple_rating_system'] = team_2_srs 636 | outcome = self.xgb_class.predict_proba(data1_mean.iloc[-1:].values) 637 | outcome_deep = self.final_model_deep.predict(data1_mean.iloc[-1:].values) 638 | outcome2 = self.xgb_class.predict_proba(data2_mean.iloc[-1:].values) 639 | outcome_deep2 = self.final_model_deep.predict(data2_mean.iloc[-1:].values) 640 | 641 | qt_best_team_1.append(outcome[0][1]) 642 | qt_best_team_1.append(outcome_deep[0][1]) 643 | qt_best_team_2.append(outcome[0][0]) 644 | qt_best_team_2.append(outcome_deep[0][0]) 645 | 646 | qt_best_team_1.append(outcome2[0][0]) 647 | qt_best_team_1.append(outcome_deep2[0][0]) 648 | qt_best_team_2.append(outcome2[0][1]) 649 | qt_best_team_2.append(outcome_deep2[0][1]) 650 | 651 | #quantile predictions - both play at their worsts 652 | for ma in tqdm(ma_range): 653 | # TEAM 1 654 | data1_mean = team_1_df2023.rolling(window=ma).quantile(0.25) 655 | # data1_mean['game_loc'] = game_loc_team1 656 | data2_mean = team_2_df_copy.rolling(window=ma).quantile(0.25) 657 | # data2_mean['game_loc'] = game_loc_team2 658 | #get latest SRS value 659 | # data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_1_srs 660 | # data2_mean.loc[data2_mean.index[-1], 'simple_rating_system'] = team_2_srs 661 | # data1_mean['simple_rating_system'].iloc[-1] = cbb_web_scraper.get_latest_srs(team_1) 662 | outcome = self.xgb_class.predict_proba(data1_mean.iloc[-1:].values) 663 | outcome_deep = self.final_model_deep.predict(data1_mean.iloc[-1:].values) 664 | outcome2 = self.xgb_class.predict_proba(data2_mean.iloc[-1:].values) 665 | outcome_deep2 = self.final_model_deep.predict(data2_mean.iloc[-1:].values) 666 | 667 | qt_worst_team_1.append(outcome[0][1]) 668 | qt_worst_team_1.append(outcome_deep[0][1]) 669 | qt_worst_team_2.append(outcome[0][0]) 670 | qt_worst_team_2.append(outcome_deep[0][0]) 671 | 672 | qt_worst_team_1.append(outcome2[0][0]) 673 | qt_worst_team_1.append(outcome_deep2[0][0]) 674 | qt_worst_team_2.append(outcome2[0][1]) 675 | qt_worst_team_2.append(outcome_deep2[0][1]) 676 | 677 | ###########TEAM 2 VS TEAM 1################### 678 | # temp = team_1_df2023 679 | # team_1_df2023 = team_2_df2023 680 | # team_2_df2023 = temp 681 | 682 | # if game_loc_team1 == 1: 683 | # game_loc_team1 = 0 684 | # elif game_loc_team1 == 0: 685 | # game_loc_team1 = 1 686 | # if game_loc_team2 == 0: 687 | # game_loc_team2 = 1 688 | # elif game_loc_team2 == 1: 689 | # game_loc_team2 = 0 690 | 691 | # #get latest SRS value - flip them 692 | # team_1_srs = cbb_web_scraper.get_latest_srs(team_2) 693 | # team_2_srs = cbb_web_scraper.get_latest_srs(team_1) 694 | # #every game of one team vs every game for other team 695 | # for _ in range(len(team_1_df2023) * 2): 696 | # random_row_df1 = team_1_df2023.sample(n=1) 697 | # random_row_df2 = team_2_df2023.sample(n=1) 698 | 699 | # for col in random_row_df1.columns: 700 | # if "opp" in col: 701 | # if col == 'opp_trb': 702 | # random_row_df1.at[random_row_df1.index[0], 'opp_trb'] = random_row_df2.at[random_row_df2.index[0], 'total_board'] 703 | # else: 704 | # new_col = col.replace("opp_", "") 705 | # random_row_df1.at[random_row_df1.index[0], col] = random_row_df2.at[random_row_df2.index[0], new_col] 706 | 707 | # outcome = self.xgb_class.predict_proba(random_row_df1) 708 | # outcome_deep = self.final_model_deep.predict(random_row_df1) 709 | 710 | # random_pred_1.append(outcome[0][1]) 711 | # random_pred_1.append(outcome_deep[0][1]) 712 | # random_pred_2.append(outcome[0][0]) 713 | # random_pred_2.append(outcome_deep[0][0]) 714 | 715 | # #rolling average predictions 716 | # for ma in tqdm(ma_range): 717 | # # TEAM 1 718 | # data1_mean = team_1_df2023.ewm(span=ma,min_periods=ma-1).mean() 719 | # # data1_mean['game_loc'] = game_loc_team1 720 | # data2_mean = team_2_df2023.ewm(span=ma,min_periods=ma-1).mean() 721 | # # data2_mean['game_loc'] = game_loc_team2 722 | # #Here replace opponent metrics with the features of the second team 723 | # for col in data1_mean.columns: 724 | # if "opp" in col: 725 | # if col == 'opp_trb': 726 | # # new_col = col.replace("opp_", "") 727 | # data1_mean.loc[data1_mean.index[-1], 'opp_trb'] = data2_mean.loc[data2_mean.index[-1], 'total_board'] 728 | # else: 729 | # new_col = col.replace("opp_", "") 730 | # data1_mean.loc[data1_mean.index[-1], col] = data2_mean.loc[data2_mean.index[-1], new_col] 731 | # #get latest SRS value 732 | # # data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_1_srs 733 | # # data2_mean.loc[data2_mean.index[-1], 'simple_rating_system'] = team_2_srs 734 | # # data1_mean['simple_rating_system'].iloc[-1] = cbb_web_scraper.get_latest_srs(team_1) 735 | # outcome = self.xgb_class.predict_proba(data1_mean.iloc[-1:]) 736 | # outcome_deep = self.final_model_deep.predict(data1_mean.iloc[-1:]) 737 | 738 | # team_1_ma_win.append(outcome[0][1]) 739 | # team_1_ma_win.append(outcome_deep[0][1]) 740 | # team_2_ma_win.append(outcome[0][0]) 741 | # team_2_ma_win.append(outcome_deep[0][0]) 742 | # #quantile predictions - both play at their bests 743 | # for ma in tqdm(ma_range): 744 | # # TEAM 1 745 | # data1_mean = team_1_df2023.rolling(window=ma).quantile(0.75).iloc[-1:] 746 | # # data1_mean['game_loc'] = game_loc_team1 747 | # data2_mean = team_2_df2023.rolling(window=ma).quantile(0.75).iloc[-1:] 748 | # # data2_mean['game_loc'] = game_loc_team2 749 | # #Here replace opponent metrics with the features of the second team 750 | # for col in data1_mean.columns: 751 | # if "opp" in col: 752 | # if col == 'opp_trb': 753 | # # new_col = col.replace("opp_", "") 754 | # data1_mean.loc[data1_mean.index[-1], 'opp_trb'] = data2_mean.loc[data2_mean.index[-1], 'total_board'] 755 | # else: 756 | # new_col = col.replace("opp_", "") 757 | # data1_mean.loc[data1_mean.index[-1], col] = data2_mean.loc[data2_mean.index[-1], new_col] 758 | # #get latest SRS value 759 | # # data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_1_srs 760 | # # data2_mean.loc[data2_mean.index[-1], 'simple_rating_system'] = team_2_srs 761 | # # data1_mean['simple_rating_system'].iloc[-1] = cbb_web_scraper.get_latest_srs(team_1) 762 | # outcome = self.xgb_class.predict_proba(data1_mean.iloc[-1:]) 763 | # outcome_deep = self.final_model_deep.predict(data1_mean.iloc[-1:]) 764 | 765 | # qt_best_team_1.append(outcome[0][1]) 766 | # qt_best_team_1.append(outcome_deep[0][1]) 767 | # qt_best_team_2.append(outcome[0][0]) 768 | # qt_best_team_2.append(outcome_deep[0][0]) 769 | 770 | # #quantile predictions - both play at their worsts 771 | # for ma in tqdm(ma_range): 772 | # # TEAM 1 773 | # data1_mean = team_1_df2023.rolling(window=ma).quantile(0.25).iloc[-1:] 774 | # # data1_mean['game_loc'] = game_loc_team1 775 | # data2_mean = team_2_df2023.rolling(window=ma).quantile(0.25).iloc[-1:] 776 | # # data2_mean['game_loc'] = game_loc_team2 777 | # #Here replace opponent metrics with the features of the second team 778 | # for col in data1_mean.columns: 779 | # if "opp" in col: 780 | # if col == 'opp_trb': 781 | # # new_col = col.replace("opp_", "") 782 | # data1_mean.loc[data1_mean.index[-1], 'opp_trb'] = data2_mean.loc[data2_mean.index[-1], 'total_board'] 783 | # else: 784 | # new_col = col.replace("opp_", "") 785 | # data1_mean.loc[data1_mean.index[-1], col] = data2_mean.loc[data2_mean.index[-1], new_col] 786 | # #get latest SRS value 787 | # # data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_1_srs 788 | # # data2_mean.loc[data2_mean.index[-1], 'simple_rating_system'] = team_2_srs 789 | # # data1_mean['simple_rating_system'].iloc[-1] = cbb_web_scraper.get_latest_srs(team_1) 790 | # outcome = self.xgb_class.predict_proba(data1_mean.iloc[-1:]) 791 | # outcome_deep = self.final_model_deep.predict(data1_mean.iloc[-1:]) 792 | 793 | # qt_worst_team_1.append(outcome[0][1]) 794 | # qt_worst_team_1.append(outcome_deep[0][1]) 795 | # qt_worst_team_2.append(outcome[0][0]) 796 | # qt_worst_team_2.append(outcome_deep[0][0]) 797 | 798 | # #reflip for printing 799 | # team_1_srs = cbb_web_scraper.get_latest_srs(team_1) 800 | # team_2_srs = cbb_web_scraper.get_latest_srs(team_2) 801 | print('===============================================================') 802 | if team_1_srs > team_2_srs: 803 | print(Fore.GREEN + Style.BRIGHT + f'{team_1} SRS data: {team_1_srs}'+ Style.RESET_ALL) 804 | print(Fore.RED + Style.BRIGHT + f'{team_2} SRS data: {team_2_srs}'+ Style.RESET_ALL) 805 | else: 806 | print(Fore.RED + Style.BRIGHT + f'{team_1} SRS data: {team_1_srs}'+ Style.RESET_ALL) 807 | print(Fore.GREEN + Style.BRIGHT + f'{team_2} SRS data: {team_2_srs}'+ Style.RESET_ALL) 808 | print('===============================================================') 809 | if np.mean(prop_1.sum()) < np.mean(prop_2.sum()): 810 | print(Fore.GREEN + Style.BRIGHT + f'{team_1} summed variability: {prop_1.sum()}'+ Style.RESET_ALL) 811 | print(Fore.RED + Style.BRIGHT + f'{team_2} summed variability: {prop_2.sum()}'+ Style.RESET_ALL) 812 | else: 813 | print(Fore.RED + Style.BRIGHT + f'{team_1} summed variability: {prop_1.sum()}'+ Style.RESET_ALL) 814 | print(Fore.GREEN + Style.BRIGHT + f'{team_2} summed variability: {prop_2.sum()}'+ Style.RESET_ALL) 815 | print('===============================================================') 816 | if np.mean(team_1_ma_win) > np.mean(team_2_ma_win): 817 | print(Fore.GREEN + Style.BRIGHT + f'{team_1} average win probabilities: {np.mean(team_1_ma_win)}'+ Style.RESET_ALL) 818 | print(Fore.RED + Style.BRIGHT + f'{team_2} average win probabilities: {np.mean(team_2_ma_win)}'+ Style.RESET_ALL) 819 | else: 820 | print(Fore.RED + Style.BRIGHT + f'{team_1} average win probabilities: {np.mean(team_1_ma_win)}'+ Style.RESET_ALL) 821 | print(Fore.GREEN + Style.BRIGHT + f'{team_2} average win probabilities: {np.mean(team_2_ma_win)}'+ Style.RESET_ALL) 822 | print('===============================================================') 823 | if np.mean(qt_best_team_1) > np.mean(qt_best_team_2): 824 | print(Fore.GREEN + Style.BRIGHT + f'{team_1} average win probabilities if they play at their best: {np.mean(qt_best_team_1)}'+ Style.RESET_ALL) 825 | print(Fore.RED + Style.BRIGHT + f'{team_2} average win probabilities if they play at their best: {np.mean(qt_best_team_2)}'+ Style.RESET_ALL) 826 | else: 827 | print(Fore.RED + Style.BRIGHT + f'{team_1} average win probabilities if they play at their best: {np.mean(qt_best_team_1)}'+ Style.RESET_ALL) 828 | print(Fore.GREEN + Style.BRIGHT + f'{team_2} average win probabilities if they play at their best: {np.mean(qt_best_team_2)}'+ Style.RESET_ALL) 829 | print('===============================================================') 830 | if np.mean(qt_worst_team_1) > np.mean(qt_worst_team_2): 831 | print(Fore.GREEN + Style.BRIGHT + f'{team_1} average win probabilities if they play at their worst: {np.mean(qt_worst_team_1)}'+ Style.RESET_ALL) 832 | print(Fore.RED + Style.BRIGHT + f'{team_2} average win probabilities if they play at their worst: {np.mean(qt_worst_team_2)}'+ Style.RESET_ALL) 833 | else: 834 | print(Fore.RED + Style.BRIGHT + f'{team_1} average win probabilities if they play at their worst: {np.mean(qt_worst_team_1)}'+ Style.RESET_ALL) 835 | print(Fore.GREEN + Style.BRIGHT + f'{team_2} average win probabilities if they play at their worst: {np.mean(qt_worst_team_2)}'+ Style.RESET_ALL) 836 | print('===============================================================') 837 | if np.mean(random_pred_1) > np.mean(random_pred_2): 838 | print(Fore.GREEN + Style.BRIGHT + f'{team_1} average win probabilities randomly selecting games: {np.mean(random_pred_1)}'+ Style.RESET_ALL) 839 | print(Fore.RED + Style.BRIGHT + f'{team_2} average win probabilities randomly selecting games: {np.mean(random_pred_2)}'+ Style.RESET_ALL) 840 | else: 841 | print(Fore.RED + Style.BRIGHT + f'{team_1} average win probabilities randomly selecting games: {np.mean(random_pred_1)}'+ Style.RESET_ALL) 842 | print(Fore.GREEN + Style.BRIGHT + f'{team_2} average win probabilities randomly selecting games: {np.mean(random_pred_2)}'+ Style.RESET_ALL) 843 | print('===============================================================') 844 | if np.mean(random_pred_1_monte) > np.mean(random_pred_2_monte): 845 | print(Fore.GREEN + Style.BRIGHT + f'{team_1} average win probabilities Monte Carlo Simulation: {np.mean(random_pred_1_monte)}'+ Style.RESET_ALL) 846 | print(Fore.RED + Style.BRIGHT + f'{team_2} average win probabilities Monte Carlo Simulation: {np.mean(random_pred_2_monte)}'+ Style.RESET_ALL) 847 | else: 848 | print(Fore.RED + Style.BRIGHT + f'{team_1} average win probabilities Monte Carlo Simulation: {np.mean(random_pred_1_monte)}'+ Style.RESET_ALL) 849 | print(Fore.GREEN + Style.BRIGHT + f'{team_2} average win probabilities Monte Carlo Simulation: {np.mean(random_pred_2_monte)}'+ Style.RESET_ALL) 850 | 851 | # if "tod" in sys.argv[2]: 852 | # date_today = str(datetime.now().date()).replace("-", "") 853 | # elif "tom" in sys.argv[2]: 854 | # date_today = str(datetime.now().date() + timedelta(days=1)).replace("-", "") 855 | # URL = "https://www.espn.com/mens-college-basketball/schedule/_/date/" + date_today #sys argv???? 856 | # print(f'ESPN prediction: {cbb_web_scraper.get_espn(URL,team_1,team_2)}') 857 | print('===============================================================') 858 | # except Exception as e: 859 | # print(f'The error: {e}') 860 | def feature_importances_random_forest(self): 861 | importances = self.RandForclass.best_estimator_.feature_importances_ 862 | indices = np.argsort(importances) 863 | plt.figure() 864 | plt.title('Feature Importances Random Forest - Classifier') 865 | plt.barh(range(len(indices)), importances[indices], color='k', align='center') 866 | plt.yticks(range(len(indices)), [self.x_test.columns[i] for i in indices]) 867 | plt.xlabel('Relative Importance - explained variance') 868 | plt.tight_layout() 869 | plt.savefig('feature_importance_random_forest_classifier.png',dpi=300) 870 | 871 | def feature_importances_xgb(self): 872 | importances = self.xgb_class.best_estimator_.feature_importances_ 873 | indices = np.argsort(importances) 874 | plt.figure(figsize=(10,8)) 875 | plt.title('Feature Importances XGBoost - Classifier') 876 | plt.barh(range(len(indices)), importances[indices], color='k', align='center') 877 | plt.yticks(range(len(indices)), [self.x_test.columns[i] for i in indices]) 878 | plt.xlabel('Relative Importance - explained variance') 879 | plt.tight_layout() 880 | plt.savefig('feature_importance_xgb_classifier.png',dpi=300) 881 | plt.close() 882 | 883 | def deep_learning_feature_importances(self): 884 | model = self.final_model_deep 885 | x_train_array = np.array(self.x_test) 886 | masker = shap.maskers.Independent(data=x_train_array) 887 | explainer = shap.Explainer(model, masker) 888 | shap_values = explainer.shap_values(x_train_array) 889 | feature_importances = np.mean(np.abs(shap_values),axis=0) 890 | shap.summary_plot(feature_importances.T, 891 | feature_names=self.cols_save, 892 | plot_type="bar", 893 | max_display=feature_importances.shape[0], 894 | show=False) 895 | plt.savefig('SHAP_feature_importances.png',dpi=400) 896 | plt.close() 897 | 898 | def run_analysis(self): 899 | self.get_teams() 900 | self.split() 901 | self.deep_learn_analysis() 902 | self.xgboost_analysis() 903 | self.predict_two_teams() 904 | if self.which_analysis != 'pca': 905 | self.feature_importances_xgb() 906 | self.deep_learning_feature_importances() 907 | 908 | def main(): 909 | cbbClass('pca').run_analysis() # 'pca' or 'corr' 910 | if __name__ == '__main__': 911 | main() -------------------------------------------------------------------------------- /cbb_regression.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | College Basketball Predictions 5 | @author: brianszekely 6 | """ 7 | import cbb_web_scraper 8 | from os import getcwd 9 | from os.path import join, exists 10 | import yaml 11 | from tqdm import tqdm 12 | from time import sleep 13 | from pandas import DataFrame, concat, read_csv, isnull 14 | import numpy as np 15 | from sklearn.model_selection import train_test_split 16 | from sklearn.model_selection import GridSearchCV 17 | from sklearn.ensemble import RandomForestRegressor 18 | import matplotlib.pyplot as plt 19 | import seaborn as sns 20 | from sys import argv 21 | from sklearn.metrics import mean_squared_error, r2_score 22 | # from sklearn.model_selection import cross_val_score, KFold 23 | import pickle 24 | import joblib 25 | import sys 26 | import os 27 | from scipy.stats import variation 28 | from difflib import get_close_matches 29 | from datetime import datetime, timedelta 30 | class cbb_regressor(): 31 | def __init__(self): 32 | print('initialize class cbb_regressor') 33 | self.all_data = DataFrame() 34 | def get_teams(self): 35 | year_list_find = [] 36 | year_list = [2023,2022,2021,2019,2018,2017,2016,2015,2014,2013,2012] #,2014,2013,2012,2011,2010 37 | if exists(join(getcwd(),'year_count.yaml')): 38 | with open(join(getcwd(),'year_count.yaml')) as file: 39 | year_counts = yaml.load(file, Loader=yaml.FullLoader) 40 | else: 41 | year_counts = {'year':year_list_find} 42 | #Remove any years that have already been collected 43 | if year_counts['year']: 44 | year_list_check = year_counts['year'] 45 | year_list_find = year_counts['year'] 46 | year_list = [i for i in year_list if i not in year_list_check] 47 | print(f'Need data for year: {year_list}') 48 | #Collect data per year 49 | if year_list: 50 | for year in tqdm(year_list): 51 | all_teams = cbb_web_scraper.get_teams_year(year_list[-1],year_list[0]) 52 | team_names = sorted(all_teams) 53 | final_list = [] 54 | self.year_store = year 55 | for abv in tqdm(team_names): 56 | try: 57 | print() #tqdm things 58 | print(f'current team: {abv}, year: {year}') 59 | basic = 'https://www.sports-reference.com/cbb/schools/' + abv + '/' + str(self.year_store) + '-gamelogs.html' 60 | adv = 'https://www.sports-reference.com/cbb/schools/' + abv + '/' + str(self.year_store) + '-gamelogs-advanced.html' 61 | df_inst = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,abv,self.year_store) 62 | df_inst['pts'].replace('', np.nan, inplace=True) 63 | df_inst.dropna(inplace=True) 64 | final_list.append(df_inst) 65 | except Exception as e: 66 | print(e) 67 | print(f'{abv} data are not available') 68 | sleep(4) #I get get banned for a small period of time if I do not do this 69 | final_data = concat(final_list) 70 | if exists(join(getcwd(),'all_data_regressor.csv')): 71 | self.all_data = read_csv(join(getcwd(),'all_data_regressor.csv')) 72 | self.all_data = concat([self.all_data, final_data.dropna()]) 73 | if not exists(join(getcwd(),'all_data_regressor.csv')): 74 | self.all_data.to_csv(join(getcwd(),'all_data_regressor.csv'),index=False) 75 | self.all_data.to_csv(join(getcwd(),'all_data_regressor.csv'),index=False) 76 | year_list_find.append(year) 77 | print(f'year list after loop: {year_list_find}') 78 | with open(join(getcwd(),'year_count.yaml'), 'w') as write_file: 79 | yaml.dump(year_counts, write_file) 80 | print(f'writing {year} to yaml file') 81 | else: 82 | self.all_data = read_csv(join(getcwd(),'all_data_regressor.csv')) 83 | print('len data: ', len(self.all_data)) 84 | self.all_data = self.all_data.drop_duplicates(keep='last') 85 | print(f'length of data after duplicates are dropped: {len(self.all_data)}') 86 | def convert_to_float(self): 87 | for col in self.all_data.columns: 88 | self.all_data[col].replace('', np.nan, inplace=True) 89 | self.all_data[col] = self.all_data[col].astype(float) 90 | def delete_opp(self): 91 | """ 92 | Drop any opponent data, as it may not be helpful when coming to prediction. Hard to estimate with running average 93 | """ 94 | for col in self.all_data.columns: 95 | if 'opp' in col: 96 | self.all_data.drop(columns=col,inplace=True) 97 | def split(self): 98 | # self.delete_opp() 99 | for col in self.all_data.columns: 100 | if 'Unnamed' in col: 101 | self.all_data.drop(columns=col,inplace=True) 102 | self.convert_to_float() 103 | self.y = self.all_data['pts'] 104 | self.x = self.all_data.drop(columns=['pts','game_result']) 105 | self.pre_process() 106 | #Dropna and remove all data from subsequent y data 107 | real_values = ~self.x_no_corr.isna().any(axis=1) 108 | self.x_no_corr.dropna(inplace=True) 109 | self.y = self.y.loc[real_values] 110 | self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x_no_corr, self.y, train_size=0.8) 111 | def pre_process(self): 112 | # Remove features with a correlation coef greater than 0.85 113 | corr_matrix = np.abs(self.x.astype(float).corr()) 114 | upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) 115 | to_drop = [column for column in upper.columns if any(upper[column] >= 0.90)] 116 | self.drop_cols = to_drop 117 | self.x_no_corr = self.x.drop(columns=to_drop) 118 | cols = self.x_no_corr.columns 119 | print(f'Columns dropped >= 0.90: {to_drop}') 120 | #Drop samples that are outliers 121 | print(f'old feature dataframe shape before outlier removal: {self.x_no_corr.shape}') 122 | for col_name in cols: 123 | Q1 = np.percentile(self.x_no_corr[col_name], 25) 124 | Q3 = np.percentile(self.x_no_corr[col_name], 75) 125 | IQR = Q3 - Q1 126 | upper = np.where(self.x_no_corr[col_name] >= (Q3+2.5*IQR)) #1.5 is the standard, use two to see if more data helps improve model performance 127 | lower = np.where(self.x_no_corr[col_name] <= (Q1-2.5*IQR)) 128 | self.x_no_corr.drop(upper[0], inplace = True) 129 | self.x_no_corr.drop(lower[0], inplace = True) 130 | self.y.drop(upper[0], inplace = True) 131 | self.y.drop(lower[0], inplace = True) 132 | if 'level_0' in self.x_no_corr.columns: 133 | self.x_no_corr.drop(columns=['level_0'],inplace = True) 134 | self.x_no_corr.reset_index(inplace = True) 135 | self.y.reset_index(inplace = True, drop=True) 136 | self.x_no_corr.drop(columns=['level_0','index'],inplace = True) 137 | print(f'new feature dataframe shape after outlier removal: {self.x_no_corr.shape}') 138 | top_corr_features = corr_matrix.index 139 | plt.figure(figsize=(20,20)) 140 | sns.heatmap(corr_matrix[top_corr_features],annot=True,cmap="RdYlGn") 141 | plt.tight_layout() 142 | plt.savefig('correlations.png',dpi=250) 143 | plt.close() 144 | def random_forest_analysis(self): 145 | if argv[1] == 'tune': 146 | #RANDOM FOREST REGRESSOR 147 | RandForclass = RandomForestRegressor() 148 | #Use the number of features as a stopping criterion for depth 149 | rows, cols = self.x_train.shape 150 | cols = int(cols / 1.18) #try to avoid overfitting on depth 151 | #square root of the total number of features is a good limit 152 | # cols = int(np.sqrt(cols)) 153 | #parameters to tune 154 | #increasing min_samples_leaf, this will reduce overfitting 155 | Rand_perm = { 156 | 'criterion' : ["squared_error", "poisson"], #absolute_error - takes forever to run 157 | 'n_estimators': range(300,500,100), 158 | # 'min_samples_split': np.arange(2, 5, 1, dtype=int), 159 | 'max_features' : [1, 'sqrt', 'log2'], 160 | 'max_depth': np.arange(2,cols,1), 161 | 'min_samples_leaf': np.arange(1,3,1) 162 | } 163 | #['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 164 | # average_precision', 'balanced_accuracy', 'completeness_score', 'explained_variance', 165 | # 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 166 | # 'homogeneity_score', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 167 | # 'jaccard_weighted', 'matthews_corrcoef', 'max_error', 'mutual_info_score', 'neg_brier_score', 168 | # 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 169 | # 'neg_mean_gamma_deviance', 'neg_mean_poisson_deviance', 'neg_mean_squared_error', 170 | # 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'neg_root_mean_squared_error', 171 | # 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'rand_score', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'roc_auc_ovo', 'roc_auc_ovo_weighted', 'roc_auc_ovr', 'roc_auc_ovr_weighted', 'top_k_accuracy', 'v_measure_score'] 172 | clf_rand = GridSearchCV(RandForclass, Rand_perm, 173 | scoring=['neg_root_mean_squared_error','explained_variance'], 174 | cv=5, 175 | refit='neg_root_mean_squared_error',verbose=4, n_jobs=-1) 176 | #save 177 | search_rand = clf_rand.fit(self.x_train,self.y_train) 178 | #Write fitted and tuned model to file 179 | # with open('randomForestModelTuned.pkl','wb') as f: 180 | # pickle.dump(search_rand,f) 181 | joblib.dump(search_rand, "./randomForestModelTuned.joblib", compress=9) 182 | print('RandomForestRegressor - best params: ',search_rand.best_params_) 183 | self.RandForRegressor = search_rand 184 | self.rmse = mean_squared_error(self.RandForRegressor.predict(self.x_test),self.y_test,squared=False) 185 | print('RMSE: ',mean_squared_error(self.RandForRegressor.predict(self.x_test),self.y_test,squared=False)) 186 | print('R2 score: ',r2_score(self.RandForRegressor.predict(self.x_test),self.y_test)) 187 | else: 188 | print('Load tuned Random Forest Regressor') 189 | # load RandomForestModel 190 | # with open('randomForestModelTuned.pkl', 'rb') as f: 191 | # self.RandForRegressor = pickle.load(f) 192 | self.RandForRegressor=joblib.load("./randomForestModelTuned.joblib") 193 | print(f'Current RandomForestRegressor Parameters: {self.RandForRegressor.best_params_}') 194 | print('RMSE: ',mean_squared_error(self.RandForRegressor.predict(self.x_test),self.y_test,squared=False)) 195 | print('R2 score: ',r2_score(self.RandForRegressor.predict(self.x_test),self.y_test)) 196 | self.rmse = mean_squared_error(self.RandForRegressor.predict(self.x_test),self.y_test,squared=False) 197 | # self.RandForRegressor = RandomForestRegressor(criterion='squared_error', 198 | # max_depth=20, 199 | # max_features='log2', 200 | # n_estimators=300, 201 | # min_samples_leaf=3) 202 | # def multi_layer_perceptron(self): 203 | # pass 204 | # def keras_regressor_analysis(self): 205 | # pass 206 | def predict_two_teams(self): 207 | teams_sports_ref = read_csv('teams_sports_ref_format.csv') 208 | while True: 209 | try: 210 | team_1 = input('team_1: ') 211 | if team_1 == 'exit': 212 | break 213 | team_2 = input('team_2: ') 214 | #Game location 215 | game_loc_team1 = int(input(f'{team_1} : home = 0, away = 1, neutral = 2: ')) 216 | if game_loc_team1 == 0: 217 | game_loc_team2 = 1 218 | elif game_loc_team1 == 1: 219 | game_loc_team2 = 0 220 | elif game_loc_team1 == 2: 221 | game_loc_team2 = 2 222 | #Check to see if the team was spelled right 223 | team_1 = get_close_matches(team_1,teams_sports_ref['teams'].tolist(),n=1)[0] 224 | team_2 = get_close_matches(team_2,teams_sports_ref['teams'].tolist(),n=1)[0] 225 | #2023 data 226 | year = 2023 227 | # sleep(4) 228 | basic = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs.html' 229 | adv = 'https://www.sports-reference.com/cbb/schools/' + team_1.lower() + '/' + str(year) + '-gamelogs-advanced.html' 230 | team_1_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_1.lower(),year) 231 | sleep(4) #I get get banned for a small period of time if I do not do this 232 | basic = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs.html' 233 | adv = 'https://www.sports-reference.com/cbb/schools/' + team_2.lower() + '/' + str(year) + '-gamelogs-advanced.html' 234 | team_2_df2023 = cbb_web_scraper.html_to_df_web_scrape_cbb(basic,adv,team_2.lower(),year) 235 | #Remove empty cells 236 | team_1_df2023['pts'].replace('', np.nan, inplace=True) 237 | team_1_df2023.replace('', np.nan, inplace=True) 238 | team_1_df2023.dropna(inplace=True) 239 | team_2_df2023['pts'].replace('', np.nan, inplace=True) 240 | team_2_df2023.replace('', np.nan, inplace=True) 241 | team_2_df2023.dropna(inplace=True) 242 | #Save series of pts for visualizations 243 | self.pts_team_1 = team_1_df2023['pts'].astype(float) 244 | self.team_1_name = team_1 245 | self.pts_team_2 = team_2_df2023['pts'].astype(float) 246 | self.team_2_name = team_2 247 | #Remove pts and game result 248 | # for col in team_1_df2023.columns: 249 | # if 'opp' in col: 250 | # team_1_df2023.drop(columns=col,inplace=True) 251 | # for col in team_2_df2023.columns: 252 | # if 'opp' in col: 253 | # team_2_df2023.drop(columns=col,inplace=True) 254 | # team_1_df2023.drop(columns=['game_result','pts'],inplace=True) 255 | # team_2_df2023.drop(columns=['game_result','pts'],inplace=True) 256 | #Drop the correlated features 257 | team_1_df2023.drop(columns=self.drop_cols, inplace=True) 258 | team_2_df2023.drop(columns=self.drop_cols, inplace=True) 259 | # team_1_df2023.to_csv('team_1.csv') 260 | # team_2_df2023.to_csv('team_2.csv') 261 | # print(team_1_df2023) 262 | # print(team_2_df2023) 263 | #Clean up dataframe 264 | # for col in team_1_df2023.columns: 265 | # if 'Unnamed' in col: 266 | # team_1_df2023.drop(columns=col,inplace=True) 267 | # for col in team_2_df2023.columns: 268 | # if 'Unnamed' in col: 269 | # team_2_df2023.drop(columns=col,inplace=True) 270 | #Try to find the moving averages that work 271 | # ma_range = np.arange(2,len(team_2_df2023)-2,1) 272 | ma_range = np.arange(2,7,1) #2 was the most correct value for mean and 8 was the best for the median; chose 9 for tiebreaking 273 | team_1_count = 0 274 | team_2_count = 0 275 | team_1_count_mean = 0 276 | team_2_count_mean = 0 277 | team_1_ma = [] 278 | team_2_ma = [] 279 | team_1_median = [] 280 | team_2_median = [] 281 | num_pts_score_team_1= [] 282 | num_pts_score_team_2 = [] 283 | mean_team_1_var = [] 284 | mean_team_2_var = [] 285 | # Get the latest simple rating system for both teams 286 | team_1_srs = cbb_web_scraper.get_latest_srs(team_1) 287 | team_2_srs = cbb_web_scraper.get_latest_srs(team_2) 288 | for ma in tqdm(ma_range): 289 | data1_median = team_1_df2023.rolling(ma).median() 290 | data1_median['game_loc'] = game_loc_team1 291 | data2_median = team_2_df2023.rolling(ma).median() 292 | data2_median['game_loc'] = game_loc_team2 293 | # data1_mean_old = team_1_df2023.rolling(ma).mean() 294 | # data2_mean_old = team_2_df2023.rolling(ma).mean() 295 | data1_mean = team_1_df2023.ewm(span=ma,min_periods=ma-1).mean() 296 | data1_mean['game_loc'] = game_loc_team1 297 | data2_mean = team_2_df2023.ewm(span=ma,min_periods=ma-1).mean() 298 | data2_mean['game_loc'] = game_loc_team2 299 | for col in team_1_df2023.columns: 300 | if "opp" in col: 301 | if col == 'opp_trb': 302 | # new_col = col.replace("opp_", "") 303 | data1_mean.loc[data1_mean.index[-1], 'opp_trb'] = data2_mean.loc[data2_mean.index[-1], 'total_board'] 304 | data2_mean.loc[data2_mean.index[-1], 'opp_trb'] = data1_mean.loc[data1_mean.index[-1], 'total_board'] 305 | 306 | data1_median.loc[data1_median.index[-1], 'opp_trb'] = data2_median.loc[data2_median.index[-1], 'total_board'] 307 | data2_median.loc[data2_median.index[-1], 'opp_trb'] = data1_median.loc[data1_median.index[-1], 'total_board'] 308 | else: 309 | new_col = col.replace("opp_", "") 310 | data1_mean.loc[data1_mean.index[-1], col] = data2_mean.loc[data2_mean.index[-1], new_col] 311 | data2_mean.loc[data2_mean.index[-1], col] = data1_mean.loc[data1_mean.index[-1], new_col] 312 | 313 | data1_median.loc[data1_median.index[-1], col] = data2_median.loc[data2_median.index[-1], new_col] 314 | data2_median.loc[data2_median.index[-1], col] = data1_median.loc[data1_median.index[-1], new_col] 315 | 316 | #Drop game result and points features 317 | data1_median.drop(columns=['game_result','pts'],inplace=True) 318 | data2_median.drop(columns=['game_result','pts'],inplace=True) 319 | data1_mean.drop(columns=['game_result','pts'],inplace=True) 320 | data2_mean.drop(columns=['game_result','pts'],inplace=True) 321 | #apply SRS 322 | data1_mean.loc[data1_mean.index[-1], 'simple_rating_system'] = team_1_srs 323 | data2_mean.loc[data2_mean.index[-1], 'simple_rating_system'] = team_2_srs 324 | data1_median.loc[data1_median.index[-1], 'simple_rating_system'] = team_1_srs 325 | data2_median.loc[data2_median.index[-1], 'simple_rating_system'] = team_2_srs 326 | #Get current predictions for both teams 327 | team_1_predict_median = self.RandForRegressor.predict(data1_median.iloc[-1:]) 328 | team_2_predict_median = self.RandForRegressor.predict(data2_median.iloc[-1:]) 329 | team_1_predict_mean = self.RandForRegressor.predict(data1_mean.iloc[-1:]) 330 | team_2_predict_mean = self.RandForRegressor.predict(data2_mean.iloc[-1:]) 331 | num_pts_score_team_1.append(team_1_predict_mean[0]) 332 | num_pts_score_team_2.append(team_2_predict_mean[0]) 333 | num_pts_score_team_1.append(team_1_predict_median[0]) 334 | num_pts_score_team_2.append(team_2_predict_median[0]) 335 | if team_1_predict_median > team_2_predict_median: 336 | team_1_count += 1 337 | team_1_median.append(ma) 338 | if team_1_predict_median < team_2_predict_median: 339 | team_2_count += 1 340 | team_2_median.append(ma) 341 | if team_1_predict_mean > team_2_predict_mean: 342 | team_1_count_mean += 1 343 | team_1_ma.append(ma) 344 | if team_1_predict_mean < team_2_predict_mean: 345 | team_2_count_mean += 1 346 | team_2_ma.append(ma) 347 | #check variability between fg and off_ftg 348 | mean_team_1_var.append(np.mean(data1_mean[['fg','off_rtg']].dropna().std())) 349 | mean_team_1_var.append(np.mean(data1_median[['fg','off_rtg']].dropna().std())) 350 | mean_team_2_var.append(np.mean(data2_mean[['fg','off_rtg']].dropna().std())) 351 | mean_team_2_var.append(np.mean(data2_median[['fg','off_rtg']].dropna().std())) 352 | print('===============================================================') 353 | print(f'{team_1} SRS data: {team_1_srs}') 354 | print(f'{team_2} SRS data: {team_2_srs}') 355 | print('===============================================================') 356 | print(f'Outcomes with a rolling median from 2-{len(ma_range)} games') 357 | print(f'{team_1}: {team_1_count} | {team_1_median}') 358 | print(f'{team_2}: {team_2_count} | {team_2_median}') 359 | if team_1_count > team_2_count: 360 | print(f'======= {team_1} wins =======') 361 | elif team_1_count < team_2_count: 362 | print(f'======= {team_2} wins =======') 363 | print('===============================================================') 364 | print(f'Outcomes with a mean from 2-{len(ma_range)} games') 365 | print(f'{team_1}: {team_1_count_mean} | {team_1_ma}') 366 | print(f'{team_2}: {team_2_count_mean} | {team_2_ma}') 367 | if team_1_count_mean > team_2_count_mean: 368 | print(f'======= {team_1} wins =======') 369 | elif team_1_count_mean < team_2_count_mean: 370 | print(f'======= {team_2} wins =======') 371 | print('===============================================================') 372 | print(f'{team_1} number of pts score: {int(np.mean(num_pts_score_team_1))} +/- {np.std(num_pts_score_team_1)}') 373 | print(f'{team_2} number of pts score: {int(np.mean(num_pts_score_team_2))} +/- {np.std(num_pts_score_team_2)}') 374 | if abs(int(np.mean(num_pts_score_team_1)) - int(np.mean(num_pts_score_team_2))) < 3:#self.rmse 375 | print('The point differential is less than the model RMSE, be cautious.') 376 | print('===============================================================') 377 | print(f'Mean variance of two best features {team_1}: {np.mean(mean_team_1_var)}') 378 | print(f'Mean variance of two best features {team_2}: {np.mean(mean_team_2_var)}') 379 | print('===============================================================') 380 | print(f'Standard deviation of points scored by {team_1}: {np.std(self.pts_team_1)}') 381 | print(f'Standard deviation of points scored by {team_2}: {np.std(self.pts_team_2)}') 382 | print('===============================================================') 383 | if "tod" in sys.argv[2]: 384 | date_today = str(datetime.now().date()).replace("-", "") 385 | elif "tom" in sys.argv[2]: 386 | date_today = str(datetime.now().date() + timedelta(days=1)).replace("-", "") 387 | URL = "https://www.espn.com/mens-college-basketball/schedule/_/date/" + date_today #sys argv???? 388 | print(f'ESPN prediction: {cbb_web_scraper.get_espn(URL,team_1,team_2)}') 389 | print('===============================================================') 390 | if sys.argv[2] == "show": 391 | self.visualization(np.mean(num_pts_score_team_1),np.mean(num_pts_score_team_2)) 392 | except Exception as e: 393 | print(f'The error: {e}') 394 | exc_type, exc_obj, exc_tb = sys.exc_info() 395 | fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] 396 | print(exc_type,' File with the error: ', fname, ' Line number with error: ',exc_tb.tb_lineno) 397 | if exc_tb.tb_lineno == 226: 398 | print(f'{team_1} data could not be found. check spelling or internet connection. Some teams do not have data on SportsReference') 399 | elif exc_tb.tb_lineno == 229: 400 | print(f'{team_2} data could not be found. check spelling or internet connection. Some teams do not have data on SportsReference') 401 | def feature_importances_random_forest(self): 402 | importances = self.RandForRegressor.best_estimator_.feature_importances_ 403 | indices = np.argsort(importances) 404 | plt.figure(figsize=(12,10)) 405 | plt.title('Feature Importances Random Forest') 406 | # plt.barh(range(len(indices)), importances[indices], color='k', align='center') 407 | sns.barplot(x=importances[indices], y=[self.x_test.columns[i] for i in indices], color='k') 408 | plt.yticks(range(len(indices)), [self.x_test.columns[i] for i in indices]) 409 | plt.xlabel('Relative Importance') 410 | plt.tight_layout() 411 | plt.savefig('feature_importance_random_forest.png',dpi=300) 412 | def visualization(self,pred_1,pred_2): 413 | games_1 = range(1,len(self.pts_team_1)+1,1) 414 | games_2 = range(1,len(self.pts_team_2)+1,1) 415 | team_1_pred = self.team_1_name + " prediction" 416 | team_2_pred = self.team_2_name + " prediction" 417 | plt.figure() 418 | plt.plot(games_1,self.pts_team_1,color='green',label=self.team_1_name) 419 | plt.plot(games_2,self.pts_team_2,color='blue',label=self.team_2_name) 420 | plt.scatter(len(self.pts_team_1)+2,pred_1,color='green',label=team_1_pred) 421 | plt.scatter(len(self.pts_team_2)+2,pred_2,color='blue',label=team_2_pred) 422 | plt.legend() 423 | plt.xlabel('Games') 424 | plt.ylabel('Points') 425 | plt.tight_layout() 426 | plt.show() 427 | def run_analysis(self): 428 | self.get_teams() 429 | self.split() 430 | self.random_forest_analysis() 431 | self.predict_two_teams() 432 | self.feature_importances_random_forest() 433 | def main(): 434 | cbb_regressor().run_analysis() 435 | if __name__ == '__main__': 436 | main() -------------------------------------------------------------------------------- /cbb_web_scraper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | html parse code - college basketball 5 | @author: brianszekely 6 | """ 7 | import requests 8 | from bs4 import BeautifulSoup 9 | from pandas import DataFrame 10 | from numpy import nan 11 | from time import sleep 12 | from os.path import join, exists 13 | from os import getcwd 14 | from urllib import request 15 | from urllib.request import Request, urlopen 16 | from pandas import read_csv 17 | from numpy import where 18 | from re import search 19 | from difflib import get_close_matches 20 | from datetime import datetime 21 | from numpy import nan 22 | #TODO: CREATE A FEATURE OF opp_simple_rating_system 23 | 24 | def get_teams_year(year_min,year_max): 25 | #Try to redo this when 429 is not an issue 26 | # URL = 'https://www.sports-reference.com/cbb/schools/' 27 | # hdr = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"} 28 | # req = Request(URL,headers=hdr) 29 | # html = request.urlopen(req) 30 | # soup = BeautifulSoup(html, "html.parser") 31 | # table = soup.find(class_="table_container is_setup") 32 | # print(soup) 33 | # input() 34 | #Read in from csv 35 | teams_save = [] 36 | teams = read_csv('all_teams_cbb.csv') 37 | teams_with_year = where((teams['From'] <= year_min) & (teams['To'] == year_max))[0] 38 | for team in teams['School'].iloc[teams_with_year]: 39 | team = team.replace(' ', '-').lower() 40 | if '.' in team: 41 | team = team.replace(".", "") 42 | if 'the' in team: 43 | team = team.replace("the-", "") 44 | if '&' in team: 45 | team = team.replace("&", "") 46 | if '(' in team and ')' in team: 47 | team = team.replace("(", "") 48 | team = team.replace(")", "") 49 | if "'" in team: 50 | team = team.replace("'", "") 51 | teams_save.append(team) 52 | return teams_save 53 | 54 | def alter_string(team): 55 | team = team.replace(' ', '-').lower() 56 | if '.' in team: 57 | team = team.replace(".", "") 58 | if 'the' in team: 59 | team = team.replace("the-", "") 60 | if '&' in team: 61 | team = team.replace("&", "") 62 | if '(' in team and ')' in team: 63 | team = team.replace("(", "") 64 | team = team.replace(")", "") 65 | if "'" in team: 66 | team = team.replace("'", "") 67 | return team 68 | def get_latest_srs(team): 69 | sleep(4) 70 | url_srs = f'https://www.sports-reference.com/cbb/schools/{team}/men/2024-schedule.html' 71 | hdr = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"} 72 | req_1 = Request(url_srs,headers=hdr) 73 | html_1 = request.urlopen(req_1) 74 | soup_3 = BeautifulSoup(html_1, "html.parser") 75 | table3 = soup_3.find(id='div_schedule') 76 | tbody2 = table3.find('tbody') 77 | tr_body2 = tbody2.find_all('tr') 78 | srs = [] 79 | for trb in tr_body2: 80 | for td in trb.find_all('td'): 81 | if td.get('data-stat') == "srs": 82 | srs.append(td.get_text()) 83 | return float(srs[-1]) 84 | 85 | def get_adv_opp_variables(team,parsed_date): 86 | date_without_time = parsed_date.strftime('%Y-%m-%d') 87 | sleep(3) 88 | url ='https://www.sports-reference.com/cbb/schools/' + team + '/' + str(2024) + '-gamelogs-advanced.html' 89 | hdr = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"} 90 | req_1 = Request(url,headers=hdr) 91 | html_1 = request.urlopen(req_1) 92 | soup_2 = BeautifulSoup(html_1, "html.parser") 93 | table2 = soup_2.find(id="all_sgl-advanced") 94 | tbody2 = table2.find('tbody') 95 | tr_body2 = tbody2.find_all('tr') 96 | # off_rtg, def_rtg = [], [] 97 | efg_pct = None 98 | print(f'team they played: {team}') 99 | for trb in tr_body2: 100 | for td in trb.find_all('td'): 101 | if td.get('data-stat') == 'date': 102 | if td.get_text() == date_without_time: 103 | continue 104 | else: 105 | break 106 | if td.get('data-stat') == "efg_pct": 107 | efg_pct = td.get_text() 108 | return efg_pct 109 | 110 | def html_to_df_web_scrape_cbb(URL,URL1,team,year): 111 | #URL = Basic data ; URL1 = Advanced stats 112 | url_srs = f'https://www.sports-reference.com/cbb/schools/{team}/men/{year}-schedule.html' 113 | hdr = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"} 114 | req_1 = Request(URL,headers=hdr) 115 | html_1 = request.urlopen(req_1) 116 | sleep(4) 117 | req_2 = Request(URL1,headers=hdr) 118 | html_2 = request.urlopen(req_2) 119 | sleep(4) 120 | req_3 = Request(url_srs,headers=hdr) 121 | html_3 = request.urlopen(req_3) 122 | # while True:4700++6 123 | # try: 124 | soup_1 = BeautifulSoup(html_1, "html.parser") 125 | soup_2 = BeautifulSoup(html_2, "html.parser") 126 | soup_3 = BeautifulSoup(html_3, "html.parser") 127 | # page = requests.get(URL) 128 | # soup = BeautifulSoup(page.content, "html.parser") 129 | # page1 = requests.get(URL1) 130 | # soup1 = BeautifulSoup(page1.content, "html.parser") 131 | # break 132 | # except: 133 | # print('HTTPSConnectionPool(host="www.sports-reference.com", port=443): Max retries exceeded. Retry in 10 seconds') 134 | # sleep(10) 135 | # table = soup_1.find(id="all_sgl-basic") 136 | table = soup_1.select_one('table[id^="sgl-basic"]') 137 | table1 = soup_2.find(id="all_sgl-advanced") 138 | table3 = soup_3.find(id='div_schedule') 139 | tbody = table.find('tbody') 140 | tbody1 = table1.find('tbody') 141 | tbody2 = table3.find('tbody') 142 | tr_body = tbody.find_all('tr') 143 | tr_body1 = tbody1.find_all('tr') 144 | tr_body2 = tbody2.find_all('tr') 145 | # game_season = [] 146 | # date_game = [] 147 | # game_location = [] 148 | # opp_id= [] 149 | # BASIC STATS 150 | game_result= [] 151 | pts= [] 152 | opp_pts= [] 153 | fg= [] 154 | fga= [] 155 | fg_pct= [] 156 | fg3= [] 157 | fg3a= [] 158 | fg3_pct= [] 159 | ft= [] 160 | fta= [] 161 | ft_pct= [] 162 | orb= [] 163 | total_board= [] 164 | ast= [] 165 | stl= [] 166 | blk= [] 167 | tov= [] 168 | pf= [] 169 | opp_fg = [] 170 | opp_fga= [] 171 | opp_fg_pct= [] 172 | opp_fg3= [] 173 | opp_fg3a= [] 174 | opp_fg3_pct= [] 175 | opp_ft= [] 176 | opp_fta= [] 177 | opp_ft_pct= [] 178 | opp_orb= [] 179 | opp_trb= [] 180 | opp_ast= [] 181 | opp_stl= [] 182 | opp_blk= [] 183 | opp_tov= [] 184 | opp_pf= [] 185 | game_loc = [] 186 | srs = [] 187 | date_save = [] 188 | efg_percent_opp = [] 189 | # opp_srs = [] 190 | #SIMPLE RATING SYSTEM 191 | # teams_sports_ref = read_csv('teams_sports_ref_format.csv') 192 | for trb in tr_body2: 193 | for td in trb.find_all('td'): 194 | # if td.get('data-stat') == 'opp_name': 195 | # get_close_matches(td.get_text(),teams_sports_ref['teams'].tolist(),n=1)[0] 196 | # print(td.get_text()) 197 | if td.get('data-stat') == "srs": 198 | if td.get_text() == '': 199 | srs.append(nan) 200 | else: 201 | srs.append(td.get_text()) 202 | #SIMPLE RATING SYSTEM - OPPONENT ? 203 | #BASIC STATS - change td.get_text() to float(td.get_text()) ? 204 | for trb in tr_body: 205 | for td in trb.find_all('td'): 206 | if td.get('data-stat') == "game_location": 207 | #home = 0, away = 1, N = 2 208 | if td.get_text() == 'N': 209 | game_loc.append(2) 210 | elif td.get_text() == '@': 211 | game_loc.append(1) 212 | elif td.get_text() == '': 213 | game_loc.append(0) 214 | if td.get('data-stat') == "game_result": 215 | if 'W' in td.get_text(): 216 | game_result.append(1) 217 | else: 218 | game_result.append(0) 219 | if td.get('data-stat') == "date": 220 | parsed_date = datetime.strptime(td.get_text(), '%Y-%m-%d') 221 | month = parsed_date.month 222 | day = parsed_date.day 223 | date_save.append(float(f'{month}.{day}')) 224 | #TODO: FIX THIS IN THE FUTURE TO ADD OPPONENT VARIABLES 225 | # if td.get('data-stat') == "opp_team_id": 226 | # opp_name = alter_string(td.get_text()) 227 | # try: 228 | # efg_percent_opp.append(get_adv_opp_variables(opp_name,parsed_date)) 229 | # except: 230 | # print(f'no advanced data for {opp_name}, advanced opponent variables are None') 231 | # efg_percent_opp.append(nan) 232 | if td.get('data-stat') == "pts": 233 | pts.append(td.get_text()) 234 | if td.get('data-stat') == "opp_pts": 235 | opp_pts.append(td.get_text()) 236 | if td.get('data-stat') == "fg": 237 | fg.append(td.get_text()) 238 | if td.get('data-stat') == "fga": 239 | fga.append(td.get_text()) 240 | if td.get('data-stat') == "fg_pct": 241 | fg_pct.append(td.get_text()) 242 | if td.get('data-stat') == "fg3": 243 | fg3.append(td.get_text()) 244 | if td.get('data-stat') == "fg3a": 245 | fg3a.append(td.get_text()) 246 | if td.get('data-stat') == "fg3_pct": 247 | fg3_pct.append(td.get_text()) 248 | if td.get('data-stat') == "ft": 249 | ft.append(td.get_text()) 250 | if td.get('data-stat') == "fta": 251 | fta.append(td.get_text()) 252 | if td.get('data-stat') == "ft_pct": 253 | ft_pct.append(td.get_text()) 254 | if td.get('data-stat') == "orb": 255 | orb.append(td.get_text()) 256 | if td.get('data-stat') == "trb": 257 | total_board.append(td.get_text()) 258 | if td.get('data-stat') == "ast": 259 | ast.append(td.get_text()) 260 | if td.get('data-stat') == "stl": 261 | stl.append(td.get_text()) 262 | if td.get('data-stat') == "blk": 263 | blk.append(td.get_text()) 264 | if td.get('data-stat') == "tov": 265 | tov.append(td.get_text()) 266 | if td.get('data-stat') == "pf": 267 | pf.append(td.get_text()) 268 | if td.get('data-stat') == "opp_fg": 269 | opp_fg.append(td.get_text()) 270 | if td.get('data-stat') == "opp_fga": 271 | opp_fga.append(td.get_text()) 272 | if td.get('data-stat') == "opp_fg_pct": 273 | opp_fg_pct.append(td.get_text()) 274 | if td.get('data-stat') == "opp_fg3": 275 | opp_fg3.append(td.get_text()) 276 | if td.get('data-stat') == "opp_fg3a": 277 | opp_fg3a.append(td.get_text()) 278 | if td.get('data-stat') == "opp_fg3_pct": 279 | opp_fg3_pct.append(td.get_text()) 280 | if td.get('data-stat') == "opp_ft": 281 | opp_ft.append(td.get_text()) 282 | if td.get('data-stat') == "opp_fta": 283 | opp_fta.append(td.get_text()) 284 | if td.get('data-stat') == "opp_ft_pct": 285 | opp_ft_pct.append(td.get_text()) 286 | if td.get('data-stat') == "opp_orb": 287 | opp_orb.append(td.get_text()) 288 | if td.get('data-stat') == "opp_trb": 289 | opp_trb.append(td.get_text()) 290 | if td.get('data-stat') == "opp_ast": 291 | opp_ast.append(td.get_text()) 292 | if td.get('data-stat') == "opp_stl": 293 | opp_stl.append(td.get_text()) 294 | if td.get('data-stat') == "opp_blk": 295 | opp_blk.append(td.get_text()) 296 | if td.get('data-stat') == "opp_tov": 297 | opp_tov.append(td.get_text()) 298 | if td.get('data-stat') == "opp_pf": 299 | opp_pf.append(td.get_text()) 300 | #ADVANCED STATS 301 | off_rtg = [] 302 | def_rtg = [] 303 | off_rtg_opp = [] 304 | def_rtg_opp = [] 305 | pace = [] 306 | fta_per_fga_pct = [] 307 | fg3a_per_fga_pct = [] 308 | ts_pct = [] 309 | trb_pct = [] 310 | ast_pct = [] 311 | stl_pct = [] 312 | blk_pct = [] 313 | efg_pct = [] 314 | tov_pct = [] 315 | orb_pct = [] 316 | ft_rate = [] 317 | opp_efg_pct= [] 318 | opp_tov_pct = [] 319 | drb_pct = [] 320 | opp_ft_rate = [] 321 | for trb in tr_body1: 322 | for td in trb.find_all('td'): 323 | if td.get('data-stat') == "off_rtg": 324 | off_rtg.append(td.get_text()) 325 | def_rtg_opp.append(td.get_text()) 326 | if td.get('data-stat') == "def_rtg": 327 | off_rtg_opp.append(td.get_text()) 328 | def_rtg.append(td.get_text()) 329 | if td.get('data-stat') == "pace": 330 | pace.append(td.get_text()) 331 | if td.get('data-stat') == "fta_per_fga_pct": 332 | fta_per_fga_pct.append(td.get_text()) 333 | if td.get('data-stat') == "fg3a_per_fga_pct": 334 | fg3a_per_fga_pct.append(td.get_text()) 335 | if td.get('data-stat') == "ts_pct": 336 | ts_pct.append(td.get_text()) 337 | if td.get('data-stat') == "trb_pct": 338 | trb_pct.append(td.get_text()) 339 | if td.get('data-stat') == "ast_pct": 340 | ast_pct.append(td.get_text()) 341 | if td.get('data-stat') == "stl_pct": 342 | stl_pct.append(td.get_text()) 343 | if td.get('data-stat') == "blk_pct": 344 | blk_pct.append(td.get_text()) 345 | if td.get('data-stat') == "efg_pct": 346 | efg_pct.append(td.get_text()) 347 | if td.get('data-stat') == "tov_pct": 348 | tov_pct.append(td.get_text()) 349 | if td.get('data-stat') == "orb_pct": 350 | orb_pct.append(td.get_text()) 351 | if td.get('data-stat') == "ft_rate": 352 | ft_rate.append(td.get_text()) 353 | if td.get('data-stat') == "opp_efg_pct": 354 | opp_efg_pct.append(td.get_text()) 355 | if td.get('data-stat') == "opp_tov_pct": 356 | opp_tov_pct.append(td.get_text()) 357 | if td.get('data-stat') == "drb_pct": 358 | drb_pct.append(td.get_text()) 359 | if td.get('data-stat') == "opp_ft_rate": 360 | opp_ft_rate.append(td.get_text()) 361 | return DataFrame(list(zip(game_result,pts,opp_pts,fg,fga, 362 | fg_pct,fg3,fg3a,fg3_pct,ft,fta,ft_pct,orb,total_board,ast, 363 | stl,blk,tov,pf,opp_fg,opp_fga,opp_fg_pct,opp_fg3,opp_fg3a,opp_fg3_pct, 364 | opp_ft,opp_fta,opp_ft_pct,opp_orb,opp_trb,opp_ast,opp_stl,opp_blk,opp_tov, 365 | opp_pf, off_rtg,def_rtg,pace,fta_per_fga_pct,fg3a_per_fga_pct,ts_pct, 366 | trb_pct,ast_pct,stl_pct,blk_pct,efg_pct,tov_pct,orb_pct,ft_rate,opp_efg_pct, 367 | opp_tov_pct,drb_pct,opp_ft_rate,game_loc,srs,date_save, 368 | off_rtg_opp,def_rtg_opp)), 369 | columns =['game_result','pts','opp_pts','fg','fga', 370 | 'fg_pct','fg3','fg3a','fg3_pct','ft','fta','ft_pct','orb','total_board','ast', 371 | 'stl','blk','tov','pf','opp_fg','opp_fga','opp_fg_pct','opp_fg3','opp_fg3a','opp_fg3_pct', 372 | 'opp_ft','opp_fta','opp_ft_pct','opp_orb','opp_trb','opp_ast','opp_stl','opp_blk','opp_tov', 373 | 'opp_pf','off_rtg','def_rtg','pace','fta_per_fga_pct','fg3a_per_fga_pct','ts_pct', 374 | 'trb_pct','ast_pct','stl_pct','blk_pct','efg_pct','tov_pct','orb_pct','ft_rate','opp_efg_pct', 375 | 'opp_tov_pct','drb_pct','opp_ft_rate','game_loc','simple_rating_system','date_played', 376 | 'opp_off_rtg','opp_def_rtg']) 377 | 378 | def get_espn(URL,team_1,team_2): 379 | team_1 = create_acr(team_1) 380 | team_2 = create_acr(team_2) 381 | # URL = "https://www.espn.com/mens-college-basketball/schedule/_/date/20230131" 382 | hdr = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"} 383 | req_1 = Request(URL,headers=hdr) 384 | html_1 = request.urlopen(req_1) 385 | soup_1 = BeautifulSoup(html_1, "html.parser") 386 | table = soup_1.find(class_="ResponsiveTable") 387 | table1 = table.find(class_="Table__Scroller") 388 | table2 = table.find(class_="Table") 389 | table3 = table.find(class_="Table__TBODY") 390 | for td in table3.find_all(class_="Table__TR Table__TR--sm Table__even"): 391 | try: 392 | #Get team names 393 | inst = td.find(class_="events__col Table__TD") 394 | href_team = inst.find(class_="AnchorLink").get("href") 395 | if team_1 in href_team: 396 | #Get game link 397 | inst = td.find(class_="date__col Table__TD") 398 | href_val = inst.find(class_="AnchorLink").get("href") 399 | game = "https://www.espn.com" + href_val 400 | req_second = Request(game,headers=hdr) 401 | html_second = request.urlopen(req_second) 402 | soup_second = BeautifulSoup(html_second, "html.parser") 403 | #Team 1 - left-0 top-0 = Away 404 | team_1_predict = soup_second.find(class_="matchupPredictor__teamValue matchupPredictor__teamValue--b left-0 top-0 flex items-baseline absolute copy") 405 | start = '>' 406 | end = "