├── .idea ├── encodings.xml ├── vcs.xml ├── .gitignore ├── inspectionProfiles │ └── profiles_settings.xml ├── modules.xml ├── KenPom.iml └── misc.xml ├── README.md └── base.py /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Datasource local storage ignored files 5 | /dataSources/ 6 | /dataSources.local.xml 7 | # Editor-based HTTP Client requests 8 | /httpRequests/ 9 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/KenPom.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | ApexVCS 8 | 9 | 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # KenPom 2 | 3 | A quick and dirty way to pull various cuts of data from the incredible NCAA Basketball advanced stats website KenPom. 4 | 5 | What does it do? 6 | 7 | (1) Quickly scrape the entire main page at KenPom into a Pandas DataFrame. 8 | ``` 9 | kenny = KenPom(start=2002, end=2003) 10 | kenny.df 11 | 12 | idx Rk Team Conf Record ... OppO OppD NCOS AdjEM Season 13 | 0 1.0 Kentucky SEC 32-4 ... 108.6 97.4 6.77 2003 14 | 1 2.0 Kansas B12 30-8 ... 108.6 96.8 6.07 2003 15 | 2 3.0 Pittsburgh BE 28-5 ... 105.5 98.4 -8.24 2003 16 | 3 4.0 Arizona P10 28-4 ... 107.2 98.5 8.19 2003 17 | 4 5.0 Illinois B10 25-7 ... 105.8 98.5 -4.18 2003 18 | ``` 19 | 20 | (2) Pull a specific team's historical main page stats 21 | 22 | ``` 23 | kenny = KenPom(start=2002, end=2003) 24 | kenny.team('Duke') 25 | 26 | idx Rk Team Conf Record AdjEM ... SoS AdjEM OppO OppD NCOS AdjEM Season 27 | 5 6.0 Duke ACC 26-7 23.75 ... 8.85 107.1 98.3 0.64 2003 28 | 0 1.0 Duke ACC 31-4 34.19 ... 9.87 109.1 99.2 6.66 2002 29 | ``` 30 | 31 | (3) Pull a specific conference's historical main page stats 32 | 33 | ``` 34 | kenny = KenPom(start=2002, end=2003) 35 | kenny.conference('WAC') 36 | 37 | idx Rk Team Conf Record ... OppO OppD NCOS AdjEM Season 38 | 52 51.0 Tulsa WAC 23-10 ... 102.7 101.7 4.16 2003 39 | 86 83.0 Nevada WAC 17-14 ... 103.2 100.9 4.98 2003 40 | 101 98.0 Fresno St. WAC 20-8 ... 101.3 102.1 -1.87 2003 41 | 115 112.0 Rice WAC 17-10 ... 101.0 101.0 -0.21 2003 42 | 118 115.0 Hawaii WAC 19-12 ... 103.4 102.1 -1.42 2003 43 | ``` 44 | 45 | (4) Hone in on a single season across all teams 46 | 47 | ``` 48 | kenny = KenPom(start=2002, end=2020) 49 | kenny.season(2002) 50 | 51 | idx Rk Team Conf Record ... OppO OppD NCOS AdjEM Season 52 | 0 1.0 Duke ACC 31-4 ... 109.1 99.2 6.66 2002 53 | 1 2.0 Cincinnati CUSA 31-4 ... 106.3 99.7 3.48 2002 54 | 2 3.0 Maryland ACC 32-4 ... 109.1 99.3 1.62 2002 55 | 3 4.0 Kansas B12 33-4 ... 110.3 99.6 8.32 2002 56 | 4 5.0 Oklahoma B12 31-5 ... 109.0 100.2 -0.45 2002 57 | ``` -------------------------------------------------------------------------------- /base.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import re 4 | import numpy 5 | import pandas as pd 6 | 7 | 8 | class KenPom: 9 | 10 | def __init__(self, start=2002, end=2003): 11 | self.start = start 12 | self.end = end 13 | self.df = self.scrape_ken() 14 | 15 | def scrape_ken(self): 16 | """ 17 | :return: DataFrame of KenPom's complete College Rankings / Advanced Stats 18 | """ 19 | season = self.end 20 | 21 | main_pom = list() 22 | 23 | while season >= self.start: 24 | url = 'https://kenpom.com/index.php?y=' + str(season) 25 | r = requests.get(url) 26 | soup = BeautifulSoup(r.content, 'lxml') 27 | 28 | table = soup.table 29 | table_rows = table.find_all('tr') 30 | 31 | team_rows = list() 32 | for tr in table_rows: 33 | td = tr.find_all('td') 34 | row = [i.text for i in td] 35 | team_rows.append(row) 36 | 37 | cols = ["Rk", "Team", "Conf", "Record", "AdjEM", "AdjO", "a", "AdjD", "b", "AdjT", "c", "Luck", "d", 38 | "SoS AdjEM", "e", "OppO", "f", "OppD", "g", "NCOS AdjEM", "h"] 39 | 40 | ken_pom = pd.DataFrame(team_rows[2:], columns=cols) 41 | 42 | ken_pom.drop(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'], axis=1, inplace=True) 43 | 44 | ken_pom['Rk'] = ken_pom['Rk'].astype('float') 45 | ken_pom['AdjEM'] = ken_pom['AdjEM'].astype('float') 46 | ken_pom['AdjO'] = ken_pom['AdjO'].astype('float') 47 | ken_pom['AdjD'] = ken_pom['AdjD'].astype('float') 48 | ken_pom['AdjT'] = ken_pom['AdjT'].astype('float') 49 | ken_pom['Luck'] = ken_pom['Luck'].astype('float') 50 | ken_pom['SoS AdjEM'] = ken_pom['SoS AdjEM'].astype('float') 51 | ken_pom['OppO'] = ken_pom['OppO'].astype('float') 52 | ken_pom['OppD'] = ken_pom['OppD'].astype('float') 53 | ken_pom['NCOS AdjEM'] = ken_pom['NCOS AdjEM'].astype('float') 54 | ken_pom['Season'] = season 55 | 56 | season -= 1 57 | 58 | main_pom.append(ken_pom) 59 | 60 | final_ken = pd.concat(main_pom) 61 | final_ken.dropna(axis=0, inplace=True) 62 | 63 | final_ken['Team'] = [re.sub(r'\d+', '', x).strip() for x in final_ken['Team']] 64 | 65 | return final_ken 66 | 67 | def team(self, team): 68 | return self.df[self.df['Team'] == str(team)] 69 | 70 | def team_record(self, team): 71 | df = self.df[self.df['Team'] == str(team)] 72 | 73 | seasons = [int(x) for x in df['Season']] 74 | 75 | raw_record = [x.split("-") for x in df['Record']] 76 | wins = [int(x[0]) for x in raw_record] 77 | losses = [int(x[1]) for x in raw_record] 78 | 79 | record = pd.DataFrame([seasons, wins, losses]).T 80 | record.columns = ['Season', 'Wins', 'Losses'] 81 | return record 82 | 83 | def conference(self, conference): 84 | return self.df[self.df['Conf'] == str(conference)] 85 | 86 | def season(self, season): 87 | return self.df[self.df['Season'] == int(season)] 88 | --------------------------------------------------------------------------------