├── .idea
├── encodings.xml
├── vcs.xml
├── .gitignore
├── inspectionProfiles
│ └── profiles_settings.xml
├── modules.xml
├── KenPom.iml
└── misc.xml
├── README.md
└── base.py
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/KenPom.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | ApexVCS
8 |
9 |
10 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # KenPom
2 |
3 | A quick and dirty way to pull various cuts of data from the incredible NCAA Basketball advanced stats website KenPom.
4 |
5 | What does it do?
6 |
7 | (1) Quickly scrape the entire main page at KenPom into a Pandas DataFrame.
8 | ```
9 | kenny = KenPom(start=2002, end=2003)
10 | kenny.df
11 |
12 | idx Rk Team Conf Record ... OppO OppD NCOS AdjEM Season
13 | 0 1.0 Kentucky SEC 32-4 ... 108.6 97.4 6.77 2003
14 | 1 2.0 Kansas B12 30-8 ... 108.6 96.8 6.07 2003
15 | 2 3.0 Pittsburgh BE 28-5 ... 105.5 98.4 -8.24 2003
16 | 3 4.0 Arizona P10 28-4 ... 107.2 98.5 8.19 2003
17 | 4 5.0 Illinois B10 25-7 ... 105.8 98.5 -4.18 2003
18 | ```
19 |
20 | (2) Pull a specific team's historical main page stats
21 |
22 | ```
23 | kenny = KenPom(start=2002, end=2003)
24 | kenny.team('Duke')
25 |
26 | idx Rk Team Conf Record AdjEM ... SoS AdjEM OppO OppD NCOS AdjEM Season
27 | 5 6.0 Duke ACC 26-7 23.75 ... 8.85 107.1 98.3 0.64 2003
28 | 0 1.0 Duke ACC 31-4 34.19 ... 9.87 109.1 99.2 6.66 2002
29 | ```
30 |
31 | (3) Pull a specific conference's historical main page stats
32 |
33 | ```
34 | kenny = KenPom(start=2002, end=2003)
35 | kenny.conference('WAC')
36 |
37 | idx Rk Team Conf Record ... OppO OppD NCOS AdjEM Season
38 | 52 51.0 Tulsa WAC 23-10 ... 102.7 101.7 4.16 2003
39 | 86 83.0 Nevada WAC 17-14 ... 103.2 100.9 4.98 2003
40 | 101 98.0 Fresno St. WAC 20-8 ... 101.3 102.1 -1.87 2003
41 | 115 112.0 Rice WAC 17-10 ... 101.0 101.0 -0.21 2003
42 | 118 115.0 Hawaii WAC 19-12 ... 103.4 102.1 -1.42 2003
43 | ```
44 |
45 | (4) Hone in on a single season across all teams
46 |
47 | ```
48 | kenny = KenPom(start=2002, end=2020)
49 | kenny.season(2002)
50 |
51 | idx Rk Team Conf Record ... OppO OppD NCOS AdjEM Season
52 | 0 1.0 Duke ACC 31-4 ... 109.1 99.2 6.66 2002
53 | 1 2.0 Cincinnati CUSA 31-4 ... 106.3 99.7 3.48 2002
54 | 2 3.0 Maryland ACC 32-4 ... 109.1 99.3 1.62 2002
55 | 3 4.0 Kansas B12 33-4 ... 110.3 99.6 8.32 2002
56 | 4 5.0 Oklahoma B12 31-5 ... 109.0 100.2 -0.45 2002
57 | ```
--------------------------------------------------------------------------------
/base.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import re
4 | import numpy
5 | import pandas as pd
6 |
7 |
8 | class KenPom:
9 |
10 | def __init__(self, start=2002, end=2003):
11 | self.start = start
12 | self.end = end
13 | self.df = self.scrape_ken()
14 |
15 | def scrape_ken(self):
16 | """
17 | :return: DataFrame of KenPom's complete College Rankings / Advanced Stats
18 | """
19 | season = self.end
20 |
21 | main_pom = list()
22 |
23 | while season >= self.start:
24 | url = 'https://kenpom.com/index.php?y=' + str(season)
25 | r = requests.get(url)
26 | soup = BeautifulSoup(r.content, 'lxml')
27 |
28 | table = soup.table
29 | table_rows = table.find_all('tr')
30 |
31 | team_rows = list()
32 | for tr in table_rows:
33 | td = tr.find_all('td')
34 | row = [i.text for i in td]
35 | team_rows.append(row)
36 |
37 | cols = ["Rk", "Team", "Conf", "Record", "AdjEM", "AdjO", "a", "AdjD", "b", "AdjT", "c", "Luck", "d",
38 | "SoS AdjEM", "e", "OppO", "f", "OppD", "g", "NCOS AdjEM", "h"]
39 |
40 | ken_pom = pd.DataFrame(team_rows[2:], columns=cols)
41 |
42 | ken_pom.drop(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'], axis=1, inplace=True)
43 |
44 | ken_pom['Rk'] = ken_pom['Rk'].astype('float')
45 | ken_pom['AdjEM'] = ken_pom['AdjEM'].astype('float')
46 | ken_pom['AdjO'] = ken_pom['AdjO'].astype('float')
47 | ken_pom['AdjD'] = ken_pom['AdjD'].astype('float')
48 | ken_pom['AdjT'] = ken_pom['AdjT'].astype('float')
49 | ken_pom['Luck'] = ken_pom['Luck'].astype('float')
50 | ken_pom['SoS AdjEM'] = ken_pom['SoS AdjEM'].astype('float')
51 | ken_pom['OppO'] = ken_pom['OppO'].astype('float')
52 | ken_pom['OppD'] = ken_pom['OppD'].astype('float')
53 | ken_pom['NCOS AdjEM'] = ken_pom['NCOS AdjEM'].astype('float')
54 | ken_pom['Season'] = season
55 |
56 | season -= 1
57 |
58 | main_pom.append(ken_pom)
59 |
60 | final_ken = pd.concat(main_pom)
61 | final_ken.dropna(axis=0, inplace=True)
62 |
63 | final_ken['Team'] = [re.sub(r'\d+', '', x).strip() for x in final_ken['Team']]
64 |
65 | return final_ken
66 |
67 | def team(self, team):
68 | return self.df[self.df['Team'] == str(team)]
69 |
70 | def team_record(self, team):
71 | df = self.df[self.df['Team'] == str(team)]
72 |
73 | seasons = [int(x) for x in df['Season']]
74 |
75 | raw_record = [x.split("-") for x in df['Record']]
76 | wins = [int(x[0]) for x in raw_record]
77 | losses = [int(x[1]) for x in raw_record]
78 |
79 | record = pd.DataFrame([seasons, wins, losses]).T
80 | record.columns = ['Season', 'Wins', 'Losses']
81 | return record
82 |
83 | def conference(self, conference):
84 | return self.df[self.df['Conf'] == str(conference)]
85 |
86 | def season(self, season):
87 | return self.df[self.df['Season'] == int(season)]
88 |
--------------------------------------------------------------------------------