├── .gitignore ├── requirements.txt ├── README.md └── scraper.py /.gitignore: -------------------------------------------------------------------------------- 1 | /__pycache__ -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.23.5 2 | pandas==1.5.3 3 | matplotlib==3.7.0 4 | seaborn==0.12.1 5 | kaggle==1.5.13 6 | statsmodels==0.13.5 7 | scikit-learn==1.2.1 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # What makes a good midfielder? 2 | 3 | ## About 4 | I started this analysis in order to find out which stats are the most important for a midfielder. It turned out to be less straightforward than expected, since I needed to find an objective way to quantify "good" players, and after I did (ratings and market values), I could not get my hands on data for player ratings so I ended up using market values only. I explored the relationship between market value and age, and used that to create a normalised metric, that is market value corrected for age. Using this metric, I then proceeded to find out which stats are the most important for a midfielder. 5 | 6 | ## Methods 7 | I used polynomial regression to find the relationship between market value and age, and then used these results to create a normalised metric. I then used multiple linear regression to find the relationship between the stats and the metric. I controlled the FDR, and found the most significant stats. However, because of collinearities, I repeated this one last time with only a handful of stats. 8 | 9 | ## Data 10 | I used market values from [Transfermarkt](https://www.transfermarkt.com/), acquired from [Kaggle](https://www.kaggle.com/datasets/davidcariboo/player-scores) and player stats from [FBref](https://fbref.com), using a [scraper](https://github.com/NikosKont/FBref-Scraper) I made myself. 11 | The data need not be downloaded, as the scraper is included in the notebook and the data are downloaded automatically, if they don't exist. 12 | 13 | ## Requirements 14 | This notebook was written in Python 3.9.7. The required packages and versions I used are: 15 | ``` 16 | numpy==1.23.5 17 | pandas==1.5.3 18 | matplotlib==3.7.0 19 | seaborn==0.12.1 20 | kaggle==1.5.13 21 | statsmodels==0.13.5 22 | scikit-learn==1.2.1 23 | ``` 24 | (also included in requirements.txt) 25 | -------------------------------------------------------------------------------- /scraper.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | import requests 4 | import pandas as pd 5 | from bs4 import BeautifulSoup as bs 6 | 7 | 8 | stats_list = ('standard', 'keepers', 'keepersadv', 'shooting', 'passing','passing_types', 9 | 'gca', 'defense', 'possession', 'playingtime', 'misc') 10 | 11 | def available_stats(): 12 | """ Returns the available stats to scrape """ 13 | return stats_list 14 | 15 | 16 | def get_player_stats_from_URL(url: str, stat: str): 17 | """ Get player stats from FBref.com, using the given URL 18 | url: the url to get the stats from 19 | stat: the stat to get, must be one of the available stats 20 | 21 | returns: pandas dataframe of the stats 22 | """ 23 | if stat not in stats_list: 24 | raise ValueError(f'stat must be one of {stats_list}') 25 | 26 | table = _get_table_from_URL(url, stat) 27 | df = _get_dataframe(table) 28 | return df 29 | 30 | 31 | def get_player_stats(stat: str, compid: str): 32 | """ Get player stats from FBref.com, URL is derived from the arguments 33 | stat: the stat to get, must be one of the available stats 34 | compid: the competition id, can be found in the url of the competition 35 | 36 | returns: pandas dataframe of the stats 37 | """ 38 | 39 | if stat == 'standard': 40 | url = f'https://fbref.com/en/comps/{compid}/stats/' 41 | else: 42 | url = f'https://fbref.com/en/comps/{compid}/{stat}/' 43 | 44 | if compid == 'Big5': 45 | url += 'players/Big-5-European-Leagues-Stats/' 46 | 47 | df = get_player_stats_from_URL(url, stat) 48 | 49 | return df 50 | 51 | 52 | def _get_table_from_URL(url, stat): 53 | print(f'Getting data from {url}...') 54 | res = requests.get(url, timeout=10) 55 | comm = re.compile('') 56 | soup = bs(comm.sub('', res.text), 'lxml') 57 | table = soup.find('div', {'id': f'div_stats_{stat}'}) 58 | print('Done.') 59 | return table 60 | 61 | 62 | def _get_dataframe(table): 63 | df = pd.read_html(str(table)) 64 | df = df[0] 65 | 66 | # delete the first and last column (Rk, Match) 67 | df = df.iloc[:, 1:-1] 68 | 69 | # keep only the second value for the headers 70 | df.columns = [h[1] for h in df.columns] 71 | 72 | # only keep the part after space for 'Nation' 73 | df['Nation'] = df['Nation'].apply(lambda x: str(x).rsplit(' ', maxsplit=1)[-1]) 74 | 75 | # delete rows with the column names 76 | df = df[df[df.columns[0]] != df.columns[0]] 77 | df.reset_index(drop=True, inplace=True) 78 | 79 | # convert all numeric columns to numeric 80 | df = df.apply(pd.to_numeric, errors='ignore') 81 | 82 | return df 83 | --------------------------------------------------------------------------------