├── .gitignore ├── FCPython.py ├── GW_38_Ratings.py ├── GW_38_Ratings_evaluation.py ├── KPI_functions.py ├── README.md ├── __pycache__ ├── FCPython.cpython-38.pyc ├── KPI_functions.cpython-38.pyc ├── fitting_functions.cpython-38.pyc └── percentile_functions.cpython-38.pyc ├── create_KPI_dataframe.py ├── create_KPI_dataframe_EDIT.py ├── create_events_df_eu.py ├── fitting_functions.py ├── minutes_played.py ├── the_match_ranking.py ├── validation_vs_WhoScored.py └── xG_model_evaluation.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore Mac system files 2 | .DS_store 3 | 4 | # Ignore node_modules folder 5 | #node_modules 6 | 7 | # Ignore files related to API keys 8 | #.env 9 | 10 | # Ignore SASS config files 11 | #.sass-cache -------------------------------------------------------------------------------- /FCPython.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Mar 25 17:32:00 2020 5 | 6 | @author: davsu428 7 | """ 8 | import matplotlib.pyplot as plt 9 | from matplotlib.patches import Arc 10 | 11 | def createPitch(length,width, unity,linecolor): # in meters 12 | # Code by @JPJ_dejong 13 | 14 | """ 15 | creates a plot in which the 'length' is the length of the pitch (goal to goal). 16 | And 'width' is the width of the pitch (sideline to sideline). 17 | Fill in the unity in meters or in yards. 18 | 19 | """ 20 | #Set unity 21 | if unity == "meters": 22 | # Set boundaries 23 | if length >= 120.5 or width >= 75.5: 24 | return(str("Field dimensions are too big for meters as unity, didn't you mean yards as unity?\ 25 | Otherwise the maximum length is 120 meters and the maximum width is 75 meters. Please try again")) 26 | #Run program if unity and boundaries are accepted 27 | else: 28 | #Create figure 29 | fig=plt.figure() 30 | #fig.set_size_inches(7, 5) 31 | ax=fig.add_subplot(1,1,1) 32 | 33 | #Pitch Outline & Centre Line 34 | plt.plot([0,0],[0,width], color=linecolor) 35 | plt.plot([0,length],[width,width], color=linecolor) 36 | plt.plot([length,length],[width,0], color=linecolor) 37 | plt.plot([length,0],[0,0], color=linecolor) 38 | plt.plot([length/2,length/2],[0,width], color=linecolor) 39 | 40 | #Left Penalty Area 41 | plt.plot([16.5 ,16.5],[(width/2 +16.5),(width/2-16.5)],color=linecolor) 42 | plt.plot([0,16.5],[(width/2 +16.5),(width/2 +16.5)],color=linecolor) 43 | plt.plot([16.5,0],[(width/2 -16.5),(width/2 -16.5)],color=linecolor) 44 | 45 | #Right Penalty Area 46 | plt.plot([(length-16.5),length],[(width/2 +16.5),(width/2 +16.5)],color=linecolor) 47 | plt.plot([(length-16.5), (length-16.5)],[(width/2 +16.5),(width/2-16.5)],color=linecolor) 48 | plt.plot([(length-16.5),length],[(width/2 -16.5),(width/2 -16.5)],color=linecolor) 49 | 50 | #Left 5-meters Box 51 | plt.plot([0,5.5],[(width/2+7.32/2+5.5),(width/2+7.32/2+5.5)],color=linecolor) 52 | plt.plot([5.5,5.5],[(width/2+7.32/2+5.5),(width/2-7.32/2-5.5)],color=linecolor) 53 | plt.plot([5.5,0.5],[(width/2-7.32/2-5.5),(width/2-7.32/2-5.5)],color=linecolor) 54 | 55 | #Right 5 -eters Box 56 | plt.plot([length,length-5.5],[(width/2+7.32/2+5.5),(width/2+7.32/2+5.5)],color=linecolor) 57 | plt.plot([length-5.5,length-5.5],[(width/2+7.32/2+5.5),width/2-7.32/2-5.5],color=linecolor) 58 | plt.plot([length-5.5,length],[width/2-7.32/2-5.5,width/2-7.32/2-5.5],color=linecolor) 59 | 60 | #Prepare Circles 61 | centreCircle = plt.Circle((length/2,width/2),9.15,color=linecolor,fill=False) 62 | centreSpot = plt.Circle((length/2,width/2),0.8,color=linecolor) 63 | leftPenSpot = plt.Circle((11,width/2),0.8,color=linecolor) 64 | rightPenSpot = plt.Circle((length-11,width/2),0.8,color=linecolor) 65 | 66 | #Draw Circles 67 | ax.add_patch(centreCircle) 68 | ax.add_patch(centreSpot) 69 | ax.add_patch(leftPenSpot) 70 | ax.add_patch(rightPenSpot) 71 | 72 | #Prepare Arcs 73 | leftArc = Arc((11,width/2),height=18.3,width=18.3,angle=0,theta1=308,theta2=52,color=linecolor) 74 | rightArc = Arc((length-11,width/2),height=18.3,width=18.3,angle=0,theta1=128,theta2=232,color=linecolor) 75 | 76 | #Draw Arcs 77 | ax.add_patch(leftArc) 78 | ax.add_patch(rightArc) 79 | #Axis titles 80 | 81 | #check unity again 82 | elif unity == "yards": 83 | #check boundaries again 84 | if length <= 95: 85 | return(str("Didn't you mean meters as unity?")) 86 | elif length >= 131 or width >= 101: 87 | return(str("Field dimensions are too big. Maximum length is 130, maximum width is 100")) 88 | #Run program if unity and boundaries are accepted 89 | else: 90 | #Create figure 91 | fig=plt.figure() 92 | #fig.set_size_inches(7, 5) 93 | ax=fig.add_subplot(1,1,1) 94 | 95 | #Pitch Outline & Centre Line 96 | plt.plot([0,0],[0,width], color=linecolor) 97 | plt.plot([0,length],[width,width], color=linecolor) 98 | plt.plot([length,length],[width,0], color=linecolor) 99 | plt.plot([length,0],[0,0], color=linecolor) 100 | plt.plot([length/2,length/2],[0,width], color=linecolor) 101 | 102 | #Left Penalty Area 103 | plt.plot([18 ,18],[(width/2 +18),(width/2-18)],color=linecolor) 104 | plt.plot([0,18],[(width/2 +18),(width/2 +18)],color=linecolor) 105 | plt.plot([18,0],[(width/2 -18),(width/2 -18)],color=linecolor) 106 | 107 | #Right Penalty Area 108 | plt.plot([(length-18),length],[(width/2 +18),(width/2 +18)],color=linecolor) 109 | plt.plot([(length-18), (length-18)],[(width/2 +18),(width/2-18)],color=linecolor) 110 | plt.plot([(length-18),length],[(width/2 -18),(width/2 -18)],color=linecolor) 111 | 112 | #Left 6-yard Box 113 | plt.plot([0,6],[(width/2+7.32/2+6),(width/2+7.32/2+6)],color=linecolor) 114 | plt.plot([6,6],[(width/2+7.32/2+6),(width/2-7.32/2-6)],color=linecolor) 115 | plt.plot([6,0],[(width/2-7.32/2-6),(width/2-7.32/2-6)],color=linecolor) 116 | 117 | #Right 6-yard Box 118 | plt.plot([length,length-6],[(width/2+7.32/2+6),(width/2+7.32/2+6)],color=linecolor) 119 | plt.plot([length-6,length-6],[(width/2+7.32/2+6),width/2-7.32/2-6],color=linecolor) 120 | plt.plot([length-6,length],[(width/2-7.32/2-6),width/2-7.32/2-6],color=linecolor) 121 | 122 | #Prepare Circles; 10 yards distance. penalty on 12 yards 123 | centreCircle = plt.Circle((length/2,width/2),10,color=linecolor,fill=False) 124 | centreSpot = plt.Circle((length/2,width/2),0.8,color=linecolor) 125 | leftPenSpot = plt.Circle((12,width/2),0.8,color=linecolor) 126 | rightPenSpot = plt.Circle((length-12,width/2),0.8,color=linecolor) 127 | 128 | #Draw Circles 129 | ax.add_patch(centreCircle) 130 | ax.add_patch(centreSpot) 131 | ax.add_patch(leftPenSpot) 132 | ax.add_patch(rightPenSpot) 133 | 134 | #Prepare Arcs 135 | leftArc = Arc((11,width/2),height=20,width=20,angle=0,theta1=312,theta2=48,color=linecolor) 136 | rightArc = Arc((length-11,width/2),height=20,width=20,angle=0,theta1=130,theta2=230,color=linecolor) 137 | 138 | #Draw Arcs 139 | ax.add_patch(leftArc) 140 | ax.add_patch(rightArc) 141 | 142 | #Tidy Axes 143 | plt.axis('off') 144 | 145 | return fig,ax 146 | 147 | 148 | def createPitchOld(): 149 | #Taken from FC Python 150 | #Create figure 151 | fig=plt.figure() 152 | ax=fig.add_subplot(1,1,1) 153 | 154 | #Pitch Outline & Centre Line 155 | plt.plot([0,0],[0,90], color=linecolor) 156 | plt.plot([0,130],[90,90], color=linecolor) 157 | plt.plot([130,130],[90,0], color=linecolor) 158 | plt.plot([130,0],[0,0], color=linecolor) 159 | plt.plot([65,65],[0,90], color=linecolor) 160 | 161 | #Left Penalty Area 162 | plt.plot([16.5,16.5],[65,25],color=linecolor) 163 | plt.plot([0,16.5],[65,65],color=linecolor) 164 | plt.plot([16.5,0],[25,25],color=linecolor) 165 | 166 | #Right Penalty Area 167 | plt.plot([130,113.5],[65,65],color=linecolor) 168 | plt.plot([113.5,113.5],[65,25],color=linecolor) 169 | plt.plot([113.5,130],[25,25],color=linecolor) 170 | 171 | #Left 6-yard Box 172 | plt.plot([0,5.5],[54,54],color=linecolor) 173 | plt.plot([5.5,5.5],[54,36],color=linecolor) 174 | plt.plot([5.5,0.5],[36,36],color=linecolor) 175 | 176 | #Right 6-yard Box 177 | plt.plot([130,124.5],[54,54],color=linecolor) 178 | plt.plot([124.5,124.5],[54,36],color=linecolor) 179 | plt.plot([124.5,130],[36,36],color=linecolor) 180 | 181 | #Prepare Circles 182 | centreCircle = plt.Circle((65,45),9.15,color=linecolor,fill=False) 183 | centreSpot = plt.Circle((65,45),0.8,color=linecolor) 184 | leftPenSpot = plt.Circle((11,45),0.8,color=linecolor) 185 | rightPenSpot = plt.Circle((119,45),0.8,color=linecolor) 186 | 187 | #Draw Circles 188 | ax.add_patch(centreCircle) 189 | ax.add_patch(centreSpot) 190 | ax.add_patch(leftPenSpot) 191 | ax.add_patch(rightPenSpot) 192 | 193 | #Prepare Arcs 194 | leftArc = Arc((11,45),height=18.3,width=18.3,angle=0,theta1=310,theta2=50,color=linecolor) 195 | rightArc = Arc((119,45),height=18.3,width=18.3,angle=0,theta1=130,theta2=230,color=linecolor) 196 | 197 | #Draw Arcs 198 | ax.add_patch(leftArc) 199 | ax.add_patch(rightArc) 200 | 201 | #Tidy Axes 202 | plt.axis('off') 203 | 204 | return fig,ax 205 | 206 | def createGoalMouth(): 207 | #Adopted from FC Python 208 | #Create figure 209 | fig=plt.figure(figsize=(8, 6)) 210 | ax=fig.add_subplot(1,1,1) 211 | 212 | linecolor='black' 213 | 214 | #Pitch Outline & Centre Line 215 | plt.plot([0,65],[0,0], color=linecolor) 216 | plt.plot([65,65],[50,0], color=linecolor) 217 | plt.plot([0,0],[50,0], color=linecolor) 218 | 219 | #Left Penalty Area 220 | plt.plot([12.5,52.5],[16.5,16.5],color=linecolor) 221 | plt.plot([52.5,52.5],[16.5,0],color=linecolor) 222 | plt.plot([12.5,12.5],[0,16.5],color=linecolor) 223 | 224 | #Left 6-yard Box 225 | plt.plot([41.5,41.5],[5.5,0],color=linecolor) 226 | plt.plot([23.5,41.5],[5.5,5.5],color=linecolor) 227 | plt.plot([23.5,23.5],[0,5.5],color=linecolor) 228 | 229 | #Goal 230 | plt.plot([41.5-5.34,41.5-5.34],[-2,0],color=linecolor) 231 | plt.plot([23.5+5.34,41.5-5.34],[-2,-2],color=linecolor) 232 | plt.plot([23.5+5.34,23.5+5.34],[0,-2],color=linecolor) 233 | 234 | #Prepare Circles 235 | leftPenSpot = plt.Circle((65/2,11),0.8,color=linecolor) 236 | 237 | #Draw Circles 238 | ax.add_patch(leftPenSpot) 239 | 240 | #Prepare Arcs 241 | leftArc = Arc((32.5,11),height=18.3,width=18.3,angle=0,theta1=38,theta2=142,color=linecolor) 242 | 243 | #Draw Arcs 244 | ax.add_patch(leftArc) 245 | 246 | #Tidy Axes 247 | plt.axis('off') 248 | 249 | return fig,ax 250 | 251 | -------------------------------------------------------------------------------- /GW_38_Ratings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Sep 14 16:41:04 2021 5 | 6 | @author: emildanielsson & JakobEP 7 | 8 | Program description: 9 | Find ratings of all players in the last round. 10 | 11 | Algorithm: 12 | 13 | """ 14 | 15 | 16 | # The basics 17 | import pandas as pd 18 | import numpy as np 19 | import json 20 | 21 | # Plotting 22 | import matplotlib.pyplot as plt 23 | from mplsoccer import FontManager 24 | 25 | # Import other functions 26 | import fitting_functions as ff 27 | 28 | # Statistical fitting of models 29 | import statsmodels.api as sm 30 | import statsmodels.formula.api as smf 31 | from sklearn import preprocessing 32 | from sklearn.preprocessing import MinMaxScaler 33 | from sklearn.preprocessing import RobustScaler 34 | 35 | # For tables 36 | from tabulate import tabulate 37 | 38 | # Ignore Future Warnings 39 | import warnings 40 | warnings.simplefilter(action='ignore', category=FutureWarning) 41 | 42 | 43 | #%% 44 | # - Read in data KPI data 45 | "---------------------------------------------------------------------------" 46 | 47 | # Test to load in and store as dataframe per_90 dont have all collumns yet 48 | # with open('Json_files/KPI_per_90_All.json') as f: 49 | # data_kpi = json.load(f) 50 | 51 | with open('../Json_files/KPI_tot_All_v2.json') as f: 52 | data_kpi = json.load(f) 53 | 54 | df_KPI = pd.DataFrame(data_kpi) 55 | 56 | 57 | # Create match dataframes 58 | df_England_matches = pd.read_json('../../Wyscout/matches/matches_England.json', encoding="unicode_escape") 59 | 60 | 61 | #%% 62 | # - Read in minutes played data 63 | "---------------------------------------------------------------------------" 64 | 65 | with open('../Json_files/minutes_played_All.json') as f: 66 | data_minutes = json.load(f) 67 | 68 | df_minutes = pd.DataFrame(data_minutes) 69 | 70 | 71 | ################################################ 72 | # - Load Fonts 73 | "----------------------------------------------" 74 | 75 | URL1 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/' 76 | 'fonts/SourceSerifPro-Regular.ttf?raw=true') 77 | serif_regular = FontManager(URL1) 78 | URL2 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/' 79 | 'fonts/SourceSerifPro-ExtraLight.ttf?raw=true') 80 | serif_extra_light = FontManager(URL2) 81 | URL3 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/fonts/' 82 | 'SourceSerifPro-Bold.ttf?raw=true') 83 | serif_bold = FontManager(URL3) 84 | 85 | 86 | 87 | #%% 88 | # - Set filter and scaler varables 89 | "---------------------------------------------------------------------------" 90 | 91 | # Now we want to filter out those who have not played at least 92 | # 10 matches with 20 minutes in each match (can change) 93 | min_minutes = 20 94 | 95 | # Choose method for normalizaion 96 | scaler = MinMaxScaler() 97 | #scaler = preprocessing.QuantileTransformer(random_state=0) 98 | #scaler = RobustScaler() 99 | 100 | 101 | #%% 102 | # - Create test and train dataset and preprocess data 103 | "---------------------------------------------------------------------------" 104 | 105 | # Seperate df_KPI beteween PL and the rest of the legaues 106 | mask_PL = df_KPI.league == "England" 107 | df_KPI_PL = df_KPI.loc[mask_PL] 108 | df_KPI_EU_train = df_KPI.loc[~mask_PL] 109 | 110 | 111 | #%% 112 | # - Rank the players 113 | "---------------------------------------------------------------------------" 114 | 115 | # Positions to fit for 116 | positions_fitting = [['LB', 'RB'], ['CB'], ['LM', 'RM'], ['CM'], ['LW', 'RW'], ['ST']] 117 | 118 | # Initiate rating and info dataframe 119 | df_final_rating = pd.DataFrame(columns = ['matchId', 'teamName', 'playerId', 120 | 'shortName', 'position', 'tot_rating', 121 | 'match_events_rating', 'fitting_rating_off', 122 | 'fitting_rating_def', 123 | 'final_rating', 'match_info', 124 | 'gameweek']) 125 | 126 | # Initiate rating and info dataframe 127 | df_final_rating2 = pd.DataFrame(columns = ['matchId', 'teamName', 'playerId', 128 | 'shortName', 'position', 'tot_rating', 129 | 'match_events_rating', 'fitting_rating_off', 130 | 'fitting_rating_def', 131 | 'final_rating', 'match_info', 132 | 'gameweek']) 133 | 134 | 135 | # Do fitting for all the positins 136 | for position in positions_fitting: 137 | 138 | ################################################ 139 | # - Kpis 140 | "----------------------------------------------" 141 | 142 | # All Kpis 143 | list_kpi_all = ['passing%', 144 | 'completed_passes', 145 | 'fouls', 146 | 'aerial%', 147 | 'aerial_wins', 148 | 'shots', 149 | 'dribbles%', 150 | 'succesful_dribbles', 151 | 'key_passes', 152 | 'succesful_through_passes', 153 | 'events_in_box', 154 | 'passes_to_box', 155 | 'creative_passes', 156 | 'succesful_def_actions', 157 | 'progressive_carries', 158 | 'red_card', 159 | 'own_goals', 160 | 'yellow_cards', 161 | 'danger_ball_loses', 162 | 'def_actions%', 163 | 'p_adj_succ_def_actions' 164 | ] 165 | 166 | # KPIs to fit for when using dep_var "team_xG" 167 | list_kpi_off = ['passing%', 168 | 'completed_passes', 169 | 'fouls', 170 | #'aerial%', 171 | #'aerial_wins', 172 | 'shots', 173 | 'dribbles%', 174 | #'succesful_dribbles', 175 | 'key_passes', 176 | #'succesful_through_passes', 177 | 'events_in_box', 178 | 'passes_to_box', 179 | #'creative_passes', 180 | #'succesful_def_actions', 181 | #'progressive_carries', 182 | 'red_card', 183 | 'own_goals', 184 | 'yellow_cards', 185 | 'danger_ball_loses', 186 | #'def_actions%', 187 | 'p_adj_succ_def_actions' 188 | ] 189 | 190 | # KPIs to fit for when using dep_var "opponent_xG" 191 | list_kpi_def = ['passing%', 192 | 'completed_passes', 193 | 'fouls', 194 | #'aerial%', 195 | #'aerial_wins', 196 | #'shots', 197 | 'dribbles%', 198 | #'succesful_dribbles', 199 | #'key_passes', 200 | #'succesful_through_passes', 201 | #'events_in_box', 202 | #'passes_to_box', 203 | #'creative_passes', 204 | #'succesful_def_actions', 205 | #'progressive_carries', 206 | 'red_card', 207 | 'own_goals', 208 | 'yellow_cards', 209 | 'danger_ball_loses', 210 | #'def_actions%', 211 | 'p_adj_succ_def_actions' 212 | ] 213 | 214 | ################################################ 215 | # - Find model coeficients, r-squared and statisticly significant kpis 216 | "----------------------------------------------" 217 | # Call to fitting function to find coeficient and independent variables 218 | dep_var_off = 'team_xG' 219 | model_coef_off, r_squared_off, list_kpi_off_fitting = ff.KPI_fitting(df_KPI_EU_train, scaler, 220 | list_kpi_off, dep_var_off, 221 | position, min_minutes) 222 | 223 | # Call to fitting function to find coeficient and independent variables 224 | dep_var_def = 'opponent_xG' 225 | model_coef_def, r_squared_def, list_kpi_def_fitting = ff.KPI_fitting(df_KPI_EU_train, scaler, 226 | list_kpi_def, dep_var_def, 227 | position, min_minutes) 228 | 229 | 230 | ################################################ 231 | # - Use the coefficients from EU to compute percentiles 232 | # in the PL gameweek 1-37, filtered PL training data 233 | "----------------------------------------------" 234 | 235 | # Filter and normalise the PL data (including GW 38) 236 | df_filtered_PL = ff.filter_dataframe(df_KPI_PL, position, list_kpi_all, min_minutes, 1) 237 | df_filtered_PL[list_kpi_all] = scaler.fit_transform(df_filtered_PL[list_kpi_all]) 238 | 239 | # Seperate gameweek 38 from PL 240 | test_gameweek = 38 241 | df_PL_gameweek_38 = df_England_matches.loc[df_England_matches.gameweek == test_gameweek] 242 | list_gameweek_38_matchId = df_PL_gameweek_38['wyId'].unique().tolist() 243 | mask_last_gameweeks = df_filtered_PL.matchId.isin(list_gameweek_38_matchId) 244 | 245 | # KPIs GW 1-37 246 | df_KPI_PL_train = df_filtered_PL.loc[~mask_last_gameweeks] 247 | 248 | # Initiate rating dataframe for GW 1-37 249 | df_ratings = pd.DataFrame() 250 | 251 | # Loop through players in gameweek 1-37 252 | for i, player in df_KPI_PL_train.iterrows(): 253 | 254 | # Add some info to dataframe 255 | df_ratings.loc[i, 'matchId'] = player['matchId'] 256 | df_ratings.loc[i, 'teamName'] = player['teamName'] 257 | df_ratings.loc[i, 'playerId'] = player['playerId'] 258 | df_ratings.loc[i, 'shortName'] = player['shortName'] 259 | 260 | ################################################ 261 | # - xG-Fit 262 | "----------------------------------------------" 263 | 264 | # Find the fitted xG 265 | xG_fitting_rating_off = ff.compute_fitting_ratings(player, model_coef_off, list_kpi_off_fitting) 266 | 267 | # Multiply the fitted value with r_squared, how good the fit was 268 | xG_fitting_rating_off = xG_fitting_rating_off * r_squared_off 269 | 270 | # Add to df 271 | df_ratings.loc[i, 'fitting_rating_off'] = xG_fitting_rating_off 272 | 273 | ################################################ 274 | # - opponent_xG-Fit (xGC) 275 | "----------------------------------------------" 276 | # Find the fitted opponent xG (xGC) 277 | xGC_fitting_rating_def = ff.compute_fitting_ratings(player, model_coef_def, list_kpi_def_fitting) 278 | 279 | # Multiply the fitted value with r_squared, how good the fit was 280 | xGC_fitting_rating_def = xGC_fitting_rating_def * r_squared_def 281 | 282 | # Add to df 283 | df_ratings.loc[i, 'fitting_rating_def'] = xGC_fitting_rating_def 284 | 285 | ################################################ 286 | # - Match event-rating 287 | "----------------------------------------------" 288 | 289 | # Find the event rating and add to dataframe 290 | match_event_rating = ff.compute_events_rating(player, position, df_KPI) 291 | df_ratings.loc[i, 'match_events_rating'] = match_event_rating 292 | 293 | # Sum fitting rating and add to dataframe (regression-based rating) 294 | tot_fit_rating = xG_fitting_rating_off - xGC_fitting_rating_def 295 | df_ratings.loc[i, 'tot_fit_rating'] = tot_fit_rating 296 | 297 | 298 | 299 | # Find percentiles from the rankings in gameweek 1-37 PL 300 | percentiles = np.arange(0.01, 1, 0.01) 301 | percentiles_fit = df_ratings['tot_fit_rating'].quantile(percentiles) 302 | percentiles_events = df_ratings['match_events_rating'].quantile(percentiles) 303 | 304 | ################################################ 305 | # - Compute the rankings of gameweek 38 for the position 306 | "----------------------------------------------" 307 | # KPIs GW 38 308 | df_KPI_PL_gameweek_38 = df_filtered_PL.loc[mask_last_gameweeks] 309 | 310 | # Initiate rating dataframe for GW 38 311 | df_ratings_test = pd.DataFrame() 312 | 313 | # Loop through players in gameweek 38 314 | for i, player in df_KPI_PL_gameweek_38.iterrows(): 315 | 316 | # Add some info to dataframe 317 | df_ratings_test.loc[i, 'matchId'] = player['matchId'] 318 | df_ratings_test.loc[i, 'teamName'] = player['teamName'] 319 | df_ratings_test.loc[i, 'playerId'] = player['playerId'] 320 | df_ratings_test.loc[i, 'shortName'] = player['shortName'] 321 | 322 | ################################################ 323 | # - xG-Fit 324 | "----------------------------------------------" 325 | 326 | # Find the fitted xG 327 | xG_fitting_rating_off = ff.compute_fitting_ratings(player, model_coef_off, list_kpi_off_fitting) 328 | 329 | # Multiply the fitted value with r_squared, how good the fit was 330 | xG_fitting_rating_off = xG_fitting_rating_off * r_squared_off 331 | 332 | # Add to df 333 | df_ratings_test.loc[i, 'fitting_rating_off'] = xG_fitting_rating_off 334 | 335 | ################################################ 336 | # - opponent_xG-Fit (xGC) 337 | "----------------------------------------------" 338 | 339 | # Find the fitted opponent xG (xGC) 340 | xGC_fitting_rating_def = ff.compute_fitting_ratings(player, model_coef_def, list_kpi_def_fitting) 341 | 342 | # Multiply the fitted value with r_squared, how good the fit was 343 | xGC_fitting_rating_def = xGC_fitting_rating_def * r_squared_def 344 | 345 | # Add to df 346 | df_ratings_test.loc[i, 'fitting_rating_def'] = xGC_fitting_rating_def 347 | 348 | ################################################ 349 | # - Match event-rating 350 | "----------------------------------------------" 351 | 352 | # Find the event rating and add to dataframe 353 | match_event_rating = ff.compute_events_rating(player, position, df_KPI) 354 | df_ratings_test.loc[i, 'match_events_rating'] = match_event_rating 355 | 356 | # Sum fitting rating and add to dataframe 357 | tot_fit_rating = xG_fitting_rating_off - xGC_fitting_rating_def 358 | df_ratings_test.loc[i, 'tot_fit_rating'] = tot_fit_rating 359 | 360 | # Modify the df_rating_test dataframe and the gameweek 38 dataframe 361 | ff.create_rating_dataframe(df_ratings_test, df_KPI_PL, df_KPI_PL_gameweek_38, 362 | percentiles_fit, percentiles_events, df_England_matches) 363 | 364 | # Modify the rating dataframe from gameweek 1-37 365 | ff.create_rating_dataframe(df_ratings, df_KPI, df_KPI_PL_train, 366 | percentiles_fit, percentiles_events, df_England_matches) 367 | 368 | 369 | # Merge the rating dataframe GW 38 370 | frames = [df_final_rating, df_ratings_test] 371 | df_final_rating = pd.concat(frames) 372 | 373 | # Merge the rating dataframe [GW1-37] 374 | frames = [df_final_rating2, df_ratings] 375 | df_final_rating2 = pd.concat(frames) 376 | 377 | 378 | #%% 379 | # Check the mean and sum rating from gameweek 1-37 380 | df_mean_rating = df_final_rating2.groupby(['shortName', 'teamName'], as_index=False)["final_rating"].mean() 381 | df_sum_rating = df_final_rating2.groupby(['shortName'], as_index=False)["final_rating"].sum() 382 | 383 | # # Save to Excel file 384 | with pd.ExcelWriter("../Gameweek_38.xlsx", mode="a", engine="openpyxl", if_sheet_exists = "new") as writer: 385 | df_mean_rating.to_excel(writer, sheet_name="mean_rating", 386 | #columns=['shortName', 'position', 'teamName', 'final_rating'], 387 | header=True, index=False) 388 | 389 | # # Save to Excel file 390 | with pd.ExcelWriter("../Gameweek_38.xlsx", mode="a", engine="openpyxl", if_sheet_exists = "new") as writer: 391 | df_sum_rating.to_excel(writer, sheet_name="sum_rating", 392 | #columns=['shortName', 'position', 'teamName', 'final_rating'], 393 | header=True, index=False) 394 | 395 | 396 | #%% 397 | # - Print and save the ratings to use for validation_vs_WhoScored 398 | "---------------------------------------------------------------------------" 399 | # Print matches from last gameweek ratings 400 | df_gameweek_38 = df_final_rating.loc[df_final_rating.gameweek == 38] 401 | rated_matches = df_gameweek_38['matchId'].unique().tolist() 402 | 403 | # Print the rated matches 404 | for match in rated_matches: 405 | the_match = df_final_rating.loc[df_final_rating['matchId'] == match] 406 | print(the_match.match_info.values[0]) 407 | table = the_match[['teamName', 'shortName', 'position', 'final_rating']] 408 | print(tabulate(table)) 409 | 410 | 411 | print("Adding results Gameweek_38.xlsx, choose filename:\n") 412 | file_name = input() 413 | 414 | # # Save to Excel file to use for validation 415 | with pd.ExcelWriter("../Gameweek_38.xlsx", mode="a", engine="openpyxl", if_sheet_exists = "new") as writer: 416 | df_gameweek_38.to_excel(writer, sheet_name=file_name, 417 | columns=['teamName', 'shortName', 'position', 'final_rating'], 418 | header=True, index=False) 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | -------------------------------------------------------------------------------- /GW_38_Ratings_evaluation.py: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/env python3 3 | # -*- coding: utf-8 -*- 4 | """ 5 | Created on Tue Sep 14 16:41:04 2021 6 | 7 | @author: emildanielsson & JakobEP 8 | 9 | Program description: 10 | Find ratings of all players in the last round 11 | 12 | Algorithm: 13 | 14 | """ 15 | 16 | 17 | # The basics 18 | import pandas as pd 19 | import numpy as np 20 | import json 21 | 22 | # Plotting 23 | import matplotlib.pyplot as plt 24 | from mplsoccer import FontManager 25 | 26 | # Import other functions 27 | import percentile_functions as pf 28 | import fitting_functions as ff 29 | 30 | # Statistical fitting of models 31 | import statsmodels.api as sm 32 | import statsmodels.formula.api as smf 33 | from sklearn import preprocessing 34 | from sklearn.preprocessing import MinMaxScaler 35 | from sklearn.preprocessing import RobustScaler 36 | import statistics 37 | 38 | # For tables 39 | from tabulate import tabulate 40 | 41 | # Ignore Future Warnings 42 | import warnings 43 | warnings.simplefilter(action='ignore', category=FutureWarning) 44 | 45 | 46 | #%% 47 | # - Read in data KPI data 48 | "---------------------------------------------------------------------------" 49 | 50 | # Test to load in and store as dataframe per_90 dont have all collumns yet 51 | # with open('Json_files/KPI_per_90_All.json') as f: 52 | # data_kpi = json.load(f) 53 | 54 | with open('Json_files/KPI_tot_All_v2.json') as f: 55 | data_kpi = json.load(f) 56 | 57 | df_KPI = pd.DataFrame(data_kpi) 58 | 59 | 60 | # Create match dataframes 61 | df_England_matches = pd.read_json('../Wyscout/matches/matches_England.json', encoding="unicode_escape") 62 | 63 | 64 | #%% 65 | # - Read in minutes played data 66 | "---------------------------------------------------------------------------" 67 | 68 | with open('Json_files/minutes_played_All.json') as f: 69 | data_minutes = json.load(f) 70 | 71 | df_minutes = pd.DataFrame(data_minutes) 72 | 73 | 74 | ################################################ 75 | # - Load Fonts 76 | "----------------------------------------------" 77 | 78 | URL1 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/' 79 | 'fonts/SourceSerifPro-Regular.ttf?raw=true') 80 | serif_regular = FontManager(URL1) 81 | URL2 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/' 82 | 'fonts/SourceSerifPro-ExtraLight.ttf?raw=true') 83 | serif_extra_light = FontManager(URL2) 84 | URL3 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/fonts/' 85 | 'SourceSerifPro-Bold.ttf?raw=true') 86 | serif_bold = FontManager(URL3) 87 | 88 | 89 | 90 | #%% 91 | # - Set filter and scaler varables 92 | "---------------------------------------------------------------------------" 93 | 94 | # Now we want to filter out those who have not played at least 95 | # 10 matches with 20 minutes in each match (can change) 96 | min_minutes = 20 97 | 98 | # Choose method for normalizaion 99 | scaler = MinMaxScaler() 100 | #scaler = preprocessing.QuantileTransformer(random_state=0) 101 | #scaler = RobustScaler() 102 | 103 | 104 | #%% 105 | # - Create test and train dataset and preprocess data 106 | "---------------------------------------------------------------------------" 107 | 108 | # Seperate df_KPI beteween PL and the rest of the legaues 109 | mask_PL = df_KPI.league == "England" 110 | df_KPI_PL = df_KPI.loc[mask_PL] 111 | df_KPI_EU_train = df_KPI.loc[~mask_PL] 112 | 113 | 114 | #%% 115 | # - Rank the players 116 | "---------------------------------------------------------------------------" 117 | 118 | # Positions to fit for 119 | #positions_fitting = [['LB', 'RB'], ['CB'], ['LM', 'RM'], ['CM'], ['LW', 'RW'], ['ST']] 120 | positions_fitting = [['ST']] 121 | #positions_fitting = [['CB']] 122 | 123 | 124 | # Do fitting for all the positins 125 | for position in positions_fitting: 126 | # print(position) 127 | 128 | ################################################ 129 | # - Kpis to fit for 130 | "----------------------------------------------" 131 | 132 | list_kpi_all = ['passing%', 133 | 'completed_passes', 134 | 'fouls', 135 | 'aerial%', 136 | 'aerial_wins', 137 | 'shots', 138 | 'dribbles%', 139 | 'succesful_dribbles', 140 | 'key_passes', 141 | 'succesful_through_passes', 142 | 'events_in_box', 143 | 'passes_to_box', 144 | 'creative_passes', 145 | 'succesful_def_actions', 146 | 'progressive_carries', 147 | 'red_card', 148 | 'own_goals', 149 | 'yellow_cards', 150 | 'danger_ball_loses', 151 | 'def_actions%', 152 | 'p_adj_succ_def_actions', 153 | 'team_xG', 154 | 'opponent_xG' 155 | ] 156 | 157 | # KPIs when using KPI_tot_All 158 | list_kpi_off = ['passing%', 159 | 'completed_passes', 160 | 'fouls', 161 | #'aerial%', 162 | #'aerial_wins', 163 | 'shots', 164 | 'dribbles%', 165 | #'succesful_dribbles', 166 | 'key_passes', 167 | #'succesful_through_passes', 168 | 'events_in_box', 169 | 'passes_to_box', 170 | #'creative_passes', 171 | #'succesful_def_actions', 172 | #'progressive_carries', 173 | 'red_card', 174 | 'own_goals', 175 | 'yellow_cards', 176 | 'danger_ball_loses', 177 | #'def_actions%', 178 | 'p_adj_succ_def_actions' 179 | ] 180 | 181 | list_kpi_def = ['passing%', 182 | 'completed_passes', 183 | 'fouls', 184 | #'aerial%', 185 | #'aerial_wins', 186 | #'shots', 187 | 'dribbles%', 188 | #'succesful_dribbles', 189 | #'key_passes', 190 | #'succesful_through_passes', 191 | #'events_in_box', 192 | #'passes_to_box', 193 | #'creative_passes', 194 | #'succesful_def_actions', 195 | #'progressive_carries', 196 | 'red_card', 197 | 'own_goals', 198 | 'yellow_cards', 199 | 'danger_ball_loses', 200 | #'def_actions%', 201 | 'p_adj_succ_def_actions' 202 | ] 203 | 204 | ################################################ 205 | # - Find model coeficients, r-squared and statisticly significant kpis 206 | "----------------------------------------------" 207 | # Call to fitting function to find coeficient and independent variables 208 | dep_var_off = 'team_xG' 209 | model_coef_off, r_squared_off, list_kpi_off_fitting, model_off = ff.KPI_fitting(df_KPI_EU_train, scaler, 210 | list_kpi_off, dep_var_off, 211 | position, min_minutes) 212 | 213 | # Call to fitting function to find coeficient and independent variables 214 | dep_var_def = 'opponent_xG' 215 | model_coef_def, r_squared_def, list_kpi_def_fitting, model_def = ff.KPI_fitting(df_KPI_EU_train, scaler, 216 | list_kpi_def, dep_var_def, 217 | position, min_minutes) 218 | 219 | 220 | ################################################ 221 | # - Use the coefficients from EU to compute percentiles 222 | # in the PL gameweek 1-37, filtered PL training data 223 | "----------------------------------------------" 224 | 225 | # Filter and normalise the PL data (including GW 38) 226 | df_filtered_PL = pf.filter_dataframe(df_KPI_PL, position, list_kpi_all, min_minutes, 1) 227 | df_filtered_PL[list_kpi_all[:-2]] = scaler.fit_transform(df_filtered_PL[list_kpi_all[:-2]]) 228 | 229 | # Seperate gameweek 38 from PL 230 | test_gameweek = 38 231 | df_PL_gameweek_38 = df_England_matches.loc[df_England_matches.gameweek == test_gameweek] 232 | list_gameweek_38_matchId = df_PL_gameweek_38['wyId'].unique().tolist() 233 | mask_last_gameweeks = df_filtered_PL.matchId.isin(list_gameweek_38_matchId) 234 | 235 | # KPIs GW 1-37 236 | df_KPI_PL_test = df_filtered_PL.loc[~mask_last_gameweeks] 237 | 238 | # Find test data 239 | X_test_off = df_KPI_PL_test[list_kpi_off_fitting[:-1]] 240 | X_test_def = df_KPI_PL_test[list_kpi_def_fitting[:-1]] 241 | 242 | # Add constant to test data 243 | X_test_off = sm.add_constant(X_test_off) 244 | X_test_def = sm.add_constant(X_test_def) 245 | 246 | # Loop through players in gameweek 1-37 247 | #for i, player in df_KPI_PL_test.iterrows(): 248 | 249 | 250 | 251 | 252 | 253 | #%% 254 | # - Evaluate fitting 255 | "---------------------------------------------------------------------------" 256 | 257 | # Out of sample prediction 258 | y_pred_off = model_off.predict(X_test_off) 259 | y_pred_def = model_def.predict(X_test_def) 260 | 261 | 262 | 263 | #%% 264 | # - Plot fitted values and computed team xG-values 265 | "---------------------------------------------------------------------------" 266 | 267 | x_plot = np.arange(len(y_pred_off)) 268 | y_plot = df_KPI_PL_test['team_xG'].copy() 269 | y_pred_plot = y_pred_off 270 | 271 | y_diff = abs(y_plot - y_pred_plot) 272 | 273 | 274 | # Create figure and axes 275 | fig1, ax1 = plt.subplots(figsize=(12, 6)) 276 | 277 | width = 0.35 # the width of the bars 278 | 279 | rects1 = ax1.bar(x_plot[0:30] - width/2, y_plot[0:30], width, label='xG-team actual') 280 | rects2 = ax1.bar(x_plot[0:30] + width/2, y_pred_plot[0:30], width, label='xG-team predicted') 281 | 282 | #plt.bar(x_plot[0:50], y_plot[0:50], color='purple', label='xG-team actual') 283 | #plt.bar(x_plot[0:50], y_pred_plot[0:50], color='orange', label='xG-team predicted') 284 | #ax1.plot(x_plot[0:50], y_diff[0:50], '--', color='red', label='xG-team difference') 285 | 286 | # x and y labels 287 | ax1.set_xlabel('matches', fontweight='bold', fontsize=20, fontproperties=serif_bold.prop) 288 | ax1.set_ylabel('xG', fontweight='bold', fontsize=20, fontproperties=serif_bold.prop) 289 | 290 | # Adding title and subtitle 291 | fig1.text(0.05, 1, f"Actual and predicted xG-team values for position: {positions_fitting[0][0]} \n", fontsize=22, 292 | fontproperties=serif_bold.prop) 293 | fig1.text(0.05, 1, 'First 30 matches in PL season 2017/18', fontsize=18, 294 | fontproperties=serif_regular.prop) 295 | 296 | # Add legend 297 | ax1.legend(loc='best', prop={"family": "Times New Roman", 'size': 14}) 298 | 299 | # Add grid and zorder 300 | ax1.grid(ls="dotted", lw=0.3, color="grey", alpha=1, zorder=1) 301 | 302 | # The tight_layout() function in pyplot module of matplotlib library is used 303 | # to automatically adjust subplot parameters to give specified padding. 304 | plt.tight_layout() 305 | plt.show() 306 | 307 | #%% 308 | # - Statistics 309 | "---------------------------------------------------------------------------" 310 | 311 | # Difference 312 | y_diff_mean = y_diff.mean() 313 | y_diff_var = statistics.variance(y_diff) 314 | #y_diff_covar = statistics.covariance(y_plot, y_pred_plot) 315 | y_diff_stdvar = statistics.stdev(y_diff) 316 | 317 | # Actual xG-team 318 | y_plot_mean = y_plot.mean() 319 | y_plot_var = statistics.variance(y_plot) 320 | y_plot_stdvar = statistics.stdev(y_plot) 321 | 322 | # Predicted xG-team 323 | y_pred_plot_mean = y_pred_plot.mean() 324 | y_pred_plot_var = statistics.variance(y_pred_plot) 325 | y_pred_plot_stdvar = statistics.stdev(y_pred_plot) 326 | 327 | 328 | #%% 329 | # - Print stats 330 | "---------------------------------------------------------------------------" 331 | print('\n') 332 | print('=============== y_diff statistics: ================ ') 333 | print(f"Mean: {y_diff_mean}") 334 | print(f"Variance: {y_diff_var}") 335 | print(f"Standard deviation: {y_diff_stdvar}") 336 | 337 | print('\n') 338 | print('=============== Actual xG-team statistics: ================ ') 339 | print(f"Mean: {y_plot_mean}") 340 | print(f"Variance: {y_plot_var}") 341 | print(f"Standard deviation: {y_plot_stdvar}") 342 | 343 | print('\n') 344 | print('=============== Predicted xG-team statistics: ================ ') 345 | print(f"Mean: {y_pred_plot_mean}") 346 | print(f"Variance: {y_pred_plot_var}") 347 | print(f"Standard deviation: {y_pred_plot_stdvar}") 348 | 349 | 350 | #%% 351 | # - Plot fitted values and computed team xGC-values 352 | "---------------------------------------------------------------------------" 353 | 354 | x_plot2 = np.arange(len(y_pred_def)) 355 | y_plot2 = df_KPI_PL_test['opponent_xG'].copy() 356 | y_pred_plot2 = y_pred_def 357 | 358 | y_diff2 = abs(y_plot2 - y_pred_plot2) 359 | 360 | # Create figure and axes 361 | fig2, ax2 = plt.subplots(figsize=(12, 6)) 362 | 363 | width = 0.35 # the width of the bars 364 | 365 | rects1 = ax2.bar(x_plot2[0:30] - width/2, y_plot2[0:30], width, label='xGC-team actual') 366 | rects2 = ax2.bar(x_plot2[0:30] + width/2, y_pred_plot2[0:30], width, label='xGC-team predicted') 367 | 368 | #plt.bar(x_plot[0:50], y_plot[0:50], color='purple', label='xG-team actual') 369 | #plt.bar(x_plot[0:50], y_pred_plot[0:50], color='orange', label='xG-team predicted') 370 | #ax1.plot(x_plot[0:50], y_diff[0:50], '--', color='red', label='xG-team difference') 371 | 372 | # x and y labels 373 | ax2.set_xlabel('matches', fontweight='bold', fontsize=20, fontproperties=serif_bold.prop) 374 | ax2.set_ylabel('xGC', fontweight='bold', fontsize=20, fontproperties=serif_bold.prop) 375 | 376 | # Add legend 377 | ax2.legend(loc='best', prop={"family": "Times New Roman", 'size': 14}) 378 | 379 | # Add grid and zorder 380 | ax2.grid(ls="dotted", lw=0.3, color="grey", alpha=1, zorder=1) 381 | 382 | # Adding title and subtitle 383 | fig2.text(0.05, 1, f"Actual and predicted xGC-team values for position: {positions_fitting[0][0]} \n", fontsize=22, 384 | fontproperties=serif_bold.prop) 385 | fig2.text(0.05, 1, 'First 30 matches in PL season 2017/18', fontsize=18, 386 | fontproperties=serif_regular.prop) 387 | 388 | # The tight_layout() function in pyplot module of matplotlib library is used 389 | # to automatically adjust subplot parameters to give specified padding. 390 | plt.tight_layout() 391 | plt.show() 392 | 393 | #%% 394 | # - Statistics 395 | "---------------------------------------------------------------------------" 396 | 397 | # Difference 398 | y_diff2_mean = y_diff2.mean() 399 | y_diff2_var = statistics.variance(y_diff2) 400 | #y_diff_covar = statistics.covariance(y_plot, y_pred_plot) 401 | y_diff2_stdvar = statistics.stdev(y_diff2) 402 | 403 | # Actual xGC-team 404 | y_plot2_mean = y_plot2.mean() 405 | y_plot2_var = statistics.variance(y_plot2) 406 | y_plot2_stdvar = statistics.stdev(y_plot2) 407 | 408 | # Predicted xGC-team 409 | y_pred_plot2_mean = y_pred_plot2.mean() 410 | y_pred_plot2_var = statistics.variance(y_pred_plot2) 411 | y_pred_plot2_stdvar = statistics.stdev(y_pred_plot2) 412 | 413 | 414 | #%% 415 | # - Print stats 416 | "---------------------------------------------------------------------------" 417 | print('\n') 418 | print('=============== y_diff2 statistics: ================ ') 419 | print(f"Mean: {y_diff2_mean}") 420 | print(f"Variance: {y_diff2_var}") 421 | print(f"Standard deviation: {y_diff2_stdvar}") 422 | 423 | print('\n') 424 | print('=============== Actual xGC-team statistics: ================ ') 425 | print(f"Mean: {y_plot2_mean}") 426 | print(f"Variance: {y_plot2_var}") 427 | print(f"Standard deviation: {y_plot2_stdvar}") 428 | 429 | print('\n') 430 | print('=============== Predicted xGC-team statistics: ================ ') 431 | print(f"Mean: {y_pred_plot2_mean}") 432 | print(f"Variance: {y_pred_plot2_var}") 433 | print(f"Standard deviation: {y_pred_plot2_stdvar}") 434 | 435 | 436 | 437 | 438 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Player_Rating_Project 2 | ============================== 3 | 4 | Instruction of how to run the files, and what needs to be downloaded beforehand, for the Player_Rating_Project. Project has been carried out at Uppsala university for the course "Advanced Course on Topics in Scientific Computing I", HT2021 period 1. 5 | 6 | Python Packages Needed 7 | ------------ 8 | - `pandas` 9 | - `numpy` 10 | - `json` 11 | - `matplotlib` 12 | - `seaborn` 13 | - `mplsoccer` 14 | - `sklearn` 15 | - `statsmodels` 16 | - `tabulate` 17 | 18 | Downloads 19 | ------------ 20 | Make sure to have Python3 downloaded, along with needed packages listed above. 21 | 22 | Get the Wyscout data from: https://figshare.com/collections/Soccer_match_event_dataset/4415000/2 23 | 24 | The following data sets from Wyscout are needed: "events.json", "matches.json", "players.json" and "teams.json". 25 | 26 | Place the downloaded Wyscout data in a folder named: `Wyscout`, placed two levels above the Python code (see below). 27 | 28 | Download the folder 'Json_files' from https://drive.google.com/drive/folders/1Yhta6-kl6Z9sn_Uy2JpMC9UiNObn6VFz?usp=sharing and place at one level above the Python code (see below). The files in this folder can also be generated if the Wyscout data is downloaded by running the following programmes in order: 29 | 30 | 1. create_events_df_eu.py 31 | 32 | 2. minutes_played.py 33 | 34 | 3. create_KPI_dataframe.py 35 | 36 | (4.) create_KPI_dataframe_EDIT.py (need some modifications, see comments) 37 | 38 | This is though not recomended since it takes quite a lot of time to run create_KPI_dataframe.py. 39 | 40 | Also download Excel-sheet `Gameweek_38.xlsx` from https://docs.google.com/spreadsheets/d/1bIpAxH0iWEot8tAlIQcvBB_uX-Au-qjX/edit?usp=sharing&ouid=117928085659621731785&rtpof=true&sd=true and place at one level above the Python code (see below). 41 | 42 | Running Instructions 43 | ------------ 44 | When the folders and files above are downloaded (or created) the following programs can be ran to see the resulting ratings from gameweek 38 45 | 46 | 1. GW_38_Ratings.py 47 | 48 | 2. the_match_ranking.py 49 | 50 | The following programs can then be ran to evalute the ratings and the xG-model 51 | 52 | 1. GW_38_Ratings_evaluation.py 53 | 54 | 2. xG_model_evaluation.py 55 | 56 | 3. validation_vs_WhoScored.py 57 | 58 | 59 | Project Organization 60 | ------------ 61 | 62 | ├── README.md <- The top-level README for running this project. 63 | | 64 | ├── Wyscout <- Wyscout data folder. 65 | │ │ 66 | │ ├── players.json 67 | │ │ 68 | │ ├── teams.json 69 | │ │ 70 | │ ├── events 71 | │ │ ├── events_England.json 72 | │ │ ├── events_France.json 73 | │ │ ├── events_Germany.json 74 | │ │ ├── events_Italy.json 75 | │ │ └── events_Spain.json 76 | │ │ 77 | │ └── matches 78 | │ ├── matches_England.json 79 | │ ├── matches_France.json 80 | │ ├── matches_Germany.json 81 | │ ├── matches_Italy.json 82 | │ └── matches_Spain.json 83 | │ 84 | └──Player_rating_Project <- Main folder for this project. 85 | | 86 | │── Gameweek_38.xlsx <- Excel with validation data from Whoscored to compare with. 87 | │ 88 | │── Json_files <- Folder where created json-files are stored. 89 | │ 90 | └── Python_Code <- Source code for this project. 91 | | 92 | |── create_events_df_eu.py 93 | |── create_KPI_dataframe_EDIT.py 94 | |── create_KPI_dataframe.py 95 | |── FCPython.py 96 | |── fitting_functions.py 97 | |── GW_38_Ratings_evaluation.py 98 | |── GW_38_Ratings.py 99 | |── KPI_functions.py 100 | |── minutes_played.py 101 | |── the_match_ranking.py 102 | |── validation_vs_WhoScored.py 103 | └── xG_model_evaluation.py 104 | 105 | -------- 106 | 107 | By: Jakob Edberger Persson and Emil Danielsson, 2021 108 | -------------------------------------------------------------------------------- /__pycache__/FCPython.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EmilDanielsson/Player-Rating-Project/ae4b93c0aaa761e8a23c04666a1b1365637955df/__pycache__/FCPython.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/KPI_functions.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EmilDanielsson/Player-Rating-Project/ae4b93c0aaa761e8a23c04666a1b1365637955df/__pycache__/KPI_functions.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/fitting_functions.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EmilDanielsson/Player-Rating-Project/ae4b93c0aaa761e8a23c04666a1b1365637955df/__pycache__/fitting_functions.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/percentile_functions.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EmilDanielsson/Player-Rating-Project/ae4b93c0aaa761e8a23c04666a1b1365637955df/__pycache__/percentile_functions.cpython-38.pyc -------------------------------------------------------------------------------- /create_KPI_dataframe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Sep 13 16:54:33 2021 5 | 6 | @author: emildanielsson & JakobEP 7 | 8 | Program description: 9 | 1. Read in data 10 | 2. Creates two dataframes; 11 | df_KPI - Dataframe of all the player's KPI's from each game 12 | df_KPI_info - Dataframe with info of player's KPI's 13 | (3.) Create and store the two dataframes as json-files in the working directory 14 | 15 | Note that this code takes very long time to run and therefore some other KPIs 16 | which were develoloped later have been added by the program: create_KPI_dataframe_EDIT. 17 | This is recomended for future use. 18 | 19 | """ 20 | 21 | # The basics 22 | import pandas as pd 23 | import numpy as np 24 | import json 25 | 26 | 27 | # Statistical fitting of models 28 | # import statsmodels.api as sm 29 | # import statsmodels.formula.api as smf 30 | from sklearn.model_selection import train_test_split 31 | from sklearn.linear_model import LogisticRegression 32 | # from sklearn.preprocessing import PolynomialFeatures 33 | 34 | # Import KPI-funcion 35 | import KPI_functions as kpi 36 | 37 | 38 | #%% 39 | # - Create dataframes from the Wyscout data 40 | "---------------------------------------------------------------------------" 41 | 42 | # Create event dataframe 43 | #df_Europe_events = pd.read_json('Json_files/events_All.json', encoding="unicode_escape") #SLOWER 44 | with open('../Json_files/events_All.json') as f: 45 | data_Europe= json.load(f) 46 | 47 | df_Europe_events = pd.DataFrame(data_Europe) 48 | 49 | # Create match dataframes 50 | df_England_matches = pd.read_json('../../Wyscout/matches/matches_England.json', encoding="unicode_escape") 51 | 52 | df_France_matches = pd.read_json('../../Wyscout/matches/matches_France.json', encoding="unicode_escape") 53 | 54 | df_Germany_matches = pd.read_json('../../Wyscout/matches/matches_Germany.json', encoding="unicode_escape") 55 | 56 | df_Italy_matches = pd.read_json('../../Wyscout/matches/matches_Italy.json', encoding="unicode_escape") 57 | 58 | df_Spain_matches = pd.read_json('../../Wyscout/matches/matches_Spain.json', encoding="unicode_escape") 59 | 60 | 61 | # Create players and teams dataframes 62 | df_players = pd.read_json("../../Wyscout/players.json", encoding="unicode_escape") 63 | df_teams = pd.read_json("../../Wyscout/teams.json", encoding="unicode_escape") 64 | 65 | 66 | #%% 67 | # - Merge matches dataframes from all leagues 68 | "---------------------------------------------------------------------------" 69 | 70 | frames_matches = [df_England_matches, df_France_matches, df_Germany_matches, 71 | df_Italy_matches, df_Spain_matches] 72 | 73 | df_Europe_matches = pd.concat(frames_matches, keys = ["England", "France", 74 | "Germany", "Italy", "Spain"]) 75 | 76 | 77 | #%% 78 | # - Read in minutes played data 79 | "---------------------------------------------------------------------------" 80 | 81 | with open('../Json_files/minutes_played_All.json') as f: 82 | data_minutes = json.load(f) 83 | 84 | df_minutes = pd.DataFrame(data_minutes) 85 | 86 | 87 | #%% 88 | # - Read in data for xG-model and get the coeficients dataframes 89 | "---------------------------------------------------------------------------" 90 | 91 | with open('../Json_files/xG_model_v2_All_except_Eng.json') as f: 92 | data_xG_model = json.load(f) 93 | 94 | # Create dataframes 95 | df_xG_model = pd.DataFrame(data_xG_model) 96 | 97 | # Call xG-m 98 | df_log_model_shots_coef, df_log_model_headers_coef, df_log_model_free_kicks_coef, log_model, log_model_headers, log_model_free_kicks = ff.xG_model(df_xG_model) 99 | 100 | 101 | #%% 102 | # - Create the dataframe of all KPI's 103 | "---------------------------------------------------------------------------" 104 | 105 | # Prepare the dataframe with the columns we need 106 | df_KPI_p90 = pd.DataFrame(columns=['matchId', 107 | 'league', 108 | 'teamName', 109 | 'playerId', 110 | 'shortName', 111 | 'role', 112 | 'minutesPlayed', 113 | 'team_goals', 114 | 'team_conceded_goals', 115 | 'red_card', 116 | # KPI's from here 117 | 'goals', 118 | 'assists', 119 | 'passing%', 120 | 'completed_passes_p90', 121 | 'fouls_p90', 122 | 'aerial%', 123 | 'aerial_wins_p90', 124 | 'shots_p90', 125 | 'dribbles%', 126 | 'succesful_dribbles_p90', 127 | 'key_passes_p90', 128 | 'succesful_through_passes_p90', 129 | 'plus_minus', 130 | 'events_in_box_p90', 131 | 'passes_to_box_p90', 132 | 'creative_passes_p90', 133 | 'succesful_def_actions_p90', 134 | 'progressive_carries_p90', 135 | 'xG_p90', 136 | 'xG_tot', 137 | 'xG_shots', 138 | 'xG_headers', 139 | 'xG_free_kicks', 140 | 'xG_penalties']) 141 | 142 | # Prepare the dataframe with the columns we need 143 | df_KPI_tot = pd.DataFrame(columns=['matchId', 144 | 'league', 145 | 'teamName', 146 | 'playerId', 147 | 'shortName', 148 | 'role', 149 | 'minutesPlayed', 150 | 'team_goals', 151 | 'team_conceded_goals', 152 | 'red_card', 153 | # KPI's from here 154 | 'goals', 155 | 'assists', 156 | 'passing%', 157 | 'completed_passes', 158 | 'fouls', 159 | 'aerial%', 160 | 'aerial_wins', 161 | 'shots', 162 | 'dribbles%', 163 | 'succesful_dribbles', 164 | 'key_passes', 165 | 'succesful_through_passes', 166 | 'plus_minus', 167 | 'events_in_box', 168 | 'passes_to_box', 169 | 'creative_passes', 170 | 'succesful_def_actions', 171 | 'progressive_carries', 172 | 'xG_tot', 173 | 'xG_shots', 174 | 'xG_headers', 175 | 'xG_free_kicks', 176 | 'xG_penalties']) 177 | 178 | # Prepare the dataframe with the columns we need 179 | df_KPI_info = pd.DataFrame(columns=['matchId', 180 | 'league', 181 | 'playerId', 182 | 'shortName', 183 | # KPI-info's from here 184 | 'info_goals', 185 | 'info_assists', 186 | 'info_passing%', 187 | 'info_completed_passes', 188 | 'info_fouls', 189 | 'info_aerial%', 190 | 'info_aerial_wins', 191 | 'info_shots', 192 | 'info_dribbles%', 193 | 'info_succesful_dribbles', 194 | 'info_key_passes', 195 | 'info_succesful_through_passes', 196 | 'info_plus_minus', 197 | 'info_events_in_box', 198 | 'info_passes_to_box', 199 | 'info_creative_passes', 200 | 'info_succesful_def_actions', 201 | 'info_progressive_carries', 202 | 'info_xG']) 203 | 204 | 205 | # Match id checkpoints 206 | loop_checkpoints = np.arange(0, 2100, 5) 207 | j = 0 208 | 209 | # Loop trough all matches 210 | for i, match in df_Europe_matches.iterrows(): 211 | 212 | # Find the events from match_i 213 | mask_match = df_Europe_events.matchId == match.wyId 214 | df_events_match = df_Europe_events.loc[mask_match] 215 | 216 | # List of all the players involved in match_i 217 | player_match_list = df_events_match['playerId'].unique().tolist() 218 | 219 | ################################################ 220 | # - Find home and away score 221 | "----------------------------------------------" 222 | 223 | # Find teamIds in the match 224 | teams_match_list = df_events_match['teamId'].unique().tolist() 225 | 226 | # Find the match data from df_matches 227 | mask_score = df_Europe_matches.wyId == match.wyId 228 | df_the_match = df_Europe_matches.loc[mask_score] 229 | team_data = df_the_match.teamsData 230 | 231 | ################################################ 232 | # - Get home and away teams and scores 233 | "----------------------------------------------" 234 | home_team_list = [] 235 | away_team_list = [] 236 | for i in range(2): 237 | team_data_i = team_data[0][str(teams_match_list[i])] 238 | team_lineup = team_data_i['formation']['lineup'] 239 | team_bench = team_data_i['formation']['bench'] 240 | 241 | # Get the lineup players 242 | for player in team_lineup: 243 | if team_data_i['side'] == "home": 244 | home_team_list.append(player['playerId']) 245 | elif team_data_i['side'] == "away": 246 | away_team_list.append(player['playerId']) 247 | else: 248 | print("Error: " + team_data_i['side']) 249 | 250 | # Get the bench players 251 | for player in team_bench: 252 | if team_data_i['side'] == "home": 253 | home_team_list.append(player['playerId']) 254 | elif team_data_i['side'] == "away": 255 | away_team_list.append(player['playerId']) 256 | else: 257 | print("Error: " + team_data_i['side']) 258 | 259 | # Set home and away score 260 | if team_data_i['side'] == "home": 261 | home_team_score = team_data_i['score'] 262 | elif team_data_i['side'] == "away": 263 | away_team_score = team_data_i['score'] 264 | else: 265 | print("Error: " + team_data_i['score']) 266 | 267 | # End of finding home and away teams and score 268 | "----------------------------------------------" 269 | 270 | 271 | # Loop trough all players and get their average position and compute KPI's 272 | for player in player_match_list: 273 | 274 | # Find the minutes played, team and red card 275 | mask_minutes = (df_minutes.playerId == player) & (df_minutes.matchId == match.wyId) 276 | df_player_minutes = df_minutes.loc[mask_minutes] 277 | 278 | # Some players are not registered the subbed in but their events are registerd 279 | # If they are not subbed in correctly in Wyscout matches "df_player_minutes" 280 | # will be empty. Thus we check this here. 281 | if len(df_player_minutes != 0): 282 | player_minutes = df_player_minutes['minutesPlayed'][0] 283 | player_in_min = df_player_minutes['player_in_min'][0] 284 | player_out_min = df_player_minutes['player_out_min'][0] 285 | player_team = df_player_minutes['teamId'][0] 286 | player_team_name = df_player_minutes['teamName'][0] 287 | red_card_bool = df_player_minutes['red_card'][0] 288 | 289 | # mask to find the given player-events 290 | mask_player = df_events_match.playerId == player 291 | 292 | # New dataframe with all events from 'player' in match 293 | df_events_player = df_events_match.loc[mask_player] 294 | 295 | # Get the role of the player 296 | position = df_events_player['Position'][0] 297 | 298 | # Get the league 299 | league = df_events_player["league"][0] 300 | 301 | # Get the shortName 302 | name = df_events_player['shortName'][0] 303 | 304 | # Get the team goal and goals conceded 305 | if (player in home_team_list): 306 | team_goals = home_team_score 307 | team_conceded_goals = away_team_score 308 | elif (player in away_team_list): 309 | team_goals = away_team_score 310 | team_conceded_goals = home_team_score 311 | else: 312 | print("Error: cant find player in list") 313 | 314 | 315 | ################################################ 316 | # - All function calls to compute kpi's 317 | "----------------------------------------------" 318 | 319 | # goals 320 | goals, goals_info = kpi.nr_goals(df_events_player, player_minutes) 321 | 322 | # assists 323 | assists, assists_info = kpi.nr_assists(df_events_player, player_minutes) 324 | 325 | # passing% 326 | pass_percent, pass_percent_info = kpi.percent_passes_completed(df_events_player, player_minutes) 327 | 328 | # passes_completed 329 | pass_comp, pass_comp_p90, pass_comp_info = kpi.passes_completed(df_events_player, player_minutes) 330 | 331 | # fouls 332 | fouls, fouls_p90, fouls_info = kpi.fouls(df_events_player, player_minutes) 333 | 334 | # aerials% 335 | aerials_percent, aerials_percent_info = kpi.percent_aerial_wins(df_events_player, player_minutes) 336 | 337 | # aerials_won 338 | aerial_wins, aerial_wins_p90, aerial_wins_info = kpi.aerials_won(df_events_player, player_minutes) 339 | 340 | # shots 341 | shots, shots_p90, shots_info = kpi.shots(df_events_player, player_minutes) 342 | 343 | # dribbles% 344 | dribbles_percent, dribbles_percent_info = kpi.percent_succesful_dribbles(df_events_player, player_minutes) 345 | 346 | # succesful_dribbles 347 | succesful_dribbles, succesful_dribbles_p90, succesful_dribbles_info = kpi.succesful_dribbles(df_events_player, player_minutes) 348 | 349 | # key_passes 350 | key_passes, key_passes_p90, key_passes_info = kpi.key_passes(df_events_player, player_minutes) 351 | 352 | # succesful_through_passes 353 | succesful_through_passes, succesful_through_passes_p90, succesful_through_passes_info = kpi.succesful_through_passes(df_events_player, player_minutes) 354 | 355 | # plus-minus 356 | plus_minus, plus_minus_info = kpi.plus_minus(df_events_match, player_team, player_minutes, player_in_min, player_out_min) 357 | 358 | # events_in_box 359 | events_in_box, events_in_box_p90, event_in_box_info = kpi.events_in_box(df_events_player, player_minutes) 360 | 361 | # passes_to_box 362 | passes_to_box, passes_to_box_p90, passes_to_box_info = kpi.passes_to_box(df_events_player, player_minutes) 363 | 364 | # creative_passes 365 | creative_passes, creative_passes_p90, creative_passes_info = kpi.creative_passes(df_events_player, player_minutes) 366 | 367 | # defensive_actions 368 | succesful_def_actions, succesful_def_actions_p90, succesful_def_actions_info = kpi.succesful_def_actions(df_events_player, player_minutes) 369 | 370 | # progressive_carries 371 | progressive_carries, progressive_carries_p90, progressive_carries_info = kpi.progressive_carries(df_events_player, player_minutes) 372 | 373 | # xG 374 | xG_tot, xG_tot_p90, xG_info, xG_shots, xG_headers, xG_free_kicks, xG_penalties = kpi.xG(df_events_player, player_minutes, df_log_model_shots_coef, df_log_model_headers_coef, df_log_model_free_kicks_coef) 375 | 376 | 377 | 378 | ######################################################## 379 | # - Add rows to df_KPI_p90, df_KPI_tot and df_KPI_info 380 | "------------------------------------------------------" 381 | 382 | # df_KPI_p90 383 | df_KPI_p90.loc[df_KPI_p90.shape[0]] = [match.wyId, league, player_team_name, player, name, 384 | position, player_minutes, team_goals, 385 | team_conceded_goals, red_card_bool, 386 | goals, 387 | assists, 388 | pass_percent, 389 | pass_comp_p90, 390 | fouls_p90, 391 | aerials_percent, 392 | aerial_wins_p90, 393 | shots_p90, 394 | dribbles_percent, 395 | succesful_dribbles_p90, 396 | key_passes_p90, 397 | succesful_through_passes_p90, 398 | plus_minus, 399 | events_in_box_p90, 400 | passes_to_box_p90, 401 | creative_passes_p90, 402 | succesful_def_actions_p90, 403 | progressive_carries_p90, 404 | xG_tot_p90, 405 | xG_tot, 406 | xG_shots, 407 | xG_headers, 408 | xG_free_kicks, 409 | xG_penalties] 410 | 411 | # df_KPI_tot 412 | df_KPI_tot.loc[df_KPI_tot.shape[0]] = [match.wyId, league, player_team_name, player, name, 413 | position, player_minutes, team_goals, 414 | team_conceded_goals, red_card_bool, 415 | goals, 416 | assists, 417 | pass_percent, 418 | pass_comp, 419 | fouls, 420 | aerials_percent, 421 | aerial_wins, 422 | shots, 423 | dribbles_percent, 424 | succesful_dribbles, 425 | key_passes, 426 | succesful_through_passes, 427 | plus_minus, 428 | events_in_box, 429 | passes_to_box, 430 | creative_passes, 431 | succesful_def_actions, 432 | progressive_carries, 433 | xG_tot, 434 | xG_shots, 435 | xG_headers, 436 | xG_free_kicks, 437 | xG_penalties] 438 | 439 | 440 | # df_KPI_info 441 | df_KPI_info.loc[df_KPI_info.shape[0]] = [match.wyId, league, player, name, 442 | goals_info, 443 | assists_info, 444 | pass_percent_info, 445 | pass_comp_info, 446 | fouls_info, 447 | aerials_percent_info, 448 | aerial_wins_info, 449 | shots_info, 450 | dribbles_percent_info, 451 | succesful_dribbles_info, 452 | key_passes_info, 453 | succesful_through_passes_info, 454 | plus_minus_info, 455 | event_in_box_info, 456 | passes_to_box_info, 457 | creative_passes_info, 458 | succesful_def_actions_info, 459 | progressive_carries_info, 460 | xG_info] 461 | 462 | 463 | if (j in loop_checkpoints): 464 | print(f"Number of matches with computed KPI's': {j}\n") 465 | 466 | j+=1 467 | 468 | 469 | #%% 470 | # - Save dataframes to json-files, uncommen which to save 471 | "---------------------------------------------------------------------------" 472 | df_KPI_p90.to_json("Json_files/KPI_per_90_All.json") 473 | df_KPI_tot.to_json("Json_files/KPI_tot_All.json") 474 | df_KPI_info.to_json("Json_files/KPI_info_All.json") 475 | 476 | 477 | -------------------------------------------------------------------------------- /create_KPI_dataframe_EDIT.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Sep 13 16:54:33 2021 5 | 6 | @author: emildanielsson & JakobEP 7 | 8 | Program description: 9 | Edit KPI-dtaframes created from create_KPI_dataframe.py 10 | Writes over changes to the same file 11 | 12 | """ 13 | 14 | # The basics 15 | import pandas as pd 16 | import numpy as np 17 | import json 18 | 19 | # Import KPI-funcion 20 | import KPI_functions as kpi 21 | 22 | 23 | #%% 24 | # - Create dataframes from the Wyscout data 25 | "---------------------------------------------------------------------------" 26 | 27 | # Create event dataframe 28 | #df_Europe_events = pd.read_json('Json_files/events_All.json', encoding="unicode_escape") #SLOWER 29 | with open('../Json_files/events_All.json') as f: 30 | data_Europe = json.load(f) 31 | 32 | df_Europe_events = pd.DataFrame(data_Europe) 33 | 34 | # Create match dataframes 35 | df_England_matches = pd.read_json('../../Wyscout/matches/matches_England.json', encoding="unicode_escape") 36 | 37 | df_France_matches = pd.read_json('../../Wyscout/matches/matches_France.json', encoding="unicode_escape") 38 | 39 | df_Germany_matches = pd.read_json('../../Wyscout/matches/matches_Germany.json', encoding="unicode_escape") 40 | 41 | df_Italy_matches = pd.read_json('../../Wyscout/matches/matches_Italy.json', encoding="unicode_escape") 42 | 43 | df_Spain_matches = pd.read_json('../../Wyscout/matches/matches_Spain.json', encoding="unicode_escape") 44 | 45 | 46 | # Create players and teams dataframes 47 | df_players = pd.read_json("../../Wyscout/players.json", encoding="unicode_escape") 48 | df_teams = pd.read_json("../../Wyscout/teams.json", encoding="unicode_escape") 49 | 50 | 51 | #%% 52 | # - Merge matches dataframes from all leagues 53 | "---------------------------------------------------------------------------" 54 | 55 | frames_matches = [df_England_matches, df_France_matches, df_Germany_matches, 56 | df_Italy_matches, df_Spain_matches] 57 | 58 | df_Europe_matches = pd.concat(frames_matches, keys = ["England", "France", 59 | "Germany", "Italy", "Spain"]) 60 | 61 | 62 | #%% 63 | # - Read in minutes played data 64 | "---------------------------------------------------------------------------" 65 | 66 | with open('../Json_files/minutes_played_All.json') as f: 67 | data_minutes = json.load(f) 68 | 69 | df_minutes = pd.DataFrame(data_minutes) 70 | 71 | 72 | #%% 73 | # - Read in dataframes of all KPI's to edit 74 | "---------------------------------------------------------------------------" 75 | 76 | with open('../Json_files/new_KPI_tot_All.json') as f: 77 | data_kpi_tot = json.load(f) 78 | 79 | with open('../Json_files/new_KPI_per_90_All.json') as f: 80 | data_kpi_p90 = json.load(f) 81 | 82 | # with open('Json_files/KPI_info_All.json') as f: 83 | # data_kpi_info = json.load(f) 84 | 85 | df_KPI_tot = pd.DataFrame(data_kpi_tot) 86 | 87 | df_KPI_p90 = pd.DataFrame(data_kpi_p90) 88 | 89 | #df_KPI_info = pd.DataFrame(data_kpi_info) 90 | 91 | 92 | #%% 93 | # - Find number of own goals 94 | "---------------------------------------------------------------------------" 95 | 96 | # Df with all own goals 97 | df_own_goals = kpi.own_goals(df_Europe_events) 98 | 99 | 100 | #%% 101 | # - Loop to add additional KPIs 102 | "---------------------------------------------------------------------------" 103 | 104 | # Match id checkpoints 105 | loop_checkpoints = np.arange(0, 2100, 5) 106 | j = 0 107 | 108 | # Loop through all matches 109 | for i, match in df_Europe_matches.iterrows(): 110 | 111 | # Find the events from match_i 112 | mask_match = df_Europe_events.matchId == match.wyId 113 | df_events_match = df_Europe_events.loc[mask_match] 114 | 115 | # List of all the players involved in match_i 116 | player_match_list = df_events_match['playerId'].unique().tolist() 117 | 118 | # Loop trough all players and get their average position and compute KPI's 119 | for player in player_match_list: 120 | 121 | # Find the minutes played, team and red card 122 | mask_minutes = (df_minutes.playerId == player) & (df_minutes.matchId == match.wyId) 123 | df_player_minutes = df_minutes.loc[mask_minutes] 124 | 125 | # Some players are not registered the subbed in but their events are registerd 126 | # If they are not subbed in correctly in Wyscout matches "df_player_minutes" 127 | # will be empty. Thus we check this here. 128 | if len(df_player_minutes != 0): 129 | player_minutes = df_player_minutes['minutesPlayed'][0] 130 | 131 | # mask to find the given player-events 132 | mask_player = df_events_match.playerId == player 133 | 134 | # New dataframe with all events from 'player' in match 135 | df_events_player = df_events_match.loc[mask_player] 136 | 137 | 138 | ################################################ 139 | # - Check after own goals from player in match 140 | "----------------------------------------------" 141 | 142 | # Initiate temp variable 143 | # own_goals_player = 0 144 | 145 | # # Read out any eventual own goals 146 | # mask_own_goals = (df_own_goals.playerId == player) & (df_own_goals.matchId == match.wyId) 147 | # df_own_goals_player = df_own_goals.loc[mask_own_goals] 148 | 149 | # # Check there were any own goals 150 | # if len(df_own_goals_player) != 0: 151 | # own_goals_player = len(df_own_goals_player) 152 | 153 | 154 | ################################################ 155 | # - All function calls to compute kpi's 156 | "----------------------------------------------" 157 | 158 | # danger_ball_loses 159 | #danger_ball_loses, danger_ball_loses_p90, danger_ball_loses_info = kpi.danger_ball_loses(df_events_player, player_minutes) 160 | 161 | # yellow_cards 162 | #yellow_cards, yellow_cards_info = kpi.yellow_cards(df_events_player) 163 | 164 | # percent_def_actions 165 | percent_def_actions, percent_def_actions_info = kpi.percent_def_actions(df_events_player, player_minutes) 166 | 167 | ######################################################## 168 | # - Add rows to df_KPI_p90, df_KPI_tot and df_KPI_info 169 | "------------------------------------------------------" 170 | 171 | # df_KPI_p90 172 | mask_insert1 = (df_KPI_p90.matchId == match.wyId) & (df_KPI_p90.playerId == player) 173 | #df_KPI_p90.loc[mask_insert1, 'own_goals'] = own_goals_player 174 | #df_KPI_p90.loc[mask_insert1, 'yellow_cards'] = yellow_cards 175 | #df_KPI_p90.loc[mask_insert1, 'danger_ball_loses'] = danger_ball_loses_p90 176 | df_KPI_p90.loc[mask_insert1, 'def_actions%'] = percent_def_actions 177 | 178 | # df_KPI_tot 179 | mask_insert2 = (df_KPI_tot.matchId == match.wyId) & (df_KPI_tot.playerId == player) 180 | #df_KPI_tot.loc[mask_insert2, 'own_goals'] = own_goals_player 181 | #df_KPI_tot.loc[mask_insert2, 'yellow_cards'] = yellow_cards 182 | #df_KPI_tot.loc[mask_insert2, 'danger_ball_loses'] = danger_ball_loses 183 | df_KPI_tot.loc[mask_insert2, 'def_actions%'] = percent_def_actions 184 | 185 | # df_KPI_info 186 | # mask_insert3 = (df_KPI_info.matchId) == match.wyId & (df_KPI_info.playerId == player) 187 | # df_KPI_info.loc[mask_insert3, 'yellow_cards'] = yellow_cards_info 188 | # df_KPI_info.loc[mask_insert3, 'danger_ball_loses'] = danger_ball_loses_info 189 | #df_KPI_info.loc[mask_insert3, 'def_actions%'] = percent_def_actions_info 190 | 191 | 192 | if (j in loop_checkpoints): 193 | print(f"Number of matches with computed KPI's': {j}\n") 194 | 195 | j+=1 196 | 197 | 198 | #%% 199 | # - Create the new columns team_xG_p90, opponents_xG, possesion, etc 200 | "---------------------------------------------------------------------------" 201 | # Find all unique matches 202 | list_matches = df_KPI_tot["matchId"].unique().tolist() 203 | 204 | for match in list_matches: 205 | 206 | # mask for the match to add team_xG 207 | mask_match = df_KPI_tot.matchId == match 208 | df_match = df_KPI_tot.loc[mask_match] 209 | 210 | # List of the team names 211 | list_teams = df_match["teamName"].unique().tolist() 212 | 213 | for team in list_teams: 214 | 215 | # Find the team KPI 216 | mask_team = df_match.teamName == team 217 | df_team = df_match.loc[mask_team] 218 | df_opponent = df_match.loc[~mask_team] 219 | 220 | # Find xG and shots 221 | # team_shots = df_team['shots'].sum() 222 | # opponent_shots = df_opponent['shots'].sum() 223 | team_xG = df_team["xG_tot"].sum() 224 | opponent_xG = df_opponent["xG_tot"].sum() 225 | team_passes = df_team['completed_passes'].sum() 226 | opponent_passes = df_opponent['completed_passes'].sum() 227 | 228 | tot_game_passes = team_passes + opponent_passes 229 | 230 | # Find approximate possesion 231 | team_possesion = team_passes / tot_game_passes 232 | opponent_possesion = opponent_passes / tot_game_passes 233 | 234 | # Find PossAdj defnesive actions 235 | for i, player in df_team.iterrows(): 236 | mask_player = ((df_KPI_tot.matchId == match) & (df_KPI_tot.playerId == player.playerId)) 237 | df_player = df_KPI_tot.loc[mask_player] 238 | def_actions = df_player.succesful_def_actions.values[0] 239 | p_adj_def_actions = def_actions / opponent_possesion 240 | df_KPI_tot.loc[mask_player, 'p_adj_succ_def_actions'] = p_adj_def_actions 241 | 242 | # Add to the KPI dataframe 243 | mask_add_xG = ((df_KPI_tot.matchId == match) & (df_KPI_tot.teamName == team)) 244 | df_KPI_tot.loc[mask_add_xG, 'team_xG'] = team_xG 245 | df_KPI_tot.loc[mask_add_xG, 'opponent_xG'] = opponent_xG 246 | df_KPI_tot.loc[mask_add_xG, 'team_possesion'] = team_possesion 247 | df_KPI_tot.loc[mask_add_xG, 'opponent_possesion'] = opponent_possesion 248 | # df_KPI_tot.loc[mask_add_xG, 'team_shots'] = team_shots 249 | # df_KPI_tot.loc[mask_add_xG, 'opponent_shots'] = opponent_shots 250 | 251 | 252 | #%% 253 | # - Save dataframes to json-files 254 | # - Note: Uncomment which to save 255 | "---------------------------------------------------------------------------" 256 | 257 | #df_KPI_p90.to_json("../Json_files/KPI_per_90_All.json") 258 | #df_KPI_tot.to_json("Json_files/KPI_tot_All.json") 259 | #df_KPI_info.to_json("Json_files/new_KPI_info_All.json") 260 | 261 | -------------------------------------------------------------------------------- /create_events_df_eu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Sep 10 12:04:25 2021 5 | 6 | @author: emildanielsson & JakobEP 7 | 8 | Program description: 9 | 1. Read in Wyscout data, Events, Players, Matches and Teams 10 | 2. Filtering of the event and player data 11 | - get rid of gk's from players data 12 | - get rid of gk-events from event data 13 | - get rid of events with unknown playerId 14 | 3. Merge all the league event files to one dataframe (df) 15 | 4. Create and store a new events.json file in the working directory 16 | - Added column "Position" with the detected position 17 | - Added column "shortName" with the shortName from Wyscout 18 | 19 | 20 | """ 21 | 22 | # The basics 23 | import pandas as pd 24 | import numpy as np 25 | import json 26 | 27 | import fitting_functions as ff 28 | 29 | #%% 30 | # - Create dataframes from the Wyscout data 31 | "---------------------------------------------------------------------------" 32 | 33 | # Create event dataframes 34 | df_England_events = pd.read_json('../../Wyscout/events/events_England.json', encoding="unicode_escape") 35 | 36 | df_France_events = pd.read_json('../../Wyscout/events/events_France.json', encoding="unicode_escape") 37 | 38 | df_Germany_events = pd.read_json('../../Wyscout/events/events_Germany.json', encoding="unicode_escape") 39 | 40 | df_Italy_events = pd.read_json('../../Wyscout/events/events_Italy.json', encoding="unicode_escape") 41 | 42 | df_Spain_events = pd.read_json('../../Wyscout/events/events_Spain.json', encoding="unicode_escape") 43 | 44 | 45 | # Create match dataframes 46 | df_England_matches = pd.read_json('../../Wyscout/matches/matches_England.json', encoding="unicode_escape") 47 | 48 | df_France_matches = pd.read_json('../../Wyscout/matches/matches_France.json', encoding="unicode_escape") 49 | 50 | df_Germany_matches = pd.read_json('../../Wyscout/matches/matches_Germany.json', encoding="unicode_escape") 51 | 52 | df_Italy_matches = pd.read_json('../../Wyscout/matches/matches_Italy.json', encoding="unicode_escape") 53 | 54 | df_Spain_matches = pd.read_json('../../Wyscout/matches/matches_Spain.json', encoding="unicode_escape") 55 | 56 | 57 | # Create players and teams dataframes 58 | df_players = pd.read_json("../../Wyscout/players.json", encoding="unicode_escape") 59 | df_teams = pd.read_json("../../Wyscout/teams.json", encoding="unicode_escape") 60 | 61 | 62 | #%% 63 | # - Merge dataframes from all leagues but England 64 | "---------------------------------------------------------------------------" 65 | 66 | frames_events = [df_England_events, df_France_events, df_Germany_events, 67 | df_Italy_events, df_Spain_events] 68 | 69 | df_Europe_events = pd.concat(frames_events, keys = ["England", "France", "Germany", "Italy", "Spain"]) 70 | df_Europe_events = df_Europe_events.reset_index(level=[0]) 71 | df_Europe_events = df_Europe_events.rename(columns ={'level_0': "league"}) 72 | 73 | 74 | frames_matches = [df_England_matches, df_France_matches, df_Germany_matches, 75 | df_Italy_matches, df_Spain_matches] 76 | 77 | df_Europe_matches = pd.concat(frames_matches, keys = ["England", "France", "Germany", "Italy", "Spain"]) 78 | df_Europe_matches = df_Europe_matches.reset_index(level=[0]) 79 | df_Europe_matches = df_Europe_matches.rename(columns ={'level_0': "league"}) 80 | 81 | 82 | #%% 83 | # - Add shortName and position to df_Europe 84 | "---------------------------------------------------------------------------" 85 | 86 | # Filter out events with no playerId (0) 87 | mask_filter = df_Europe_events.playerId != 0 88 | df_Europe_events = df_Europe_events[mask_filter] 89 | 90 | # Find unique player ids 91 | eu_players = df_Europe_events["playerId"].unique().tolist() 92 | 93 | # Player id checkpoints 94 | loop_checkpoints = np.arange(0,2080,50) 95 | j = 0 96 | 97 | # Loop through player list and add new column for name 98 | for player in eu_players: 99 | 100 | # Find player short name 101 | mask_player = df_players.wyId == player 102 | shortName = df_players.loc[mask_player, 'shortName'].values[0] 103 | 104 | # Mask player 105 | mask_events_player = df_Europe_events.playerId == player 106 | df_Europe_events.loc[mask_events_player, 'shortName'] = shortName 107 | 108 | if (j in loop_checkpoints): 109 | print(f"shortName added: {j}\n") 110 | 111 | j+=1 112 | 113 | # Find all unique matches played 114 | matchId_list = df_Europe_events['matchId'].unique().tolist() 115 | 116 | # Match id checkpoints 117 | loop_checkpoints = np.arange(0,2080,50) 118 | j = 0 119 | 120 | # Loop through all matches 121 | for match_i in matchId_list: 122 | 123 | # Find the event from match_i 124 | mask_match = df_Europe_events.matchId == match_i 125 | df_match = df_Europe_events.loc[mask_match] 126 | 127 | # List of all the players involved in match_i 128 | player_match_list = df_match['playerId'].unique().tolist() 129 | 130 | # Loop trough all players and get their average position 131 | for player in player_match_list: 132 | 133 | # mask to find the given player-events 134 | mask_player = df_match.playerId == player 135 | 136 | # mask to find player from df_players 137 | mask_player2 = df_players.wyId == player 138 | 139 | # New dataframe with all events from 'player' in match 'match_i' 140 | player_df = df_match.loc[mask_player] 141 | 142 | # Initiate lists to be filled with x and y coordinates 143 | x_list = [] 144 | y_list = [] 145 | 146 | # Get list of all starting coordinates from each event of the player 147 | for i, event in player_df.iterrows(): 148 | x_list.append(event['positions'][0]['x']) 149 | y_list.append(event['positions'][0]['y']) 150 | 151 | # Get the mean positions 152 | y_mean = sum(y_list) / len(y_list) 153 | x_mean = sum(x_list) / len(x_list) 154 | 155 | # Get the Wyscout-determined role of the player 156 | position_wyscout = df_players.loc[mask_player2]['role'].values[0]['name'] 157 | 158 | # Call to function 159 | position = ff.decide_position(x_mean, y_mean, position_wyscout) 160 | 161 | # Add the position to the dataframe 162 | mask_add_position = (df_Europe_events.matchId == match_i) & (df_Europe_events.playerId == player) 163 | df_Europe_events.loc[mask_add_position, 'Position'] = position 164 | 165 | if (j in loop_checkpoints): 166 | print(f"Number of event-modified matches: {j}\n") 167 | 168 | j+=1 169 | 170 | 171 | # Filter out events with goalkeepers 172 | mask_gk = df_Europe_events.Position == "GK" 173 | df_Europe_events = df_Europe_events[~mask_gk] 174 | 175 | 176 | #%% 177 | # - Save dataframe of Europe events to working directory 178 | "---------------------------------------------------------------------------" 179 | 180 | df_Europe_events.reset_index(inplace=True) 181 | df_Europe_events.to_json("Json_files/events_All.json") 182 | 183 | # Test to load in and store as dataframe 184 | with open('Json_files/events_All.json') as f: 185 | data_Europe_new = json.load(f) 186 | 187 | df_Europe_new = pd.DataFrame(data_Europe_new) 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | -------------------------------------------------------------------------------- /fitting_functions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Sep 14 16:41:04 2021 5 | 6 | @author: emildanielsson & JakobEP 7 | 8 | Program description: 9 | 10 | Funtions for fitting. 11 | 12 | """ 13 | 14 | # The basics 15 | import pandas as pd 16 | import numpy as np 17 | import json 18 | 19 | # Plotting 20 | import matplotlib.pyplot as plt 21 | from mplsoccer import FontManager 22 | from mplsoccer import Pitch, VerticalPitch 23 | 24 | # Statistical fitting of models 25 | import statsmodels.api as sm 26 | import statsmodels.formula.api as smf 27 | from sklearn import preprocessing 28 | from sklearn.preprocessing import MinMaxScaler 29 | from sklearn.preprocessing import RobustScaler 30 | from sklearn.model_selection import train_test_split 31 | from sklearn.linear_model import LogisticRegression 32 | 33 | # For tables 34 | from tabulate import tabulate 35 | 36 | 37 | #%% 38 | # - Load Fonts 39 | "---------------------------------------------------------------------------" 40 | 41 | URL1 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/' 42 | 'fonts/SourceSerifPro-Regular.ttf?raw=true') 43 | serif_regular = FontManager(URL1) 44 | URL2 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/' 45 | 'fonts/SourceSerifPro-ExtraLight.ttf?raw=true') 46 | serif_extra_light = FontManager(URL2) 47 | URL3 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/fonts/' 48 | 'SourceSerifPro-Bold.ttf?raw=true') 49 | serif_bold = FontManager(URL3) 50 | 51 | 52 | #%% 53 | # - Functions 54 | "---------------------------------------------------------------------------" 55 | 56 | """ Function which takes in dataframe of shots and outputs 57 | dataframes which contains information about the logistic 58 | regression models for the different shot-types. 59 | 60 | Description: 61 | 62 | Regression model variables: 63 | 64 | Dependent variable: Goal 65 | Independent variables: Angle, Distance, Distance squared 66 | 67 | Input: 68 | df_xG_model - dataframe for all shots, headers, freekicks, penalties 69 | and their tags (go/no goal) 70 | 71 | Output: 72 | dataframes for all coefficients and fitted log models 73 | 74 | """ 75 | def xG_model(df_xG_model): 76 | 77 | ################################################# 78 | # - Filter out headers and freekicks 79 | "------------------------------------------------" 80 | 81 | mask_headers = df_xG_model.header == 1 82 | mask_free_kicks = df_xG_model.free_kick == 1 83 | 84 | df_xG_shots = df_xG_model[(~mask_headers) & (~mask_free_kicks)] 85 | df_xG_headers = df_xG_model[mask_headers] 86 | df_xG_free_kicks = df_xG_model[mask_free_kicks] 87 | 88 | 89 | ################################################# 90 | # - Split data into test and training sets, 91 | # looking at distance (dist) and angle (ang) in radians. xG-shots. 92 | "------------------------------------------------" 93 | 94 | df_trainSet = df_xG_shots[['goal', 'distance', 'angle_rad']].copy() 95 | 96 | # Adding distance squared to df 97 | squaredD = df_trainSet['distance']**2 98 | df_trainSet = df_trainSet.assign(distance_sq = squaredD) 99 | 100 | # y(x) where y = shot result, x1 = distance, x2 = angle 101 | x_train, x_test, y_train, y_test = train_test_split(df_trainSet.drop('goal', axis=1), 102 | df_trainSet['goal'], test_size=0.20, 103 | random_state=10) 104 | 105 | 106 | ################################################# 107 | # - Create logistic model and fit it to data. xG-shots. 108 | "------------------------------------------------" 109 | 110 | # Create instance 111 | log_model = LogisticRegression() 112 | 113 | # Fit model with training data 114 | log_model.fit(x_train, y_train) 115 | 116 | # Read out coefficent(s) into df 117 | log_model_coef = log_model.coef_[0] 118 | 119 | # Create df of fit 120 | df_log_model_shots_coef = pd.DataFrame(log_model_coef, 121 | x_train.columns, 122 | columns=['coef']).sort_values(by='coef', ascending=False) 123 | 124 | # Add to df 125 | df_log_model_shots_coef.loc['intercept'] = log_model.intercept_[0] 126 | print(df_log_model_shots_coef) 127 | 128 | 129 | ################################################# 130 | # - Split data into test and training sets, 131 | # looking at distance (dist) and angle (ang) in radians. xG-headers. 132 | "------------------------------------------------" 133 | 134 | df_trainSet_headers = df_xG_headers[['goal', 'distance', 'angle_rad']].copy() 135 | 136 | # Adding distance squared to df 137 | squaredD = df_trainSet_headers['distance']**2 138 | df_trainSet_headers = df_trainSet_headers.assign(distance_sq = squaredD) 139 | 140 | # y(x) where y = shot result, x1 = distance, x2 = angle 141 | x_train_h, x_test_h, y_train_h, y_test_h = train_test_split(df_trainSet_headers.drop('goal', axis=1), 142 | df_trainSet_headers['goal'], test_size=0.20, 143 | random_state=10) 144 | 145 | 146 | ################################################# 147 | # - Create logistic model and fit it to data. xG-headers. 148 | "------------------------------------------------" 149 | 150 | # Create instance 151 | log_model_headers = LogisticRegression() 152 | 153 | # Fit model with training data 154 | log_model_headers.fit(x_train_h, y_train_h) 155 | 156 | # Read out coefficent(s) into df 157 | log_model_headers_coef = log_model_headers.coef_[0] 158 | 159 | # Create df of fit 160 | df_log_model_headers_coef = pd.DataFrame(log_model_headers_coef, 161 | x_train_h.columns, 162 | columns=['coef']).sort_values(by='coef', ascending=False) 163 | 164 | # Add to df 165 | df_log_model_headers_coef.loc['intercept'] = log_model_headers.intercept_[0] 166 | print(df_log_model_headers_coef) 167 | 168 | 169 | ################################################# 170 | # - Split data into test and training sets, 171 | # looking at distance (dist) and angle (ang) in radians. xG-free-kicks. 172 | "------------------------------------------------" 173 | 174 | df_trainSet_free_kicks = df_xG_free_kicks[['goal', 'distance', 'angle_rad']].copy() 175 | 176 | # Adding distance squared to df 177 | squaredD = df_trainSet_free_kicks['distance']**2 178 | df_trainSet_free_kicks = df_trainSet_free_kicks.assign(distance_sq = squaredD) 179 | 180 | # y(x) where y = shot result, x1 = distance, x2 = angle 181 | x_train_f, x_test_f, y_train_f, y_test_f = train_test_split(df_trainSet_free_kicks.drop('goal', axis=1), 182 | df_trainSet_free_kicks['goal'], test_size=0.20, 183 | random_state=10) 184 | 185 | 186 | ################################################# 187 | # - Create logistic model and fit it to data. xG-free-kicks. 188 | "------------------------------------------------" 189 | 190 | # Create instance 191 | log_model_free_kicks = LogisticRegression() 192 | 193 | # Fit model with training data 194 | log_model_free_kicks.fit(x_train_f, y_train_f) 195 | 196 | # Read out coefficent(s) into df 197 | log_model_free_kicks_coef = log_model_free_kicks.coef_[0] 198 | 199 | # Create df of fit 200 | df_log_model_free_kicks_coef = pd.DataFrame(log_model_free_kicks_coef, 201 | x_train_f.columns, 202 | columns=['coef']).sort_values(by='coef', ascending=False) 203 | 204 | # Add to df 205 | df_log_model_free_kicks_coef.loc['intercept'] = log_model_free_kicks.intercept_[0] 206 | print(df_log_model_free_kicks_coef) 207 | 208 | return df_log_model_shots_coef, df_log_model_headers_coef, df_log_model_free_kicks_coef, log_model, log_model_headers, log_model_free_kicks 209 | 210 | # - End function 211 | ############################################################################# 212 | 213 | 214 | 215 | """ Function to determine the position of a player. 216 | Inputs: x: average x-coordinate 217 | y: average y-coordinate 218 | position: Position taken from Wyscout "role"-column 219 | """ 220 | def decide_position(x, y, position): 221 | if (position == "Defender"): 222 | if (y < 30): 223 | return "LB" 224 | elif (y > 70): 225 | return "RB" 226 | else: 227 | return "CB" 228 | elif (position == "Midfielder"): 229 | if (y < 30): 230 | return "LM" 231 | elif (y > 70): 232 | return "RM" 233 | else: 234 | return "CM" 235 | elif (position == "Forward"): 236 | if (y < 30): 237 | return "LW" 238 | elif (y > 70): 239 | return "RW" 240 | else: 241 | return "ST" 242 | elif (position == "Goalkeeper"): 243 | return "GK" 244 | else: 245 | return "?" 246 | 247 | # - End function 248 | ############################################################################# 249 | 250 | 251 | 252 | """ Function which does linear regression fitting of KPI against 253 | dep_var (team_xG or opponent_xG) for a given position. 254 | Iteratively removes one independent vaiable at a time that is 255 | concidered statistically insignificant (p-value > 0.05). 256 | 257 | Description: 258 | 259 | Regression model variables: 260 | 261 | Dependent variable: dep_var (team_xG or opponent_xG) 262 | Independent variables: KPI values 263 | 264 | Input: 265 | KPI_train - dataframe of KPIs that can be used as training data 266 | scaler - chosen scaler method for the normalization of KPIs 267 | list_kpi - list of KPIs used for training data 268 | dep_var - model dependent variable 269 | position - position to find model for 270 | min_minutes - minimum minutes played for a player in a match 271 | to be included in the regression model training data. 272 | 273 | Output: 274 | model_coef - linear regression model coefficients 275 | r_squared - resulting r-squared of the model 276 | list_kpi_fitting - list of statistically significant KPIs 277 | 278 | """ 279 | def KPI_fitting(KPI_train, scaler, list_kpi, dep_var, position, min_minutes): 280 | 281 | list_kpi_fitting = list_kpi.copy() 282 | 283 | # Append the dependent variable 284 | list_kpi_fitting.append(dep_var) 285 | 286 | ################################################ 287 | # - Filter the training data 288 | "----------------------------------------------" 289 | df_train_filtered = filter_dataframe(KPI_train, position, list_kpi_fitting, min_minutes, 1) 290 | 291 | # Normalise 292 | df_train_filtered[list_kpi_fitting[:-1]] = scaler.fit_transform(df_train_filtered[list_kpi_fitting[:-1]]) 293 | 294 | 295 | ################################################ 296 | # - First Linear regression model for this position 297 | "----------------------------------------------" 298 | # First Linear regression model 299 | X = df_train_filtered[list_kpi_fitting[:-1]] # Dep. var last 300 | X = sm.add_constant(X) 301 | y = df_train_filtered[dep_var] # Dep. var 302 | test_model = sm.OLS(y, X).fit() 303 | #print(f"Model before tuning for the position {position}: \n") 304 | #print(test_model.summary()) 305 | 306 | ################################################ 307 | # - Do iterations of Linear regression model to exclude some independent variables 308 | "----------------------------------------------" 309 | model_pvalues = test_model.pvalues 310 | model_pvalues = model_pvalues.drop('const', axis = 0) 311 | pvalues_check = model_pvalues.values <= 0.05 312 | 313 | # Loop regression model and take out the highest KPI with the highest pvalue one at a time 314 | while False in pvalues_check: 315 | 316 | # Find highest pvalue kpi 317 | highest_kpi = model_pvalues[model_pvalues == model_pvalues.values.max()].index[0] 318 | 319 | # New list of KPIs 320 | list_kpi_fitting.remove(highest_kpi) 321 | 322 | # Filter the data 323 | df_train_filtered = filter_dataframe(KPI_train, position, list_kpi_fitting, min_minutes, 1) 324 | 325 | # Normalise the new frame 326 | df_train_filtered[list_kpi_fitting[:-1]] = scaler.fit_transform(df_train_filtered[list_kpi_fitting[:-1]]) 327 | 328 | # Linear regression model 329 | X = df_train_filtered[list_kpi_fitting[:-1]] 330 | X = sm.add_constant(X) 331 | y = df_train_filtered[dep_var] 332 | test_model = sm.OLS(y, X).fit() 333 | 334 | model_pvalues = test_model.pvalues 335 | model_pvalues = model_pvalues.drop('const', axis = 0) 336 | pvalues_check = model_pvalues.values <= 0.05 337 | 338 | 339 | # Print model after the tuning 340 | print(f"Model AFTER tuning for the position {position}: \n") 341 | print(test_model.summary()) 342 | model_coef = test_model.params 343 | r_squared = test_model.rsquared 344 | 345 | return model_coef, r_squared, list_kpi_fitting 346 | 347 | # - End function 348 | ############################################################################# 349 | 350 | 351 | 352 | """ Function which computes the linear regression fitted 353 | result from a player in a given match. 354 | 355 | Description: 356 | 357 | Input: 358 | player - KPIs for a player in a given match 359 | model_coef - regression model coefficients 360 | (regression model gave as statistically significant) 361 | list_kpi_fitting - list of KPIs (regression model gave 362 | as statistically significant) 363 | 364 | Output: 365 | result - result (fitted xG_team or xG_opponent) 366 | for that player in that match 367 | 368 | """ 369 | def compute_fitting_ratings(player, model_coef, list_kpi_fitting): 370 | 371 | result = 0 372 | 373 | for kpi in list_kpi_fitting[:-1]: 374 | result += (model_coef[kpi] * player[kpi]) 375 | 376 | result += model_coef['const'] 377 | 378 | return result 379 | 380 | # - End function 381 | ############################################################################# 382 | 383 | 384 | 385 | """ Function which computes the so-called "event-based rating" 386 | for a player in a given match 387 | 388 | (Could look over this code and possibly remove df_KPI as input) 389 | 390 | Description: 391 | 392 | Input: 393 | player - infomration about the player 394 | position - position group of the player 395 | df_KPI - dataframe of KPIs 396 | 397 | Output: 398 | result - resulting "event-based rating" 399 | 400 | """ 401 | def compute_events_rating(player, position, df_KPI): 402 | 403 | # default weights 404 | dict_weights = {'plus_minus': 0.2, 405 | 'goals': 1, 406 | 'assists': 0.7, 407 | 'own_goals': -0.5, 408 | 'yellow_cards': -0.05, 409 | 'danger_ball_loses': -0.2, 410 | 'xG_tot': -0.1, 411 | 'red_card': -1, 412 | 'aerial%': 0.1, 413 | 'def_actions%':0.1, 414 | 'p_adj_succ_def_actions': 0.1, 415 | 'succesful_dribbles': 0.05, 416 | 'creative_passes': 0.1, 417 | 'progressive_carries': 0.05 418 | } 419 | 420 | #Set weight for the different positions 421 | if position == ['LB', 'RB']: 422 | dict_weights['def_actions%'] = 0.2 423 | dict_weights['progressive_carries'] = 0.15 424 | elif position == ['CB']: 425 | dict_weights['aerial%'] = 0.3 426 | dict_weights['def_actions%'] = 0.8 427 | dict_weights['p_adj_succ_def_actions'] = 0.6 428 | elif position == ['LM', 'RM']: 429 | dict_weights['aerial%'] = 0.05 430 | dict_weights['def_actions%'] = 0.05 431 | dict_weights['creative_passes'] = 0.2 432 | dict_weights['progressive_carries'] = 0.1 433 | dict_weights['succesful_dribbles'] = 0.1 434 | elif position == ['CM']: 435 | dict_weights['creative_passes'] = 0.3 436 | dict_weights['succesful_dribbles'] = 0.1 437 | elif position == ['LW', 'RW']: 438 | dict_weights['aerial%'] = 0.05 439 | dict_weights['def_actions%'] = 0.05 440 | dict_weights['creative_passes'] = 0.6 441 | dict_weights['progressive_carries'] = 0.3 442 | dict_weights['succesful_dribbles'] = 0.4 443 | dict_weights['p_adj_succ_def_actions'] = 0.05 444 | elif position == ['ST']: 445 | dict_weights['def_actions%'] = 0 446 | dict_weights['p_adj_succ_def_actions'] = 0 447 | else: 448 | print("Not a valid position") 449 | 450 | # Find the KPI dataframe 451 | mask_match = ((df_KPI['matchId'] == player.matchId) & (df_KPI['playerId'] == player.playerId)) 452 | df_the_match = df_KPI.loc[mask_match] 453 | 454 | # Sum the event rating 455 | event_rating = 0 456 | for weight_name in dict_weights: 457 | #print(weight) 458 | weight = dict_weights[weight_name] 459 | value = df_the_match[weight_name].values[0] 460 | event_rating += (value * weight) 461 | 462 | event_rating = event_rating / 20 463 | 464 | return event_rating 465 | 466 | # - End function 467 | ############################################################################# 468 | 469 | 470 | 471 | """ Function which mainly finds the percentile ranks of the regression- 472 | based rating and the event based rating. Sum those two ratings and 473 | adds this to the dataframes "df_KPI_test" and df_ratings as 474 | "final_rating". 475 | 476 | 477 | (This function might need some improvement, exmaple: remove df_KPI) 478 | 479 | df_KPI and df_KPI_test are included mostly for trial and error purposes 480 | doing the development 481 | 482 | 483 | Description: 484 | 485 | Input: 486 | df_ratings - dataframe of fitting and event rating results. 487 | This dataframe is modified with added columns in the function. 488 | Most importantly is "final_rating" added. 489 | 490 | df_KPI - dataframe of KPIs for both training and test data 491 | 492 | df_KPI_test - dataframe of KPIs for the test data. This dataframe 493 | is modified with added columns in the function. 494 | 495 | percentiles_fit - percentile values for the regression-based rating 496 | percentiles_events - percentile values for the event-based rating 497 | df_matches - Wyscout matches dataframe used for adding info to df_KPI_test 498 | 499 | Output: 500 | None 501 | 502 | """ 503 | def create_rating_dataframe(df_ratings, df_KPI, df_KPI_test, percentiles_fit, percentiles_events, df_matches): 504 | for i, player in df_ratings.iterrows(): 505 | mask_match = ((df_KPI['matchId'] == player.matchId) & (df_KPI['playerId'] == player.playerId)) 506 | 507 | # Find percentile rank of the regression-based rating 508 | if df_ratings.loc[i, 'tot_fit_rating'] < percentiles_fit.values[0]: 509 | final_fit_rating = 0.1 510 | else: 511 | for percentile in percentiles_fit.values: 512 | if df_ratings.loc[i, 'tot_fit_rating'] > percentile: 513 | final_fit_rating = round(percentiles_fit[percentiles_fit == percentile].index[0] * 5, 1) 514 | 515 | # Find percentile rank of the event-based rating 516 | if df_ratings.loc[i, 'match_events_rating'] < percentiles_events.values[0]: 517 | final_event_rating = 0.1 518 | else: 519 | for percentile in percentiles_events.values: 520 | if df_ratings.loc[i, 'match_events_rating'] > percentile: 521 | final_event_rating = round(percentiles_events[percentiles_events == percentile].index[0] * 5, 1) 522 | 523 | # Sum the regression-based rating and event-based rating 524 | final_rating = final_fit_rating + final_event_rating 525 | 526 | # Find the match info to easier look up the rating elsewhere 527 | the_match = df_matches.loc[df_matches['wyId'] == player.matchId] 528 | match_info = the_match.label.values[0] 529 | gameweek = the_match.gameweek.values[0] 530 | 531 | # Add the final rating and info to both the test-df and the ratings-df 532 | df_ratings.loc[i, 'position'] = df_KPI.loc[mask_match, 'role'].values[0] 533 | df_ratings.loc[i, 'match_info'] = match_info 534 | df_ratings.loc[i, 'final_rating'] = final_rating 535 | df_ratings.loc[i, 'gameweek'] = gameweek 536 | 537 | #tot_rating = df_ratings.loc[i, 'tot_rating'] 538 | fitting_rating_off = df_ratings.loc[i, 'fitting_rating_off'] 539 | fitting_rating_def = df_ratings.loc[i, 'fitting_rating_def'] 540 | tot_fit_rating = df_ratings.loc[i, 'tot_fit_rating'] 541 | match_events_rating = df_ratings.loc[i, 'match_events_rating'] 542 | 543 | #df_KPI_test.loc[mask_match, 'tot_rating'] = tot_rating 544 | df_KPI_test.loc[mask_match, 'fitting_rating_off'] = fitting_rating_off 545 | df_KPI_test.loc[mask_match, 'fitting_rating_def'] = fitting_rating_def 546 | df_KPI_test.loc[mask_match, 'tot_fit_rating'] = tot_fit_rating 547 | df_KPI_test.loc[mask_match, 'match_events_rating'] = match_events_rating 548 | df_KPI_test.loc[mask_match, 'final_rating'] = final_rating 549 | df_KPI_test.loc[mask_match, 'match_info'] = match_info 550 | df_KPI_test.loc[mask_match, 'gameweek'] = gameweek 551 | 552 | 553 | # - End function 554 | ############################################################################# 555 | 556 | 557 | 558 | """ Function which filters the dataframe 559 | 560 | Description: 561 | 562 | Input: 563 | df_KPI - Dataframe with information about player´s KPI´s 564 | from x number of games. 565 | 566 | position - postion to filter for 567 | 568 | list_kpi - selected kpi column´s to include in the returne dataframe 569 | 570 | min_minutes - minutes to filter for 571 | 572 | min_matches - total number of matches to filter for 573 | 574 | Output: 575 | df_pos_final - Filtered dataframe 576 | 577 | """ 578 | def filter_dataframe(df_KPI, positions, list_kpi, min_minutes, min_matches): 579 | 580 | # Create a dataframe with all the players from chosen position 581 | mask_pos = df_KPI.role.isin(positions) 582 | df_pos = df_KPI.loc[mask_pos] 583 | 584 | # Find the matches were the players have played more than "min_minutes" 585 | mask_tot_min = df_pos.minutesPlayed > min_minutes 586 | df_pos = df_pos.loc[mask_tot_min] 587 | 588 | # Find the unique player Id´s 589 | player_list = df_pos['playerId'].unique().tolist() 590 | 591 | # Loop through and add the players with more than "min_matches" 592 | # matches to the dataframe 593 | player_list_high_minutes = [] 594 | for player in player_list: 595 | mask_player = df_pos.playerId == player 596 | df_player = df_pos.loc[mask_player] 597 | nr_of_matches = len(df_player) 598 | 599 | # Add player to the list 600 | if (nr_of_matches >= min_matches): 601 | player_list_high_minutes.append(player) 602 | 603 | # Create the final dataframe with matches 604 | mask_tot_matches = df_pos.playerId.isin(player_list_high_minutes) 605 | df_pos_final = df_pos.loc[mask_tot_matches] 606 | 607 | # Only return the relevant columns 608 | list_columns = list_kpi.copy() 609 | list_columns.extend(['playerId', 'shortName', 'teamName', 'matchId']) 610 | df_pos_final = df_pos_final[df_pos.columns.intersection(list_columns)] 611 | 612 | return df_pos_final 613 | 614 | # - End function 615 | ############################################################################# 616 | 617 | 618 | 619 | """ Function which plots the final ratings from a match (including subs with 620 | more than 20 minutes played). 621 | 622 | 623 | Description: 624 | 625 | Input: 626 | df_final_rating - dataframe of all the players ratings from the match 627 | home_team_lineup - Wycout shortName of players in home team lineup 628 | home_team_bench - Wycout shortName of players on home team bench 629 | away_team_lineup - Wycout shortName of players in away team lineup 630 | away_team_bench - Wycout shortName of players on away team bench 631 | 632 | Output: 633 | None (Nice looking plot) 634 | 635 | """ 636 | def plot_pitch_ratings(df_final_rating, home_team_lineup, home_team_bench, away_team_lineup, away_team_bench): 637 | pitch = Pitch(pitch_type="wyscout") 638 | fig, ax = pitch.draw(figsize=(7,15)) 639 | 640 | match_result = df_final_rating.match_info.values[0] 641 | 642 | ax.text(50, -5, match_result, ha = "center", fontsize = 16, fontproperties = serif_bold.prop) 643 | 644 | text_size = 10 645 | 646 | alpha_scaling = 13 647 | 648 | pitch_positions = { 649 | 'LB': [10, 12], 650 | 'LWB': [17, 12], 651 | 'LCB': [3, 30], 652 | 'CB': [2, 50], 653 | 'RB': [10, 88], 654 | 'RWB': [17, 88], 655 | 'RCB': [3, 70], 656 | 'LM': [28, 12], 657 | 'RM': [28, 88], 658 | 'LCM': [20, 35], 659 | 'CM': [17, 50], 660 | 'RCM': [20, 65], 661 | 'CAM': [38, 50], 662 | 'LW': [47, 25], 663 | 'RW': [47, 75], 664 | 'ST': [49, 50], 665 | 'LST': [49, 40], 666 | 'RST': [49, 60], 667 | } 668 | 669 | 670 | # Team colors 671 | team_colors = { 672 | 'Huddersfield Town FC': "#0E63AD", 673 | 'Manchester United FC': '#DA291C', 674 | 'Tottenham Hotspur FC': '#132257', 675 | 'Newcastle United FC': '#241F20', 676 | 'Stoke City FC': '#E03A3E', 677 | 'Southampton FC': '#D71920', 678 | 'Everton FC': '#003399', 679 | 'Leicester City FC': '#003090', 680 | 'Crystal Palace FC':'#1B458F', 681 | 'West Ham United FC': '#7A263A', 682 | 'Burnley FC': '#6C1D45', 683 | 'Swansea City AFC': '#121212', 684 | 'West Bromwich Albion FC': '#122F67', 685 | 'AFC Bournemouth': '#DA291C', 686 | 'Brighton & Hove Albion FC': '#0057B8', 687 | 'Watford FC': '#FBEE23', 688 | 'Liverpool FC': '#C8102E', 689 | 'Chelsea FC': '#034694', 690 | 'Manchester City FC': '#6CABDD', 691 | 'Arsenal FC':'#EF0107' 692 | } 693 | 694 | attackers = ['LW', 'CAM', 695 | 'RW', 696 | 'ST', 697 | 'LST', 698 | 'RST'] 699 | 700 | # Creta list of the players in ranking dataframe 701 | ranked_players = df_final_rating['shortName'].tolist() 702 | 703 | # adjust for the rating box in the plot 704 | box_adjustment = 5 705 | 706 | # Place the home team lineup on the pitch 707 | for player in home_team_lineup: 708 | mask_player = df_final_rating.shortName == player 709 | position = df_final_rating.loc[mask_player, 'position'].values[0] 710 | rating = df_final_rating.loc[mask_player, 'final_rating'].values[0] 711 | team = df_final_rating.loc[mask_player, 'teamName'].values[0] 712 | 713 | # Set the team_color 714 | team_color = team_colors[team] 715 | 716 | # Make sure to seperate name if it is too long 717 | shortName = player.split() 718 | shortName_new = "" 719 | if len(shortName) == 1: 720 | shortName_new = player 721 | else: 722 | for i in range(2): 723 | shortName_new += shortName[i] 724 | if i == 0: 725 | shortName_new += " " 726 | 727 | 728 | x = pitch_positions[position][0] 729 | y = pitch_positions[position][1] + box_adjustment 730 | 731 | alignment = "left" 732 | box_addition = 3 733 | if position in attackers: 734 | alignment = "right" 735 | box_addition = -3 736 | 737 | props = dict(boxstyle='round', facecolor=team_color, alpha=rating/alpha_scaling) 738 | # place a text box with rating 739 | ax.text(x+box_addition, y-5, str(round(rating, 1)), ha = alignment, fontsize = text_size, 740 | fontproperties = serif_bold.prop, bbox=props) 741 | ax.text(x, y, shortName_new, ha = alignment, fontsize = text_size, color=team_color, 742 | fontproperties = serif_bold.prop) # add fonts 743 | 744 | 745 | # Place the home team bench 746 | bench_x = -2 747 | bench_y = 110 748 | for player in home_team_bench: 749 | 750 | # check if the bench player played 751 | if player in ranked_players: 752 | 753 | mask_player = df_final_rating.shortName == player 754 | rating = df_final_rating.loc[mask_player, 'final_rating'].values[0] 755 | team = df_final_rating.loc[mask_player, 'teamName'].values[0] 756 | 757 | # Set the team_color 758 | team_color = team_colors[team] 759 | 760 | # Make sure to seperate name if it is too long 761 | shortName = player.split() 762 | shortName_new = "" 763 | if len(shortName) == 1: 764 | shortName_new = player 765 | else: 766 | for i in range(2): 767 | shortName_new += shortName[i] 768 | if i == 0: 769 | shortName_new += " " 770 | print("hej") 771 | 772 | props = dict(boxstyle='round', facecolor=team_color, alpha=rating/alpha_scaling) 773 | # place a text box with rating 774 | ax.text(bench_x+5, bench_y-5, str(round(rating, 1)), ha = "center", fontsize = text_size, 775 | fontproperties = serif_bold.prop, bbox=props) 776 | ax.text(bench_x, bench_y, shortName_new, ha = "left", fontsize = text_size, color=team_color, 777 | fontproperties = serif_regular.prop) # add fonts 778 | bench_x += 20 779 | 780 | # Place the away team lineup 781 | for player in away_team_lineup: 782 | mask_player = df_final_rating.shortName == player 783 | position = df_final_rating.loc[mask_player, 'position'].values[0] 784 | rating = df_final_rating.loc[mask_player, 'final_rating'].values[0] 785 | team = df_final_rating.loc[mask_player, 'teamName'].values[0] 786 | 787 | # Set the team_color 788 | team_color = team_colors[team] 789 | 790 | # Make sure to seperate name if it is too long 791 | shortName = player.split() 792 | shortName_new = "" 793 | if len(shortName) == 1: 794 | shortName_new = player 795 | else: 796 | for i in range(2): 797 | shortName_new += shortName[i] 798 | if i == 0: 799 | shortName_new += " " 800 | print("hej") 801 | 802 | alignment = "right" 803 | box_addition = -3 804 | if position in attackers: 805 | alignment = "left" 806 | box_addition = +3 807 | 808 | x = 100-pitch_positions[position][0] 809 | y = 100-pitch_positions[position][1] + box_adjustment 810 | 811 | # place a text box with rating 812 | # these are matplotlib.patch.Patch properties 813 | props = dict(boxstyle='round', facecolor=team_color, alpha=rating/alpha_scaling) 814 | ax.text(x+box_addition, y-5, str(round(rating, 1)), ha = alignment, fontsize = text_size, 815 | fontproperties = serif_bold.prop, bbox=props) 816 | ax.text(x, y, shortName_new, fontsize = text_size, ha = alignment, color=team_color, 817 | fontproperties = serif_bold.prop) # add fonts 818 | 819 | bench_x = 50 820 | bench_y = 110 821 | for player in away_team_bench: 822 | # check if the bench player played 823 | if player in ranked_players: 824 | mask_player = df_final_rating.shortName == player 825 | rating = df_final_rating.loc[mask_player, 'final_rating'].values[0] 826 | team = df_final_rating.loc[mask_player, 'teamName'].values[0] 827 | 828 | # Set the team_color 829 | team_color = team_colors[team] 830 | 831 | # Make sure to seperate name if it is too long 832 | shortName = player.split() 833 | shortName_new = "" 834 | if len(shortName) == 1: 835 | shortName_new = player 836 | else: 837 | for i in range(2): 838 | shortName_new += shortName[i] 839 | if i == 0: 840 | shortName_new += " " 841 | print("hej") 842 | 843 | # place a text box with rating 844 | props = dict(boxstyle='round', facecolor=team_color, alpha=rating/alpha_scaling) 845 | ax.text(bench_x+5, bench_y-5, str(round(rating, 1)), ha = "center", fontsize = text_size, 846 | fontproperties = serif_bold.prop, bbox=props) 847 | 848 | # place name of the benched player 849 | ax.text(bench_x, bench_y, shortName_new, ha = "left", fontsize = text_size, color=team_color, 850 | fontproperties = serif_regular.prop) # add fonts 851 | bench_x += 15 852 | 853 | 854 | 855 | -------------------------------------------------------------------------------- /minutes_played.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Sep 21 12:44:56 2021 5 | 6 | @author: emildanielsson 7 | """ 8 | 9 | #!/usr/bin/env python3 10 | # -*- coding: utf-8 -*- 11 | """ 12 | Created on Mon Sep 13 11:51:54 2021 13 | 14 | @author: emildanielsson 15 | 16 | Program description: 17 | Computes how many minutes each player have played for each game 18 | from the given event and matches data set 19 | 20 | Creates and saves a dataframe with the following columns: 21 | playerId - playerId from Wyscout 22 | shortName - shortName from Wyscout 23 | matchId - matchId from Wyscout data 24 | teamId - teamId from Wyscout data 25 | teamName - Official teamname frrom Wyscout data 26 | player_in_min - the minute of the match the playerr started playing 27 | player_out_min - the minute of the match the player stopped playing 28 | minutesPlayed - Minutes played in the given game 29 | red_card - boolean to show if the player got a red card that game 30 | (1 = red card, = no red card) 31 | 32 | """ 33 | 34 | # The basics 35 | import pandas as pd 36 | import numpy as np 37 | import json 38 | 39 | 40 | ############################################################################# 41 | # - Create dataframes from the Wyscout data 42 | "---------------------------------------------------------------------------" 43 | 44 | # Create event dataframe 45 | #df_Europe_events = pd.read_json('Json_files/events_All.json', encoding="unicode_escape") #SLOWER 46 | with open('../Json_files/events_All.json') as f: 47 | data_Europe= json.load(f) 48 | 49 | df_Europe_events = pd.DataFrame(data_Europe) 50 | 51 | # Create match dataframes 52 | df_England_matches = pd.read_json('../../Wyscout/matches/matches_England.json', encoding="unicode_escape") 53 | 54 | df_France_matches = pd.read_json('../../Wyscout/matches/matches_France.json', encoding="unicode_escape") 55 | 56 | df_Germany_matches = pd.read_json('../../Wyscout/matches/matches_Germany.json', encoding="unicode_escape") 57 | 58 | df_Italy_matches = pd.read_json('../../Wyscout/matches/matches_Italy.json', encoding="unicode_escape") 59 | 60 | df_Spain_matches = pd.read_json('../../Wyscout/matches/matches_Spain.json', encoding="unicode_escape") 61 | 62 | 63 | # Create players and teams dataframes 64 | df_players = pd.read_json("../../Wyscout/players.json", encoding="unicode_escape") 65 | df_teams = pd.read_json("../../Wyscout/teams.json", encoding="unicode_escape") 66 | 67 | 68 | 69 | ############################################################################# 70 | # - Merge matches dataframes from all leagues 71 | "---------------------------------------------------------------------------" 72 | 73 | frames_matches = [df_England_matches, df_France_matches, df_Germany_matches, 74 | df_Italy_matches, df_Spain_matches] 75 | 76 | df_Europe_matches = pd.concat(frames_matches, keys = ["England", "France", 77 | "Germany", "Italy", "Spain"]) 78 | 79 | 80 | ############################################################################# 81 | # - Creating the dataframe of full playing time for each match 82 | "---------------------------------------------------------------------------" 83 | 84 | # Prepares the dataframe with the columns we need 85 | df_matches_fulltime=pd.DataFrame(columns=['matchId','matchDuration']) 86 | 87 | # Match id checkpoints 88 | loop_checkpoints = np.arange(0,2080,50) 89 | j = 0 90 | 91 | # Loop trough all matches 92 | for i, match in df_Europe_matches.iterrows(): 93 | 94 | # Find the events from match 95 | mask_match = (df_Europe_events.matchId == match['wyId']) & (df_Europe_events.matchPeriod == "2H") 96 | df_match = df_Europe_events.loc[mask_match] 97 | 98 | # time ofsecond half in seconds 99 | fulltime_sec = df_match['eventSec'].max() 100 | 101 | # Convert to minutes 102 | fulltime_min = 45 + round(fulltime_sec / 60) 103 | 104 | # Add match and full time (minutes) to dataframe 105 | df_matches_fulltime.loc[df_matches_fulltime.shape[0]] = [match.wyId, fulltime_min] 106 | 107 | if (j in loop_checkpoints): 108 | print(f"Number of matches checked for fulltimes: {j}\n") 109 | 110 | j+=1 111 | 112 | ############################################################################# 113 | # - Creating the dataframe of minutes played for each player in each game 114 | "---------------------------------------------------------------------------" 115 | 116 | # Prepares the dataframe with the columns we need 117 | df_minutes_played=pd.DataFrame(columns=['playerId', 'shortName', 118 | 'matchId', 'teamId', 'teamName', 119 | 'player_in_min', 'player_out_min', 120 | 'minutesPlayed', 'red_card']) 121 | 122 | # Match id checkpoints 123 | loop_checkpoints = np.arange(0,2080,50) 124 | j = 0 125 | 126 | # Loop trough all matches 127 | for i, match in df_Europe_matches.iterrows(): 128 | 129 | # Lineups and substitutions are nested in teamsData 130 | team_data = match['teamsData'] 131 | 132 | # Get match Id 133 | matchId = match['wyId'] 134 | 135 | # Get full match length 136 | fulltime_min = df_matches_fulltime.loc[df_matches_fulltime['matchId'] == matchId]['matchDuration'].values[0] 137 | 138 | # Loop through both teams in the match 139 | for teamId in team_data: 140 | # loop like this gets the teamId as String, not the team object apperantly 141 | 142 | # Fetches the team to look at 143 | team = team_data[teamId] 144 | 145 | # list of the lineup 146 | lineup = team['formation']['lineup'] 147 | 148 | # list of the substitutions 149 | substitutions = team['formation']['substitutions'] 150 | 151 | # Get the team id 152 | teamId = team['teamId'] 153 | 154 | # Get the team name 155 | mask_team_name = df_teams.wyId == teamId 156 | df_team = df_teams.loc[mask_team_name] 157 | teamName = df_team.officialName.values[0] # Could change officialName -> name ?? 158 | 159 | # list of the players that came in during the match 160 | sub_ins = [] 161 | sub_outs = [] 162 | if (substitutions != "null"): 163 | for sub in substitutions: 164 | # "Handle" the case when the sub is badly registered 165 | if ((sub['playerIn'] != 0) & (sub['playerOut'] != 0)): 166 | sub_ins.append(sub['playerIn']) 167 | sub_outs.append(sub['playerOut']) 168 | # With this solution some players will have played more minutes 169 | # than they actually played. But it is not that many matches 170 | # so I think we are fine with it. 171 | 172 | # Loop through all players in the lineup and get their minutes played 173 | for player in lineup: 174 | 175 | # Get the current playerId 176 | playerId = player['playerId'] 177 | 178 | # Get the current player shortName 179 | shortName = df_players.loc[df_players.wyId == playerId].shortName.values[0] 180 | 181 | # If the player have been subbed out set minutes played for the sub and the player 182 | if (playerId in sub_outs): 183 | 184 | # Find index of the substitution from the lists 185 | sub_index = sub_outs.index(playerId) 186 | 187 | # Find the mninute when sub took place 188 | sub_minute = substitutions[sub_index]['minute'] 189 | 190 | # Find the name of the subbed in player 191 | shortName_sub = df_players.loc[df_players.wyId == sub_ins[sub_index]].shortName.values[0] 192 | 193 | # Add minutes played by the subed out player to the dataframe 194 | df_minutes_played.loc[df_minutes_played.shape[0]] = [sub_outs[sub_index], shortName, matchId, teamId, teamName, 0, sub_minute, sub_minute, 0] 195 | 196 | 197 | # Handle the case if the subbed in player also is subbed out (injury for example) 198 | if (sub_ins[sub_index] in sub_outs): 199 | 200 | # Find index of the substitution from the lists 201 | sub_index2 = sub_outs.index(sub_ins[sub_index]) 202 | 203 | # Find the mninute when sub took place 204 | sub_minute2 = substitutions[sub_index2]['minute'] 205 | 206 | # Find the name of the subbed in player 207 | shortName_sub2 = df_players.loc[df_players.wyId == sub_ins[sub_index2]].shortName.values[0] 208 | 209 | # Make sure the subbed in and then out player at least played 1 min 210 | if (sub_minute2 - sub_minute <= 0): 211 | sub_playing_minutes2 = 1 212 | else: 213 | sub_playing_minutes2 = sub_minute2 - sub_minute 214 | 215 | # Add minutes played by the subed in and out player to the dataframe 216 | df_minutes_played.loc[df_minutes_played.shape[0]] = [sub_outs[sub_index2], shortName_sub, matchId, teamId, teamName, sub_minute, sub_minute2, sub_playing_minutes2, 0] 217 | 218 | # Make sure the subbed in player at least played 1 min 219 | if (fulltime_min - sub_minute2 <= 0): 220 | sub_playing_minutes3 = 1 221 | else: 222 | sub_playing_minutes3 = fulltime_min - sub_minute2 223 | 224 | 225 | # Add minutes played by the subed in player to the dataframe 226 | df_minutes_played.loc[df_minutes_played.shape[0]] = [sub_ins[sub_index2], shortName_sub2, matchId, teamId, teamName, sub_minute2, fulltime_min, sub_playing_minutes3, 0] 227 | 228 | # Normal substitution 229 | else: 230 | # Make sure the subbed in player at least played 1 min 231 | if (fulltime_min - sub_minute <= 0): 232 | sub_playing_minutes = 1 233 | else: 234 | sub_playing_minutes = fulltime_min - sub_minute 235 | 236 | # Add minutes played by the subed in player to the dataframe 237 | df_minutes_played.loc[df_minutes_played.shape[0]] = [sub_ins[sub_index], shortName_sub, matchId, teamId, teamName, sub_minute, fulltime_min, sub_playing_minutes, 0] 238 | 239 | # The player played for the whole game 240 | else: 241 | df_minutes_played.loc[df_minutes_played.shape[0]] = [playerId, shortName, matchId, teamId, teamName, 0, fulltime_min, fulltime_min, 0] 242 | 243 | 244 | if (j in loop_checkpoints): 245 | print(f"Number of matches checked for minutes: {j}\n") 246 | 247 | j+=1 248 | 249 | 250 | 251 | ############################################################################# 252 | # - Adjust for red cards 253 | "---------------------------------------------------------------------------" 254 | 255 | # Filter out the fouls, assumed that red cards only exists as Foul-event 256 | mask_fouls = df_Europe_events.eventName == "Foul" 257 | df_fouls = df_Europe_events.loc[mask_fouls] 258 | 259 | # Initiate variables 260 | match_list_reds = [] 261 | player_list_reds = [] 262 | 263 | # Loop through events to find matcghes and players with red cards 264 | for i, foul_i in df_fouls.iterrows(): 265 | 266 | # List to save the tags in 267 | foul_tags = [] 268 | 269 | # Loop through fouls to find red cards 270 | for foultag in foul_i['tags']: 271 | foul_tags.append(foultag['id']) 272 | 273 | # tag 1701 == red card, tag 1703 == second yellow card 274 | if ((1701 in foul_tags) or (1703 in foul_tags)): 275 | 276 | # Fet the redcarded playerId and matchId 277 | red_carded_player = foul_i.playerId 278 | red_carded_match = foul_i.matchId 279 | 280 | # Find minute of the red card 281 | if foul_i.matchPeriod == "1H": 282 | red_card_minute = round(foul_i.eventSec / 60) 283 | elif foul_i.matchPeriod == "2H": 284 | red_card_minute = 45 + round(foul_i.eventSec / 60) 285 | else: 286 | print("Error" + str(foul_i.matchPeriod)) 287 | 288 | # Find the minute the red carded player got in 289 | mask_red_card_player_min = ((df_minutes_played.playerId == red_carded_player) & (df_minutes_played.matchId == red_carded_match)) 290 | df_red_card = df_minutes_played.loc[mask_red_card_player_min] 291 | if len(df_red_card) != 0: 292 | red_card_player_in = df_red_card.player_in_min.values[0] 293 | 294 | # Adjust the dataframe "df_minutes_played" to add the red card info 295 | df_minutes_played.loc[mask_red_card_player_min, 'player_out_min'] = red_card_minute 296 | df_minutes_played.loc[mask_red_card_player_min, 'minutesPlayed'] = red_card_minute - red_card_player_in 297 | df_minutes_played.loc[mask_red_card_player_min, 'red_card'] = 1 298 | 299 | 300 | 301 | ############################################################################# 302 | # - Save df_minutes to dataframe "minutes_played_All.json" 303 | "---------------------------------------------------------------------------" 304 | 305 | 306 | df_minutes_played.to_json("../Json_files/minutes_played_All.json") 307 | 308 | # Test to load in and store as dataframe 309 | # with open('../Json_files/minutes_played_All.json') as f: 310 | # data_minutes_new = json.load(f) 311 | 312 | # df_test_new = pd.DataFrame(data_minutes_new) 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | -------------------------------------------------------------------------------- /the_match_ranking.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Sep 14 16:41:04 2021 5 | 6 | @author: emildanielsson & JakobEP 7 | 8 | Program description: 9 | Find ratings of all players in the last round 10 | 11 | Algorithm: 12 | 13 | """ 14 | 15 | 16 | # The basics 17 | import pandas as pd 18 | import numpy as np 19 | import json 20 | 21 | # Plotting 22 | import matplotlib.pyplot as plt 23 | from mplsoccer import FontManager 24 | from mplsoccer import Pitch, VerticalPitch 25 | 26 | # Import other functions 27 | import fitting_functions as ff 28 | import KPI_functions as kpi 29 | 30 | # Statistical fitting of models 31 | import statsmodels.api as sm 32 | import statsmodels.formula.api as smf 33 | from sklearn import preprocessing 34 | from sklearn.preprocessing import MinMaxScaler 35 | from sklearn.preprocessing import RobustScaler 36 | from sklearn.model_selection import train_test_split 37 | from sklearn.linear_model import LogisticRegression 38 | 39 | # For tables 40 | from tabulate import tabulate 41 | 42 | # Ignore Future Warnings 43 | import warnings 44 | warnings.simplefilter(action='ignore', category=FutureWarning) 45 | 46 | 47 | #%% 48 | # - Load Fonts 49 | "---------------------------------------------------------------------------" 50 | 51 | URL1 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/' 52 | 'fonts/SourceSerifPro-Regular.ttf?raw=true') 53 | serif_regular = FontManager(URL1) 54 | URL2 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/' 55 | 'fonts/SourceSerifPro-ExtraLight.ttf?raw=true') 56 | serif_extra_light = FontManager(URL2) 57 | URL3 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/fonts/' 58 | 'SourceSerifPro-Bold.ttf?raw=true') 59 | serif_bold = FontManager(URL3) 60 | 61 | 62 | #%% 63 | # - Read in data KPI data 64 | "---------------------------------------------------------------------------" 65 | 66 | # Test to load in and store as dataframe per_90 dont have all collumns yet 67 | # with open('Json_files/KPI_per_90_All.json') as f: 68 | # data_kpi = json.load(f) 69 | 70 | with open('../Json_files/KPI_tot_All_v2.json') as f: 71 | data_kpi = json.load(f) 72 | 73 | df_KPI = pd.DataFrame(data_kpi) 74 | 75 | 76 | # Create match dataframes 77 | df_England_matches = pd.read_json('../../Wyscout/matches/matches_England.json', encoding="unicode_escape") 78 | 79 | 80 | #%% 81 | # - Read in minutes played data 82 | "---------------------------------------------------------------------------" 83 | 84 | with open('../Json_files/minutes_played_All.json') as f: 85 | data_minutes = json.load(f) 86 | 87 | df_minutes = pd.DataFrame(data_minutes) 88 | 89 | 90 | #%% 91 | # - Read PL events data, players and teams 92 | "---------------------------------------------------------------------------" 93 | 94 | # Create event dataframe for PL 95 | df_events = pd.read_json('../Json_files/events_All.json', encoding="unicode_escape") 96 | 97 | # Create players and teams dataframes 98 | df_players = pd.read_json("../../Wyscout/players.json", encoding="unicode_escape") 99 | df_teams = pd.read_json("../../Wyscout/teams.json", encoding="unicode_escape") 100 | 101 | 102 | #%% 103 | # - Read in data for xG-model and get the coeficients dataframes 104 | "---------------------------------------------------------------------------" 105 | 106 | with open('../Json_files/xG_model_v2_All_except_Eng.json') as f: 107 | data_xG_model = json.load(f) 108 | 109 | # Create dataframes 110 | df_xG_model = pd.DataFrame(data_xG_model) 111 | 112 | # Call xG-m 113 | df_log_model_shots_coef, df_log_model_headers_coef, df_log_model_free_kicks_coef, log_model, log_model_headers, log_model_free_kicks = ff.xG_model(df_xG_model) 114 | 115 | 116 | #%% 117 | # - Set filter and scaler varables 118 | "---------------------------------------------------------------------------" 119 | 120 | # Now we want to filter out those who have not played at least 121 | # 10 matches with 20 minutes in each match (can change) 122 | min_minutes = 20 123 | 124 | # Choose method for normalizaion 125 | scaler = MinMaxScaler() 126 | #scaler = preprocessing.QuantileTransformer(random_state=0) 127 | #scaler = RobustScaler() 128 | 129 | 130 | #%% 131 | # - Create test and train dataset and preprocess data 132 | "---------------------------------------------------------------------------" 133 | 134 | # Seperate df_KPI beteween PL and the rest of the legaues 135 | mask_PL = df_KPI.league == "England" 136 | df_KPI_PL = df_KPI.loc[mask_PL] 137 | df_KPI_EU_train = df_KPI.loc[~mask_PL] 138 | 139 | test_gameweek = 38 140 | df_PL_gameweek_38 = df_England_matches.loc[df_England_matches.gameweek == test_gameweek] 141 | list_gameweek_38_matchId = df_PL_gameweek_38['wyId'].unique().tolist() 142 | mask_last_gameweeks = df_KPI_PL.matchId.isin(list_gameweek_38_matchId) 143 | 144 | # KPIs GW 1-37 145 | df_KPI_PL = df_KPI_PL.loc[~mask_last_gameweeks] 146 | 147 | 148 | #%% 149 | # - Let User choose a match to get ratings from 150 | "---------------------------------------------------------------------------" 151 | 152 | print("Choose match Id to get rankings from:\n") 153 | for i, match in df_PL_gameweek_38.iterrows(): 154 | print(match.label) 155 | print(f"matchId: {match.wyId}\n") 156 | 157 | print("Enter the match Id to look at: ") 158 | 159 | the_matchId = int(input()) 160 | #the_matchId = 2500098 161 | 162 | # Find the match events 163 | df_the_match_events = df_events.loc[df_events.matchId == the_matchId] 164 | 165 | # Df with all own goals 166 | df_own_goals = kpi.own_goals(df_the_match_events) 167 | 168 | 169 | #%% 170 | # - Create the KPI-dataframe from that match 171 | "---------------------------------------------------------------------------" 172 | # Initiate the dataframe 173 | # Prepare the dataframe with the columns we need 174 | df_the_match_KPI = pd.DataFrame(columns=['matchId', 175 | 'league', 176 | 'teamName', 177 | 'playerId', 178 | 'shortName', 179 | 'role', 180 | 'minutesPlayed', 181 | 'team_goals', 182 | 'team_conceded_goals', 183 | 'red_card', 184 | # KPI's from here 185 | 'goals', 186 | 'assists', 187 | 'passing%', 188 | 'completed_passes', 189 | 'fouls', 190 | 'aerial%', 191 | 'aerial_wins', 192 | 'shots', 193 | 'dribbles%', 194 | 'succesful_dribbles', 195 | 'key_passes', 196 | 'succesful_through_passes', 197 | 'plus_minus', 198 | 'events_in_box', 199 | 'passes_to_box', 200 | 'creative_passes', 201 | 'succesful_def_actions', 202 | 'progressive_carries', 203 | 'xG_tot', 204 | 'xG_shots', 205 | 'xG_headers', 206 | 'xG_free_kicks', 207 | 'xG_penalties', 208 | 'own_goals', 209 | 'yellow_cards', 210 | 'danger_ball_loses', 211 | 'def_actions%']) 212 | 213 | 214 | #%% 215 | # - Find home and away score 216 | "----------------------------------------------" 217 | 218 | # Find teamIds in the match 219 | teams_match_list = df_the_match_events['teamId'].unique().tolist() 220 | 221 | # Find the match data from df_matches 222 | mask_score = df_England_matches.wyId == the_matchId 223 | df_the_match_info = df_England_matches.loc[mask_score] 224 | team_data = df_the_match_info.teamsData.values[0] 225 | 226 | # Get the list of players from events file 227 | players_the_match = df_the_match_events['playerId'].unique().tolist() 228 | 229 | # Shortrname lists 230 | home_team_lineup = [] 231 | away_team_lineup = [] 232 | home_team_bench = [] 233 | away_team_bench = [] 234 | 235 | # playerIds list 236 | home_team_list = [] 237 | away_team_list = [] 238 | for i in range(2): 239 | team_data_i = team_data[str(teams_match_list[i])] 240 | team_lineup = team_data_i['formation']['lineup'] 241 | team_bench = team_data_i['formation']['bench'] 242 | 243 | # HERE COULD WE GET THE LINEUP POSITIONS 244 | 245 | # Get the lineup players 246 | for player in team_lineup: 247 | if player['playerId'] in players_the_match: 248 | if team_data_i['side'] == "home": 249 | home_team_list.append(player['playerId']) 250 | shortName = df_players.loc[df_players.wyId == player['playerId']].shortName.values[0] 251 | home_team_lineup.append(shortName) 252 | elif team_data_i['side'] == "away": 253 | away_team_list.append(player['playerId']) 254 | shortName = df_players.loc[df_players.wyId == player['playerId']].shortName.values[0] 255 | away_team_lineup.append(shortName) 256 | else: 257 | print("Error: " + team_data_i['side']) 258 | 259 | # Get the bench players 260 | for player in team_bench: 261 | if player['playerId'] in players_the_match: 262 | if team_data_i['side'] == "home": 263 | home_team_list.append(player['playerId']) 264 | shortName = df_players.loc[df_players.wyId == player['playerId']].shortName.values[0] 265 | home_team_bench.append(shortName) 266 | elif team_data_i['side'] == "away": 267 | away_team_list.append(player['playerId']) 268 | shortName = df_players.loc[df_players.wyId == player['playerId']].shortName.values[0] 269 | away_team_bench.append(shortName) 270 | else: 271 | print("Error: " + team_data_i['side']) 272 | 273 | # Set home and away score 274 | if team_data_i['side'] == "home": 275 | home_team_score = team_data_i['score'] 276 | elif team_data_i['side'] == "away": 277 | away_team_score = team_data_i['score'] 278 | else: 279 | print("Error: " + team_data_i['score']) 280 | 281 | #%% 282 | # Compute the KPIs from the chosen match 283 | "----------------------------------------------" 284 | 285 | # Loop trough all players and get their average position and compute KPI's 286 | for player in players_the_match: 287 | 288 | # Find the minutes played, team and red card 289 | mask_minutes = (df_minutes.playerId == player) 290 | df_player_minutes = df_minutes.loc[mask_minutes] 291 | 292 | # Some players are not registered the subbed in but their events are registerd 293 | # If they are not subbed in correctly in Wyscout matches "df_player_minutes" 294 | # will be empty. Thus we check this here. 295 | if len(df_player_minutes != 0): 296 | player_minutes = df_player_minutes['minutesPlayed'][0] 297 | player_in_min = df_player_minutes['player_in_min'][0] 298 | player_out_min = df_player_minutes['player_out_min'][0] 299 | player_team = df_player_minutes['teamId'][0] 300 | player_team_name = df_player_minutes['teamName'][0] 301 | red_card_bool = df_player_minutes['red_card'][0] 302 | 303 | # New dataframe with all events from 'player' in match 304 | df_events_player = df_the_match_events.loc[df_the_match_events.playerId == player] 305 | 306 | # Get the position of the player 307 | position = df_events_player['Position'].values[0] 308 | 309 | # Get the league 310 | league = df_events_player["league"].values[0] 311 | 312 | # Get the shortName 313 | name = df_events_player['shortName'].values[0] 314 | 315 | # Get the team goal and goals conceded 316 | if (player in home_team_list): 317 | team_goals = home_team_score 318 | team_conceded_goals = away_team_score 319 | elif (player in away_team_list): 320 | team_goals = away_team_score 321 | team_conceded_goals = home_team_score 322 | else: 323 | print("Error: cant find player in list") 324 | 325 | ################################################ 326 | # - Check after own goals from player in match 327 | "----------------------------------------------" 328 | 329 | # Initiate temp variable 330 | own_goals = 0 331 | 332 | # Read out any eventual own goals 333 | df_own_goals_player = df_own_goals.loc[df_own_goals.playerId == player] 334 | 335 | # Check there were any own goals 336 | if len(df_own_goals_player) != 0: 337 | own_goals = len(df_own_goals_player) 338 | 339 | 340 | ################################################ 341 | # - All function calls to compute kpi's 342 | # - (Should maybe try to use df.loc[mask, column] = instead) 343 | "----------------------------------------------" 344 | 345 | # goals 346 | goals, goals_info = kpi.nr_goals(df_events_player, player_minutes) 347 | 348 | # assists 349 | assists, assists_info = kpi.nr_assists(df_events_player, player_minutes) 350 | 351 | # passing% 352 | pass_percent, pass_percent_info = kpi.percent_passes_completed(df_events_player, player_minutes) 353 | 354 | # passes_completed 355 | pass_comp, pass_comp_p90, pass_comp_info = kpi.passes_completed(df_events_player, player_minutes) 356 | 357 | # fouls 358 | fouls, fouls_p90, fouls_info = kpi.fouls(df_events_player, player_minutes) 359 | 360 | # aerials% 361 | aerials_percent, aerials_percent_info = kpi.percent_aerial_wins(df_events_player, player_minutes) 362 | 363 | # aerials_won 364 | aerial_wins, aerial_wins_p90, aerial_wins_info = kpi.aerials_won(df_events_player, player_minutes) 365 | 366 | # shots 367 | shots, shots_p90, shots_info = kpi.shots(df_events_player, player_minutes) 368 | 369 | # dribbles% 370 | dribbles_percent, dribbles_percent_info = kpi.percent_succesful_dribbles(df_events_player, player_minutes) 371 | 372 | # succesful_dribbles 373 | succesful_dribbles, succesful_dribbles_p90, succesful_dribbles_info = kpi.succesful_dribbles(df_events_player, player_minutes) 374 | 375 | # key_passes 376 | key_passes, key_passes_p90, key_passes_info = kpi.key_passes(df_events_player, player_minutes) 377 | 378 | # succesful_through_passes 379 | succesful_through_passes, succesful_through_passes_p90, succesful_through_passes_info = kpi.succesful_through_passes(df_events_player, player_minutes) 380 | 381 | # plus-minus 382 | plus_minus, plus_minus_info = kpi.plus_minus(df_the_match_events, player_team, player_minutes, player_in_min, player_out_min) 383 | 384 | # events_in_box 385 | events_in_box, events_in_box_p90, event_in_box_info = kpi.events_in_box(df_events_player, player_minutes) 386 | 387 | # passes_to_box 388 | passes_to_box, passes_to_box_p90, passes_to_box_info = kpi.passes_to_box(df_events_player, player_minutes) 389 | 390 | # creative_passes 391 | creative_passes, creative_passes_p90, creative_passes_info = kpi.creative_passes(df_events_player, player_minutes) 392 | 393 | # defensive_actions 394 | succesful_def_actions, succesful_def_actions_p90, succesful_def_actions_info = kpi.succesful_def_actions(df_events_player, player_minutes) 395 | 396 | # progressive_carries 397 | progressive_carries, progressive_carries_p90, progressive_carries_info = kpi.progressive_carries(df_events_player, player_minutes) 398 | 399 | # xG 400 | xG_tot, xG_tot_p90, xG_info, xG_shots, xG_headers, xG_free_kicks, xG_penalties = kpi.xG(df_events_player, player_minutes, df_log_model_shots_coef, df_log_model_headers_coef, df_log_model_free_kicks_coef) 401 | 402 | # danger_ball_loses 403 | danger_ball_loses, danger_ball_loses_p90, danger_ball_loses_info = kpi.danger_ball_loses(df_events_player, player_minutes) 404 | 405 | # yellow_cards 406 | yellow_cards, yellow_cards_info = kpi.yellow_cards(df_events_player) 407 | 408 | # percent_def_actions 409 | percent_def_actions, percent_def_actions_info = kpi.percent_def_actions(df_events_player, player_minutes) 410 | 411 | 412 | ######################################################## 413 | # - Add rows to df_the_match_KPI 414 | "------------------------------------------------------" 415 | # df_KPI_tot 416 | df_the_match_KPI.loc[df_the_match_KPI.shape[0]] = [the_matchId, league, player_team_name, player, name, 417 | position, player_minutes, team_goals, 418 | team_conceded_goals, red_card_bool, 419 | goals, 420 | assists, 421 | pass_percent, 422 | pass_comp, 423 | fouls, 424 | aerials_percent, 425 | aerial_wins, 426 | shots, 427 | dribbles_percent, 428 | succesful_dribbles, 429 | key_passes, 430 | succesful_through_passes, 431 | plus_minus, 432 | events_in_box, 433 | passes_to_box, 434 | creative_passes, 435 | succesful_def_actions, 436 | progressive_carries, 437 | xG_tot, 438 | xG_shots, 439 | xG_headers, 440 | xG_free_kicks, 441 | xG_penalties, 442 | own_goals, 443 | yellow_cards, 444 | danger_ball_loses, 445 | percent_def_actions 446 | ] 447 | 448 | #%% 449 | # - Create the new columns team_xG, opponents_xG, possesion 450 | "---------------------------------------------------------------------------" 451 | # List of the team names 452 | list_teams = df_the_match_KPI["teamName"].unique().tolist() 453 | 454 | for team in list_teams: 455 | 456 | # Find the team KPI 457 | mask_team = df_the_match_KPI.teamName == team 458 | df_team = df_the_match_KPI.loc[mask_team] 459 | df_opponent = df_the_match_KPI.loc[~mask_team] 460 | 461 | # Find xG and shots 462 | # team_shots = df_team['shots'].sum() 463 | # opponent_shots = df_opponent['shots'].sum() 464 | team_xG = df_team["xG_tot"].sum() 465 | opponent_xG = df_opponent["xG_tot"].sum() 466 | team_passes = df_team['completed_passes'].sum() 467 | opponent_passes = df_opponent['completed_passes'].sum() 468 | 469 | tot_game_passes = team_passes + opponent_passes 470 | 471 | # Find approximate possesion 472 | team_possesion = team_passes / tot_game_passes 473 | opponent_possesion = opponent_passes / tot_game_passes 474 | 475 | # Find PossAdj defnesive actions 476 | for i, player in df_team.iterrows(): 477 | mask_player = (df_the_match_KPI.playerId == player.playerId) 478 | df_player = df_the_match_KPI.loc[mask_player] 479 | def_actions = df_player.succesful_def_actions.values[0] 480 | p_adj_def_actions = def_actions / opponent_possesion 481 | df_the_match_KPI.loc[mask_player, 'p_adj_succ_def_actions'] = p_adj_def_actions 482 | 483 | # Add to the KPI dataframe 484 | mask_add_xG = (df_the_match_KPI.teamName == team) 485 | df_the_match_KPI.loc[mask_add_xG, 'team_xG'] = team_xG 486 | df_the_match_KPI.loc[mask_add_xG, 'opponent_xG'] = opponent_xG 487 | df_the_match_KPI.loc[mask_add_xG, 'team_possesion'] = team_possesion 488 | df_the_match_KPI.loc[mask_add_xG, 'opponent_possesion'] = opponent_possesion 489 | # df_the_match_KPI.loc[mask_add_xG, 'team_shots'] = team_shots 490 | # df_the_match_KPI.loc[mask_add_xG, 'opponent_shots'] = opponent_shots 491 | 492 | #%% 493 | # - Rank the players 494 | "---------------------------------------------------------------------------" 495 | 496 | # Merge the KPIs from the chosen match with the KPIS from 1-37 497 | df_KPI_PL = df_KPI_PL.append(df_the_match_KPI, ignore_index = True) 498 | 499 | # Positions to fit for 500 | positions_fitting = [['LB', 'RB'], ['CB'], ['LM', 'RM'], ['CM'], ['LW', 'RW'], ['ST']] 501 | 502 | # Initiate rating and info dataframe 503 | df_final_rating = pd.DataFrame(columns = ['matchId', 'teamName', 'playerId', 504 | 'shortName', 'position', 'tot_rating', 505 | 'match_events_rating', 'fitting_rating_off', 506 | 'fitting_rating_def', 507 | 'final_rating', 'match_info', 508 | 'gameweek']) 509 | 510 | # Do fitting for all the positins 511 | for position in positions_fitting: 512 | # print(position) 513 | 514 | ################################################ 515 | # - Kpis to fit for 516 | "----------------------------------------------" 517 | 518 | list_kpi_all = ['passing%', 519 | 'completed_passes', 520 | 'fouls', 521 | 'aerial%', 522 | 'aerial_wins', 523 | 'shots', 524 | 'dribbles%', 525 | 'succesful_dribbles', 526 | 'key_passes', 527 | 'succesful_through_passes', 528 | 'events_in_box', 529 | 'passes_to_box', 530 | 'creative_passes', 531 | 'succesful_def_actions', 532 | 'progressive_carries', 533 | 'red_card', 534 | 'own_goals', 535 | 'yellow_cards', 536 | 'danger_ball_loses', 537 | 'def_actions%', 538 | 'p_adj_succ_def_actions' 539 | ] 540 | 541 | # KPIs when using KPI_tot_All 542 | list_kpi_off = ['passing%', 543 | 'completed_passes', 544 | 'fouls', 545 | #'aerial%', 546 | #'aerial_wins', 547 | 'shots', 548 | 'dribbles%', 549 | #'succesful_dribbles', 550 | 'key_passes', 551 | #'succesful_through_passes', 552 | 'events_in_box', 553 | 'passes_to_box', 554 | #'creative_passes', 555 | #'succesful_def_actions', 556 | #'progressive_carries', 557 | 'red_card', 558 | 'own_goals', 559 | 'yellow_cards', 560 | 'danger_ball_loses', 561 | #'def_actions%', 562 | 'p_adj_succ_def_actions' 563 | ] 564 | 565 | list_kpi_def = ['passing%', 566 | 'completed_passes', 567 | 'fouls', 568 | #'aerial%', 569 | #'aerial_wins', 570 | #'shots', 571 | 'dribbles%', 572 | #'succesful_dribbles', 573 | #'key_passes', 574 | #'succesful_through_passes', 575 | #'events_in_box', 576 | #'passes_to_box', 577 | #'creative_passes', 578 | #'succesful_def_actions', 579 | #'progressive_carries', 580 | 'red_card', 581 | 'own_goals', 582 | 'yellow_cards', 583 | 'danger_ball_loses', 584 | #'def_actions%', 585 | 'p_adj_succ_def_actions' 586 | ] 587 | 588 | # # KPIs when using per_90_All 589 | # list_kpi_p90 = ['passing%', 590 | # 'completed_passes_p90', 591 | # 'fouls_p90', 592 | # 'aerial%', 593 | # 'aerial_wins_p90', 594 | # 'shots_p90', 595 | # 'dribbles%', 596 | # 'succesful_dribbles_p90', 597 | # 'key_passes_p90', 598 | # 'succesful_through_passes_p90', 599 | # 'events_in_box_p90', 600 | # 'passes_to_box_p90', 601 | # 'creative_passes_p90', 602 | # 'succesful_def_actions_p90', 603 | # 'progressive_carries_p90', 604 | # 'red_card', 605 | # 'own_goals', 606 | # 'yellow_cards', 607 | # 'danger_ball_loses', 608 | # 'def_actions%' 609 | # ] 610 | 611 | # Copy the KPI dataframe to add offensive and defensive 612 | 613 | ################################################ 614 | # - Filter the training data 615 | "----------------------------------------------" 616 | # Call to fitting function to find coeficient and independent variables 617 | dep_var_off = 'team_xG' 618 | model_coef_off, r_squared_off, list_kpi_off_fitting = ff.KPI_fitting(df_KPI_EU_train, scaler, 619 | list_kpi_off, dep_var_off, 620 | position, min_minutes) 621 | 622 | # Call to fitting function to find coeficient and independent variables 623 | dep_var_def = 'opponent_xG' 624 | model_coef_def, r_squared_def, list_kpi_def_fitting = ff.KPI_fitting(df_KPI_EU_train, scaler, 625 | list_kpi_def, dep_var_def, 626 | position, min_minutes) 627 | 628 | 629 | ################################################ 630 | # - Use the coeficient from EU to compute percentiles 631 | # in the PL gameweek 1-37, filtered PL training data 632 | "----------------------------------------------" 633 | 634 | # Merge the KPIs from the chosen match with the KPIS from 1-37 635 | 636 | # Filter and normalise the PL data (including the chosen match) 637 | df_filtered_PL = ff.filter_dataframe(df_KPI_PL, position, list_kpi_all, min_minutes, 1) 638 | df_filtered_PL[list_kpi_all] = scaler.fit_transform(df_filtered_PL[list_kpi_all]) 639 | 640 | # KPIs GW 1-37 641 | df_KPI_PL_train = df_filtered_PL.loc[~(df_filtered_PL.matchId == the_matchId)] 642 | 643 | # Initiate rating dataframe for GW 1-37 644 | df_ratings = pd.DataFrame() 645 | 646 | # Loop through players in gameweek 1-37 647 | for i, player in df_KPI_PL_train.iterrows(): 648 | 649 | # Add some info to dataframe 650 | df_ratings.loc[i, 'matchId'] = player['matchId'] 651 | df_ratings.loc[i, 'teamName'] = player['teamName'] 652 | df_ratings.loc[i, 'playerId'] = player['playerId'] 653 | df_ratings.loc[i, 'shortName'] = player['shortName'] 654 | 655 | ################################################ 656 | # - xG-Fit 657 | "----------------------------------------------" 658 | 659 | # Find the fitted xG 660 | xG_fitting_rating_off = ff.compute_fitting_ratings(player, model_coef_off, list_kpi_off_fitting) 661 | 662 | # Multiply the fitted value with r_squared, how good the fit was 663 | xG_fitting_rating_off = xG_fitting_rating_off * r_squared_off 664 | 665 | # Add to df 666 | df_ratings.loc[i, 'fitting_rating_off'] = xG_fitting_rating_off 667 | 668 | ################################################ 669 | # - opponent_xG-Fit (xGC) 670 | "----------------------------------------------" 671 | # Find the fitted opponent xG (xGC) 672 | xGC_fitting_rating_def = ff.compute_fitting_ratings(player, model_coef_def, list_kpi_def_fitting) 673 | 674 | # Multiply the fitted value with r_squared, how good the fit was 675 | xGC_fitting_rating_def = xGC_fitting_rating_def * r_squared_def 676 | 677 | # Add to df 678 | df_ratings.loc[i, 'fitting_rating_def'] = xGC_fitting_rating_def 679 | 680 | ################################################ 681 | # - Match event-rating 682 | "----------------------------------------------" 683 | 684 | # Find the event rating and add to dataframe 685 | match_event_rating = ff.compute_events_rating(player, position, df_KPI) 686 | df_ratings.loc[i, 'match_events_rating'] = match_event_rating 687 | 688 | # Sum fitting rating and add to dataframe 689 | tot_fit_rating = xG_fitting_rating_off - xGC_fitting_rating_def 690 | df_ratings.loc[i, 'tot_fit_rating'] = tot_fit_rating 691 | 692 | 693 | 694 | # Find percentiles from the rankings in gameweek 1-37 PL 695 | percentiles = np.arange(0.01, 1, 0.01) 696 | percentiles_fit = df_ratings['tot_fit_rating'].quantile(percentiles) 697 | percentiles_events = df_ratings['match_events_rating'].quantile(percentiles) 698 | 699 | ################################################ 700 | # - Compute the rankings of the chosen match gameweek 38 for the position 701 | "----------------------------------------------" 702 | # KPIs GW 38 703 | df_the_match_KPI_players = df_filtered_PL.loc[df_filtered_PL.matchId == the_matchId] 704 | 705 | # Initiate rating dataframe for GW 38 706 | df_ratings_test = pd.DataFrame() 707 | 708 | # Loop through players from the given match 709 | for i, player in df_the_match_KPI_players.iterrows(): 710 | 711 | # Add some info to dataframe 712 | df_ratings_test.loc[i, 'matchId'] = player['matchId'] 713 | df_ratings_test.loc[i, 'teamName'] = player['teamName'] 714 | df_ratings_test.loc[i, 'playerId'] = player['playerId'] 715 | df_ratings_test.loc[i, 'shortName'] = player['shortName'] 716 | 717 | ################################################ 718 | # - xG-Fit 719 | "----------------------------------------------" 720 | 721 | # Find the fitted xG 722 | xG_fitting_rating_off = ff.compute_fitting_ratings(player, model_coef_off, list_kpi_off_fitting) 723 | 724 | # Multiply the fitted value with r_squared, how good the fit was 725 | xG_fitting_rating_off = xG_fitting_rating_off * r_squared_off 726 | 727 | # Add to df 728 | df_ratings_test.loc[i, 'fitting_rating_off'] = xG_fitting_rating_off 729 | 730 | ################################################ 731 | # - opponent_xG-Fit (xGC) 732 | "----------------------------------------------" 733 | 734 | # Find the fitted opponent xG (xGC) 735 | xGC_fitting_rating_def = ff.compute_fitting_ratings(player, model_coef_def, list_kpi_def_fitting) 736 | 737 | # Multiply the fitted value with r_squared, how good the fit was 738 | xGC_fitting_rating_def = xGC_fitting_rating_def * r_squared_def 739 | 740 | # Add to df 741 | df_ratings_test.loc[i, 'fitting_rating_def'] = xGC_fitting_rating_def 742 | 743 | ################################################ 744 | # - Match event-rating 745 | "----------------------------------------------" 746 | 747 | # Find the event rating and add to dataframe 748 | match_event_rating = ff.compute_events_rating(player, position, df_KPI) 749 | df_ratings_test.loc[i, 'match_events_rating'] = match_event_rating 750 | 751 | # Sum fitting rating and add to dataframe 752 | tot_fit_rating = xG_fitting_rating_off - xGC_fitting_rating_def 753 | df_ratings_test.loc[i, 'tot_fit_rating'] = tot_fit_rating 754 | 755 | # Modify the df_rating_test dataframe and the gameweek 38 dataframe 756 | ff.create_rating_dataframe(df_ratings_test, df_KPI_PL, df_the_match_KPI_players, 757 | percentiles_fit, percentiles_events, df_England_matches) 758 | 759 | # Merge to the raw rating dataframe 760 | frames = [df_final_rating, df_ratings_test] 761 | df_final_rating = pd.concat(frames) 762 | 763 | 764 | #%% 765 | # - Print the ratings 766 | "---------------------------------------------------------------------------" 767 | print(df_final_rating.match_info.values[0]) 768 | table = df_final_rating[['teamName', 'shortName', 'position', 'final_rating']] 769 | print(tabulate(table)) 770 | 771 | 772 | 773 | #%% 774 | # - Plot the pitch 775 | # - Manually input the ACTUAL player positions 776 | "---------------------------------------------------------------------------" 777 | 778 | positions = ['GK', 'CB', 'LCB', 'RCB', 'LB', 'RB', 'LWB', 'RWB', 'CM', 779 | 'LCM', 'RCM', 'CAM', 'LM', 780 | 'RM', 'LW', 'RW', 'ST', 'LST', 'RST'] 781 | 782 | print("Here is the home team lineup:") 783 | # Print home team players 784 | for player in home_team_lineup: 785 | print(player) 786 | 787 | print("Here is the away team lineup:") 788 | for player in away_team_lineup: 789 | print(player) 790 | 791 | print("Now enter the position for each player in that game.") 792 | print(f"The positions to choose from are the following: \n{positions}") 793 | print("HOME TEAM:") 794 | for player in home_team_lineup: 795 | print(f"Write the position for: {player}") 796 | position = input() 797 | while position not in positions: 798 | print("NOT A VALID POSITION!") 799 | print(f"Write the position for: {player}") 800 | position = input() 801 | mask_player = df_final_rating.shortName == player 802 | df_final_rating.loc[mask_player, 'position'] = position 803 | 804 | print("AWAY TEAM:") 805 | for player in away_team_lineup: 806 | print(f"Write the position for: {player}") 807 | position = input() 808 | while position not in positions: 809 | print("NOT A VALID POSITION!") 810 | print(f"Write the position for: {player}") 811 | position = input() 812 | mask_player = df_final_rating.shortName == player 813 | df_final_rating.loc[mask_player, 'position'] = position 814 | 815 | #%% 816 | 817 | # This copy is only done for testing purposes 818 | df_plot_ratings = df_final_rating.copy() 819 | 820 | # Plot final ratings on a pitch 821 | ff.plot_pitch_ratings(df_plot_ratings, home_team_lineup, home_team_bench, away_team_lineup, away_team_bench) 822 | 823 | -------------------------------------------------------------------------------- /validation_vs_WhoScored.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Sep 14 16:41:04 2021 5 | 6 | @author: emildanielsson & JakobEP 7 | 8 | Program description: 9 | Finds ratings of all players in the last round and compares them with 10 | read in ratings from WhoScored (for the same matches in last round). 11 | 12 | OBS! Make sure to run GW_38_Ratings.py firstly and then 13 | 14 | """ 15 | 16 | # The basics 17 | import pandas as pd 18 | import numpy as np 19 | import json 20 | 21 | 22 | #%% 23 | # - Read Excels 24 | # - 25 | # - Make sure to choose the correct sheets 26 | # - 27 | "---------------------------------------------------------------------------" 28 | 29 | # Specify the path to the xlsx-file 30 | excel_path = "../Gameweek_38.xlsx" 31 | 32 | df_WhoScored = pd.read_excel(open(excel_path, 'rb'), 33 | sheet_name='WhoScored') 34 | 35 | df_pre_tune = pd.read_excel(open(excel_path, 'rb'), 36 | sheet_name='result_pre_tune') 37 | 38 | df_post_tune = pd.read_excel(open(excel_path, 'rb'), 39 | sheet_name='result_post_tune') 40 | 41 | 42 | 43 | #%% 44 | # - Create validation dataframe 45 | "---------------------------------------------------------------------------" 46 | df_validation = pd.DataFrame() 47 | 48 | # Find all the teams 49 | teams = df_WhoScored.teamName.unique().tolist() 50 | 51 | # Loop through teams and add theri "team_rating" 52 | for team in teams: 53 | 54 | # Whoscored frame sorted 55 | df_WhoScored_team = df_WhoScored.loc[df_WhoScored.teamName == team] 56 | df_WhoScored_team = df_WhoScored_team.sort_values(by='Rating', ascending=False) 57 | WhoScored_players = df_WhoScored_team.shortName.values.tolist() 58 | 59 | # df_pre_tune frame sorted 60 | df_pre_tune_team = df_pre_tune.loc[df_pre_tune.teamName == team] 61 | df_pre_tune_team = df_pre_tune_team.sort_values(by='final_rating', ascending=False) 62 | pre_tune_players = df_pre_tune_team.shortName.values.tolist() 63 | 64 | # df_ost_tune frame sorted 65 | df_post_tune_team = df_post_tune.loc[df_post_tune.teamName == team] 66 | df_post_tune_team = df_post_tune_team.sort_values(by='final_rating', ascending=False) 67 | post_tune_players = df_post_tune_team.shortName.values.tolist() 68 | 69 | for i, player in df_WhoScored_team.iterrows(): 70 | playerName = player.shortName 71 | df_validation.loc[i, 'shortName'] = playerName 72 | df_validation.loc[i, 'Position'] = player.position 73 | df_validation.loc[i, 'teamName'] = player.teamName 74 | df_validation.loc[i, 'WhoScored'] = WhoScored_players.index(playerName) + 1 75 | df_validation.loc[i, 'pre_tune'] = pre_tune_players.index(playerName) + 1 76 | df_validation.loc[i, 'post_tune'] = post_tune_players.index(playerName) + 1 77 | 78 | 79 | #%% 80 | # - Validate all players 81 | "---------------------------------------------------------------------------" 82 | 83 | score_pre = 0 84 | score_post = 0 85 | nr_of_players = len(df_validation) 86 | for i, player in df_validation.iterrows(): 87 | score_pre += abs(player.WhoScored - player.pre_tune) 88 | score_post += abs(player.WhoScored - player.post_tune) 89 | 90 | # Divide by the number of players (average "false" in comparison to WhoScored) 91 | score_pre = score_pre / nr_of_players 92 | score_post = score_post / nr_of_players 93 | 94 | # Print Validation for all players 95 | print("All Players validation:") 96 | print(f"pre tuning score = {score_pre}") 97 | print(f"post score = {score_post}\n") 98 | 99 | 100 | 101 | #%% 102 | # - Validate Positions 103 | "---------------------------------------------------------------------------" 104 | 105 | # Positions to fit for 106 | positions = [['LB', 'RB'], ['CB'], ['LM', 'RM'], ['CM'], ['LW', 'RW'], ['ST']] 107 | 108 | for position in positions: 109 | df_validate = df_validation.loc[df_validation.Position.isin(position)] 110 | score_pre = 0 111 | score_post = 0 112 | nr_of_players = len(df_validate) 113 | for i, player in df_validate.iterrows(): 114 | score_pre += abs(player.WhoScored - player.pre_tune) 115 | score_post += abs(player.WhoScored - player.post_tune) 116 | 117 | # Divide by the number of players (average "false" in comparison to WhoScored) 118 | score_pre = score_pre / nr_of_players 119 | score_post = score_post / nr_of_players 120 | 121 | # Print Validation for all players 122 | print(f"Validation {position}") 123 | print(f"pre tuning score = {score_pre}") 124 | print(f"post score = {score_post} \n") 125 | 126 | 127 | #%% 128 | # - Write validation results to Excel document 129 | "---------------------------------------------------------------------------" 130 | 131 | # with pd.ExcelWriter("../Gameweek_38.xlsx", mode="a", engine="openpyxl", if_sheet_exists = "new") as writer: 132 | # df_validation.to_excel(writer, sheet_name="WhoScored_Validation", 133 | # columns=['shortName', 'Position', 'teamName', 'WhoScored', 'pre_tune', 'pre_tune'], 134 | # header=True, index=False) -------------------------------------------------------------------------------- /xG_model_evaluation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Sep 17 14:42:31 2021 5 | 6 | @author: emildanielsson & JakobEP 7 | 8 | Program description: 9 | 10 | Evaluating and validating created xG-models with test-data. 11 | 12 | """ 13 | 14 | #%% 15 | # - Imports used 16 | "---------------------------------------------------------------------------" 17 | 18 | # Basics 19 | import pandas as pd 20 | import numpy as np 21 | import json 22 | 23 | # Plotting 24 | import matplotlib.pyplot as plt 25 | import seaborn as sns 26 | from mplsoccer import FontManager 27 | 28 | # Import other functions 29 | import fitting_functions as ff 30 | import FCPython 31 | 32 | # Statistical fitting of models 33 | from sklearn import metrics 34 | 35 | 36 | #%% 37 | # - Plot settings 38 | "---------------------------------------------------------------------------" 39 | 40 | # Read in fonts 41 | URL1 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/' 42 | 'fonts/SourceSerifPro-Regular.ttf?raw=true') 43 | serif_regular = FontManager(URL1) 44 | 45 | URL2 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/' 46 | 'fonts/SourceSerifPro-ExtraLight.ttf?raw=true') 47 | serif_extra_light = FontManager(URL2) 48 | URL3 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/fonts/' 49 | 'SourceSerifPro-Bold.ttf?raw=true') 50 | serif_bold = FontManager(URL3) 51 | 52 | 53 | #%% 54 | # - Create dataframes from the Wyscout data, uncomment if needed 55 | "---------------------------------------------------------------------------" 56 | 57 | """ 58 | 59 | # Create event dataframe 60 | with open('../Json_files/events_All.json') as f: 61 | data_Europe = json.load(f) 62 | 63 | df_Europe_events = pd.DataFrame(data_Europe) 64 | 65 | 66 | # Filter out events for England 67 | df_PL_events = df_Europe_events[df_Europe_events.league == 'England'] 68 | 69 | # Save as .json-file (so it can be read in directly in the future) 70 | df_PL_events.to_json("../Json_files/events_PL.json") 71 | 72 | """ 73 | 74 | #%% 75 | # - Read in event data for PL, uncomment if needed 76 | "---------------------------------------------------------------------------" 77 | """ 78 | # Create event dataframe 79 | with open('../Json_files/events_PL.json') as f: 80 | data_PL = json.load(f) 81 | 82 | df_PL_events = pd.DataFrame(data_PL) 83 | 84 | """ 85 | 86 | #%% 87 | # - Read in data for xG-model 88 | "---------------------------------------------------------------------------" 89 | 90 | with open('../Json_files/xG_model_v2_All_except_Eng.json') as f: 91 | data_xG_model_All = json.load(f) 92 | 93 | 94 | with open('../Json_files/xG_model_v2_England_only.json') as f: 95 | data_xG_model_PL = json.load(f) 96 | 97 | 98 | # Create dataframes 99 | df_All_xG_model = pd.DataFrame(data_xG_model_All) 100 | df_PL_xG_model = pd.DataFrame(data_xG_model_PL) 101 | 102 | 103 | #%% 104 | # - Get the coeficients dataframes 105 | "---------------------------------------------------------------------------" 106 | 107 | # Call xG-m function 108 | df_log_model_shots_coef, df_log_model_headers_coef, df_log_model_free_kicks_coef, log_model_shots, log_model_headers, log_model_free_kicks = ff.xG_model(df_All_xG_model) 109 | 110 | 111 | #%% 112 | # - Print out fitting results 113 | "---------------------------------------------------------------------------" 114 | 115 | #print("\n=============== xG-model shots ======================") 116 | 117 | #print(log_model_shots) 118 | 119 | 120 | #%% 121 | # - Filter out headers and freekicks for PL data 122 | "---------------------------------------------------------------------------" 123 | 124 | mask_headers = df_PL_xG_model.header == 1 125 | mask_free_kicks = df_PL_xG_model.free_kick == 1 126 | 127 | df_xG_shots = df_PL_xG_model[(~mask_headers) & (~mask_free_kicks)] 128 | df_xG_headers = df_PL_xG_model[mask_headers] 129 | df_xG_free_kicks = df_PL_xG_model[mask_free_kicks] 130 | 131 | 132 | #%% 133 | # - Split data - PL 134 | "---------------------------------------------------------------------------" 135 | 136 | x_testSet = df_xG_shots[['distance', 'angle_rad']].copy() # change df 137 | y_testSet = df_xG_shots[['goal']].copy() # change df 138 | 139 | # Adding distance squared to df 140 | squaredD = x_testSet['distance']**2 141 | x_testSet = x_testSet.assign(distance_sq = squaredD) 142 | 143 | 144 | #%% 145 | # - Make predictions - PL 146 | "---------------------------------------------------------------------------" 147 | 148 | # Find prediciton probabilities 149 | y_pred_prob = log_model_shots.predict_proba(x_testSet) # change model 150 | 151 | # Specify thresholds 152 | threshold5 = [0.5] 153 | threshold4 = [0.4] 154 | threshold2 = [0.2] 155 | threshold05 = [0.05] 156 | 157 | # Final predicitons 158 | #y_pred = y_pred_prob[y_pred_prob[:, 1] > threshold] 159 | 160 | y_pred5 = (y_pred_prob[:, 1] > threshold5).astype('float') 161 | y_pred2 = (y_pred_prob[:, 1] > threshold2).astype('float') 162 | y_pred05 = (y_pred_prob[:, 1] > threshold05).astype('float') 163 | y_pred4 = (y_pred_prob[:, 1] > threshold4).astype('float') 164 | 165 | y_pred = (y_pred_prob[:, 1] > threshold4).astype('float') # change 166 | 167 | # Get confusion matrix 168 | cnf_matrix5 = metrics.confusion_matrix(y_testSet, y_pred5) 169 | cnf_matrix2 = metrics.confusion_matrix(y_testSet, y_pred2) 170 | cnf_matrix05 = metrics.confusion_matrix(y_testSet, y_pred05) 171 | cnf_matrix4 = metrics.confusion_matrix(y_testSet, y_pred4) 172 | 173 | #%% 174 | # - Visualize the confusion matrix using Heatmap - PL 175 | "---------------------------------------------------------------------------" 176 | 177 | class_names = [0, 1] # name of classes 178 | 179 | fig, ax = plt.subplots(figsize=(8, 6)) 180 | tick_marks = np.arange(len(class_names)) 181 | plt.xticks(tick_marks, class_names) 182 | plt.yticks(tick_marks, class_names) 183 | 184 | # create heatmap 185 | sns.heatmap(pd.DataFrame(cnf_matrix4), annot=True, annot_kws={"size": 14}, # change 186 | cmap="Oranges", fmt='g', cbar=False, linewidths=2, linecolor='orange') 187 | ax.xaxis.set_label_position("top") 188 | plt.tight_layout() 189 | plt.title(f'Confusion matrix for Threshold: {threshold4[0]}', y=1.1, fontweight='bold', fontsize=20, fontproperties=serif_regular.prop) # change 190 | plt.ylabel('Actual label', fontweight='bold', fontsize=18, fontproperties=serif_regular.prop) 191 | plt.xlabel('Predicted label', fontweight='bold', fontsize=18, fontproperties=serif_regular.prop) 192 | 193 | # Set ticks size 194 | plt.xticks(fontsize=14, fontweight='bold', fontproperties=serif_regular.prop) 195 | plt.yticks(fontsize=14, fontweight='bold', fontproperties=serif_regular.prop) 196 | 197 | 198 | #%% 199 | # - Print stats - PL 200 | "---------------------------------------------------------------------------" 201 | 202 | """Recall - describes how big proportion among the true positive points that are predicted as positive. A high recall 203 | (close to 1) is good, and a low recall (close to 0) indicates a problem with false negatives. 204 | 205 | Precision - describes what the ratio of true positive points are among the ones predicted as positive. A high precision 206 | (close to 1) is good, and a low recall (close to 0) indicates a problem with false positives.""" 207 | 208 | 209 | cm_5 = metrics.confusion_matrix(y_testSet, y_pred5) 210 | cm_2 = metrics.confusion_matrix(y_testSet, y_pred2) 211 | cm_05 = metrics.confusion_matrix(y_testSet, y_pred05) 212 | cm_4 = metrics.confusion_matrix(y_testSet, y_pred4) 213 | 214 | #sensitivity = the ability of the model to correctly identify shots that resulted in a goal. 215 | sensitivity_5 = cm_5[1][1]/(cm_5[1][1] + cm_5[1][0]) 216 | sensitivity_2 = cm_2[1][1]/(cm_2[1][1] + cm_2[1][0]) 217 | sensitivity_05 = cm_05[1][1]/(cm_05[1][1] + cm_05[1][0]) 218 | sensitivity_4 = cm_4[1][1]/(cm_4[1][1] + cm_4[1][0]) 219 | 220 | #the ability of the model to correctly identify shots that did not result in a goal 221 | specificity_5 = cm_5[0][0]/(cm_5[0][1]+ cm_5[0][0]) 222 | specificity_2 = cm_2[0][0]/(cm_2[0][1]+ cm_2[0][0]) 223 | specificity_05 = cm_05[0][0]/(cm_05[0][1]+ cm_05[0][0]) 224 | specificity_4 = cm_4[0][0]/(cm_4[0][1]+ cm_4[0][0]) 225 | 226 | print("\n=============== xG-model performance ======================") 227 | 228 | print("Accuracy:", metrics.accuracy_score(y_testSet, y_pred)) 229 | print("Precision:", metrics.precision_score(y_testSet, y_pred)) 230 | print("Recall:", metrics.recall_score(y_testSet, y_pred)) 231 | 232 | print('sensitivity = ' + str(sensitivity_4)) # change 233 | print('specificity = '+ str(specificity_4) ) # change 234 | 235 | print("R-sq. score:", metrics.r2_score(y_testSet, y_pred, sample_weight=None, multioutput='uniform_average')) 236 | 237 | # OR 238 | """ 239 | 240 | cm_display = ConfusionMatrixDisplay(cm_dis_3).plot(cmap='OrRd') 241 | cm_display.im_.colorbar.remove() 242 | plt.title('Confusion Matrix for Threshold = 0.3') 243 | """ 244 | 245 | #%% 246 | # - Plot ROC-curve - PL 247 | "---------------------------------------------------------------------------" 248 | 249 | from sklearn.metrics import roc_curve 250 | 251 | fig, axes = plt.subplots(figsize=(11, 7)) 252 | y_score = log_model_shots.decision_function(x_testSet) # change model 253 | fpr, tpr, _ = roc_curve(y_testSet, y_score, pos_label=log_model_shots.classes_[1]) # change model 254 | plt.plot(fpr,tpr, label='ROC for xG-model shots') 255 | 256 | plt.scatter(1 - specificity_5, sensitivity_5, c='orange', s=100, label='Threshold = 0.5') 257 | plt.scatter(1 - specificity_2, sensitivity_2, c='red', s=100, label='Threshold = 0.2') 258 | plt.scatter(1 - specificity_05, sensitivity_05, c='green', s=100, label='Threshold = 0.05') 259 | plt.scatter(1 - specificity_4, sensitivity_4, c='purple', s=100, label='Threshold = 0.4') 260 | y_45 = np.linspace(0,1,100) 261 | plt.plot(y_45,y_45,linestyle='dashed', c='cyan', label='random guess') 262 | plt.legend(prop={"family": "Times New Roman", "size": 12}) 263 | plt.xlim([0, 1]) 264 | plt.ylim([0, 1]) 265 | plt.xlabel('False Positive Rate (1 - Specificity)', fontweight='bold', fontsize=16, fontproperties=serif_regular.prop) 266 | plt.ylabel('True Positive Rate (Sensitivity)', fontweight='bold', fontsize=16, fontproperties=serif_regular.prop) 267 | plt.title('ROC Curve', fontweight='bold', fontsize=24, fontproperties=serif_regular.prop) 268 | 269 | 270 | #%% 271 | # - Evaluate xG-model by plotting 272 | "---------------------------------------------------------------------------" 273 | 274 | coef_angle = df_log_model_shots_coef.iloc[0].values[0] 275 | 276 | coef_distance = df_log_model_shots_coef.iloc[2].values[0] 277 | 278 | coef_distance_sq = df_log_model_shots_coef.iloc[1].values[0] 279 | 280 | B0 = df_log_model_shots_coef.iloc[3].values[0] 281 | 282 | #Return xG value for more general model 283 | def calculate_xG(sh): 284 | 285 | xG = 1/(1 + np.exp(-(coef_distance*sh['distance'] + coef_distance_sq*sh['D2'] 286 | + coef_angle*sh['angle'] + B0))) 287 | return xG 288 | 289 | 290 | #Create a 2D map of xG 291 | pgoal_2d = np.zeros((65, 65)) 292 | 293 | for x in range(65): 294 | for y in range(65): 295 | sh = dict() 296 | a = np.arctan(7.32 *x /(x**2 + abs(y-65/2)**2 - (7.32/2)**2)) 297 | if a<0: 298 | a = np.pi + a 299 | sh['angle'] = a 300 | sh['distance'] = np.sqrt(x**2 + abs(y-65/2)**2) 301 | sh['D2'] = x**2 + abs(y-65/2)**2 302 | sh['X'] = x 303 | sh['AX'] = x*a 304 | sh['X2'] = x**2 305 | #sh['A2'] = a**2 306 | sh['C'] = abs(y-65/2) 307 | sh['C2'] = (y-65/2)**2 308 | 309 | pgoal_2d[x, y] = calculate_xG(sh) 310 | 311 | (fig3, ax3) = FCPython.createGoalMouth() 312 | pos = ax3.imshow(pgoal_2d, extent=[-1, 65, 65, -1], aspect='auto',cmap=plt.cm.Reds,vmin=0, vmax=0.3) 313 | fig3.colorbar(pos, ax=ax3) 314 | ax3.set_title('xG-model goal probabilities', fontsize=24, fontproperties=serif_regular.prop) 315 | plt.xlim((0,66)) 316 | plt.ylim((-3,35)) 317 | plt.gca().set_aspect('equal', adjustable='box') 318 | plt.show() 319 | 320 | 321 | 322 | --------------------------------------------------------------------------------