├── .gitignore
├── FCPython.py
├── GW_38_Ratings.py
├── GW_38_Ratings_evaluation.py
├── KPI_functions.py
├── README.md
├── __pycache__
    ├── FCPython.cpython-38.pyc
    ├── KPI_functions.cpython-38.pyc
    ├── fitting_functions.cpython-38.pyc
    └── percentile_functions.cpython-38.pyc
├── create_KPI_dataframe.py
├── create_KPI_dataframe_EDIT.py
├── create_events_df_eu.py
├── fitting_functions.py
├── minutes_played.py
├── the_match_ranking.py
├── validation_vs_WhoScored.py
└── xG_model_evaluation.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Ignore Mac system files
 2 | .DS_store
 3 | 
 4 | # Ignore node_modules folder
 5 | #node_modules
 6 | 
 7 | # Ignore files related to API keys
 8 | #.env
 9 | 
10 | # Ignore SASS config files
11 | #.sass-cache


--------------------------------------------------------------------------------
/FCPython.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Wed Mar 25 17:32:00 2020
  5 | 
  6 | @author: davsu428
  7 | """
  8 | import matplotlib.pyplot as plt
  9 | from matplotlib.patches import Arc
 10 | 
 11 | def createPitch(length,width, unity,linecolor): # in meters
 12 |     # Code by @JPJ_dejong
 13 | 
 14 |     """
 15 |     creates a plot in which the 'length' is the length of the pitch (goal to goal).
 16 |     And 'width' is the width of the pitch (sideline to sideline). 
 17 |     Fill in the unity in meters or in yards.
 18 | 
 19 |     """
 20 |     #Set unity
 21 |     if unity == "meters":
 22 |         # Set boundaries
 23 |         if length >= 120.5 or width >= 75.5:
 24 |             return(str("Field dimensions are too big for meters as unity, didn't you mean yards as unity?\
 25 |                        Otherwise the maximum length is 120 meters and the maximum width is 75 meters. Please try again"))
 26 |         #Run program if unity and boundaries are accepted
 27 |         else:
 28 |             #Create figure
 29 |             fig=plt.figure()
 30 |             #fig.set_size_inches(7, 5)
 31 |             ax=fig.add_subplot(1,1,1)
 32 |            
 33 |             #Pitch Outline & Centre Line
 34 |             plt.plot([0,0],[0,width], color=linecolor)
 35 |             plt.plot([0,length],[width,width], color=linecolor)
 36 |             plt.plot([length,length],[width,0], color=linecolor)
 37 |             plt.plot([length,0],[0,0], color=linecolor)
 38 |             plt.plot([length/2,length/2],[0,width], color=linecolor)
 39 |             
 40 |             #Left Penalty Area
 41 |             plt.plot([16.5 ,16.5],[(width/2 +16.5),(width/2-16.5)],color=linecolor)
 42 |             plt.plot([0,16.5],[(width/2 +16.5),(width/2 +16.5)],color=linecolor)
 43 |             plt.plot([16.5,0],[(width/2 -16.5),(width/2 -16.5)],color=linecolor)
 44 |             
 45 |             #Right Penalty Area
 46 |             plt.plot([(length-16.5),length],[(width/2 +16.5),(width/2 +16.5)],color=linecolor)
 47 |             plt.plot([(length-16.5), (length-16.5)],[(width/2 +16.5),(width/2-16.5)],color=linecolor)
 48 |             plt.plot([(length-16.5),length],[(width/2 -16.5),(width/2 -16.5)],color=linecolor)
 49 |             
 50 |             #Left 5-meters Box
 51 |             plt.plot([0,5.5],[(width/2+7.32/2+5.5),(width/2+7.32/2+5.5)],color=linecolor)
 52 |             plt.plot([5.5,5.5],[(width/2+7.32/2+5.5),(width/2-7.32/2-5.5)],color=linecolor)
 53 |             plt.plot([5.5,0.5],[(width/2-7.32/2-5.5),(width/2-7.32/2-5.5)],color=linecolor)
 54 |             
 55 |             #Right 5 -eters Box
 56 |             plt.plot([length,length-5.5],[(width/2+7.32/2+5.5),(width/2+7.32/2+5.5)],color=linecolor)
 57 |             plt.plot([length-5.5,length-5.5],[(width/2+7.32/2+5.5),width/2-7.32/2-5.5],color=linecolor)
 58 |             plt.plot([length-5.5,length],[width/2-7.32/2-5.5,width/2-7.32/2-5.5],color=linecolor)
 59 |             
 60 |             #Prepare Circles
 61 |             centreCircle = plt.Circle((length/2,width/2),9.15,color=linecolor,fill=False)
 62 |             centreSpot = plt.Circle((length/2,width/2),0.8,color=linecolor)
 63 |             leftPenSpot = plt.Circle((11,width/2),0.8,color=linecolor)
 64 |             rightPenSpot = plt.Circle((length-11,width/2),0.8,color=linecolor)
 65 |             
 66 |             #Draw Circles
 67 |             ax.add_patch(centreCircle)
 68 |             ax.add_patch(centreSpot)
 69 |             ax.add_patch(leftPenSpot)
 70 |             ax.add_patch(rightPenSpot)
 71 |             
 72 |             #Prepare Arcs
 73 |             leftArc = Arc((11,width/2),height=18.3,width=18.3,angle=0,theta1=308,theta2=52,color=linecolor)
 74 |             rightArc = Arc((length-11,width/2),height=18.3,width=18.3,angle=0,theta1=128,theta2=232,color=linecolor)
 75 |             
 76 |             #Draw Arcs
 77 |             ax.add_patch(leftArc)
 78 |             ax.add_patch(rightArc)
 79 |             #Axis titles
 80 | 
 81 |     #check unity again
 82 |     elif unity == "yards":
 83 |         #check boundaries again
 84 |         if length <= 95:
 85 |             return(str("Didn't you mean meters as unity?"))
 86 |         elif length >= 131 or width >= 101:
 87 |             return(str("Field dimensions are too big. Maximum length is 130, maximum width is 100"))
 88 |         #Run program if unity and boundaries are accepted
 89 |         else:
 90 |             #Create figure
 91 |             fig=plt.figure()
 92 |             #fig.set_size_inches(7, 5)
 93 |             ax=fig.add_subplot(1,1,1)
 94 |            
 95 |             #Pitch Outline & Centre Line
 96 |             plt.plot([0,0],[0,width], color=linecolor)
 97 |             plt.plot([0,length],[width,width], color=linecolor)
 98 |             plt.plot([length,length],[width,0], color=linecolor)
 99 |             plt.plot([length,0],[0,0], color=linecolor)
100 |             plt.plot([length/2,length/2],[0,width], color=linecolor)
101 |             
102 |             #Left Penalty Area
103 |             plt.plot([18 ,18],[(width/2 +18),(width/2-18)],color=linecolor)
104 |             plt.plot([0,18],[(width/2 +18),(width/2 +18)],color=linecolor)
105 |             plt.plot([18,0],[(width/2 -18),(width/2 -18)],color=linecolor)
106 |             
107 |             #Right Penalty Area
108 |             plt.plot([(length-18),length],[(width/2 +18),(width/2 +18)],color=linecolor)
109 |             plt.plot([(length-18), (length-18)],[(width/2 +18),(width/2-18)],color=linecolor)
110 |             plt.plot([(length-18),length],[(width/2 -18),(width/2 -18)],color=linecolor)
111 |             
112 |             #Left 6-yard Box
113 |             plt.plot([0,6],[(width/2+7.32/2+6),(width/2+7.32/2+6)],color=linecolor)
114 |             plt.plot([6,6],[(width/2+7.32/2+6),(width/2-7.32/2-6)],color=linecolor)
115 |             plt.plot([6,0],[(width/2-7.32/2-6),(width/2-7.32/2-6)],color=linecolor)
116 |             
117 |             #Right 6-yard Box
118 |             plt.plot([length,length-6],[(width/2+7.32/2+6),(width/2+7.32/2+6)],color=linecolor)
119 |             plt.plot([length-6,length-6],[(width/2+7.32/2+6),width/2-7.32/2-6],color=linecolor)
120 |             plt.plot([length-6,length],[(width/2-7.32/2-6),width/2-7.32/2-6],color=linecolor)
121 |             
122 |             #Prepare Circles; 10 yards distance. penalty on 12 yards
123 |             centreCircle = plt.Circle((length/2,width/2),10,color=linecolor,fill=False)
124 |             centreSpot = plt.Circle((length/2,width/2),0.8,color=linecolor)
125 |             leftPenSpot = plt.Circle((12,width/2),0.8,color=linecolor)
126 |             rightPenSpot = plt.Circle((length-12,width/2),0.8,color=linecolor)
127 |             
128 |             #Draw Circles
129 |             ax.add_patch(centreCircle)
130 |             ax.add_patch(centreSpot)
131 |             ax.add_patch(leftPenSpot)
132 |             ax.add_patch(rightPenSpot)
133 |             
134 |             #Prepare Arcs
135 |             leftArc = Arc((11,width/2),height=20,width=20,angle=0,theta1=312,theta2=48,color=linecolor)
136 |             rightArc = Arc((length-11,width/2),height=20,width=20,angle=0,theta1=130,theta2=230,color=linecolor)
137 |             
138 |             #Draw Arcs
139 |             ax.add_patch(leftArc)
140 |             ax.add_patch(rightArc)
141 |                 
142 |     #Tidy Axes
143 |     plt.axis('off')
144 |     
145 |     return fig,ax
146 | 
147 | 
148 | def createPitchOld():
149 |     #Taken from FC Python        
150 |     #Create figure
151 |     fig=plt.figure()
152 |     ax=fig.add_subplot(1,1,1)
153 | 
154 |     #Pitch Outline & Centre Line
155 |     plt.plot([0,0],[0,90], color=linecolor)
156 |     plt.plot([0,130],[90,90], color=linecolor)
157 |     plt.plot([130,130],[90,0], color=linecolor)
158 |     plt.plot([130,0],[0,0], color=linecolor)
159 |     plt.plot([65,65],[0,90], color=linecolor)
160 |     
161 |     #Left Penalty Area
162 |     plt.plot([16.5,16.5],[65,25],color=linecolor)
163 |     plt.plot([0,16.5],[65,65],color=linecolor)
164 |     plt.plot([16.5,0],[25,25],color=linecolor)
165 |     
166 |     #Right Penalty Area
167 |     plt.plot([130,113.5],[65,65],color=linecolor)
168 |     plt.plot([113.5,113.5],[65,25],color=linecolor)
169 |     plt.plot([113.5,130],[25,25],color=linecolor)
170 |     
171 |     #Left 6-yard Box
172 |     plt.plot([0,5.5],[54,54],color=linecolor)
173 |     plt.plot([5.5,5.5],[54,36],color=linecolor)
174 |     plt.plot([5.5,0.5],[36,36],color=linecolor)
175 |     
176 |     #Right 6-yard Box
177 |     plt.plot([130,124.5],[54,54],color=linecolor)
178 |     plt.plot([124.5,124.5],[54,36],color=linecolor)
179 |     plt.plot([124.5,130],[36,36],color=linecolor)
180 |     
181 |     #Prepare Circles
182 |     centreCircle = plt.Circle((65,45),9.15,color=linecolor,fill=False)
183 |     centreSpot = plt.Circle((65,45),0.8,color=linecolor)
184 |     leftPenSpot = plt.Circle((11,45),0.8,color=linecolor)
185 |     rightPenSpot = plt.Circle((119,45),0.8,color=linecolor)
186 |     
187 |     #Draw Circles
188 |     ax.add_patch(centreCircle)
189 |     ax.add_patch(centreSpot)
190 |     ax.add_patch(leftPenSpot)
191 |     ax.add_patch(rightPenSpot)
192 |     
193 |     #Prepare Arcs
194 |     leftArc = Arc((11,45),height=18.3,width=18.3,angle=0,theta1=310,theta2=50,color=linecolor)
195 |     rightArc = Arc((119,45),height=18.3,width=18.3,angle=0,theta1=130,theta2=230,color=linecolor)
196 | 
197 |     #Draw Arcs
198 |     ax.add_patch(leftArc)
199 |     ax.add_patch(rightArc)
200 |     
201 |     #Tidy Axes
202 |     plt.axis('off')
203 |     
204 |     return fig,ax
205 | 
206 | def createGoalMouth():
207 |     #Adopted from FC Python
208 |     #Create figure
209 |     fig=plt.figure(figsize=(8, 6))
210 |     ax=fig.add_subplot(1,1,1)
211 | 
212 |     linecolor='black'
213 | 
214 |     #Pitch Outline & Centre Line
215 |     plt.plot([0,65],[0,0], color=linecolor)
216 |     plt.plot([65,65],[50,0], color=linecolor)
217 |     plt.plot([0,0],[50,0], color=linecolor)
218 |     
219 |     #Left Penalty Area
220 |     plt.plot([12.5,52.5],[16.5,16.5],color=linecolor)
221 |     plt.plot([52.5,52.5],[16.5,0],color=linecolor)
222 |     plt.plot([12.5,12.5],[0,16.5],color=linecolor)
223 |     
224 |     #Left 6-yard Box
225 |     plt.plot([41.5,41.5],[5.5,0],color=linecolor)
226 |     plt.plot([23.5,41.5],[5.5,5.5],color=linecolor)
227 |     plt.plot([23.5,23.5],[0,5.5],color=linecolor)
228 |     
229 |     #Goal
230 |     plt.plot([41.5-5.34,41.5-5.34],[-2,0],color=linecolor)
231 |     plt.plot([23.5+5.34,41.5-5.34],[-2,-2],color=linecolor)
232 |     plt.plot([23.5+5.34,23.5+5.34],[0,-2],color=linecolor)
233 |     
234 |     #Prepare Circles
235 |     leftPenSpot = plt.Circle((65/2,11),0.8,color=linecolor)
236 |     
237 |     #Draw Circles
238 |     ax.add_patch(leftPenSpot)
239 |     
240 |     #Prepare Arcs
241 |     leftArc = Arc((32.5,11),height=18.3,width=18.3,angle=0,theta1=38,theta2=142,color=linecolor)
242 |     
243 |     #Draw Arcs
244 |     ax.add_patch(leftArc)
245 |     
246 |     #Tidy Axes
247 |     plt.axis('off')
248 |     
249 |     return fig,ax
250 | 
251 | 


--------------------------------------------------------------------------------
/GW_38_Ratings.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue Sep 14 16:41:04 2021
  5 | 
  6 | @author: emildanielsson & JakobEP
  7 | 
  8 | Program description: 
  9 |    Find ratings of all players in the last round.
 10 |    
 11 | Algorithm: 
 12 |     
 13 | """
 14 | 
 15 | 
 16 | # The basics
 17 | import pandas as pd
 18 | import numpy as np
 19 | import json
 20 | 
 21 | # Plotting
 22 | import matplotlib.pyplot as plt
 23 | from mplsoccer import FontManager
 24 | 
 25 | # Import other functions
 26 | import fitting_functions as ff
 27 | 
 28 | # Statistical fitting of models
 29 | import statsmodels.api as sm
 30 | import statsmodels.formula.api as smf
 31 | from sklearn import preprocessing
 32 | from sklearn.preprocessing import MinMaxScaler
 33 | from sklearn.preprocessing import RobustScaler
 34 | 
 35 | # For tables
 36 | from tabulate import tabulate
 37 | 
 38 | # Ignore Future Warnings
 39 | import warnings
 40 | warnings.simplefilter(action='ignore', category=FutureWarning)
 41 | 
 42 | 
 43 | #%%
 44 | # - Read in data KPI data
 45 | "---------------------------------------------------------------------------"
 46 | 
 47 | # Test to load in and store as dataframe per_90 dont have all collumns yet
 48 | # with open('Json_files/KPI_per_90_All.json') as f:
 49 | #     data_kpi = json.load(f)
 50 |     
 51 | with open('../Json_files/KPI_tot_All_v2.json') as f:
 52 |     data_kpi = json.load(f)
 53 |     
 54 | df_KPI = pd.DataFrame(data_kpi)
 55 | 
 56 | 
 57 | # Create match dataframes
 58 | df_England_matches = pd.read_json('../../Wyscout/matches/matches_England.json', encoding="unicode_escape")
 59 | 
 60 | 
 61 | #%%
 62 | # - Read in minutes played data
 63 | "---------------------------------------------------------------------------"
 64 | 
 65 | with open('../Json_files/minutes_played_All.json') as f:
 66 |     data_minutes = json.load(f)
 67 |     
 68 | df_minutes = pd.DataFrame(data_minutes)
 69 | 
 70 | 
 71 | ################################################
 72 | # - Load Fonts
 73 | "----------------------------------------------"
 74 | 
 75 | URL1 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/'
 76 |         'fonts/SourceSerifPro-Regular.ttf?raw=true')
 77 | serif_regular = FontManager(URL1)
 78 | URL2 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/'
 79 |         'fonts/SourceSerifPro-ExtraLight.ttf?raw=true')
 80 | serif_extra_light = FontManager(URL2)
 81 | URL3 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/fonts/'
 82 |         'SourceSerifPro-Bold.ttf?raw=true')
 83 | serif_bold = FontManager(URL3)
 84 | 
 85 | 
 86 | 
 87 | #%%
 88 | # - Set filter and scaler varables
 89 | "---------------------------------------------------------------------------"
 90 | 
 91 | # Now we want to filter out those who have not played at least 
 92 | # 10 matches with 20 minutes in each match (can change)
 93 | min_minutes = 20
 94 | 
 95 | # Choose method for normalizaion
 96 | scaler = MinMaxScaler()
 97 | #scaler = preprocessing.QuantileTransformer(random_state=0)
 98 | #scaler = RobustScaler()
 99 | 
100 | 
101 | #%%
102 | # - Create test and train dataset and preprocess data
103 | "---------------------------------------------------------------------------"
104 | 
105 | # Seperate df_KPI beteween PL and the rest of the legaues
106 | mask_PL = df_KPI.league == "England"
107 | df_KPI_PL = df_KPI.loc[mask_PL]
108 | df_KPI_EU_train = df_KPI.loc[~mask_PL]
109 | 
110 | 
111 | #%%
112 | # - Rank the players 
113 | "---------------------------------------------------------------------------"
114 | 
115 | # Positions to fit for
116 | positions_fitting = [['LB', 'RB'], ['CB'], ['LM', 'RM'], ['CM'], ['LW', 'RW'], ['ST']]
117 | 
118 | # Initiate rating and info dataframe
119 | df_final_rating = pd.DataFrame(columns = ['matchId', 'teamName', 'playerId',
120 |                                           'shortName', 'position', 'tot_rating',
121 |                                           'match_events_rating', 'fitting_rating_off',
122 |                                           'fitting_rating_def',
123 |                                           'final_rating', 'match_info',
124 |                                           'gameweek'])
125 | 
126 | # Initiate rating and info dataframe
127 | df_final_rating2 = pd.DataFrame(columns = ['matchId', 'teamName', 'playerId',
128 |                                           'shortName', 'position', 'tot_rating',
129 |                                           'match_events_rating', 'fitting_rating_off',
130 |                                           'fitting_rating_def',
131 |                                           'final_rating', 'match_info',
132 |                                           'gameweek'])
133 | 
134 | 
135 | # Do fitting for all the positins
136 | for position in positions_fitting:
137 | 
138 |     ################################################
139 |     # - Kpis 
140 |     "----------------------------------------------"
141 |     
142 |     # All Kpis
143 |     list_kpi_all = ['passing%', 
144 |             'completed_passes',
145 |             'fouls',
146 |             'aerial%',
147 |             'aerial_wins',
148 |             'shots',
149 |             'dribbles%',
150 |             'succesful_dribbles',
151 |             'key_passes',
152 |             'succesful_through_passes',
153 |             'events_in_box',
154 |             'passes_to_box',
155 |             'creative_passes',
156 |             'succesful_def_actions',
157 |             'progressive_carries',
158 |             'red_card',
159 |             'own_goals',
160 |             'yellow_cards',
161 |             'danger_ball_loses',
162 |             'def_actions%',
163 |             'p_adj_succ_def_actions'
164 |             ] 
165 |         
166 |     # KPIs to fit for when using dep_var "team_xG"
167 |     list_kpi_off = ['passing%', 
168 |                 'completed_passes',
169 |                 'fouls',
170 |                 #'aerial%',
171 |                 #'aerial_wins',
172 |                 'shots',
173 |                 'dribbles%',
174 |                 #'succesful_dribbles',
175 |                 'key_passes',
176 |                 #'succesful_through_passes',
177 |                 'events_in_box',
178 |                 'passes_to_box',
179 |                 #'creative_passes',
180 |                 #'succesful_def_actions', 
181 |                 #'progressive_carries',
182 |                 'red_card',
183 |                 'own_goals',
184 |                 'yellow_cards',
185 |                 'danger_ball_loses',
186 |                 #'def_actions%',
187 |                 'p_adj_succ_def_actions'
188 |                 ] 
189 |     
190 |     # KPIs to fit for when using dep_var "opponent_xG"
191 |     list_kpi_def = ['passing%', 
192 |                 'completed_passes',
193 |                 'fouls',
194 |                 #'aerial%',
195 |                 #'aerial_wins',
196 |                 #'shots',
197 |                 'dribbles%',
198 |                 #'succesful_dribbles',
199 |                 #'key_passes',
200 |                 #'succesful_through_passes',
201 |                 #'events_in_box',
202 |                 #'passes_to_box',
203 |                 #'creative_passes',
204 |                 #'succesful_def_actions',
205 |                 #'progressive_carries',
206 |                 'red_card',
207 |                 'own_goals',
208 |                 'yellow_cards',
209 |                 'danger_ball_loses',
210 |                 #'def_actions%',
211 |                 'p_adj_succ_def_actions'
212 |                 ] 
213 | 
214 |     ################################################
215 |     # - Find model coeficients, r-squared and statisticly significant kpis
216 |     "----------------------------------------------"
217 |     # Call to fitting function to find coeficient and independent variables
218 |     dep_var_off = 'team_xG'
219 |     model_coef_off, r_squared_off, list_kpi_off_fitting = ff.KPI_fitting(df_KPI_EU_train, scaler,
220 |                                                   list_kpi_off, dep_var_off,
221 |                                                   position, min_minutes)
222 |     
223 |     # Call to fitting function to find coeficient and independent variables
224 |     dep_var_def = 'opponent_xG'
225 |     model_coef_def, r_squared_def, list_kpi_def_fitting = ff.KPI_fitting(df_KPI_EU_train, scaler,
226 |                                                   list_kpi_def, dep_var_def,
227 |                                                   position, min_minutes)
228 |     
229 |     
230 |     ################################################
231 |     # - Use the coefficients from EU to compute percentiles
232 |     #   in the PL gameweek 1-37, filtered PL training data
233 |     "----------------------------------------------"
234 |     
235 |     # Filter and normalise the PL data (including GW 38)
236 |     df_filtered_PL = ff.filter_dataframe(df_KPI_PL, position, list_kpi_all, min_minutes, 1)
237 |     df_filtered_PL[list_kpi_all] = scaler.fit_transform(df_filtered_PL[list_kpi_all]) 
238 |     
239 |     # Seperate gameweek 38 from PL
240 |     test_gameweek = 38
241 |     df_PL_gameweek_38 = df_England_matches.loc[df_England_matches.gameweek == test_gameweek]
242 |     list_gameweek_38_matchId = df_PL_gameweek_38['wyId'].unique().tolist()
243 |     mask_last_gameweeks = df_filtered_PL.matchId.isin(list_gameweek_38_matchId)
244 |     
245 |     # KPIs GW 1-37
246 |     df_KPI_PL_train = df_filtered_PL.loc[~mask_last_gameweeks]
247 |     
248 |     # Initiate rating dataframe for GW 1-37
249 |     df_ratings = pd.DataFrame()
250 |     
251 |     # Loop through players in gameweek 1-37
252 |     for i, player in df_KPI_PL_train.iterrows():
253 |         
254 |         # Add some info to dataframe
255 |         df_ratings.loc[i, 'matchId'] = player['matchId']
256 |         df_ratings.loc[i, 'teamName'] = player['teamName']
257 |         df_ratings.loc[i, 'playerId'] = player['playerId']
258 |         df_ratings.loc[i, 'shortName'] = player['shortName']
259 |         
260 |         ################################################
261 |         # - xG-Fit
262 |         "----------------------------------------------"
263 |     
264 |         # Find the fitted xG 
265 |         xG_fitting_rating_off = ff.compute_fitting_ratings(player, model_coef_off, list_kpi_off_fitting)
266 |         
267 |         # Multiply the fitted value with r_squared, how good the fit was
268 |         xG_fitting_rating_off = xG_fitting_rating_off * r_squared_off
269 |         
270 |         # Add to df
271 |         df_ratings.loc[i, 'fitting_rating_off'] = xG_fitting_rating_off
272 |         
273 |         ################################################
274 |         # - opponent_xG-Fit (xGC)
275 |         "----------------------------------------------"
276 |         # Find the fitted opponent xG (xGC)
277 |         xGC_fitting_rating_def = ff.compute_fitting_ratings(player, model_coef_def, list_kpi_def_fitting)
278 |         
279 |         # Multiply the fitted value with r_squared, how good the fit was 
280 |         xGC_fitting_rating_def = xGC_fitting_rating_def * r_squared_def
281 |         
282 |         # Add to df
283 |         df_ratings.loc[i, 'fitting_rating_def'] = xGC_fitting_rating_def
284 |         
285 |         ################################################
286 |         # - Match event-rating
287 |         "----------------------------------------------"
288 |         
289 |         # Find the event rating and add to dataframe
290 |         match_event_rating = ff.compute_events_rating(player, position, df_KPI)
291 |         df_ratings.loc[i, 'match_events_rating'] = match_event_rating
292 |         
293 |         # Sum fitting rating and add to dataframe (regression-based rating)
294 |         tot_fit_rating = xG_fitting_rating_off - xGC_fitting_rating_def
295 |         df_ratings.loc[i, 'tot_fit_rating'] = tot_fit_rating
296 |         
297 |         
298 | 
299 |     # Find percentiles from the rankings in gameweek 1-37 PL 
300 |     percentiles = np.arange(0.01, 1, 0.01)
301 |     percentiles_fit = df_ratings['tot_fit_rating'].quantile(percentiles)
302 |     percentiles_events = df_ratings['match_events_rating'].quantile(percentiles)
303 |     
304 |     ################################################
305 |     # - Compute the rankings of gameweek 38 for the position
306 |     "----------------------------------------------"
307 |     # KPIs GW 38
308 |     df_KPI_PL_gameweek_38 = df_filtered_PL.loc[mask_last_gameweeks] 
309 |     
310 |     # Initiate rating dataframe for GW 38
311 |     df_ratings_test = pd.DataFrame()
312 |     
313 |     # Loop through players in gameweek 38
314 |     for i, player in df_KPI_PL_gameweek_38.iterrows():
315 |         
316 |         # Add some info to dataframe
317 |         df_ratings_test.loc[i, 'matchId'] = player['matchId']
318 |         df_ratings_test.loc[i, 'teamName'] = player['teamName']
319 |         df_ratings_test.loc[i, 'playerId'] = player['playerId']
320 |         df_ratings_test.loc[i, 'shortName'] = player['shortName']
321 |         
322 |         ################################################
323 |         # - xG-Fit
324 |         "----------------------------------------------"
325 |         
326 |         # Find the fitted xG 
327 |         xG_fitting_rating_off = ff.compute_fitting_ratings(player, model_coef_off, list_kpi_off_fitting)
328 |         
329 |         # Multiply the fitted value with r_squared, how good the fit was
330 |         xG_fitting_rating_off = xG_fitting_rating_off * r_squared_off
331 |         
332 |         # Add to df
333 |         df_ratings_test.loc[i, 'fitting_rating_off'] = xG_fitting_rating_off
334 |         
335 |         ################################################
336 |         # - opponent_xG-Fit (xGC)
337 |         "----------------------------------------------"
338 | 
339 |         # Find the fitted opponent xG (xGC)
340 |         xGC_fitting_rating_def = ff.compute_fitting_ratings(player, model_coef_def, list_kpi_def_fitting)
341 |         
342 |         # Multiply the fitted value with r_squared, how good the fit was
343 |         xGC_fitting_rating_def = xGC_fitting_rating_def * r_squared_def
344 |         
345 |         # Add to df
346 |         df_ratings_test.loc[i, 'fitting_rating_def'] = xGC_fitting_rating_def
347 |         
348 |         ################################################
349 |         # - Match event-rating
350 |         "----------------------------------------------"
351 |         
352 |         # Find the event rating and add to dataframe
353 |         match_event_rating = ff.compute_events_rating(player, position, df_KPI)
354 |         df_ratings_test.loc[i, 'match_events_rating'] = match_event_rating
355 |         
356 |         # Sum fitting rating and add to dataframe
357 |         tot_fit_rating = xG_fitting_rating_off - xGC_fitting_rating_def
358 |         df_ratings_test.loc[i, 'tot_fit_rating'] = tot_fit_rating
359 |     
360 |     # Modify the df_rating_test dataframe and the gameweek 38 dataframe
361 |     ff.create_rating_dataframe(df_ratings_test, df_KPI_PL, df_KPI_PL_gameweek_38,
362 |                                percentiles_fit, percentiles_events, df_England_matches)
363 |     
364 |     # Modify the rating dataframe from gameweek 1-37
365 |     ff.create_rating_dataframe(df_ratings, df_KPI, df_KPI_PL_train,
366 |                                percentiles_fit, percentiles_events, df_England_matches)
367 | 
368 |     
369 |     # Merge the rating dataframe GW 38
370 |     frames = [df_final_rating, df_ratings_test]
371 |     df_final_rating = pd.concat(frames) 
372 |     
373 |     # Merge the rating dataframe [GW1-37]
374 |     frames = [df_final_rating2, df_ratings]
375 |     df_final_rating2 = pd.concat(frames)
376 | 
377 | 
378 | #%%
379 | # Check the mean and sum rating from gameweek 1-37
380 | df_mean_rating = df_final_rating2.groupby(['shortName', 'teamName'], as_index=False)["final_rating"].mean()
381 | df_sum_rating = df_final_rating2.groupby(['shortName'], as_index=False)["final_rating"].sum()
382 | 
383 | # # Save to Excel file
384 | with pd.ExcelWriter("../Gameweek_38.xlsx", mode="a", engine="openpyxl", if_sheet_exists = "new") as writer:
385 |     df_mean_rating.to_excel(writer, sheet_name="mean_rating",
386 |                             #columns=['shortName', 'position', 'teamName', 'final_rating'],
387 |                     header=True, index=False)
388 |     
389 | # # Save to Excel file
390 | with pd.ExcelWriter("../Gameweek_38.xlsx", mode="a", engine="openpyxl", if_sheet_exists = "new") as writer:
391 |     df_sum_rating.to_excel(writer, sheet_name="sum_rating",
392 |                             #columns=['shortName', 'position', 'teamName', 'final_rating'],
393 |                     header=True, index=False)
394 | 
395 | 
396 | #%%
397 | # - Print and save the ratings to use for validation_vs_WhoScored
398 | "---------------------------------------------------------------------------"
399 | # Print matches from last gameweek ratings
400 | df_gameweek_38 = df_final_rating.loc[df_final_rating.gameweek == 38]
401 | rated_matches = df_gameweek_38['matchId'].unique().tolist()
402 | 
403 | # Print the rated matches
404 | for match in rated_matches:
405 |     the_match = df_final_rating.loc[df_final_rating['matchId'] == match]
406 |     print(the_match.match_info.values[0])
407 |     table = the_match[['teamName', 'shortName', 'position', 'final_rating']]
408 |     print(tabulate(table))
409 |     
410 | 
411 | print("Adding results Gameweek_38.xlsx, choose filename:\n")
412 | file_name = input()
413 |     
414 | # # Save to Excel file to use for validation
415 | with pd.ExcelWriter("../Gameweek_38.xlsx", mode="a", engine="openpyxl", if_sheet_exists = "new") as writer:
416 |     df_gameweek_38.to_excel(writer, sheet_name=file_name,
417 |                             columns=['teamName', 'shortName', 'position', 'final_rating'],
418 |                     header=True, index=False)
419 | 
420 | 
421 | 
422 | 
423 | 
424 | 
425 | 
426 | 
427 | 


--------------------------------------------------------------------------------
/GW_38_Ratings_evaluation.py:
--------------------------------------------------------------------------------
  1 | 
  2 | #!/usr/bin/env python3
  3 | # -*- coding: utf-8 -*-
  4 | """
  5 | Created on Tue Sep 14 16:41:04 2021
  6 | 
  7 | @author: emildanielsson & JakobEP
  8 | 
  9 | Program description: 
 10 |    Find ratings of all players in the last round
 11 |    
 12 | Algorithm: 
 13 |     
 14 | """
 15 | 
 16 | 
 17 | # The basics
 18 | import pandas as pd
 19 | import numpy as np
 20 | import json
 21 | 
 22 | # Plotting
 23 | import matplotlib.pyplot as plt
 24 | from mplsoccer import FontManager
 25 | 
 26 | # Import other functions
 27 | import percentile_functions as pf
 28 | import fitting_functions as ff
 29 | 
 30 | # Statistical fitting of models
 31 | import statsmodels.api as sm
 32 | import statsmodels.formula.api as smf
 33 | from sklearn import preprocessing
 34 | from sklearn.preprocessing import MinMaxScaler
 35 | from sklearn.preprocessing import RobustScaler
 36 | import statistics
 37 | 
 38 | # For tables
 39 | from tabulate import tabulate
 40 | 
 41 | # Ignore Future Warnings
 42 | import warnings
 43 | warnings.simplefilter(action='ignore', category=FutureWarning)
 44 | 
 45 | 
 46 | #%%
 47 | # - Read in data KPI data
 48 | "---------------------------------------------------------------------------"
 49 | 
 50 | # Test to load in and store as dataframe per_90 dont have all collumns yet
 51 | # with open('Json_files/KPI_per_90_All.json') as f:
 52 | #     data_kpi = json.load(f)
 53 |     
 54 | with open('Json_files/KPI_tot_All_v2.json') as f:
 55 |     data_kpi = json.load(f)
 56 |     
 57 | df_KPI = pd.DataFrame(data_kpi)
 58 | 
 59 | 
 60 | # Create match dataframes
 61 | df_England_matches = pd.read_json('../Wyscout/matches/matches_England.json', encoding="unicode_escape")
 62 | 
 63 | 
 64 | #%%
 65 | # - Read in minutes played data
 66 | "---------------------------------------------------------------------------"
 67 | 
 68 | with open('Json_files/minutes_played_All.json') as f:
 69 |     data_minutes = json.load(f)
 70 |     
 71 | df_minutes = pd.DataFrame(data_minutes)
 72 | 
 73 | 
 74 | ################################################
 75 | # - Load Fonts
 76 | "----------------------------------------------"
 77 | 
 78 | URL1 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/'
 79 |         'fonts/SourceSerifPro-Regular.ttf?raw=true')
 80 | serif_regular = FontManager(URL1)
 81 | URL2 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/'
 82 |         'fonts/SourceSerifPro-ExtraLight.ttf?raw=true')
 83 | serif_extra_light = FontManager(URL2)
 84 | URL3 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/fonts/'
 85 |         'SourceSerifPro-Bold.ttf?raw=true')
 86 | serif_bold = FontManager(URL3)
 87 | 
 88 | 
 89 | 
 90 | #%%
 91 | # - Set filter and scaler varables
 92 | "---------------------------------------------------------------------------"
 93 | 
 94 | # Now we want to filter out those who have not played at least 
 95 | # 10 matches with 20 minutes in each match (can change)
 96 | min_minutes = 20
 97 | 
 98 | # Choose method for normalizaion
 99 | scaler = MinMaxScaler()
100 | #scaler = preprocessing.QuantileTransformer(random_state=0)
101 | #scaler = RobustScaler()
102 | 
103 | 
104 | #%%
105 | # - Create test and train dataset and preprocess data
106 | "---------------------------------------------------------------------------"
107 | 
108 | # Seperate df_KPI beteween PL and the rest of the legaues
109 | mask_PL = df_KPI.league == "England"
110 | df_KPI_PL = df_KPI.loc[mask_PL]
111 | df_KPI_EU_train = df_KPI.loc[~mask_PL]
112 | 
113 | 
114 | #%%
115 | # - Rank the players 
116 | "---------------------------------------------------------------------------"
117 | 
118 | # Positions to fit for
119 | #positions_fitting = [['LB', 'RB'], ['CB'], ['LM', 'RM'], ['CM'], ['LW', 'RW'], ['ST']]
120 | positions_fitting = [['ST']]
121 | #positions_fitting = [['CB']]
122 | 
123 | 
124 | # Do fitting for all the positins
125 | for position in positions_fitting:
126 |     # print(position)
127 | 
128 |     ################################################
129 |     # - Kpis to fit for
130 |     "----------------------------------------------"
131 |     
132 |     list_kpi_all = ['passing%', 
133 |             'completed_passes',
134 |             'fouls',
135 |             'aerial%',
136 |             'aerial_wins',
137 |             'shots',
138 |             'dribbles%',
139 |             'succesful_dribbles',
140 |             'key_passes',
141 |             'succesful_through_passes',
142 |             'events_in_box',
143 |             'passes_to_box',
144 |             'creative_passes',
145 |             'succesful_def_actions',
146 |             'progressive_carries',
147 |             'red_card',
148 |             'own_goals',
149 |             'yellow_cards',
150 |             'danger_ball_loses',
151 |             'def_actions%',
152 |             'p_adj_succ_def_actions',
153 |             'team_xG',
154 |             'opponent_xG'
155 |             ] 
156 |         
157 |     # KPIs when using KPI_tot_All
158 |     list_kpi_off = ['passing%', 
159 |                 'completed_passes',
160 |                 'fouls',
161 |                 #'aerial%',
162 |                 #'aerial_wins',
163 |                 'shots',
164 |                 'dribbles%',
165 |                 #'succesful_dribbles',
166 |                 'key_passes',
167 |                 #'succesful_through_passes',
168 |                 'events_in_box',
169 |                 'passes_to_box',
170 |                 #'creative_passes',
171 |                 #'succesful_def_actions', 
172 |                 #'progressive_carries',
173 |                 'red_card',
174 |                 'own_goals',
175 |                 'yellow_cards',
176 |                 'danger_ball_loses',
177 |                 #'def_actions%',
178 |                 'p_adj_succ_def_actions'
179 |                 ] 
180 |     
181 |     list_kpi_def = ['passing%', 
182 |                 'completed_passes',
183 |                 'fouls',
184 |                 #'aerial%',
185 |                 #'aerial_wins',
186 |                 #'shots',
187 |                 'dribbles%',
188 |                 #'succesful_dribbles',
189 |                 #'key_passes',
190 |                 #'succesful_through_passes',
191 |                 #'events_in_box',
192 |                 #'passes_to_box',
193 |                 #'creative_passes',
194 |                 #'succesful_def_actions',
195 |                 #'progressive_carries',
196 |                 'red_card',
197 |                 'own_goals',
198 |                 'yellow_cards',
199 |                 'danger_ball_loses',
200 |                 #'def_actions%',
201 |                 'p_adj_succ_def_actions'
202 |                 ] 
203 | 
204 |     ################################################
205 |     # - Find model coeficients, r-squared and statisticly significant kpis
206 |     "----------------------------------------------"
207 |     # Call to fitting function to find coeficient and independent variables
208 |     dep_var_off = 'team_xG'
209 |     model_coef_off, r_squared_off, list_kpi_off_fitting, model_off = ff.KPI_fitting(df_KPI_EU_train, scaler,
210 |                                                   list_kpi_off, dep_var_off,
211 |                                                   position, min_minutes)
212 |     
213 |     # Call to fitting function to find coeficient and independent variables
214 |     dep_var_def = 'opponent_xG'
215 |     model_coef_def, r_squared_def, list_kpi_def_fitting, model_def = ff.KPI_fitting(df_KPI_EU_train, scaler,
216 |                                                   list_kpi_def, dep_var_def,
217 |                                                   position, min_minutes)
218 |     
219 |     
220 |     ################################################
221 |     # - Use the coefficients from EU to compute percentiles
222 |     #   in the PL gameweek 1-37, filtered PL training data
223 |     "----------------------------------------------"
224 |     
225 |     # Filter and normalise the PL data (including GW 38)
226 |     df_filtered_PL = pf.filter_dataframe(df_KPI_PL, position, list_kpi_all, min_minutes, 1)
227 |     df_filtered_PL[list_kpi_all[:-2]] = scaler.fit_transform(df_filtered_PL[list_kpi_all[:-2]]) 
228 |     
229 |     # Seperate gameweek 38 from PL
230 |     test_gameweek = 38
231 |     df_PL_gameweek_38 = df_England_matches.loc[df_England_matches.gameweek == test_gameweek]
232 |     list_gameweek_38_matchId = df_PL_gameweek_38['wyId'].unique().tolist()
233 |     mask_last_gameweeks = df_filtered_PL.matchId.isin(list_gameweek_38_matchId)
234 |     
235 |     # KPIs GW 1-37
236 |     df_KPI_PL_test = df_filtered_PL.loc[~mask_last_gameweeks]
237 |     
238 |     # Find test data
239 |     X_test_off = df_KPI_PL_test[list_kpi_off_fitting[:-1]]
240 |     X_test_def = df_KPI_PL_test[list_kpi_def_fitting[:-1]]
241 |     
242 |     # Add constant to test data
243 |     X_test_off = sm.add_constant(X_test_off)
244 |     X_test_def = sm.add_constant(X_test_def)
245 |     
246 |     # Loop through players in gameweek 1-37
247 |     #for i, player in df_KPI_PL_test.iterrows():
248 | 
249 | 
250 | 
251 | 
252 | 
253 | #%%
254 | # - Evaluate fitting
255 | "---------------------------------------------------------------------------"
256 | 
257 | # Out of sample prediction
258 | y_pred_off = model_off.predict(X_test_off)
259 | y_pred_def = model_def.predict(X_test_def)
260 | 
261 | 
262 | 
263 | #%%
264 | # - Plot fitted values and computed team xG-values
265 | "---------------------------------------------------------------------------"
266 | 
267 | x_plot = np.arange(len(y_pred_off))
268 | y_plot = df_KPI_PL_test['team_xG'].copy()
269 | y_pred_plot = y_pred_off
270 | 
271 | y_diff = abs(y_plot - y_pred_plot)
272 | 
273 | 
274 | # Create figure and axes
275 | fig1, ax1 = plt.subplots(figsize=(12, 6))
276 | 
277 | width = 0.35  # the width of the bars
278 | 
279 | rects1 = ax1.bar(x_plot[0:30] - width/2, y_plot[0:30], width, label='xG-team actual')
280 | rects2 = ax1.bar(x_plot[0:30] + width/2, y_pred_plot[0:30], width, label='xG-team predicted')
281 | 
282 | #plt.bar(x_plot[0:50], y_plot[0:50], color='purple', label='xG-team actual')
283 | #plt.bar(x_plot[0:50], y_pred_plot[0:50], color='orange', label='xG-team predicted')
284 | #ax1.plot(x_plot[0:50], y_diff[0:50], '--', color='red', label='xG-team difference')
285 | 
286 | # x and y labels
287 | ax1.set_xlabel('matches', fontweight='bold', fontsize=20, fontproperties=serif_bold.prop)
288 | ax1.set_ylabel('xG', fontweight='bold', fontsize=20, fontproperties=serif_bold.prop)
289 | 
290 | # Adding title and subtitle
291 | fig1.text(0.05, 1, f"Actual and predicted xG-team values for position: {positions_fitting[0][0]} \n", fontsize=22,
292 |              fontproperties=serif_bold.prop)
293 | fig1.text(0.05, 1, 'First 30 matches in PL season 2017/18', fontsize=18,
294 |              fontproperties=serif_regular.prop)
295 | 
296 | # Add legend
297 | ax1.legend(loc='best', prop={"family": "Times New Roman", 'size': 14})
298 | 
299 | # Add grid and zorder
300 | ax1.grid(ls="dotted", lw=0.3, color="grey", alpha=1, zorder=1)
301 | 
302 | # The tight_layout() function in pyplot module of matplotlib library is used 
303 | # to automatically adjust subplot parameters to give specified padding.
304 | plt.tight_layout()
305 | plt.show()
306 | 
307 | #%%
308 | # - Statistics
309 | "---------------------------------------------------------------------------"
310 | 
311 | # Difference
312 | y_diff_mean = y_diff.mean()
313 | y_diff_var = statistics.variance(y_diff)
314 | #y_diff_covar = statistics.covariance(y_plot, y_pred_plot)
315 | y_diff_stdvar = statistics.stdev(y_diff)
316 | 
317 | # Actual xG-team
318 | y_plot_mean = y_plot.mean()
319 | y_plot_var = statistics.variance(y_plot)
320 | y_plot_stdvar = statistics.stdev(y_plot)
321 | 
322 | # Predicted xG-team
323 | y_pred_plot_mean = y_pred_plot.mean()
324 | y_pred_plot_var = statistics.variance(y_pred_plot)
325 | y_pred_plot_stdvar = statistics.stdev(y_pred_plot)
326 | 
327 | 
328 | #%%
329 | # - Print stats
330 | "---------------------------------------------------------------------------"
331 | print('\n')
332 | print('=============== y_diff statistics: ================ ')
333 | print(f"Mean: {y_diff_mean}")
334 | print(f"Variance: {y_diff_var}")
335 | print(f"Standard deviation: {y_diff_stdvar}")
336 | 
337 | print('\n')
338 | print('=============== Actual xG-team statistics: ================ ')
339 | print(f"Mean: {y_plot_mean}")
340 | print(f"Variance: {y_plot_var}")
341 | print(f"Standard deviation: {y_plot_stdvar}")
342 | 
343 | print('\n')
344 | print('=============== Predicted xG-team statistics: ================ ')
345 | print(f"Mean: {y_pred_plot_mean}")
346 | print(f"Variance: {y_pred_plot_var}")
347 | print(f"Standard deviation: {y_pred_plot_stdvar}")
348 | 
349 | 
350 | #%%
351 | # - Plot fitted values and computed team xGC-values
352 | "---------------------------------------------------------------------------"
353 | 
354 | x_plot2 = np.arange(len(y_pred_def))
355 | y_plot2 = df_KPI_PL_test['opponent_xG'].copy()
356 | y_pred_plot2 = y_pred_def
357 | 
358 | y_diff2 = abs(y_plot2 - y_pred_plot2)
359 | 
360 | # Create figure and axes
361 | fig2, ax2 = plt.subplots(figsize=(12, 6))
362 | 
363 | width = 0.35  # the width of the bars
364 | 
365 | rects1 = ax2.bar(x_plot2[0:30] - width/2, y_plot2[0:30], width, label='xGC-team actual')
366 | rects2 = ax2.bar(x_plot2[0:30] + width/2, y_pred_plot2[0:30], width, label='xGC-team predicted')
367 | 
368 | #plt.bar(x_plot[0:50], y_plot[0:50], color='purple', label='xG-team actual')
369 | #plt.bar(x_plot[0:50], y_pred_plot[0:50], color='orange', label='xG-team predicted')
370 | #ax1.plot(x_plot[0:50], y_diff[0:50], '--', color='red', label='xG-team difference')
371 | 
372 | # x and y labels
373 | ax2.set_xlabel('matches', fontweight='bold', fontsize=20, fontproperties=serif_bold.prop)
374 | ax2.set_ylabel('xGC', fontweight='bold', fontsize=20, fontproperties=serif_bold.prop)
375 | 
376 | # Add legend
377 | ax2.legend(loc='best', prop={"family": "Times New Roman", 'size': 14})
378 | 
379 | # Add grid and zorder
380 | ax2.grid(ls="dotted", lw=0.3, color="grey", alpha=1, zorder=1)
381 | 
382 | # Adding title and subtitle
383 | fig2.text(0.05, 1, f"Actual and predicted xGC-team values for position: {positions_fitting[0][0]} \n", fontsize=22,
384 |              fontproperties=serif_bold.prop)
385 | fig2.text(0.05, 1, 'First 30 matches in PL season 2017/18', fontsize=18,
386 |              fontproperties=serif_regular.prop)
387 | 
388 | # The tight_layout() function in pyplot module of matplotlib library is used 
389 | # to automatically adjust subplot parameters to give specified padding.
390 | plt.tight_layout()
391 | plt.show()
392 | 
393 | #%%
394 | # - Statistics
395 | "---------------------------------------------------------------------------"
396 | 
397 | # Difference
398 | y_diff2_mean = y_diff2.mean()
399 | y_diff2_var = statistics.variance(y_diff2)
400 | #y_diff_covar = statistics.covariance(y_plot, y_pred_plot)
401 | y_diff2_stdvar = statistics.stdev(y_diff2)
402 | 
403 | # Actual xGC-team
404 | y_plot2_mean = y_plot2.mean()
405 | y_plot2_var = statistics.variance(y_plot2)
406 | y_plot2_stdvar = statistics.stdev(y_plot2)
407 | 
408 | # Predicted xGC-team
409 | y_pred_plot2_mean = y_pred_plot2.mean()
410 | y_pred_plot2_var = statistics.variance(y_pred_plot2)
411 | y_pred_plot2_stdvar = statistics.stdev(y_pred_plot2)
412 | 
413 | 
414 | #%%
415 | # - Print stats
416 | "---------------------------------------------------------------------------"
417 | print('\n')
418 | print('=============== y_diff2 statistics: ================ ')
419 | print(f"Mean: {y_diff2_mean}")
420 | print(f"Variance: {y_diff2_var}")
421 | print(f"Standard deviation: {y_diff2_stdvar}")
422 | 
423 | print('\n')
424 | print('=============== Actual xGC-team statistics: ================ ')
425 | print(f"Mean: {y_plot2_mean}")
426 | print(f"Variance: {y_plot2_var}")
427 | print(f"Standard deviation: {y_plot2_stdvar}")
428 | 
429 | print('\n')
430 | print('=============== Predicted xGC-team statistics: ================ ')
431 | print(f"Mean: {y_pred_plot2_mean}")
432 | print(f"Variance: {y_pred_plot2_var}")
433 | print(f"Standard deviation: {y_pred_plot2_stdvar}")
434 | 
435 | 
436 | 
437 | 
438 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Player_Rating_Project
  2 | ==============================
  3 | 
  4 | Instruction of how to run the files, and what needs to be downloaded beforehand, for the Player_Rating_Project. Project has been carried out at Uppsala university for the course "Advanced Course on Topics in Scientific Computing I", HT2021 period 1.
  5 | 
  6 | Python Packages Needed
  7 | ------------
  8 | - `pandas`
  9 | - `numpy`
 10 | - `json`
 11 | - `matplotlib`
 12 | - `seaborn`
 13 | - `mplsoccer`
 14 | - `sklearn`
 15 | - `statsmodels`
 16 | - `tabulate`
 17 | 
 18 | Downloads
 19 | ------------
 20 | Make sure to have Python3 downloaded, along with needed packages listed above.
 21 | 
 22 | Get the Wyscout data from: https://figshare.com/collections/Soccer_match_event_dataset/4415000/2 
 23 | 
 24 | The following data sets from Wyscout are needed: "events.json", "matches.json", "players.json" and "teams.json".
 25 | 
 26 | Place the downloaded Wyscout data in a folder named: `Wyscout`, placed two levels above the Python code (see below).
 27 | 
 28 | Download the folder 'Json_files' from https://drive.google.com/drive/folders/1Yhta6-kl6Z9sn_Uy2JpMC9UiNObn6VFz?usp=sharing and place at one level above the Python code (see below). The files in this folder can also be generated if the Wyscout data is downloaded by running the following programmes in order:
 29 |     
 30 |     1. create_events_df_eu.py
 31 |     
 32 |     2. minutes_played.py
 33 |     
 34 |     3. create_KPI_dataframe.py 
 35 |     
 36 |     (4.) create_KPI_dataframe_EDIT.py (need some modifications, see comments)
 37 |     
 38 | This is though not recomended since it takes quite a lot of time to run create_KPI_dataframe.py.
 39 | 
 40 | Also download Excel-sheet `Gameweek_38.xlsx` from https://docs.google.com/spreadsheets/d/1bIpAxH0iWEot8tAlIQcvBB_uX-Au-qjX/edit?usp=sharing&ouid=117928085659621731785&rtpof=true&sd=true and place at one level above the Python code (see below).
 41 | 
 42 | Running Instructions
 43 | ------------
 44 | When the folders and files above are downloaded (or created) the following programs can be ran to see the resulting ratings from gameweek 38
 45 | 
 46 |     1. GW_38_Ratings.py
 47 |     
 48 |     2. the_match_ranking.py
 49 |     
 50 | The following programs can then be ran to evalute the ratings and the xG-model 
 51 | 
 52 |     1. GW_38_Ratings_evaluation.py
 53 |     
 54 |     2. xG_model_evaluation.py
 55 |     
 56 |     3. validation_vs_WhoScored.py
 57 | 
 58 | 
 59 | Project Organization
 60 | ------------
 61 | 
 62 |     ├── README.md                               <- The top-level README for running this project.
 63 |     |
 64 |     ├── Wyscout                                 <- Wyscout data folder.
 65 |     │   │
 66 |     │   ├── players.json
 67 |     │   │
 68 |     │   ├── teams.json  
 69 |     │   │
 70 |     │   ├── events            
 71 |     │   │   ├── events_England.json
 72 |     │   │   ├── events_France.json
 73 |     │   │   ├── events_Germany.json
 74 |     │   │   ├── events_Italy.json
 75 |     │   │   └── events_Spain.json
 76 |     │   │
 77 |     │   └── matches            
 78 |     │       ├── matches_England.json
 79 |     │       ├── matches_France.json
 80 |     │       ├── matches_Germany.json
 81 |     │       ├── matches_Italy.json
 82 |     │       └── matches_Spain.json
 83 |     │
 84 |     └──Player_rating_Project                    <- Main folder for this project.
 85 |         |
 86 |         │── Gameweek_38.xlsx                    <- Excel with validation data from Whoscored to compare with.
 87 |         │
 88 |         │── Json_files                          <- Folder where created json-files are stored.
 89 |         │
 90 |         └── Python_Code                         <- Source code for this project.
 91 |             |
 92 |             |── create_events_df_eu.py
 93 |             |── create_KPI_dataframe_EDIT.py
 94 |             |── create_KPI_dataframe.py
 95 |             |── FCPython.py
 96 |             |── fitting_functions.py
 97 |             |── GW_38_Ratings_evaluation.py
 98 |             |── GW_38_Ratings.py
 99 |             |── KPI_functions.py
100 |             |── minutes_played.py
101 |             |── the_match_ranking.py
102 |             |── validation_vs_WhoScored.py
103 |             └── xG_model_evaluation.py
104 | 
105 | --------
106 | 
107 | By: Jakob Edberger Persson and Emil Danielsson, 2021
108 | 


--------------------------------------------------------------------------------
/__pycache__/FCPython.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilDanielsson/Player-Rating-Project/ae4b93c0aaa761e8a23c04666a1b1365637955df/__pycache__/FCPython.cpython-38.pyc


--------------------------------------------------------------------------------
/__pycache__/KPI_functions.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilDanielsson/Player-Rating-Project/ae4b93c0aaa761e8a23c04666a1b1365637955df/__pycache__/KPI_functions.cpython-38.pyc


--------------------------------------------------------------------------------
/__pycache__/fitting_functions.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilDanielsson/Player-Rating-Project/ae4b93c0aaa761e8a23c04666a1b1365637955df/__pycache__/fitting_functions.cpython-38.pyc


--------------------------------------------------------------------------------
/__pycache__/percentile_functions.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilDanielsson/Player-Rating-Project/ae4b93c0aaa761e8a23c04666a1b1365637955df/__pycache__/percentile_functions.cpython-38.pyc


--------------------------------------------------------------------------------
/create_KPI_dataframe.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Mon Sep 13 16:54:33 2021
  5 | 
  6 | @author: emildanielsson & JakobEP
  7 | 
  8 | Program description: 
  9 |     1. Read in data
 10 |     2. Creates two dataframes; 
 11 |         df_KPI      - Dataframe of all the player's KPI's from each game
 12 |         df_KPI_info - Dataframe with info of player's KPI's 
 13 |     (3.) Create and store the two dataframes as json-files in the working directory
 14 | 
 15 |     Note that this code takes very long time to run and therefore some other KPIs 
 16 |     which were develoloped later have been added by the program: create_KPI_dataframe_EDIT.
 17 |     This is recomended for future use. 
 18 |     
 19 | """
 20 | 
 21 | # The basics
 22 | import pandas as pd
 23 | import numpy as np
 24 | import json
 25 | 
 26 | 
 27 | # Statistical fitting of models
 28 | # import statsmodels.api as sm
 29 | # import statsmodels.formula.api as smf
 30 | from sklearn.model_selection import train_test_split
 31 | from sklearn.linear_model import LogisticRegression
 32 | # from sklearn.preprocessing import PolynomialFeatures
 33 | 
 34 | # Import KPI-funcion
 35 | import KPI_functions as kpi
 36 | 
 37 | 
 38 | #%%
 39 | # - Create dataframes from the Wyscout data
 40 | "---------------------------------------------------------------------------"
 41 | 
 42 | # Create event dataframe
 43 | #df_Europe_events = pd.read_json('Json_files/events_All.json', encoding="unicode_escape") #SLOWER
 44 | with open('../Json_files/events_All.json') as f:
 45 |     data_Europe= json.load(f)
 46 |     
 47 | df_Europe_events = pd.DataFrame(data_Europe)
 48 | 
 49 | # Create match dataframes
 50 | df_England_matches = pd.read_json('../../Wyscout/matches/matches_England.json', encoding="unicode_escape")
 51 | 
 52 | df_France_matches = pd.read_json('../../Wyscout/matches/matches_France.json', encoding="unicode_escape")
 53 | 
 54 | df_Germany_matches = pd.read_json('../../Wyscout/matches/matches_Germany.json', encoding="unicode_escape")
 55 | 
 56 | df_Italy_matches = pd.read_json('../../Wyscout/matches/matches_Italy.json', encoding="unicode_escape")
 57 | 
 58 | df_Spain_matches = pd.read_json('../../Wyscout/matches/matches_Spain.json', encoding="unicode_escape")
 59 | 
 60 | 
 61 | # Create players and teams dataframes
 62 | df_players = pd.read_json("../../Wyscout/players.json", encoding="unicode_escape")
 63 | df_teams = pd.read_json("../../Wyscout/teams.json", encoding="unicode_escape")
 64 | 
 65 | 
 66 | #%%
 67 | # - Merge matches dataframes from all leagues 
 68 | "---------------------------------------------------------------------------"
 69 | 
 70 | frames_matches = [df_England_matches, df_France_matches, df_Germany_matches, 
 71 |                   df_Italy_matches, df_Spain_matches]
 72 | 
 73 | df_Europe_matches = pd.concat(frames_matches, keys = ["England", "France",
 74 |                                                       "Germany", "Italy", "Spain"])
 75 | 
 76 | 
 77 | #%%
 78 | # - Read in minutes played data
 79 | "---------------------------------------------------------------------------"
 80 | 
 81 | with open('../Json_files/minutes_played_All.json') as f:
 82 |     data_minutes = json.load(f)
 83 |     
 84 | df_minutes = pd.DataFrame(data_minutes)
 85 | 
 86 | 
 87 | #%%
 88 | # - Read in data for xG-model and get the coeficients dataframes
 89 | "---------------------------------------------------------------------------"  
 90 | 
 91 | with open('../Json_files/xG_model_v2_All_except_Eng.json') as f:
 92 |     data_xG_model = json.load(f)
 93 | 
 94 | # Create dataframes
 95 | df_xG_model = pd.DataFrame(data_xG_model)  
 96 | 
 97 | # Call xG-m
 98 | df_log_model_shots_coef, df_log_model_headers_coef, df_log_model_free_kicks_coef, log_model, log_model_headers, log_model_free_kicks = ff.xG_model(df_xG_model)
 99 | 
100 | 
101 | #%%
102 | # - Create the dataframe of all KPI's
103 | "---------------------------------------------------------------------------"
104 | 
105 | # Prepare the dataframe with the columns we need
106 | df_KPI_p90 = pd.DataFrame(columns=['matchId',
107 |                                    'league',
108 |                                'teamName',
109 |                                'playerId',
110 |                                'shortName',
111 |                                'role',
112 |                                'minutesPlayed',
113 |                                'team_goals',
114 |                                'team_conceded_goals',
115 |                                'red_card',
116 |                                # KPI's from here
117 |                                'goals',
118 |                                'assists',
119 |                                'passing%',
120 |                                'completed_passes_p90',
121 |                                'fouls_p90',
122 |                                'aerial%',
123 |                                'aerial_wins_p90',
124 |                                'shots_p90',
125 |                                'dribbles%',
126 |                                'succesful_dribbles_p90',
127 |                                'key_passes_p90',
128 |                                'succesful_through_passes_p90',
129 |                                'plus_minus',
130 |                                'events_in_box_p90',
131 |                                'passes_to_box_p90',
132 |                                'creative_passes_p90',
133 |                                'succesful_def_actions_p90',
134 |                                'progressive_carries_p90',
135 |                                'xG_p90',
136 |                                'xG_tot',
137 |                                'xG_shots',
138 |                                'xG_headers',
139 |                                'xG_free_kicks',
140 |                                'xG_penalties'])
141 | 
142 | # Prepare the dataframe with the columns we need
143 | df_KPI_tot = pd.DataFrame(columns=['matchId',
144 |                                    'league',
145 |                                'teamName',
146 |                                'playerId',
147 |                                'shortName',
148 |                                'role',
149 |                                'minutesPlayed',
150 |                                'team_goals',
151 |                                'team_conceded_goals',
152 |                                'red_card',
153 |                                # KPI's from here
154 |                                'goals',
155 |                                'assists',
156 |                                'passing%',
157 |                                'completed_passes',
158 |                                'fouls',
159 |                                'aerial%',
160 |                                'aerial_wins',
161 |                                'shots',
162 |                                'dribbles%',
163 |                                'succesful_dribbles',
164 |                                'key_passes',
165 |                                'succesful_through_passes',
166 |                                'plus_minus',
167 |                                'events_in_box',
168 |                                'passes_to_box',
169 |                                'creative_passes',
170 |                                'succesful_def_actions',
171 |                                'progressive_carries',
172 |                                'xG_tot',
173 |                                'xG_shots',
174 |                                'xG_headers',
175 |                                'xG_free_kicks',
176 |                                'xG_penalties'])
177 | 
178 | # Prepare the dataframe with the columns we need
179 | df_KPI_info = pd.DataFrame(columns=['matchId',
180 |                                     'league',
181 |                                'playerId',
182 |                                'shortName',                              
183 |                                # KPI-info's from here
184 |                                'info_goals',
185 |                                'info_assists',
186 |                                'info_passing%',
187 |                                'info_completed_passes',
188 |                                'info_fouls',
189 |                                'info_aerial%',
190 |                                'info_aerial_wins',
191 |                                'info_shots',
192 |                                'info_dribbles%',
193 |                                'info_succesful_dribbles',
194 |                                'info_key_passes',
195 |                                'info_succesful_through_passes',
196 |                                'info_plus_minus',
197 |                                'info_events_in_box',
198 |                                'info_passes_to_box',
199 |                                'info_creative_passes',
200 |                                'info_succesful_def_actions',
201 |                                'info_progressive_carries',
202 |                                'info_xG'])
203 | 
204 | 
205 | # Match id checkpoints
206 | loop_checkpoints = np.arange(0, 2100, 5)
207 | j = 0
208 | 
209 | # Loop trough all matches
210 | for i, match in df_Europe_matches.iterrows():
211 |     
212 |     # Find the events from match_i
213 |     mask_match = df_Europe_events.matchId == match.wyId
214 |     df_events_match = df_Europe_events.loc[mask_match]
215 |     
216 |     # List of all the players involved in match_i
217 |     player_match_list = df_events_match['playerId'].unique().tolist()
218 |     
219 |     ################################################
220 |     # - Find home and away score
221 |     "----------------------------------------------"
222 |     
223 |     # Find teamIds in the match
224 |     teams_match_list = df_events_match['teamId'].unique().tolist()
225 |     
226 |     # Find the match data from df_matches
227 |     mask_score = df_Europe_matches.wyId == match.wyId
228 |     df_the_match = df_Europe_matches.loc[mask_score]
229 |     team_data = df_the_match.teamsData
230 |     
231 |     ################################################
232 |     # - Get home and away teams and scores
233 |     "----------------------------------------------"
234 |     home_team_list = []
235 |     away_team_list = []
236 |     for i in range(2):
237 |         team_data_i = team_data[0][str(teams_match_list[i])]
238 |         team_lineup = team_data_i['formation']['lineup']
239 |         team_bench = team_data_i['formation']['bench']
240 |         
241 |         # Get the lineup players
242 |         for player in team_lineup:
243 |             if team_data_i['side'] == "home":
244 |                 home_team_list.append(player['playerId'])
245 |             elif team_data_i['side'] == "away":
246 |                 away_team_list.append(player['playerId'])
247 |             else:
248 |                 print("Error: " + team_data_i['side'])
249 |         
250 |         # Get the bench players
251 |         for player in team_bench:
252 |             if team_data_i['side'] == "home":
253 |                 home_team_list.append(player['playerId'])
254 |             elif team_data_i['side'] == "away":
255 |                 away_team_list.append(player['playerId'])
256 |             else:
257 |                 print("Error: " + team_data_i['side'])
258 |                 
259 |         # Set home and away score
260 |         if team_data_i['side'] == "home":
261 |             home_team_score = team_data_i['score']
262 |         elif team_data_i['side'] == "away":
263 |             away_team_score = team_data_i['score']
264 |         else:
265 |             print("Error: " + team_data_i['score'])
266 |                     
267 |     # End of finding home and away teams and score
268 |     "----------------------------------------------"
269 | 
270 |     
271 |     # Loop trough all players and get their average position and compute KPI's
272 |     for player in player_match_list:
273 |         
274 |         # Find the minutes played, team and red card
275 |         mask_minutes = (df_minutes.playerId == player) & (df_minutes.matchId == match.wyId)
276 |         df_player_minutes = df_minutes.loc[mask_minutes]
277 |         
278 |         # Some players are not registered the subbed in but their events are registerd
279 |         # If they are not subbed in correctly in Wyscout matches "df_player_minutes"
280 |         # will be empty. Thus we check this here. 
281 |         if len(df_player_minutes != 0):
282 |             player_minutes = df_player_minutes['minutesPlayed'][0]
283 |             player_in_min = df_player_minutes['player_in_min'][0]
284 |             player_out_min = df_player_minutes['player_out_min'][0]
285 |             player_team = df_player_minutes['teamId'][0]
286 |             player_team_name = df_player_minutes['teamName'][0]
287 |             red_card_bool = df_player_minutes['red_card'][0]
288 |             
289 |             # mask to find the given player-events
290 |             mask_player = df_events_match.playerId == player
291 |             
292 |             # New dataframe with all events from 'player' in match
293 |             df_events_player = df_events_match.loc[mask_player]
294 |             
295 |             # Get the role of the player
296 |             position = df_events_player['Position'][0]
297 |             
298 |             # Get the league
299 |             league = df_events_player["league"][0]
300 |             
301 |             # Get the shortName
302 |             name = df_events_player['shortName'][0]
303 |             
304 |             # Get the team goal and goals conceded
305 |             if (player in home_team_list):
306 |                 team_goals = home_team_score
307 |                 team_conceded_goals = away_team_score
308 |             elif (player in away_team_list):
309 |                 team_goals = away_team_score
310 |                 team_conceded_goals = home_team_score
311 |             else:
312 |                 print("Error: cant find player in list")
313 |             
314 |             
315 |             ################################################
316 |             # - All function calls to compute kpi's
317 |             "----------------------------------------------"
318 |             
319 |             # goals
320 |             goals, goals_info = kpi.nr_goals(df_events_player, player_minutes)
321 |             
322 |             # assists
323 |             assists, assists_info = kpi.nr_assists(df_events_player, player_minutes)
324 |             
325 |             # passing%
326 |             pass_percent, pass_percent_info = kpi.percent_passes_completed(df_events_player, player_minutes)
327 |             
328 |             # passes_completed
329 |             pass_comp, pass_comp_p90, pass_comp_info = kpi.passes_completed(df_events_player, player_minutes)
330 |             
331 |             # fouls
332 |             fouls, fouls_p90, fouls_info = kpi.fouls(df_events_player, player_minutes)
333 |             
334 |             # aerials%
335 |             aerials_percent, aerials_percent_info = kpi.percent_aerial_wins(df_events_player, player_minutes)
336 |             
337 |             # aerials_won
338 |             aerial_wins, aerial_wins_p90, aerial_wins_info = kpi.aerials_won(df_events_player, player_minutes)
339 |             
340 |             # shots
341 |             shots, shots_p90, shots_info = kpi.shots(df_events_player, player_minutes)
342 |             
343 |             # dribbles%
344 |             dribbles_percent, dribbles_percent_info = kpi.percent_succesful_dribbles(df_events_player, player_minutes)
345 |             
346 |             # succesful_dribbles
347 |             succesful_dribbles, succesful_dribbles_p90, succesful_dribbles_info = kpi.succesful_dribbles(df_events_player, player_minutes)
348 |             
349 |             # key_passes
350 |             key_passes, key_passes_p90, key_passes_info = kpi.key_passes(df_events_player, player_minutes)
351 |             
352 |             # succesful_through_passes
353 |             succesful_through_passes, succesful_through_passes_p90, succesful_through_passes_info = kpi.succesful_through_passes(df_events_player, player_minutes)
354 |             
355 |             # plus-minus
356 |             plus_minus, plus_minus_info = kpi.plus_minus(df_events_match, player_team, player_minutes, player_in_min, player_out_min)
357 |             
358 |             # events_in_box
359 |             events_in_box, events_in_box_p90, event_in_box_info = kpi.events_in_box(df_events_player, player_minutes)
360 |             
361 |             # passes_to_box
362 |             passes_to_box, passes_to_box_p90, passes_to_box_info = kpi.passes_to_box(df_events_player, player_minutes)
363 |             
364 |             # creative_passes
365 |             creative_passes, creative_passes_p90, creative_passes_info = kpi.creative_passes(df_events_player, player_minutes)
366 |             
367 |             # defensive_actions
368 |             succesful_def_actions, succesful_def_actions_p90, succesful_def_actions_info = kpi.succesful_def_actions(df_events_player, player_minutes)
369 |             
370 |             # progressive_carries 
371 |             progressive_carries, progressive_carries_p90, progressive_carries_info = kpi.progressive_carries(df_events_player, player_minutes) 
372 |             
373 |             # xG
374 |             xG_tot, xG_tot_p90, xG_info, xG_shots, xG_headers, xG_free_kicks, xG_penalties = kpi.xG(df_events_player, player_minutes, df_log_model_shots_coef, df_log_model_headers_coef, df_log_model_free_kicks_coef)
375 |             
376 |             
377 |             
378 |             ########################################################
379 |             # - Add rows to df_KPI_p90, df_KPI_tot and df_KPI_info
380 |             "------------------------------------------------------"
381 |             
382 |             # df_KPI_p90
383 |             df_KPI_p90.loc[df_KPI_p90.shape[0]] = [match.wyId, league, player_team_name, player, name,
384 |                                            position, player_minutes, team_goals, 
385 |                                            team_conceded_goals, red_card_bool,
386 |                                            goals,
387 |                                            assists,
388 |                                            pass_percent,
389 |                                            pass_comp_p90,
390 |                                            fouls_p90,
391 |                                            aerials_percent,
392 |                                            aerial_wins_p90,
393 |                                            shots_p90,
394 |                                            dribbles_percent,
395 |                                            succesful_dribbles_p90,
396 |                                            key_passes_p90,
397 |                                            succesful_through_passes_p90,
398 |                                            plus_minus,
399 |                                            events_in_box_p90,
400 |                                            passes_to_box_p90,
401 |                                            creative_passes_p90,
402 |                                            succesful_def_actions_p90,
403 |                                            progressive_carries_p90,
404 |                                            xG_tot_p90,
405 |                                            xG_tot,
406 |                                            xG_shots,
407 |                                            xG_headers,
408 |                                            xG_free_kicks,
409 |                                            xG_penalties]
410 |             
411 |             # df_KPI_tot
412 |             df_KPI_tot.loc[df_KPI_tot.shape[0]] = [match.wyId, league, player_team_name, player, name,
413 |                                            position, player_minutes, team_goals, 
414 |                                            team_conceded_goals, red_card_bool,
415 |                                            goals,
416 |                                            assists,
417 |                                            pass_percent,
418 |                                            pass_comp,
419 |                                            fouls,
420 |                                            aerials_percent,
421 |                                            aerial_wins,
422 |                                            shots,
423 |                                            dribbles_percent,
424 |                                            succesful_dribbles,
425 |                                            key_passes,
426 |                                            succesful_through_passes,
427 |                                            plus_minus,
428 |                                            events_in_box,
429 |                                            passes_to_box,
430 |                                            creative_passes,
431 |                                            succesful_def_actions,
432 |                                            progressive_carries,
433 |                                            xG_tot,
434 |                                            xG_shots,
435 |                                            xG_headers,
436 |                                            xG_free_kicks,
437 |                                            xG_penalties]
438 |             
439 |             
440 |             # df_KPI_info
441 |             df_KPI_info.loc[df_KPI_info.shape[0]] = [match.wyId, league, player, name,
442 |                                                       goals_info,
443 |                                                       assists_info,
444 |                                                       pass_percent_info,
445 |                                                       pass_comp_info,
446 |                                                       fouls_info,
447 |                                                       aerials_percent_info,
448 |                                                       aerial_wins_info,
449 |                                                       shots_info,
450 |                                                       dribbles_percent_info,
451 |                                                       succesful_dribbles_info,
452 |                                                       key_passes_info,
453 |                                                       succesful_through_passes_info,
454 |                                                       plus_minus_info,
455 |                                                       event_in_box_info,
456 |                                                       passes_to_box_info,
457 |                                                       creative_passes_info,
458 |                                                       succesful_def_actions_info,
459 |                                                       progressive_carries_info,
460 |                                                       xG_info]
461 |         
462 |         
463 |     if (j in loop_checkpoints):
464 |         print(f"Number of matches with computed KPI's': {j}\n")
465 | 
466 |     j+=1
467 | 
468 | 
469 | #%%
470 | # - Save dataframes to json-files, uncommen which to save
471 | "---------------------------------------------------------------------------" 
472 | df_KPI_p90.to_json("Json_files/KPI_per_90_All.json")
473 | df_KPI_tot.to_json("Json_files/KPI_tot_All.json")
474 | df_KPI_info.to_json("Json_files/KPI_info_All.json")
475 | 
476 |         
477 |     


--------------------------------------------------------------------------------
/create_KPI_dataframe_EDIT.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Mon Sep 13 16:54:33 2021
  5 | 
  6 | @author: emildanielsson & JakobEP
  7 | 
  8 | Program description: 
  9 |     Edit KPI-dtaframes created from create_KPI_dataframe.py
 10 |     Writes over changes to the same file
 11 |     
 12 | """
 13 | 
 14 | # The basics
 15 | import pandas as pd
 16 | import numpy as np
 17 | import json
 18 | 
 19 | # Import KPI-funcion
 20 | import KPI_functions as kpi
 21 | 
 22 | 
 23 | #%%
 24 | # - Create dataframes from the Wyscout data
 25 | "---------------------------------------------------------------------------"
 26 | 
 27 | # Create event dataframe
 28 | #df_Europe_events = pd.read_json('Json_files/events_All.json', encoding="unicode_escape") #SLOWER
 29 | with open('../Json_files/events_All.json') as f:
 30 |     data_Europe = json.load(f)
 31 |     
 32 | df_Europe_events = pd.DataFrame(data_Europe)
 33 | 
 34 | # Create match dataframes
 35 | df_England_matches = pd.read_json('../../Wyscout/matches/matches_England.json', encoding="unicode_escape")
 36 | 
 37 | df_France_matches = pd.read_json('../../Wyscout/matches/matches_France.json', encoding="unicode_escape")
 38 | 
 39 | df_Germany_matches = pd.read_json('../../Wyscout/matches/matches_Germany.json', encoding="unicode_escape")
 40 | 
 41 | df_Italy_matches = pd.read_json('../../Wyscout/matches/matches_Italy.json', encoding="unicode_escape")
 42 | 
 43 | df_Spain_matches = pd.read_json('../../Wyscout/matches/matches_Spain.json', encoding="unicode_escape")
 44 | 
 45 | 
 46 | # Create players and teams dataframes
 47 | df_players = pd.read_json("../../Wyscout/players.json", encoding="unicode_escape")
 48 | df_teams = pd.read_json("../../Wyscout/teams.json", encoding="unicode_escape")
 49 | 
 50 | 
 51 | #%%
 52 | # - Merge matches dataframes from all leagues 
 53 | "---------------------------------------------------------------------------"
 54 | 
 55 | frames_matches = [df_England_matches, df_France_matches, df_Germany_matches, 
 56 |                   df_Italy_matches, df_Spain_matches]
 57 | 
 58 | df_Europe_matches = pd.concat(frames_matches, keys = ["England", "France",
 59 |                                                       "Germany", "Italy", "Spain"])
 60 | 
 61 | 
 62 | #%%
 63 | # - Read in minutes played data
 64 | "---------------------------------------------------------------------------"
 65 | 
 66 | with open('../Json_files/minutes_played_All.json') as f:
 67 |     data_minutes = json.load(f)
 68 |     
 69 | df_minutes = pd.DataFrame(data_minutes)
 70 | 
 71 | 
 72 | #%%
 73 | # - Read in dataframes of all KPI's to edit
 74 | "---------------------------------------------------------------------------"
 75 | 
 76 | with open('../Json_files/new_KPI_tot_All.json') as f:
 77 |     data_kpi_tot = json.load(f)
 78 |     
 79 | with open('../Json_files/new_KPI_per_90_All.json') as f:
 80 |     data_kpi_p90 = json.load(f)
 81 |     
 82 | # with open('Json_files/KPI_info_All.json') as f:
 83 | #     data_kpi_info = json.load(f)
 84 |     
 85 | df_KPI_tot = pd.DataFrame(data_kpi_tot)
 86 |     
 87 | df_KPI_p90 = pd.DataFrame(data_kpi_p90)
 88 | 
 89 | #df_KPI_info = pd.DataFrame(data_kpi_info)
 90 | 
 91 | 
 92 | #%%
 93 | # - Find number of own goals
 94 | "---------------------------------------------------------------------------"
 95 | 
 96 | # Df with all own goals
 97 | df_own_goals = kpi.own_goals(df_Europe_events)
 98 | 
 99 |     
100 | #%%
101 | # - Loop to add additional KPIs
102 | "---------------------------------------------------------------------------"
103 | 
104 | # Match id checkpoints
105 | loop_checkpoints = np.arange(0, 2100, 5)
106 | j = 0
107 | 
108 | # Loop through all matches
109 | for i, match in df_Europe_matches.iterrows():
110 |     
111 |     # Find the events from match_i
112 |     mask_match = df_Europe_events.matchId == match.wyId
113 |     df_events_match = df_Europe_events.loc[mask_match]
114 |     
115 |     # List of all the players involved in match_i
116 |     player_match_list = df_events_match['playerId'].unique().tolist()
117 |     
118 |     # Loop trough all players and get their average position and compute KPI's
119 |     for player in player_match_list:
120 |         
121 |         # Find the minutes played, team and red card
122 |         mask_minutes = (df_minutes.playerId == player) & (df_minutes.matchId == match.wyId)
123 |         df_player_minutes = df_minutes.loc[mask_minutes]
124 |         
125 |         # Some players are not registered the subbed in but their events are registerd
126 |         # If they are not subbed in correctly in Wyscout matches "df_player_minutes"
127 |         # will be empty. Thus we check this here. 
128 |         if len(df_player_minutes != 0):
129 |             player_minutes = df_player_minutes['minutesPlayed'][0]
130 |             
131 |             # mask to find the given player-events
132 |             mask_player = df_events_match.playerId == player
133 |             
134 |             # New dataframe with all events from 'player' in match
135 |             df_events_player = df_events_match.loc[mask_player]
136 |             
137 |             
138 |             ################################################
139 |             # - Check after own goals from player in match
140 |             "----------------------------------------------"
141 |             
142 |             # Initiate temp variable
143 |             # own_goals_player = 0
144 |             
145 |             # # Read out any eventual own goals 
146 |             # mask_own_goals = (df_own_goals.playerId == player) & (df_own_goals.matchId == match.wyId)
147 |             # df_own_goals_player = df_own_goals.loc[mask_own_goals]
148 |             
149 |             # # Check there were any own goals
150 |             # if len(df_own_goals_player) != 0:
151 |             #     own_goals_player = len(df_own_goals_player)
152 |             
153 |             
154 |             ################################################
155 |             # - All function calls to compute kpi's
156 |             "----------------------------------------------"
157 |             
158 |             # danger_ball_loses
159 |             #danger_ball_loses, danger_ball_loses_p90, danger_ball_loses_info = kpi.danger_ball_loses(df_events_player, player_minutes)
160 |             
161 |             # yellow_cards
162 |             #yellow_cards, yellow_cards_info = kpi.yellow_cards(df_events_player)
163 |             
164 |             # percent_def_actions
165 |             percent_def_actions, percent_def_actions_info = kpi.percent_def_actions(df_events_player, player_minutes)
166 |             
167 |             ########################################################
168 |             # - Add rows to df_KPI_p90, df_KPI_tot and df_KPI_info
169 |             "------------------------------------------------------"
170 |             
171 |             # df_KPI_p90
172 |             mask_insert1 = (df_KPI_p90.matchId == match.wyId) & (df_KPI_p90.playerId == player)
173 |             #df_KPI_p90.loc[mask_insert1, 'own_goals'] = own_goals_player
174 |             #df_KPI_p90.loc[mask_insert1, 'yellow_cards'] = yellow_cards
175 |             #df_KPI_p90.loc[mask_insert1, 'danger_ball_loses'] = danger_ball_loses_p90
176 |             df_KPI_p90.loc[mask_insert1, 'def_actions%'] = percent_def_actions
177 |             
178 |             # df_KPI_tot
179 |             mask_insert2 = (df_KPI_tot.matchId == match.wyId) & (df_KPI_tot.playerId == player)
180 |             #df_KPI_tot.loc[mask_insert2, 'own_goals'] = own_goals_player
181 |             #df_KPI_tot.loc[mask_insert2, 'yellow_cards'] = yellow_cards
182 |             #df_KPI_tot.loc[mask_insert2, 'danger_ball_loses'] = danger_ball_loses
183 |             df_KPI_tot.loc[mask_insert2, 'def_actions%'] = percent_def_actions
184 |             
185 |             # df_KPI_info
186 |             # mask_insert3 = (df_KPI_info.matchId) == match.wyId & (df_KPI_info.playerId == player)
187 |             # df_KPI_info.loc[mask_insert3, 'yellow_cards'] = yellow_cards_info
188 |             # df_KPI_info.loc[mask_insert3, 'danger_ball_loses'] = danger_ball_loses_info
189 |             #df_KPI_info.loc[mask_insert3, 'def_actions%'] = percent_def_actions_info
190 |         
191 |         
192 |     if (j in loop_checkpoints):
193 |         print(f"Number of matches with computed KPI's': {j}\n")
194 | 
195 |     j+=1
196 | 
197 | 
198 | #%%
199 | # - Create the new columns team_xG_p90, opponents_xG, possesion, etc
200 | "---------------------------------------------------------------------------"
201 | # Find all unique matches 
202 | list_matches = df_KPI_tot["matchId"].unique().tolist()
203 | 
204 | for match in list_matches:
205 |     
206 |     # mask for the match to add team_xG 
207 |     mask_match = df_KPI_tot.matchId == match
208 |     df_match = df_KPI_tot.loc[mask_match]
209 |     
210 |     # List of the team names
211 |     list_teams = df_match["teamName"].unique().tolist()
212 |     
213 |     for team in list_teams:
214 |         
215 |         # Find the team KPI
216 |         mask_team = df_match.teamName == team
217 |         df_team = df_match.loc[mask_team]
218 |         df_opponent = df_match.loc[~mask_team]
219 |         
220 |         # Find xG and shots
221 |         # team_shots = df_team['shots'].sum()
222 |         # opponent_shots = df_opponent['shots'].sum()
223 |         team_xG = df_team["xG_tot"].sum()
224 |         opponent_xG = df_opponent["xG_tot"].sum()
225 |         team_passes = df_team['completed_passes'].sum()
226 |         opponent_passes = df_opponent['completed_passes'].sum()
227 |         
228 |         tot_game_passes = team_passes + opponent_passes
229 |         
230 |         # Find approximate possesion
231 |         team_possesion = team_passes / tot_game_passes
232 |         opponent_possesion = opponent_passes / tot_game_passes
233 |         
234 |         # Find PossAdj defnesive actions
235 |         for i, player in df_team.iterrows():
236 |             mask_player =  ((df_KPI_tot.matchId == match) & (df_KPI_tot.playerId == player.playerId))
237 |             df_player = df_KPI_tot.loc[mask_player]
238 |             def_actions = df_player.succesful_def_actions.values[0]
239 |             p_adj_def_actions = def_actions / opponent_possesion
240 |             df_KPI_tot.loc[mask_player, 'p_adj_succ_def_actions'] = p_adj_def_actions
241 |         
242 |         # Add to the KPI dataframe
243 |         mask_add_xG = ((df_KPI_tot.matchId == match) & (df_KPI_tot.teamName == team))
244 |         df_KPI_tot.loc[mask_add_xG, 'team_xG'] = team_xG
245 |         df_KPI_tot.loc[mask_add_xG, 'opponent_xG'] = opponent_xG
246 |         df_KPI_tot.loc[mask_add_xG, 'team_possesion'] = team_possesion
247 |         df_KPI_tot.loc[mask_add_xG, 'opponent_possesion'] = opponent_possesion
248 |         # df_KPI_tot.loc[mask_add_xG, 'team_shots'] = team_shots
249 |         # df_KPI_tot.loc[mask_add_xG, 'opponent_shots'] = opponent_shots
250 | 
251 | 
252 | #%%
253 | # - Save dataframes to json-files
254 | # - Note: Uncomment which to save
255 | "---------------------------------------------------------------------------" 
256 | 
257 | #df_KPI_p90.to_json("../Json_files/KPI_per_90_All.json")
258 | #df_KPI_tot.to_json("Json_files/KPI_tot_All.json")
259 | #df_KPI_info.to_json("Json_files/new_KPI_info_All.json")
260 | 
261 |         


--------------------------------------------------------------------------------
/create_events_df_eu.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Fri Sep 10 12:04:25 2021
  5 | 
  6 | @author: emildanielsson & JakobEP
  7 | 
  8 | Program description:
  9 |     1. Read in Wyscout data, Events, Players, Matches and Teams
 10 |     2. Filtering of the event and player data
 11 |         - get rid of gk's from players data
 12 |         - get rid of gk-events from event data
 13 |         - get rid of events with unknown playerId
 14 |     3. Merge all the league event files to one dataframe (df)
 15 |     4. Create and store a new events.json file in the working directory
 16 |         - Added column "Position" with the detected position
 17 |         - Added column "shortName" with the shortName from Wyscout
 18 |         
 19 | 
 20 | """
 21 | 
 22 | # The basics
 23 | import pandas as pd
 24 | import numpy as np
 25 | import json
 26 | 
 27 | import fitting_functions as ff
 28 |  
 29 | #%%
 30 | # - Create dataframes from the Wyscout data
 31 | "---------------------------------------------------------------------------"
 32 | 
 33 | # Create event dataframes
 34 | df_England_events = pd.read_json('../../Wyscout/events/events_England.json', encoding="unicode_escape")
 35 | 
 36 | df_France_events = pd.read_json('../../Wyscout/events/events_France.json', encoding="unicode_escape")
 37 | 
 38 | df_Germany_events = pd.read_json('../../Wyscout/events/events_Germany.json', encoding="unicode_escape")
 39 | 
 40 | df_Italy_events = pd.read_json('../../Wyscout/events/events_Italy.json', encoding="unicode_escape")
 41 | 
 42 | df_Spain_events = pd.read_json('../../Wyscout/events/events_Spain.json', encoding="unicode_escape")
 43 | 
 44 | 
 45 | # Create match dataframes
 46 | df_England_matches = pd.read_json('../../Wyscout/matches/matches_England.json', encoding="unicode_escape")
 47 | 
 48 | df_France_matches = pd.read_json('../../Wyscout/matches/matches_France.json', encoding="unicode_escape")
 49 | 
 50 | df_Germany_matches = pd.read_json('../../Wyscout/matches/matches_Germany.json', encoding="unicode_escape")
 51 | 
 52 | df_Italy_matches = pd.read_json('../../Wyscout/matches/matches_Italy.json', encoding="unicode_escape")
 53 | 
 54 | df_Spain_matches = pd.read_json('../../Wyscout/matches/matches_Spain.json', encoding="unicode_escape")
 55 | 
 56 | 
 57 | # Create players and teams dataframes
 58 | df_players = pd.read_json("../../Wyscout/players.json", encoding="unicode_escape")
 59 | df_teams = pd.read_json("../../Wyscout/teams.json", encoding="unicode_escape")
 60 | 
 61 | 
 62 | #%%
 63 | # - Merge dataframes from all leagues but England
 64 | "---------------------------------------------------------------------------"
 65 | 
 66 | frames_events = [df_England_events, df_France_events, df_Germany_events,
 67 |           df_Italy_events, df_Spain_events]
 68 | 
 69 | df_Europe_events = pd.concat(frames_events, keys = ["England", "France", "Germany", "Italy", "Spain"])
 70 | df_Europe_events = df_Europe_events.reset_index(level=[0])
 71 | df_Europe_events = df_Europe_events.rename(columns ={'level_0': "league"})
 72 | 
 73 | 
 74 | frames_matches = [df_England_matches, df_France_matches, df_Germany_matches, 
 75 |                   df_Italy_matches, df_Spain_matches]
 76 | 
 77 | df_Europe_matches = pd.concat(frames_matches, keys = ["England", "France", "Germany", "Italy", "Spain"])
 78 | df_Europe_matches = df_Europe_matches.reset_index(level=[0])
 79 | df_Europe_matches = df_Europe_matches.rename(columns ={'level_0': "league"})
 80 | 
 81 | 
 82 | #%%
 83 | # - Add shortName and position to df_Europe
 84 | "---------------------------------------------------------------------------"
 85 | 
 86 | # Filter out events with no playerId (0) 
 87 | mask_filter = df_Europe_events.playerId != 0
 88 | df_Europe_events = df_Europe_events[mask_filter]
 89 | 
 90 | # Find unique player ids
 91 | eu_players = df_Europe_events["playerId"].unique().tolist()
 92 | 
 93 | # Player id checkpoints
 94 | loop_checkpoints = np.arange(0,2080,50)
 95 | j = 0
 96 | 
 97 | # Loop through player list and add new column for name
 98 | for player in eu_players:
 99 |     
100 |     # Find player short name
101 |     mask_player = df_players.wyId == player
102 |     shortName = df_players.loc[mask_player, 'shortName'].values[0]
103 |     
104 |     # Mask player
105 |     mask_events_player = df_Europe_events.playerId == player
106 |     df_Europe_events.loc[mask_events_player, 'shortName'] = shortName
107 |     
108 |     if (j in loop_checkpoints):
109 |         print(f"shortName added: {j}\n")
110 |     
111 |     j+=1
112 |     
113 | # Find all unique matches played
114 | matchId_list = df_Europe_events['matchId'].unique().tolist()    
115 | 
116 | # Match id checkpoints
117 | loop_checkpoints = np.arange(0,2080,50)
118 | j = 0
119 | 
120 | # Loop through all matches 
121 | for match_i in matchId_list:
122 |     
123 |     # Find the event from match_i
124 |     mask_match = df_Europe_events.matchId == match_i
125 |     df_match = df_Europe_events.loc[mask_match]
126 |     
127 |     # List of all the players involved in match_i
128 |     player_match_list = df_match['playerId'].unique().tolist()
129 |     
130 |     # Loop trough all players and get their average position
131 |     for player in player_match_list:
132 |         
133 |         # mask to find the given player-events
134 |         mask_player = df_match.playerId == player
135 |         
136 |         # mask to find player from df_players
137 |         mask_player2 = df_players.wyId == player
138 |         
139 |         # New dataframe with all events from 'player' in match 'match_i'
140 |         player_df = df_match.loc[mask_player]
141 | 
142 |         # Initiate lists to be filled with x and y coordinates
143 |         x_list = []
144 |         y_list = []
145 |         
146 |         # Get list of all starting coordinates from each event of the player
147 |         for i, event in player_df.iterrows():
148 |             x_list.append(event['positions'][0]['x'])
149 |             y_list.append(event['positions'][0]['y'])
150 |             
151 |         # Get the mean positions
152 |         y_mean = sum(y_list) / len(y_list)
153 |         x_mean = sum(x_list) / len(x_list)
154 |         
155 |         # Get the Wyscout-determined role of the player
156 |         position_wyscout = df_players.loc[mask_player2]['role'].values[0]['name']
157 |         
158 |         # Call to function
159 |         position = ff.decide_position(x_mean, y_mean, position_wyscout)
160 |         
161 |         # Add the position to the dataframe
162 |         mask_add_position = (df_Europe_events.matchId == match_i) & (df_Europe_events.playerId == player)
163 |         df_Europe_events.loc[mask_add_position, 'Position'] = position
164 |         
165 |     if (j in loop_checkpoints):
166 |         print(f"Number of event-modified matches: {j}\n")
167 |     
168 |     j+=1
169 | 
170 | 
171 | # Filter out events with goalkeepers
172 | mask_gk = df_Europe_events.Position == "GK"
173 | df_Europe_events = df_Europe_events[~mask_gk]
174 | 
175 | 
176 | #%%
177 | # - Save dataframe of Europe events to working directory
178 | "---------------------------------------------------------------------------"
179 | 
180 | df_Europe_events.reset_index(inplace=True)
181 | df_Europe_events.to_json("Json_files/events_All.json")
182 | 
183 | # Test to load in and store as dataframe
184 | with open('Json_files/events_All.json') as f:
185 |     data_Europe_new = json.load(f)
186 |     
187 | df_Europe_new = pd.DataFrame(data_Europe_new)
188 | 
189 | 
190 | 
191 |     
192 |     
193 |     
194 |     
195 |     
196 |     
197 |     
198 |     
199 |     
200 |     
201 |     
202 |     
203 |     
204 |     
205 |     
206 |     
207 |     


--------------------------------------------------------------------------------
/fitting_functions.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue Sep 14 16:41:04 2021
  5 | 
  6 | @author: emildanielsson & JakobEP
  7 | 
  8 | Program description: 
  9 |     
 10 |     Funtions for fitting.
 11 |     
 12 | """
 13 | 
 14 | # The basics
 15 | import pandas as pd
 16 | import numpy as np
 17 | import json
 18 | 
 19 | # Plotting
 20 | import matplotlib.pyplot as plt
 21 | from mplsoccer import FontManager
 22 | from mplsoccer import Pitch, VerticalPitch
 23 | 
 24 | # Statistical fitting of models
 25 | import statsmodels.api as sm
 26 | import statsmodels.formula.api as smf
 27 | from sklearn import preprocessing
 28 | from sklearn.preprocessing import MinMaxScaler
 29 | from sklearn.preprocessing import RobustScaler
 30 | from sklearn.model_selection import train_test_split
 31 | from sklearn.linear_model import LogisticRegression
 32 | 
 33 | # For tables
 34 | from tabulate import tabulate
 35 | 
 36 | 
 37 | #%%
 38 | # - Load Fonts
 39 | "---------------------------------------------------------------------------" 
 40 | 
 41 | URL1 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/'
 42 |         'fonts/SourceSerifPro-Regular.ttf?raw=true')
 43 | serif_regular = FontManager(URL1)
 44 | URL2 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/'
 45 |         'fonts/SourceSerifPro-ExtraLight.ttf?raw=true')
 46 | serif_extra_light = FontManager(URL2)
 47 | URL3 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/fonts/'
 48 |         'SourceSerifPro-Bold.ttf?raw=true')
 49 | serif_bold = FontManager(URL3)
 50 | 
 51 | 
 52 | #%%
 53 | # - Functions
 54 | "---------------------------------------------------------------------------" 
 55 | 
 56 | """ Function which takes in dataframe of shots and outputs 
 57 |     dataframes which contains information about the logistic
 58 |     regression models for the different shot-types. 
 59 | 
 60 | Description: 
 61 |     
 62 |     Regression model variables:
 63 | 
 64 |         Dependent variable: Goal
 65 |         Independent variables: Angle, Distance, Distance squared 
 66 |     
 67 |     Input:
 68 |            df_xG_model - dataframe for all shots, headers, freekicks, penalties
 69 |            and their tags (go/no goal)
 70 |         
 71 |     Output: 
 72 |             dataframes for all coefficients and fitted log models
 73 |         
 74 | """
 75 | def xG_model(df_xG_model):
 76 |     
 77 |     #################################################
 78 |     # - Filter out headers and freekicks
 79 |     "------------------------------------------------"
 80 |     
 81 |     mask_headers = df_xG_model.header == 1
 82 |     mask_free_kicks = df_xG_model.free_kick == 1
 83 |     
 84 |     df_xG_shots = df_xG_model[(~mask_headers) & (~mask_free_kicks)]
 85 |     df_xG_headers = df_xG_model[mask_headers]
 86 |     df_xG_free_kicks = df_xG_model[mask_free_kicks]
 87 |     
 88 |     
 89 |     #################################################
 90 |     # - Split data into test and training sets, 
 91 |     #   looking at distance (dist) and angle (ang) in radians. xG-shots.
 92 |     "------------------------------------------------"
 93 |     
 94 |     df_trainSet = df_xG_shots[['goal', 'distance', 'angle_rad']].copy()
 95 |     
 96 |     # Adding distance squared to df
 97 |     squaredD = df_trainSet['distance']**2
 98 |     df_trainSet = df_trainSet.assign(distance_sq = squaredD)
 99 |     
100 |     # y(x) where y = shot result, x1 = distance, x2 = angle
101 |     x_train, x_test, y_train, y_test = train_test_split(df_trainSet.drop('goal', axis=1), 
102 |                                                         df_trainSet['goal'], test_size=0.20, 
103 |                                                         random_state=10)
104 |     
105 |     
106 |     #################################################
107 |     # - Create logistic model and fit it to data. xG-shots.
108 |     "------------------------------------------------"
109 |     
110 |     # Create instance
111 |     log_model = LogisticRegression()
112 |     
113 |     # Fit model with training data
114 |     log_model.fit(x_train, y_train)
115 |     
116 |     # Read out coefficent(s) into df
117 |     log_model_coef = log_model.coef_[0]
118 |     
119 |     # Create df of fit
120 |     df_log_model_shots_coef = pd.DataFrame(log_model_coef, 
121 |                  x_train.columns, 
122 |                  columns=['coef']).sort_values(by='coef', ascending=False)
123 |     
124 |     # Add to df
125 |     df_log_model_shots_coef.loc['intercept'] = log_model.intercept_[0]
126 |     print(df_log_model_shots_coef)
127 |     
128 |     
129 |     #################################################
130 |     # - Split data into test and training sets, 
131 |     #   looking at distance (dist) and angle (ang) in radians. xG-headers.
132 |     "------------------------------------------------"
133 |     
134 |     df_trainSet_headers = df_xG_headers[['goal', 'distance', 'angle_rad']].copy()
135 |     
136 |     # Adding distance squared to df
137 |     squaredD = df_trainSet_headers['distance']**2
138 |     df_trainSet_headers = df_trainSet_headers.assign(distance_sq = squaredD)
139 |     
140 |     # y(x) where y = shot result, x1 = distance, x2 = angle
141 |     x_train_h, x_test_h, y_train_h, y_test_h = train_test_split(df_trainSet_headers.drop('goal', axis=1), 
142 |                                                         df_trainSet_headers['goal'], test_size=0.20, 
143 |                                                         random_state=10)
144 |     
145 |     
146 |     #################################################
147 |     # - Create logistic model and fit it to data. xG-headers.
148 |     "------------------------------------------------"
149 |     
150 |     # Create instance
151 |     log_model_headers = LogisticRegression()
152 |     
153 |     # Fit model with training data
154 |     log_model_headers.fit(x_train_h, y_train_h)
155 |     
156 |     # Read out coefficent(s) into df
157 |     log_model_headers_coef = log_model_headers.coef_[0]
158 |     
159 |     # Create df of fit
160 |     df_log_model_headers_coef = pd.DataFrame(log_model_headers_coef, 
161 |                  x_train_h.columns, 
162 |                  columns=['coef']).sort_values(by='coef', ascending=False)
163 |     
164 |     # Add to df
165 |     df_log_model_headers_coef.loc['intercept'] = log_model_headers.intercept_[0]
166 |     print(df_log_model_headers_coef)
167 |     
168 |     
169 |     #################################################
170 |     # - Split data into test and training sets, 
171 |     #   looking at distance (dist) and angle (ang) in radians. xG-free-kicks.
172 |     "------------------------------------------------"
173 |     
174 |     df_trainSet_free_kicks = df_xG_free_kicks[['goal', 'distance', 'angle_rad']].copy()
175 |     
176 |     # Adding distance squared to df
177 |     squaredD = df_trainSet_free_kicks['distance']**2
178 |     df_trainSet_free_kicks = df_trainSet_free_kicks.assign(distance_sq = squaredD)
179 |     
180 |     # y(x) where y = shot result, x1 = distance, x2 = angle
181 |     x_train_f, x_test_f, y_train_f, y_test_f = train_test_split(df_trainSet_free_kicks.drop('goal', axis=1), 
182 |                                                         df_trainSet_free_kicks['goal'], test_size=0.20, 
183 |                                                         random_state=10)
184 |     
185 |     
186 |     #################################################
187 |     # - Create logistic model and fit it to data. xG-free-kicks.
188 |     "------------------------------------------------"
189 |     
190 |     # Create instance
191 |     log_model_free_kicks = LogisticRegression()
192 |     
193 |     # Fit model with training data
194 |     log_model_free_kicks.fit(x_train_f, y_train_f)
195 |     
196 |     # Read out coefficent(s) into df
197 |     log_model_free_kicks_coef = log_model_free_kicks.coef_[0]
198 |     
199 |     # Create df of fit
200 |     df_log_model_free_kicks_coef = pd.DataFrame(log_model_free_kicks_coef, 
201 |                  x_train_f.columns, 
202 |                  columns=['coef']).sort_values(by='coef', ascending=False)
203 |     
204 |     # Add to df
205 |     df_log_model_free_kicks_coef.loc['intercept'] = log_model_free_kicks.intercept_[0]
206 |     print(df_log_model_free_kicks_coef)
207 |     
208 |     return df_log_model_shots_coef, df_log_model_headers_coef, df_log_model_free_kicks_coef, log_model, log_model_headers, log_model_free_kicks
209 | 
210 | # - End function
211 | #############################################################################
212 | 
213 | 
214 | 
215 | """ Function to determine the position of a player.
216 |     Inputs: x: average x-coordinate
217 |             y: average y-coordinate
218 |             position: Position taken from Wyscout "role"-column 
219 | """   
220 | def decide_position(x, y, position):
221 |     if (position == "Defender"):
222 |         if (y < 30):
223 |             return "LB"
224 |         elif (y > 70):
225 |             return "RB"
226 |         else:
227 |             return "CB"
228 |     elif (position == "Midfielder"):
229 |         if (y < 30):
230 |             return "LM"
231 |         elif (y > 70):
232 |             return "RM"
233 |         else:
234 |             return "CM"      
235 |     elif (position == "Forward"):
236 |         if (y < 30):
237 |             return "LW"
238 |         elif (y > 70):
239 |             return "RW"
240 |         else:
241 |             return "ST" 
242 |     elif (position == "Goalkeeper"):
243 |         return "GK"
244 |     else:
245 |         return "?"
246 | 
247 | # - End function
248 | #############################################################################
249 | 
250 | 
251 | 
252 | """ Function which does linear regression fitting of KPI against 
253 |     dep_var (team_xG or opponent_xG) for a given position.
254 |     Iteratively removes one independent vaiable at a time that is
255 |      concidered statistically insignificant (p-value > 0.05). 
256 | 
257 |     Description: 
258 |     
259 |     Regression model variables:
260 | 
261 |         Dependent variable: dep_var (team_xG or opponent_xG)
262 |         Independent variables: KPI values
263 |     
264 |     Input:
265 |         KPI_train - dataframe of KPIs that can be used as training data
266 |         scaler - chosen scaler method for the normalization of KPIs
267 |         list_kpi - list of KPIs used for training data
268 |         dep_var -  model dependent variable
269 |         position - position to find model for
270 |         min_minutes - minimum minutes played for a player in a match
271 |             to be included in the regression model training data. 
272 |         
273 |     Output: 
274 |             model_coef - linear regression model coefficients 
275 |             r_squared - resulting r-squared of the model
276 |             list_kpi_fitting - list of statistically significant KPIs
277 | 
278 | """
279 | def KPI_fitting(KPI_train, scaler, list_kpi, dep_var, position, min_minutes):
280 |     
281 |     list_kpi_fitting = list_kpi.copy()
282 |     
283 |     # Append the dependent variable
284 |     list_kpi_fitting.append(dep_var)
285 | 
286 |     ################################################
287 |     # - Filter the training data
288 |     "----------------------------------------------"
289 |     df_train_filtered = filter_dataframe(KPI_train, position, list_kpi_fitting, min_minutes, 1)
290 |     
291 |     # Normalise
292 |     df_train_filtered[list_kpi_fitting[:-1]] = scaler.fit_transform(df_train_filtered[list_kpi_fitting[:-1]])  
293 |     
294 |     
295 |     ################################################
296 |     # - First Linear regression model for this position
297 |     "----------------------------------------------"
298 |     # First Linear regression model 
299 |     X = df_train_filtered[list_kpi_fitting[:-1]] # Dep. var last
300 |     X = sm.add_constant(X)
301 |     y = df_train_filtered[dep_var] # Dep. var
302 |     test_model = sm.OLS(y, X).fit()
303 |     #print(f"Model before tuning for the position {position}: \n")
304 |     #print(test_model.summary())        
305 |     
306 |     ################################################
307 |     # - Do iterations of Linear regression model to exclude some independent variables
308 |     "----------------------------------------------"
309 |     model_pvalues = test_model.pvalues
310 |     model_pvalues = model_pvalues.drop('const', axis = 0)
311 |     pvalues_check = model_pvalues.values <= 0.05
312 |     
313 |     # Loop regression model and take out the highest KPI with the highest pvalue one at a time 
314 |     while False in pvalues_check:
315 |         
316 |         # Find highest pvalue kpi
317 |         highest_kpi = model_pvalues[model_pvalues == model_pvalues.values.max()].index[0]
318 |             
319 |         # New list of KPIs 
320 |         list_kpi_fitting.remove(highest_kpi)
321 |         
322 |         # Filter the data
323 |         df_train_filtered = filter_dataframe(KPI_train, position, list_kpi_fitting, min_minutes, 1)
324 |         
325 |         # Normalise the new frame
326 |         df_train_filtered[list_kpi_fitting[:-1]] = scaler.fit_transform(df_train_filtered[list_kpi_fitting[:-1]]) 
327 |         
328 |         # Linear regression model 
329 |         X = df_train_filtered[list_kpi_fitting[:-1]]
330 |         X = sm.add_constant(X)
331 |         y = df_train_filtered[dep_var]
332 |         test_model = sm.OLS(y, X).fit()
333 |         
334 |         model_pvalues = test_model.pvalues
335 |         model_pvalues = model_pvalues.drop('const', axis = 0)
336 |         pvalues_check = model_pvalues.values <= 0.05
337 |     
338 |     
339 |     # Print model after the tuning
340 |     print(f"Model AFTER tuning for the position {position}: \n")
341 |     print(test_model.summary())  
342 |     model_coef = test_model.params
343 |     r_squared = test_model.rsquared
344 |     
345 |     return model_coef, r_squared, list_kpi_fitting
346 | 
347 | # - End function
348 | #############################################################################
349 | 
350 | 
351 | 
352 | """ Function which computes the linear regression fitted 
353 |     result from a player in a given match.
354 | 
355 |     Description: 
356 |     
357 |     Input:
358 |         player - KPIs for a player in a given match
359 |         model_coef - regression model coefficients 
360 |             (regression model gave as statistically significant)
361 |         list_kpi_fitting - list of KPIs (regression model gave 
362 |             as statistically significant)
363 |         
364 |     Output: 
365 |             result - result (fitted xG_team or xG_opponent)
366 |                 for that player in that match
367 | 
368 | """
369 | def compute_fitting_ratings(player, model_coef, list_kpi_fitting):
370 |     
371 |         result = 0
372 |         
373 |         for kpi in list_kpi_fitting[:-1]:
374 |             result += (model_coef[kpi] * player[kpi])
375 |                           
376 |         result += model_coef['const']   
377 |         
378 |         return result
379 |     
380 |     # - End function
381 | #############################################################################
382 | 
383 | 
384 | 
385 | """ Function which computes the so-called "event-based rating"
386 |     for a player in a given match
387 | 
388 |     (Could look over this code and possibly remove df_KPI as input)
389 | 
390 |     Description: 
391 |     
392 |     Input:
393 |         player - infomration about the player
394 |         position - position group of the player
395 |         df_KPI - dataframe of KPIs
396 |         
397 |     Output: 
398 |             result - resulting "event-based rating"
399 | 
400 | """
401 | def compute_events_rating(player, position, df_KPI):
402 |     
403 |     # default weights
404 |     dict_weights = {'plus_minus': 0.2,
405 |                     'goals': 1,
406 |                     'assists': 0.7,
407 |                     'own_goals': -0.5,
408 |                     'yellow_cards': -0.05,
409 |                     'danger_ball_loses': -0.2,
410 |                     'xG_tot': -0.1,
411 |                     'red_card': -1,
412 |                     'aerial%': 0.1,
413 |                     'def_actions%':0.1,
414 |                     'p_adj_succ_def_actions': 0.1,
415 |                     'succesful_dribbles': 0.05,
416 |                     'creative_passes': 0.1,
417 |                     'progressive_carries': 0.05
418 |                     }   
419 |      
420 |     #Set weight for the different positions
421 |     if position == ['LB', 'RB']:
422 |          dict_weights['def_actions%'] = 0.2
423 |          dict_weights['progressive_carries'] = 0.15
424 |     elif position == ['CB']:
425 |          dict_weights['aerial%'] = 0.3
426 |          dict_weights['def_actions%'] = 0.8
427 |          dict_weights['p_adj_succ_def_actions'] = 0.6
428 |     elif position == ['LM', 'RM']:
429 |         dict_weights['aerial%'] = 0.05
430 |         dict_weights['def_actions%'] = 0.05
431 |         dict_weights['creative_passes'] = 0.2
432 |         dict_weights['progressive_carries'] = 0.1
433 |         dict_weights['succesful_dribbles'] = 0.1
434 |     elif position == ['CM']:
435 |         dict_weights['creative_passes'] = 0.3
436 |         dict_weights['succesful_dribbles'] = 0.1
437 |     elif position == ['LW', 'RW']:
438 |         dict_weights['aerial%'] = 0.05
439 |         dict_weights['def_actions%'] = 0.05
440 |         dict_weights['creative_passes'] = 0.6
441 |         dict_weights['progressive_carries'] = 0.3
442 |         dict_weights['succesful_dribbles'] = 0.4
443 |         dict_weights['p_adj_succ_def_actions'] = 0.05
444 |     elif position == ['ST']:
445 |         dict_weights['def_actions%'] = 0
446 |         dict_weights['p_adj_succ_def_actions'] = 0
447 |     else:
448 |          print("Not a valid position")
449 |     
450 |     # Find the KPI dataframe
451 |     mask_match = ((df_KPI['matchId'] == player.matchId) & (df_KPI['playerId'] == player.playerId))
452 |     df_the_match = df_KPI.loc[mask_match]
453 |     
454 |     # Sum the event rating
455 |     event_rating = 0
456 |     for weight_name in dict_weights:
457 |         #print(weight)
458 |         weight = dict_weights[weight_name]
459 |         value = df_the_match[weight_name].values[0]
460 |         event_rating += (value * weight) 
461 |     
462 |     event_rating = event_rating / 20
463 |     
464 |     return event_rating
465 | 
466 | # - End function
467 | #############################################################################
468 | 
469 | 
470 | 
471 | """ Function which mainly finds the percentile ranks of the regression-
472 |     based rating and the event based rating. Sum those two ratings and 
473 |     adds this to the dataframes "df_KPI_test" and df_ratings as 
474 |     "final_rating". 
475 |     
476 | 
477 |     (This function might need some improvement, exmaple: remove df_KPI)
478 | 
479 |     df_KPI and df_KPI_test are included mostly for trial and error purposes
480 |     doing the development
481 | 
482 | 
483 |     Description: 
484 |     
485 |     Input:
486 |         df_ratings - dataframe of fitting and event rating results. 
487 |             This dataframe is modified with added columns in the function.
488 |             Most importantly is "final_rating" added.
489 | 
490 |         df_KPI - dataframe of KPIs for both training and test data
491 | 
492 |         df_KPI_test - dataframe of KPIs for the test data. This dataframe
493 |             is modified with added columns in the function. 
494 | 
495 |         percentiles_fit - percentile values for the regression-based rating
496 |         percentiles_events - percentile values for the event-based rating
497 |         df_matches - Wyscout matches dataframe used for adding info to df_KPI_test
498 |         
499 |     Output: 
500 |             None
501 | 
502 | """
503 | def create_rating_dataframe(df_ratings, df_KPI, df_KPI_test, percentiles_fit, percentiles_events, df_matches):
504 |     for i, player in df_ratings.iterrows():
505 |         mask_match = ((df_KPI['matchId'] == player.matchId) & (df_KPI['playerId'] == player.playerId))
506 | 
507 |         # Find percentile rank of the regression-based rating
508 |         if df_ratings.loc[i, 'tot_fit_rating'] < percentiles_fit.values[0]:
509 |             final_fit_rating = 0.1
510 |         else: 
511 |             for percentile in percentiles_fit.values:
512 |                 if df_ratings.loc[i, 'tot_fit_rating'] > percentile:
513 |                     final_fit_rating = round(percentiles_fit[percentiles_fit == percentile].index[0] * 5, 1)
514 | 
515 |         # Find percentile rank of the event-based rating
516 |         if df_ratings.loc[i, 'match_events_rating'] < percentiles_events.values[0]:
517 |             final_event_rating = 0.1
518 |         else: 
519 |             for percentile in percentiles_events.values:
520 |                 if df_ratings.loc[i, 'match_events_rating'] > percentile:
521 |                     final_event_rating = round(percentiles_events[percentiles_events == percentile].index[0] * 5, 1)
522 |         
523 |         # Sum the regression-based rating and event-based rating
524 |         final_rating = final_fit_rating + final_event_rating
525 |         
526 |         # Find the match info to easier look up the rating elsewhere
527 |         the_match = df_matches.loc[df_matches['wyId'] == player.matchId]
528 |         match_info = the_match.label.values[0]
529 |         gameweek = the_match.gameweek.values[0]
530 |         
531 |         # Add the final rating and info to both the test-df and the ratings-df
532 |         df_ratings.loc[i, 'position'] = df_KPI.loc[mask_match, 'role'].values[0]
533 |         df_ratings.loc[i, 'match_info'] = match_info
534 |         df_ratings.loc[i, 'final_rating'] = final_rating
535 |         df_ratings.loc[i, 'gameweek'] = gameweek
536 |         
537 |         #tot_rating = df_ratings.loc[i, 'tot_rating']
538 |         fitting_rating_off = df_ratings.loc[i, 'fitting_rating_off']
539 |         fitting_rating_def = df_ratings.loc[i, 'fitting_rating_def']
540 |         tot_fit_rating = df_ratings.loc[i, 'tot_fit_rating']
541 |         match_events_rating = df_ratings.loc[i, 'match_events_rating']
542 |         
543 |         #df_KPI_test.loc[mask_match, 'tot_rating'] = tot_rating
544 |         df_KPI_test.loc[mask_match, 'fitting_rating_off'] = fitting_rating_off
545 |         df_KPI_test.loc[mask_match, 'fitting_rating_def'] = fitting_rating_def
546 |         df_KPI_test.loc[mask_match, 'tot_fit_rating'] = tot_fit_rating
547 |         df_KPI_test.loc[mask_match, 'match_events_rating'] = match_events_rating
548 |         df_KPI_test.loc[mask_match, 'final_rating'] = final_rating
549 |         df_KPI_test.loc[mask_match, 'match_info'] = match_info
550 |         df_KPI_test.loc[mask_match, 'gameweek'] = gameweek
551 | 
552 | 
553 | # - End function
554 | #############################################################################
555 |         
556 | 
557 | 
558 | """ Function which filters the dataframe 
559 | 
560 | Description: 
561 |     
562 |     Input:  
563 |             df_KPI - Dataframe with information about player´s KPI´s 
564 |             from x number of games.
565 |             
566 |             position - postion to filter for
567 |             
568 |             list_kpi - selected kpi column´s to include in the returne dataframe
569 |             
570 |             min_minutes - minutes to filter for
571 |             
572 |             min_matches - total number of matches to filter for
573 |         
574 |     Output: 
575 |             df_pos_final - Filtered dataframe
576 |             
577 | """
578 | def filter_dataframe(df_KPI, positions, list_kpi, min_minutes, min_matches):
579 |     
580 |     # Create a dataframe with all the players from chosen position
581 |     mask_pos = df_KPI.role.isin(positions)
582 |     df_pos = df_KPI.loc[mask_pos]
583 |     
584 |     # Find the matches were the players have played more than "min_minutes" 
585 |     mask_tot_min = df_pos.minutesPlayed > min_minutes
586 |     df_pos = df_pos.loc[mask_tot_min] 
587 |     
588 |     # Find the unique player Id´s
589 |     player_list = df_pos['playerId'].unique().tolist()
590 |     
591 |     # Loop through and add the players with more than "min_matches" 
592 |     # matches to the dataframe
593 |     player_list_high_minutes = []
594 |     for player in player_list:
595 |         mask_player = df_pos.playerId == player
596 |         df_player = df_pos.loc[mask_player]
597 |         nr_of_matches = len(df_player)
598 |         
599 |         # Add player to the list
600 |         if (nr_of_matches >= min_matches):
601 |             player_list_high_minutes.append(player)
602 |             
603 |     # Create the final dataframe with matches
604 |     mask_tot_matches = df_pos.playerId.isin(player_list_high_minutes)
605 |     df_pos_final = df_pos.loc[mask_tot_matches]
606 |     
607 |     # Only return the relevant columns
608 |     list_columns = list_kpi.copy()
609 |     list_columns.extend(['playerId', 'shortName', 'teamName', 'matchId']) 
610 |     df_pos_final = df_pos_final[df_pos.columns.intersection(list_columns)]
611 | 
612 |     return df_pos_final
613 | 
614 | # - End function
615 | #############################################################################
616 | 
617 |         
618 |         
619 | """ Function which plots the final ratings from a match (including subs with
620 |     more than 20 minutes played).
621 | 
622 | 
623 |     Description: 
624 |     
625 |     Input:
626 |         df_final_rating - dataframe of all the players ratings from the match
627 |         home_team_lineup - Wycout shortName of players in home team lineup
628 |         home_team_bench - Wycout shortName of players on home team bench
629 |         away_team_lineup - Wycout shortName of players in away team lineup
630 |         away_team_bench - Wycout shortName of players on away team bench
631 |         
632 |     Output: 
633 |         None (Nice looking plot)
634 | 
635 | """ 
636 | def plot_pitch_ratings(df_final_rating, home_team_lineup, home_team_bench, away_team_lineup, away_team_bench):
637 |     pitch = Pitch(pitch_type="wyscout")
638 |     fig, ax = pitch.draw(figsize=(7,15))
639 |     
640 |     match_result = df_final_rating.match_info.values[0]
641 |     
642 |     ax.text(50, -5, match_result, ha = "center", fontsize = 16, fontproperties = serif_bold.prop)
643 |     
644 |     text_size = 10
645 |     
646 |     alpha_scaling = 13
647 |     
648 |     pitch_positions = {
649 |         'LB':  [10, 12],
650 |         'LWB':  [17, 12],
651 |         'LCB': [3, 30],
652 |         'CB': [2, 50],
653 |         'RB': [10, 88],
654 |         'RWB': [17, 88],
655 |         'RCB': [3, 70],
656 |         'LM': [28, 12],
657 |         'RM': [28, 88],
658 |         'LCM': [20, 35],
659 |         'CM': [17, 50],
660 |         'RCM': [20, 65],
661 |         'CAM': [38, 50],
662 |         'LW': [47, 25],
663 |         'RW': [47, 75],
664 |         'ST': [49, 50],
665 |         'LST': [49, 40],
666 |         'RST': [49, 60],
667 |         }
668 |     
669 |     
670 |     # Team colors 
671 |     team_colors = {
672 |         'Huddersfield Town FC':  "#0E63AD",
673 |         'Manchester United FC':  '#DA291C',
674 |         'Tottenham Hotspur FC': '#132257',
675 |         'Newcastle United FC': '#241F20',
676 |         'Stoke City FC': '#E03A3E',
677 |         'Southampton FC': '#D71920',
678 |         'Everton FC': '#003399',
679 |         'Leicester City FC': '#003090',
680 |         'Crystal Palace FC':'#1B458F',
681 |         'West Ham United FC': '#7A263A',
682 |         'Burnley FC': '#6C1D45',
683 |         'Swansea City AFC': '#121212',
684 |         'West Bromwich Albion FC': '#122F67',
685 |         'AFC Bournemouth': '#DA291C',
686 |         'Brighton & Hove Albion FC': '#0057B8',
687 |         'Watford FC': '#FBEE23',
688 |         'Liverpool FC': '#C8102E',
689 |         'Chelsea FC': '#034694',
690 |         'Manchester City FC': '#6CABDD',
691 |         'Arsenal FC':'#EF0107'
692 |         }
693 |     
694 |     attackers = ['LW', 'CAM',
695 |         'RW',
696 |         'ST',
697 |         'LST',
698 |         'RST']
699 |     
700 |     # Creta list of the players in ranking dataframe
701 |     ranked_players = df_final_rating['shortName'].tolist()
702 |     
703 |     # adjust for the rating box in the plot
704 |     box_adjustment = 5
705 |     
706 |     # Place the home team lineup on the pitch
707 |     for player in home_team_lineup:
708 |         mask_player = df_final_rating.shortName == player
709 |         position = df_final_rating.loc[mask_player, 'position'].values[0]
710 |         rating = df_final_rating.loc[mask_player, 'final_rating'].values[0]
711 |         team = df_final_rating.loc[mask_player, 'teamName'].values[0]
712 |         
713 |         # Set the team_color
714 |         team_color = team_colors[team]
715 |         
716 |         # Make sure to seperate name if it is too long
717 |         shortName = player.split()
718 |         shortName_new = ""
719 |         if len(shortName) == 1:
720 |                 shortName_new = player
721 |         else:
722 |              for i in range(2):
723 |                 shortName_new += shortName[i]
724 |                 if i == 0: 
725 |                     shortName_new += " "
726 |                     
727 |         
728 |         x = pitch_positions[position][0]
729 |         y = pitch_positions[position][1] + box_adjustment
730 |         
731 |         alignment = "left"
732 |         box_addition = 3
733 |         if position in attackers:
734 |             alignment = "right"
735 |             box_addition = -3
736 |             
737 |         props = dict(boxstyle='round', facecolor=team_color, alpha=rating/alpha_scaling)
738 |         # place a text box with rating
739 |         ax.text(x+box_addition, y-5, str(round(rating, 1)), ha = alignment, fontsize = text_size,
740 |              fontproperties = serif_bold.prop, bbox=props)
741 |         ax.text(x, y, shortName_new, ha = alignment, fontsize = text_size, color=team_color,
742 |              fontproperties = serif_bold.prop) # add fonts
743 |     
744 |     
745 |     # Place the home team bench
746 |     bench_x = -2
747 |     bench_y = 110
748 |     for player in home_team_bench:
749 |         
750 |         # check if the bench player played
751 |         if player in ranked_players:
752 |             
753 |             mask_player = df_final_rating.shortName == player
754 |             rating = df_final_rating.loc[mask_player, 'final_rating'].values[0]           
755 |             team = df_final_rating.loc[mask_player, 'teamName'].values[0]
756 |         
757 |             # Set the team_color
758 |             team_color = team_colors[team]
759 |             
760 |             # Make sure to seperate name if it is too long
761 |             shortName = player.split()
762 |             shortName_new = ""
763 |             if len(shortName) == 1:
764 |                     shortName_new = player
765 |             else:
766 |                  for i in range(2):
767 |                     shortName_new += shortName[i]
768 |                     if i == 0: 
769 |                         shortName_new += " "
770 |                         print("hej")
771 |                     
772 |             props = dict(boxstyle='round', facecolor=team_color, alpha=rating/alpha_scaling)
773 |             # place a text box with rating
774 |             ax.text(bench_x+5, bench_y-5, str(round(rating, 1)), ha = "center", fontsize = text_size,
775 |                     fontproperties = serif_bold.prop, bbox=props)
776 |             ax.text(bench_x, bench_y, shortName_new, ha = "left", fontsize = text_size, color=team_color,
777 |              fontproperties = serif_regular.prop) # add fonts
778 |             bench_x += 20
779 |     
780 |     # Place the away team lineup
781 |     for player in away_team_lineup:
782 |         mask_player = df_final_rating.shortName == player
783 |         position = df_final_rating.loc[mask_player, 'position'].values[0]
784 |         rating = df_final_rating.loc[mask_player, 'final_rating'].values[0]
785 |         team = df_final_rating.loc[mask_player, 'teamName'].values[0]
786 |         
787 |         # Set the team_color
788 |         team_color = team_colors[team]
789 |         
790 |         # Make sure to seperate name if it is too long
791 |         shortName = player.split()
792 |         shortName_new = ""
793 |         if len(shortName) == 1:
794 |                 shortName_new = player
795 |         else:
796 |              for i in range(2):
797 |                 shortName_new += shortName[i]
798 |                 if i == 0: 
799 |                     shortName_new += " "
800 |                     print("hej")
801 |         
802 |         alignment = "right"
803 |         box_addition = -3
804 |         if position in attackers:
805 |             alignment = "left"
806 |             box_addition = +3
807 |         
808 |         x = 100-pitch_positions[position][0]
809 |         y = 100-pitch_positions[position][1] + box_adjustment
810 |         
811 |         # place a text box with rating
812 |         # these are matplotlib.patch.Patch properties
813 |         props = dict(boxstyle='round', facecolor=team_color, alpha=rating/alpha_scaling)
814 |         ax.text(x+box_addition, y-5, str(round(rating, 1)), ha = alignment, fontsize = text_size,
815 |              fontproperties = serif_bold.prop, bbox=props)
816 |         ax.text(x, y, shortName_new, fontsize = text_size, ha = alignment, color=team_color,
817 |                  fontproperties = serif_bold.prop) # add fonts
818 |     
819 |     bench_x = 50
820 |     bench_y = 110
821 |     for player in away_team_bench:
822 |         # check if the bench player played
823 |         if player in ranked_players:
824 |             mask_player = df_final_rating.shortName == player
825 |             rating = df_final_rating.loc[mask_player, 'final_rating'].values[0]
826 |             team = df_final_rating.loc[mask_player, 'teamName'].values[0]
827 |         
828 |             # Set the team_color
829 |             team_color = team_colors[team]
830 |             
831 |             # Make sure to seperate name if it is too long
832 |             shortName = player.split()
833 |             shortName_new = ""
834 |             if len(shortName) == 1:
835 |                     shortName_new = player
836 |             else:
837 |                  for i in range(2):
838 |                     shortName_new += shortName[i]
839 |                     if i == 0: 
840 |                         shortName_new += " "
841 |                         print("hej")
842 |                     
843 |             # place a text box with rating
844 |             props = dict(boxstyle='round', facecolor=team_color, alpha=rating/alpha_scaling)
845 |             ax.text(bench_x+5, bench_y-5, str(round(rating, 1)), ha = "center", fontsize = text_size,
846 |                     fontproperties = serif_bold.prop, bbox=props)
847 |             
848 |             # place name of the benched player
849 |             ax.text(bench_x, bench_y, shortName_new, ha = "left", fontsize = text_size, color=team_color,
850 |              fontproperties = serif_regular.prop) # add fonts
851 |             bench_x += 15       
852 |             
853 |     
854 |     
855 |             


--------------------------------------------------------------------------------
/minutes_played.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue Sep 21 12:44:56 2021
  5 | 
  6 | @author: emildanielsson
  7 | """
  8 | 
  9 | #!/usr/bin/env python3
 10 | # -*- coding: utf-8 -*-
 11 | """
 12 | Created on Mon Sep 13 11:51:54 2021
 13 | 
 14 | @author: emildanielsson
 15 | 
 16 | Program description: 
 17 |     Computes how many minutes each player have played for each game 
 18 |     from the given event and matches data set  
 19 |     
 20 |     Creates and saves a dataframe with the following columns:
 21 |         playerId - playerId from Wyscout 
 22 |         shortName - shortName from Wyscout
 23 |         matchId - matchId from Wyscout data
 24 |         teamId - teamId from Wyscout data
 25 |         teamName - Official teamname frrom Wyscout data
 26 |         player_in_min - the minute of the match the playerr started playing
 27 |         player_out_min - the minute of the match the player stopped playing
 28 |         minutesPlayed - Minutes played in the given game
 29 |         red_card - boolean to show if the player got a red card that game  
 30 |                     (1 = red card,  = no red card)
 31 |         
 32 | """
 33 | 
 34 | # The basics
 35 | import pandas as pd
 36 | import numpy as np
 37 | import json
 38 | 
 39 | 
 40 | #############################################################################
 41 | # - Create dataframes from the Wyscout data
 42 | "---------------------------------------------------------------------------"
 43 | 
 44 | # Create event dataframe
 45 | #df_Europe_events = pd.read_json('Json_files/events_All.json', encoding="unicode_escape") #SLOWER
 46 | with open('../Json_files/events_All.json') as f:
 47 |     data_Europe= json.load(f)
 48 |     
 49 | df_Europe_events = pd.DataFrame(data_Europe)
 50 | 
 51 | # Create match dataframes
 52 | df_England_matches = pd.read_json('../../Wyscout/matches/matches_England.json', encoding="unicode_escape")
 53 | 
 54 | df_France_matches = pd.read_json('../../Wyscout/matches/matches_France.json', encoding="unicode_escape")
 55 | 
 56 | df_Germany_matches = pd.read_json('../../Wyscout/matches/matches_Germany.json', encoding="unicode_escape")
 57 | 
 58 | df_Italy_matches = pd.read_json('../../Wyscout/matches/matches_Italy.json', encoding="unicode_escape")
 59 | 
 60 | df_Spain_matches = pd.read_json('../../Wyscout/matches/matches_Spain.json', encoding="unicode_escape")
 61 | 
 62 | 
 63 | # Create players and teams dataframes
 64 | df_players = pd.read_json("../../Wyscout/players.json", encoding="unicode_escape")
 65 | df_teams = pd.read_json("../../Wyscout/teams.json", encoding="unicode_escape")
 66 | 
 67 | 
 68 | 
 69 | #############################################################################
 70 | # - Merge matches dataframes from all leagues 
 71 | "---------------------------------------------------------------------------"
 72 | 
 73 | frames_matches = [df_England_matches, df_France_matches, df_Germany_matches, 
 74 |                   df_Italy_matches, df_Spain_matches]
 75 | 
 76 | df_Europe_matches = pd.concat(frames_matches, keys = ["England", "France",
 77 |                                                       "Germany", "Italy", "Spain"])
 78 | 
 79 | 
 80 | #############################################################################
 81 | # - Creating the dataframe of full playing time for each match
 82 | "---------------------------------------------------------------------------"
 83 | 
 84 | # Prepares the dataframe with the columns we need
 85 | df_matches_fulltime=pd.DataFrame(columns=['matchId','matchDuration'])
 86 | 
 87 | # Match id checkpoints
 88 | loop_checkpoints = np.arange(0,2080,50)
 89 | j = 0
 90 | 
 91 | # Loop trough all matches
 92 | for i, match in df_Europe_matches.iterrows():
 93 |     
 94 |     # Find the events from match
 95 |     mask_match = (df_Europe_events.matchId == match['wyId']) & (df_Europe_events.matchPeriod == "2H")
 96 |     df_match = df_Europe_events.loc[mask_match]
 97 |     
 98 |     # time ofsecond half in seconds
 99 |     fulltime_sec = df_match['eventSec'].max()
100 |     
101 |     # Convert to minutes
102 |     fulltime_min = 45 + round(fulltime_sec / 60)
103 |     
104 |     # Add match and full time (minutes) to dataframe
105 |     df_matches_fulltime.loc[df_matches_fulltime.shape[0]] = [match.wyId, fulltime_min]
106 |     
107 |     if (j in loop_checkpoints):
108 |         print(f"Number of matches checked for fulltimes: {j}\n")
109 |     
110 |     j+=1
111 | 
112 | #############################################################################
113 | # - Creating the dataframe of minutes played for each player in each game
114 | "---------------------------------------------------------------------------"
115 | 
116 | # Prepares the dataframe with the columns we need
117 | df_minutes_played=pd.DataFrame(columns=['playerId', 'shortName',
118 |                                         'matchId', 'teamId', 'teamName',
119 |                                         'player_in_min', 'player_out_min',
120 |                                         'minutesPlayed', 'red_card'])
121 | 
122 | # Match id checkpoints
123 | loop_checkpoints = np.arange(0,2080,50)
124 | j = 0
125 | 
126 | # Loop trough all matches
127 | for i, match in df_Europe_matches.iterrows():
128 |     
129 |     # Lineups and substitutions are nested in teamsData
130 |     team_data = match['teamsData']
131 |     
132 |     # Get match Id
133 |     matchId = match['wyId']   
134 |     
135 |     # Get full match length
136 |     fulltime_min = df_matches_fulltime.loc[df_matches_fulltime['matchId'] == matchId]['matchDuration'].values[0]
137 |     
138 |     # Loop through both teams in the match
139 |     for teamId in team_data: 
140 |         # loop like this gets the teamId as String, not the team object apperantly
141 |         
142 |         # Fetches the team to look at
143 |         team = team_data[teamId]
144 |         
145 |         # list of the lineup
146 |         lineup = team['formation']['lineup']
147 |         
148 |         # list of the substitutions
149 |         substitutions = team['formation']['substitutions']
150 |         
151 |         # Get the team id
152 |         teamId = team['teamId']
153 |         
154 |         # Get the team name
155 |         mask_team_name = df_teams.wyId == teamId
156 |         df_team = df_teams.loc[mask_team_name]
157 |         teamName = df_team.officialName.values[0] # Could change officialName -> name ??
158 |         
159 |         # list of the players that came in during the match
160 |         sub_ins = []
161 |         sub_outs = []
162 |         if (substitutions != "null"):
163 |             for sub in substitutions:
164 |                 # "Handle" the case when the sub is badly registered
165 |                 if ((sub['playerIn'] != 0) & (sub['playerOut'] != 0)):
166 |                     sub_ins.append(sub['playerIn'])
167 |                     sub_outs.append(sub['playerOut']) 
168 |                 # With this solution some players will have played more minutes
169 |                 # than they actually played. But it is not that many matches 
170 |                 # so I think we are fine with it.
171 |         
172 |         # Loop through all players in the lineup and get their minutes played
173 |         for player in lineup:
174 |             
175 |             # Get the current playerId
176 |             playerId = player['playerId']
177 |             
178 |             # Get the current player shortName
179 |             shortName = df_players.loc[df_players.wyId == playerId].shortName.values[0]
180 |             
181 |             # If the player have been subbed out set minutes played for the sub and the player
182 |             if (playerId in sub_outs):
183 |                 
184 |                 # Find index of the substitution from the lists
185 |                 sub_index = sub_outs.index(playerId)
186 |                 
187 |                 # Find the mninute when sub took place
188 |                 sub_minute = substitutions[sub_index]['minute']
189 |                 
190 |                 # Find the name of the subbed in player
191 |                 shortName_sub = df_players.loc[df_players.wyId == sub_ins[sub_index]].shortName.values[0]
192 |                 
193 |                 # Add minutes played by the subed out player to the dataframe
194 |                 df_minutes_played.loc[df_minutes_played.shape[0]] = [sub_outs[sub_index], shortName, matchId, teamId, teamName, 0, sub_minute, sub_minute, 0]
195 |                     
196 |                 
197 |                 # Handle the case if the subbed in player also is subbed out (injury for example)
198 |                 if (sub_ins[sub_index] in sub_outs):
199 | 
200 |                     # Find index of the substitution from the lists
201 |                     sub_index2 = sub_outs.index(sub_ins[sub_index])
202 |                     
203 |                     # Find the mninute when sub took place
204 |                     sub_minute2 = substitutions[sub_index2]['minute']
205 |                     
206 |                     # Find the name of the subbed in player
207 |                     shortName_sub2 = df_players.loc[df_players.wyId == sub_ins[sub_index2]].shortName.values[0]
208 |                     
209 |                     # Make sure the subbed in and then out player at least played 1 min
210 |                     if (sub_minute2 - sub_minute <= 0):
211 |                        sub_playing_minutes2 = 1
212 |                     else:
213 |                         sub_playing_minutes2 = sub_minute2 - sub_minute
214 |                     
215 |                     # Add minutes played by the subed in and out player to the dataframe
216 |                     df_minutes_played.loc[df_minutes_played.shape[0]] = [sub_outs[sub_index2], shortName_sub, matchId, teamId, teamName, sub_minute, sub_minute2, sub_playing_minutes2, 0]
217 |                     
218 |                     # Make sure the subbed in player at least played 1 min
219 |                     if (fulltime_min - sub_minute2 <= 0):
220 |                        sub_playing_minutes3 = 1
221 |                     else:
222 |                         sub_playing_minutes3 = fulltime_min - sub_minute2
223 |                 
224 |                         
225 |                     # Add minutes played by the subed in player to the dataframe    
226 |                     df_minutes_played.loc[df_minutes_played.shape[0]] = [sub_ins[sub_index2], shortName_sub2, matchId, teamId, teamName, sub_minute2, fulltime_min, sub_playing_minutes3, 0]
227 |                 
228 |                 # Normal substitution
229 |                 else: 
230 |                     # Make sure the subbed in player at least played 1 min
231 |                     if (fulltime_min - sub_minute <= 0):
232 |                        sub_playing_minutes = 1
233 |                     else:
234 |                         sub_playing_minutes = fulltime_min - sub_minute
235 |                         
236 |                     # Add minutes played by the subed in player to the dataframe    
237 |                     df_minutes_played.loc[df_minutes_played.shape[0]] = [sub_ins[sub_index], shortName_sub, matchId, teamId, teamName, sub_minute, fulltime_min, sub_playing_minutes, 0]
238 |                 
239 |             # The player played for the whole game   
240 |             else:
241 |                 df_minutes_played.loc[df_minutes_played.shape[0]] = [playerId, shortName, matchId, teamId, teamName, 0, fulltime_min, fulltime_min, 0]
242 |                 
243 |                 
244 |     if (j in loop_checkpoints):
245 |         print(f"Number of matches checked for minutes: {j}\n")
246 |     
247 |     j+=1
248 | 
249 | 
250 | 
251 | #############################################################################
252 | # - Adjust for red cards
253 | "---------------------------------------------------------------------------"
254 | 
255 | # Filter out the fouls, assumed that red cards only exists as Foul-event
256 | mask_fouls = df_Europe_events.eventName == "Foul"
257 | df_fouls = df_Europe_events.loc[mask_fouls]
258 | 
259 | # Initiate variables 
260 | match_list_reds = []
261 | player_list_reds = []
262 | 
263 | # Loop through events to find matcghes and players with red cards
264 | for i, foul_i in df_fouls.iterrows():
265 |     
266 |     # List to save the tags in
267 |     foul_tags = []
268 | 
269 |     # Loop through fouls to find red cards
270 |     for foultag in foul_i['tags']:
271 |         foul_tags.append(foultag['id'])
272 | 
273 |     # tag 1701 == red card, tag 1703 == second yellow card
274 |     if ((1701 in foul_tags) or (1703 in foul_tags)):
275 |         
276 |         # Fet the redcarded playerId and matchId
277 |         red_carded_player = foul_i.playerId
278 |         red_carded_match = foul_i.matchId
279 |         
280 |         # Find minute of the red card
281 |         if foul_i.matchPeriod == "1H":
282 |             red_card_minute = round(foul_i.eventSec / 60)
283 |         elif foul_i.matchPeriod == "2H":
284 |             red_card_minute = 45 + round(foul_i.eventSec / 60)
285 |         else:
286 |             print("Error" + str(foul_i.matchPeriod))
287 | 
288 |         # Find the minute the red carded player got in 
289 |         mask_red_card_player_min = ((df_minutes_played.playerId == red_carded_player) & (df_minutes_played.matchId == red_carded_match))
290 |         df_red_card = df_minutes_played.loc[mask_red_card_player_min]
291 |         if len(df_red_card) != 0:
292 |             red_card_player_in = df_red_card.player_in_min.values[0]
293 |             
294 |             # Adjust the dataframe "df_minutes_played" to add the red card info 
295 |             df_minutes_played.loc[mask_red_card_player_min, 'player_out_min'] = red_card_minute
296 |             df_minutes_played.loc[mask_red_card_player_min, 'minutesPlayed'] = red_card_minute - red_card_player_in
297 |             df_minutes_played.loc[mask_red_card_player_min, 'red_card'] = 1
298 | 
299 | 
300 | 
301 | #############################################################################
302 | # - Save df_minutes to dataframe "minutes_played_All.json"
303 | "---------------------------------------------------------------------------"
304 | 
305 | 
306 | df_minutes_played.to_json("../Json_files/minutes_played_All.json")
307 | 
308 | # Test to load in and store as dataframe
309 | # with open('../Json_files/minutes_played_All.json') as f:
310 | #     data_minutes_new = json.load(f)
311 |     
312 | # df_test_new = pd.DataFrame(data_minutes_new)
313 | 
314 | 
315 | 
316 | 
317 | 
318 | 
319 | 
320 | 
321 | 
322 | 
323 | 
324 | 
325 | 
326 | 
327 | 
328 | 
329 | 
330 | 
331 | 
332 | 
333 | 
334 | 
335 | 
336 | 
337 | 
338 | 
339 | 
340 | 
341 | 


--------------------------------------------------------------------------------
/the_match_ranking.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue Sep 14 16:41:04 2021
  5 | 
  6 | @author: emildanielsson & JakobEP
  7 | 
  8 | Program description: 
  9 |    Find ratings of all players in the last round
 10 |    
 11 | Algorithm: 
 12 |     
 13 | """
 14 | 
 15 | 
 16 | # The basics
 17 | import pandas as pd
 18 | import numpy as np
 19 | import json
 20 | 
 21 | # Plotting
 22 | import matplotlib.pyplot as plt
 23 | from mplsoccer import FontManager
 24 | from mplsoccer import Pitch, VerticalPitch
 25 | 
 26 | # Import other functions
 27 | import fitting_functions as ff
 28 | import KPI_functions as kpi
 29 | 
 30 | # Statistical fitting of models
 31 | import statsmodels.api as sm
 32 | import statsmodels.formula.api as smf
 33 | from sklearn import preprocessing
 34 | from sklearn.preprocessing import MinMaxScaler
 35 | from sklearn.preprocessing import RobustScaler
 36 | from sklearn.model_selection import train_test_split
 37 | from sklearn.linear_model import LogisticRegression
 38 | 
 39 | # For tables
 40 | from tabulate import tabulate
 41 | 
 42 | # Ignore Future Warnings
 43 | import warnings
 44 | warnings.simplefilter(action='ignore', category=FutureWarning)
 45 | 
 46 | 
 47 | #%%
 48 | # - Load Fonts
 49 | "---------------------------------------------------------------------------"
 50 | 
 51 | URL1 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/'
 52 |         'fonts/SourceSerifPro-Regular.ttf?raw=true')
 53 | serif_regular = FontManager(URL1)
 54 | URL2 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/'
 55 |         'fonts/SourceSerifPro-ExtraLight.ttf?raw=true')
 56 | serif_extra_light = FontManager(URL2)
 57 | URL3 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/fonts/'
 58 |         'SourceSerifPro-Bold.ttf?raw=true')
 59 | serif_bold = FontManager(URL3)
 60 | 
 61 | 
 62 | #%%
 63 | # - Read in data KPI data
 64 | "---------------------------------------------------------------------------"
 65 | 
 66 | # Test to load in and store as dataframe per_90 dont have all collumns yet
 67 | # with open('Json_files/KPI_per_90_All.json') as f:
 68 | #     data_kpi = json.load(f)
 69 |     
 70 | with open('../Json_files/KPI_tot_All_v2.json') as f:
 71 |     data_kpi = json.load(f)
 72 |     
 73 | df_KPI = pd.DataFrame(data_kpi)
 74 | 
 75 | 
 76 | # Create match dataframes
 77 | df_England_matches = pd.read_json('../../Wyscout/matches/matches_England.json', encoding="unicode_escape")
 78 | 
 79 | 
 80 | #%%
 81 | # - Read in minutes played data
 82 | "---------------------------------------------------------------------------"
 83 | 
 84 | with open('../Json_files/minutes_played_All.json') as f:
 85 |     data_minutes = json.load(f)
 86 |     
 87 | df_minutes = pd.DataFrame(data_minutes)
 88 | 
 89 | 
 90 | #%%
 91 | # - Read PL events data, players and teams
 92 | "---------------------------------------------------------------------------"
 93 | 
 94 | # Create event dataframe for PL
 95 | df_events = pd.read_json('../Json_files/events_All.json', encoding="unicode_escape")
 96 | 
 97 | # Create players and teams dataframes
 98 | df_players = pd.read_json("../../Wyscout/players.json", encoding="unicode_escape")
 99 | df_teams = pd.read_json("../../Wyscout/teams.json", encoding="unicode_escape")
100 | 
101 | 
102 | #%%
103 | # - Read in data for xG-model and get the coeficients dataframes
104 | "---------------------------------------------------------------------------"  
105 | 
106 | with open('../Json_files/xG_model_v2_All_except_Eng.json') as f:
107 |     data_xG_model = json.load(f)
108 | 
109 | # Create dataframes
110 | df_xG_model = pd.DataFrame(data_xG_model)  
111 | 
112 | # Call xG-m
113 | df_log_model_shots_coef, df_log_model_headers_coef, df_log_model_free_kicks_coef, log_model, log_model_headers, log_model_free_kicks = ff.xG_model(df_xG_model)
114 | 
115 | 
116 | #%%
117 | # - Set filter and scaler varables
118 | "---------------------------------------------------------------------------"
119 | 
120 | # Now we want to filter out those who have not played at least 
121 | # 10 matches with 20 minutes in each match (can change)
122 | min_minutes = 20
123 | 
124 | # Choose method for normalizaion
125 | scaler = MinMaxScaler()
126 | #scaler = preprocessing.QuantileTransformer(random_state=0)
127 | #scaler = RobustScaler()
128 | 
129 | 
130 | #%%
131 | # - Create test and train dataset and preprocess data
132 | "---------------------------------------------------------------------------"
133 | 
134 | # Seperate df_KPI beteween PL and the rest of the legaues
135 | mask_PL = df_KPI.league == "England"
136 | df_KPI_PL = df_KPI.loc[mask_PL]
137 | df_KPI_EU_train = df_KPI.loc[~mask_PL]
138 | 
139 | test_gameweek = 38
140 | df_PL_gameweek_38 = df_England_matches.loc[df_England_matches.gameweek == test_gameweek]
141 | list_gameweek_38_matchId = df_PL_gameweek_38['wyId'].unique().tolist()
142 | mask_last_gameweeks = df_KPI_PL.matchId.isin(list_gameweek_38_matchId)
143 | 
144 | # KPIs GW 1-37
145 | df_KPI_PL = df_KPI_PL.loc[~mask_last_gameweeks]
146 | 
147 | 
148 | #%%
149 | # - Let User choose a match to get ratings from
150 | "---------------------------------------------------------------------------"
151 | 
152 | print("Choose match Id to get rankings from:\n")
153 | for i, match in df_PL_gameweek_38.iterrows():
154 |     print(match.label)
155 |     print(f"matchId: {match.wyId}\n")
156 |     
157 | print("Enter the match Id to look at: ")
158 | 
159 | the_matchId = int(input())
160 | #the_matchId = 2500098
161 | 
162 | # Find the match events 
163 | df_the_match_events = df_events.loc[df_events.matchId == the_matchId]
164 | 
165 | # Df with all own goals
166 | df_own_goals = kpi.own_goals(df_the_match_events)
167 | 
168 | 
169 | #%%
170 | # - Create the KPI-dataframe from that match
171 | "---------------------------------------------------------------------------"
172 | # Initiate the dataframe
173 | # Prepare the dataframe with the columns we need
174 | df_the_match_KPI = pd.DataFrame(columns=['matchId',
175 |                                'league',          
176 |                                'teamName',
177 |                                'playerId',
178 |                                'shortName',
179 |                                'role',
180 |                                'minutesPlayed',
181 |                                'team_goals',
182 |                                'team_conceded_goals',
183 |                                'red_card',
184 |                                # KPI's from here
185 |                                'goals',
186 |                                'assists',
187 |                                'passing%',
188 |                                'completed_passes',
189 |                                'fouls',
190 |                                'aerial%',
191 |                                'aerial_wins',
192 |                                'shots',
193 |                                'dribbles%',
194 |                                'succesful_dribbles',
195 |                                'key_passes',
196 |                                'succesful_through_passes',
197 |                                'plus_minus',
198 |                                'events_in_box',
199 |                                'passes_to_box',
200 |                                'creative_passes',
201 |                                'succesful_def_actions',
202 |                                'progressive_carries',
203 |                                'xG_tot',
204 |                                'xG_shots',
205 |                                'xG_headers',
206 |                                'xG_free_kicks',
207 |                                'xG_penalties',
208 |                                'own_goals',
209 |                                'yellow_cards',
210 |                                'danger_ball_loses',
211 |                                'def_actions%'])
212 | 
213 | 
214 | #%%
215 | # - Find home and away score
216 | "----------------------------------------------"
217 | 
218 | # Find teamIds in the match
219 | teams_match_list = df_the_match_events['teamId'].unique().tolist()
220 | 
221 | # Find the match data from df_matches
222 | mask_score = df_England_matches.wyId == the_matchId
223 | df_the_match_info = df_England_matches.loc[mask_score]
224 | team_data = df_the_match_info.teamsData.values[0]
225 | 
226 | # Get the list of players from events file
227 | players_the_match = df_the_match_events['playerId'].unique().tolist()
228 | 
229 | # Shortrname lists
230 | home_team_lineup = []
231 | away_team_lineup = []
232 | home_team_bench = []
233 | away_team_bench = []
234 | 
235 | # playerIds list
236 | home_team_list = []
237 | away_team_list = []
238 | for i in range(2):
239 |     team_data_i = team_data[str(teams_match_list[i])]
240 |     team_lineup = team_data_i['formation']['lineup']
241 |     team_bench = team_data_i['formation']['bench']
242 | 
243 |     # HERE COULD WE GET THE LINEUP POSITIONS
244 |     
245 |     # Get the lineup players
246 |     for player in team_lineup:
247 |         if player['playerId'] in players_the_match:
248 |             if team_data_i['side'] == "home":
249 |                 home_team_list.append(player['playerId'])
250 |                 shortName = df_players.loc[df_players.wyId == player['playerId']].shortName.values[0]
251 |                 home_team_lineup.append(shortName)
252 |             elif team_data_i['side'] == "away":
253 |                 away_team_list.append(player['playerId'])
254 |                 shortName = df_players.loc[df_players.wyId == player['playerId']].shortName.values[0]
255 |                 away_team_lineup.append(shortName)
256 |             else:
257 |                 print("Error: " + team_data_i['side'])
258 |     
259 |     # Get the bench players
260 |     for player in team_bench:
261 |         if player['playerId'] in players_the_match:
262 |             if team_data_i['side'] == "home":
263 |                 home_team_list.append(player['playerId'])
264 |                 shortName = df_players.loc[df_players.wyId == player['playerId']].shortName.values[0]
265 |                 home_team_bench.append(shortName)
266 |             elif team_data_i['side'] == "away":
267 |                 away_team_list.append(player['playerId'])
268 |                 shortName = df_players.loc[df_players.wyId == player['playerId']].shortName.values[0]
269 |                 away_team_bench.append(shortName)
270 |             else:
271 |                 print("Error: " + team_data_i['side'])
272 |         
273 |         # Set home and away score
274 |         if team_data_i['side'] == "home":
275 |             home_team_score = team_data_i['score']
276 |         elif team_data_i['side'] == "away":
277 |             away_team_score = team_data_i['score']
278 |         else:
279 |             print("Error: " + team_data_i['score'])
280 |         
281 | #%%
282 | # Compute the KPIs from the chosen match
283 | "----------------------------------------------"
284 | 
285 | # Loop trough all players and get their average position and compute KPI's
286 | for player in players_the_match:
287 |     
288 |     # Find the minutes played, team and red card
289 |     mask_minutes = (df_minutes.playerId == player)
290 |     df_player_minutes = df_minutes.loc[mask_minutes]
291 |     
292 |     # Some players are not registered the subbed in but their events are registerd
293 |     # If they are not subbed in correctly in Wyscout matches "df_player_minutes"
294 |     # will be empty. Thus we check this here. 
295 |     if len(df_player_minutes != 0):
296 |         player_minutes = df_player_minutes['minutesPlayed'][0]
297 |         player_in_min = df_player_minutes['player_in_min'][0]
298 |         player_out_min = df_player_minutes['player_out_min'][0]
299 |         player_team = df_player_minutes['teamId'][0]
300 |         player_team_name = df_player_minutes['teamName'][0]
301 |         red_card_bool = df_player_minutes['red_card'][0]
302 |         
303 |         # New dataframe with all events from 'player' in match
304 |         df_events_player = df_the_match_events.loc[df_the_match_events.playerId == player]
305 |         
306 |         # Get the position of the player
307 |         position = df_events_player['Position'].values[0]
308 |         
309 |         # Get the league
310 |         league = df_events_player["league"].values[0]
311 |         
312 |         # Get the shortName
313 |         name = df_events_player['shortName'].values[0]
314 |         
315 |         # Get the team goal and goals conceded
316 |         if (player in home_team_list):
317 |             team_goals = home_team_score
318 |             team_conceded_goals = away_team_score
319 |         elif (player in away_team_list):
320 |             team_goals = away_team_score
321 |             team_conceded_goals = home_team_score
322 |         else:
323 |             print("Error: cant find player in list")
324 |             
325 |         ################################################
326 |         # - Check after own goals from player in match
327 |         "----------------------------------------------"
328 |         
329 |         # Initiate temp variable
330 |         own_goals = 0
331 |         
332 |         # Read out any eventual own goals 
333 |         df_own_goals_player = df_own_goals.loc[df_own_goals.playerId == player]
334 |         
335 |         # Check there were any own goals
336 |         if len(df_own_goals_player) != 0:
337 |             own_goals = len(df_own_goals_player)
338 |             
339 |         
340 |         ################################################
341 |         # - All function calls to compute kpi's
342 |         # - (Should maybe try to use df.loc[mask, column] = instead)
343 |         "----------------------------------------------"
344 |         
345 |         # goals
346 |         goals, goals_info = kpi.nr_goals(df_events_player, player_minutes)
347 |         
348 |         # assists
349 |         assists, assists_info = kpi.nr_assists(df_events_player, player_minutes)
350 |         
351 |         # passing%
352 |         pass_percent, pass_percent_info = kpi.percent_passes_completed(df_events_player, player_minutes)
353 |         
354 |         # passes_completed
355 |         pass_comp, pass_comp_p90, pass_comp_info = kpi.passes_completed(df_events_player, player_minutes)
356 |         
357 |         # fouls
358 |         fouls, fouls_p90, fouls_info = kpi.fouls(df_events_player, player_minutes)
359 |         
360 |         # aerials%
361 |         aerials_percent, aerials_percent_info = kpi.percent_aerial_wins(df_events_player, player_minutes)
362 |         
363 |         # aerials_won
364 |         aerial_wins, aerial_wins_p90, aerial_wins_info = kpi.aerials_won(df_events_player, player_minutes)
365 |         
366 |         # shots
367 |         shots, shots_p90, shots_info = kpi.shots(df_events_player, player_minutes)
368 |         
369 |         # dribbles%
370 |         dribbles_percent, dribbles_percent_info = kpi.percent_succesful_dribbles(df_events_player, player_minutes)
371 |         
372 |         # succesful_dribbles
373 |         succesful_dribbles, succesful_dribbles_p90, succesful_dribbles_info = kpi.succesful_dribbles(df_events_player, player_minutes)
374 |         
375 |         # key_passes
376 |         key_passes, key_passes_p90, key_passes_info = kpi.key_passes(df_events_player, player_minutes)
377 |         
378 |         # succesful_through_passes
379 |         succesful_through_passes, succesful_through_passes_p90, succesful_through_passes_info = kpi.succesful_through_passes(df_events_player, player_minutes)
380 |         
381 |         # plus-minus
382 |         plus_minus, plus_minus_info = kpi.plus_minus(df_the_match_events, player_team, player_minutes, player_in_min, player_out_min)
383 |         
384 |         # events_in_box
385 |         events_in_box, events_in_box_p90, event_in_box_info = kpi.events_in_box(df_events_player, player_minutes)
386 |         
387 |         # passes_to_box
388 |         passes_to_box, passes_to_box_p90, passes_to_box_info = kpi.passes_to_box(df_events_player, player_minutes)
389 |         
390 |         # creative_passes
391 |         creative_passes, creative_passes_p90, creative_passes_info = kpi.creative_passes(df_events_player, player_minutes)
392 |         
393 |         # defensive_actions
394 |         succesful_def_actions, succesful_def_actions_p90, succesful_def_actions_info = kpi.succesful_def_actions(df_events_player, player_minutes)
395 |         
396 |         # progressive_carries 
397 |         progressive_carries, progressive_carries_p90, progressive_carries_info = kpi.progressive_carries(df_events_player, player_minutes) 
398 |         
399 |         # xG
400 |         xG_tot, xG_tot_p90, xG_info, xG_shots, xG_headers, xG_free_kicks, xG_penalties = kpi.xG(df_events_player, player_minutes, df_log_model_shots_coef, df_log_model_headers_coef, df_log_model_free_kicks_coef)
401 |         
402 |         # danger_ball_loses
403 |         danger_ball_loses, danger_ball_loses_p90, danger_ball_loses_info = kpi.danger_ball_loses(df_events_player, player_minutes)
404 |         
405 |         # yellow_cards
406 |         yellow_cards, yellow_cards_info = kpi.yellow_cards(df_events_player)
407 |         
408 |         # percent_def_actions
409 |         percent_def_actions, percent_def_actions_info = kpi.percent_def_actions(df_events_player, player_minutes)
410 |             
411 |         
412 |         ########################################################
413 |         # - Add rows to df_the_match_KPI
414 |         "------------------------------------------------------"
415 |         # df_KPI_tot
416 |         df_the_match_KPI.loc[df_the_match_KPI.shape[0]] = [the_matchId, league, player_team_name, player, name,
417 |                                        position, player_minutes, team_goals, 
418 |                                        team_conceded_goals, red_card_bool,
419 |                                        goals,
420 |                                        assists,
421 |                                        pass_percent,
422 |                                        pass_comp,
423 |                                        fouls,
424 |                                        aerials_percent,
425 |                                        aerial_wins,
426 |                                        shots,
427 |                                        dribbles_percent,
428 |                                        succesful_dribbles,
429 |                                        key_passes,
430 |                                        succesful_through_passes,
431 |                                        plus_minus,
432 |                                        events_in_box,
433 |                                        passes_to_box,
434 |                                        creative_passes,
435 |                                        succesful_def_actions,
436 |                                        progressive_carries,
437 |                                        xG_tot,
438 |                                        xG_shots,
439 |                                        xG_headers,
440 |                                        xG_free_kicks,
441 |                                        xG_penalties,
442 |                                        own_goals,
443 |                                        yellow_cards,
444 |                                        danger_ball_loses,
445 |                                        percent_def_actions
446 |                                        ]
447 |     
448 | #%%
449 | # - Create the new columns team_xG, opponents_xG, possesion
450 | "---------------------------------------------------------------------------"
451 | # List of the team names
452 | list_teams = df_the_match_KPI["teamName"].unique().tolist()
453 | 
454 | for team in list_teams:
455 |     
456 |     # Find the team KPI
457 |     mask_team = df_the_match_KPI.teamName == team
458 |     df_team = df_the_match_KPI.loc[mask_team]
459 |     df_opponent = df_the_match_KPI.loc[~mask_team]
460 |     
461 |     # Find xG and shots
462 |     # team_shots = df_team['shots'].sum()
463 |     # opponent_shots = df_opponent['shots'].sum()
464 |     team_xG = df_team["xG_tot"].sum()
465 |     opponent_xG = df_opponent["xG_tot"].sum()
466 |     team_passes = df_team['completed_passes'].sum()
467 |     opponent_passes = df_opponent['completed_passes'].sum()
468 |     
469 |     tot_game_passes = team_passes + opponent_passes
470 |     
471 |     # Find approximate possesion
472 |     team_possesion = team_passes / tot_game_passes
473 |     opponent_possesion = opponent_passes / tot_game_passes
474 |     
475 |     # Find PossAdj defnesive actions
476 |     for i, player in df_team.iterrows():
477 |         mask_player =  (df_the_match_KPI.playerId == player.playerId)
478 |         df_player = df_the_match_KPI.loc[mask_player]
479 |         def_actions = df_player.succesful_def_actions.values[0]
480 |         p_adj_def_actions = def_actions / opponent_possesion
481 |         df_the_match_KPI.loc[mask_player, 'p_adj_succ_def_actions'] = p_adj_def_actions
482 |     
483 |     # Add to the KPI dataframe
484 |     mask_add_xG = (df_the_match_KPI.teamName == team)
485 |     df_the_match_KPI.loc[mask_add_xG, 'team_xG'] = team_xG
486 |     df_the_match_KPI.loc[mask_add_xG, 'opponent_xG'] = opponent_xG
487 |     df_the_match_KPI.loc[mask_add_xG, 'team_possesion'] = team_possesion
488 |     df_the_match_KPI.loc[mask_add_xG, 'opponent_possesion'] = opponent_possesion
489 |     # df_the_match_KPI.loc[mask_add_xG, 'team_shots'] = team_shots
490 |     # df_the_match_KPI.loc[mask_add_xG, 'opponent_shots'] = opponent_shots
491 | 
492 | #%%
493 | # - Rank the players 
494 | "---------------------------------------------------------------------------"
495 | 
496 | # Merge the KPIs from the chosen match with the KPIS from 1-37
497 | df_KPI_PL = df_KPI_PL.append(df_the_match_KPI, ignore_index = True)
498 | 
499 | # Positions to fit for
500 | positions_fitting = [['LB', 'RB'], ['CB'], ['LM', 'RM'], ['CM'], ['LW', 'RW'], ['ST']]
501 | 
502 | # Initiate rating and info dataframe
503 | df_final_rating = pd.DataFrame(columns = ['matchId', 'teamName', 'playerId',
504 |                                           'shortName', 'position', 'tot_rating',
505 |                                           'match_events_rating', 'fitting_rating_off',
506 |                                           'fitting_rating_def',
507 |                                           'final_rating', 'match_info',
508 |                                           'gameweek'])
509 | 
510 | # Do fitting for all the positins
511 | for position in positions_fitting:
512 |     # print(position)
513 | 
514 |     ################################################
515 |     # - Kpis to fit for
516 |     "----------------------------------------------"
517 |     
518 |     list_kpi_all = ['passing%', 
519 |             'completed_passes',
520 |             'fouls',
521 |             'aerial%',
522 |             'aerial_wins',
523 |             'shots',
524 |             'dribbles%',
525 |             'succesful_dribbles',
526 |             'key_passes',
527 |             'succesful_through_passes',
528 |             'events_in_box',
529 |             'passes_to_box',
530 |             'creative_passes',
531 |             'succesful_def_actions',
532 |             'progressive_carries',
533 |             'red_card',
534 |             'own_goals',
535 |             'yellow_cards',
536 |             'danger_ball_loses',
537 |             'def_actions%',
538 |             'p_adj_succ_def_actions'
539 |             ] 
540 |         
541 |     # KPIs when using KPI_tot_All
542 |     list_kpi_off = ['passing%', 
543 |                 'completed_passes',
544 |                 'fouls',
545 |                 #'aerial%',
546 |                 #'aerial_wins',
547 |                 'shots',
548 |                 'dribbles%',
549 |                 #'succesful_dribbles',
550 |                 'key_passes',
551 |                 #'succesful_through_passes',
552 |                 'events_in_box',
553 |                 'passes_to_box',
554 |                 #'creative_passes',
555 |                 #'succesful_def_actions', 
556 |                 #'progressive_carries',
557 |                 'red_card',
558 |                 'own_goals',
559 |                 'yellow_cards',
560 |                 'danger_ball_loses',
561 |                 #'def_actions%',
562 |                 'p_adj_succ_def_actions'
563 |                 ] 
564 |     
565 |     list_kpi_def = ['passing%', 
566 |                 'completed_passes',
567 |                 'fouls',
568 |                 #'aerial%',
569 |                 #'aerial_wins',
570 |                 #'shots',
571 |                 'dribbles%',
572 |                 #'succesful_dribbles',
573 |                 #'key_passes',
574 |                 #'succesful_through_passes',
575 |                 #'events_in_box',
576 |                 #'passes_to_box',
577 |                 #'creative_passes',
578 |                 #'succesful_def_actions',
579 |                 #'progressive_carries',
580 |                 'red_card',
581 |                 'own_goals',
582 |                 'yellow_cards',
583 |                 'danger_ball_loses',
584 |                 #'def_actions%',
585 |                 'p_adj_succ_def_actions'
586 |                 ] 
587 |     
588 |     # # KPIs when using per_90_All
589 |     # list_kpi_p90 = ['passing%',
590 |     #             'completed_passes_p90',
591 |     #             'fouls_p90',
592 |     #             'aerial%',
593 |     #             'aerial_wins_p90',
594 |     #             'shots_p90',
595 |     #             'dribbles%',
596 |     #             'succesful_dribbles_p90',
597 |     #             'key_passes_p90',
598 |     #             'succesful_through_passes_p90',
599 |     #             'events_in_box_p90',
600 |     #             'passes_to_box_p90',
601 |     #             'creative_passes_p90',
602 |     #             'succesful_def_actions_p90',
603 |     #             'progressive_carries_p90',
604 |     #             'red_card',
605 |     #             'own_goals',
606 |     #             'yellow_cards',
607 |     #             'danger_ball_loses',
608 |     #             'def_actions%'
609 |     #             ]
610 |     
611 |     # Copy the KPI dataframe to add offensive and defensive
612 | 
613 |     ################################################
614 |     # - Filter the training data
615 |     "----------------------------------------------"
616 |     # Call to fitting function to find coeficient and independent variables
617 |     dep_var_off = 'team_xG'
618 |     model_coef_off, r_squared_off, list_kpi_off_fitting = ff.KPI_fitting(df_KPI_EU_train, scaler,
619 |                                                   list_kpi_off, dep_var_off,
620 |                                                   position, min_minutes)
621 |     
622 |     # Call to fitting function to find coeficient and independent variables
623 |     dep_var_def = 'opponent_xG'
624 |     model_coef_def, r_squared_def, list_kpi_def_fitting = ff.KPI_fitting(df_KPI_EU_train, scaler,
625 |                                                   list_kpi_def, dep_var_def,
626 |                                                   position, min_minutes)
627 |     
628 |     
629 |     ################################################
630 |     # - Use the coeficient from EU to compute percentiles
631 |     #   in the PL gameweek 1-37, filtered PL training data
632 |     "----------------------------------------------"
633 |     
634 |     # Merge the KPIs from the chosen match with the KPIS from 1-37
635 |     
636 |     # Filter and normalise the PL data (including the chosen match)
637 |     df_filtered_PL = ff.filter_dataframe(df_KPI_PL, position, list_kpi_all, min_minutes, 1)
638 |     df_filtered_PL[list_kpi_all] = scaler.fit_transform(df_filtered_PL[list_kpi_all]) 
639 |     
640 |     # KPIs GW 1-37
641 |     df_KPI_PL_train = df_filtered_PL.loc[~(df_filtered_PL.matchId == the_matchId)]
642 |     
643 |     # Initiate rating dataframe for GW 1-37
644 |     df_ratings = pd.DataFrame()
645 |     
646 |     # Loop through players in gameweek 1-37
647 |     for i, player in df_KPI_PL_train.iterrows():
648 |         
649 |         # Add some info to dataframe
650 |         df_ratings.loc[i, 'matchId'] = player['matchId']
651 |         df_ratings.loc[i, 'teamName'] = player['teamName']
652 |         df_ratings.loc[i, 'playerId'] = player['playerId']
653 |         df_ratings.loc[i, 'shortName'] = player['shortName']
654 |         
655 |         ################################################
656 |         # - xG-Fit
657 |         "----------------------------------------------"
658 |     
659 |         # Find the fitted xG 
660 |         xG_fitting_rating_off = ff.compute_fitting_ratings(player, model_coef_off, list_kpi_off_fitting)
661 |         
662 |         # Multiply the fitted value with r_squared, how good the fit was
663 |         xG_fitting_rating_off = xG_fitting_rating_off * r_squared_off
664 |         
665 |         # Add to df
666 |         df_ratings.loc[i, 'fitting_rating_off'] = xG_fitting_rating_off
667 |         
668 |         ################################################
669 |         # - opponent_xG-Fit (xGC)
670 |         "----------------------------------------------"
671 |         # Find the fitted opponent xG (xGC)
672 |         xGC_fitting_rating_def = ff.compute_fitting_ratings(player, model_coef_def, list_kpi_def_fitting)
673 |         
674 |         # Multiply the fitted value with r_squared, how good the fit was
675 |         xGC_fitting_rating_def = xGC_fitting_rating_def * r_squared_def
676 |         
677 |         # Add to df
678 |         df_ratings.loc[i, 'fitting_rating_def'] = xGC_fitting_rating_def
679 |         
680 |         ################################################
681 |         # - Match event-rating
682 |         "----------------------------------------------"
683 |         
684 |         # Find the event rating and add to dataframe
685 |         match_event_rating = ff.compute_events_rating(player, position, df_KPI)
686 |         df_ratings.loc[i, 'match_events_rating'] = match_event_rating
687 |         
688 |         # Sum fitting rating and add to dataframe
689 |         tot_fit_rating = xG_fitting_rating_off - xGC_fitting_rating_def
690 |         df_ratings.loc[i, 'tot_fit_rating'] = tot_fit_rating
691 |         
692 |         
693 | 
694 |     # Find percentiles from the rankings in gameweek 1-37 PL 
695 |     percentiles = np.arange(0.01, 1, 0.01)
696 |     percentiles_fit = df_ratings['tot_fit_rating'].quantile(percentiles)
697 |     percentiles_events = df_ratings['match_events_rating'].quantile(percentiles)
698 |     
699 |     ################################################
700 |     # - Compute the rankings of the chosen match gameweek 38 for the position
701 |     "----------------------------------------------"
702 |     # KPIs GW 38
703 |     df_the_match_KPI_players = df_filtered_PL.loc[df_filtered_PL.matchId == the_matchId] 
704 |     
705 |     # Initiate rating dataframe for GW 38
706 |     df_ratings_test = pd.DataFrame()
707 |     
708 |     # Loop through players from the given match
709 |     for i, player in df_the_match_KPI_players.iterrows():
710 |         
711 |         # Add some info to dataframe
712 |         df_ratings_test.loc[i, 'matchId'] = player['matchId']
713 |         df_ratings_test.loc[i, 'teamName'] = player['teamName']
714 |         df_ratings_test.loc[i, 'playerId'] = player['playerId']
715 |         df_ratings_test.loc[i, 'shortName'] = player['shortName']
716 |         
717 |         ################################################
718 |         # - xG-Fit
719 |         "----------------------------------------------"
720 |         
721 |         # Find the fitted xG 
722 |         xG_fitting_rating_off = ff.compute_fitting_ratings(player, model_coef_off, list_kpi_off_fitting)
723 |         
724 |         # Multiply the fitted value with r_squared, how good the fit was
725 |         xG_fitting_rating_off = xG_fitting_rating_off * r_squared_off
726 |         
727 |         # Add to df
728 |         df_ratings_test.loc[i, 'fitting_rating_off'] = xG_fitting_rating_off
729 |         
730 |         ################################################
731 |         # - opponent_xG-Fit (xGC)
732 |         "----------------------------------------------"
733 | 
734 |         # Find the fitted opponent xG (xGC)
735 |         xGC_fitting_rating_def = ff.compute_fitting_ratings(player, model_coef_def, list_kpi_def_fitting)
736 |         
737 |         # Multiply the fitted value with r_squared, how good the fit was
738 |         xGC_fitting_rating_def = xGC_fitting_rating_def * r_squared_def
739 |         
740 |         # Add to df
741 |         df_ratings_test.loc[i, 'fitting_rating_def'] = xGC_fitting_rating_def
742 |         
743 |         ################################################
744 |         # - Match event-rating
745 |         "----------------------------------------------"
746 |         
747 |         # Find the event rating and add to dataframe
748 |         match_event_rating = ff.compute_events_rating(player, position, df_KPI)
749 |         df_ratings_test.loc[i, 'match_events_rating'] = match_event_rating
750 |         
751 |         # Sum fitting rating and add to dataframe
752 |         tot_fit_rating = xG_fitting_rating_off - xGC_fitting_rating_def
753 |         df_ratings_test.loc[i, 'tot_fit_rating'] = tot_fit_rating
754 |     
755 |     # Modify the df_rating_test dataframe and the gameweek 38 dataframe
756 |     ff.create_rating_dataframe(df_ratings_test, df_KPI_PL, df_the_match_KPI_players,
757 |                                percentiles_fit, percentiles_events, df_England_matches)
758 | 
759 |     # Merge to the raw rating dataframe
760 |     frames = [df_final_rating, df_ratings_test]
761 |     df_final_rating = pd.concat(frames) 
762 | 
763 | 
764 | #%%
765 | # - Print the ratings
766 | "---------------------------------------------------------------------------"
767 | print(df_final_rating.match_info.values[0])
768 | table = df_final_rating[['teamName', 'shortName', 'position', 'final_rating']]
769 | print(tabulate(table))
770 | 
771 | 
772 | 
773 | #%%
774 | # - Plot the pitch
775 | # - Manually input the ACTUAL player positions
776 | "---------------------------------------------------------------------------"
777 | 
778 | positions = ['GK', 'CB', 'LCB', 'RCB', 'LB', 'RB', 'LWB', 'RWB', 'CM', 
779 |              'LCM', 'RCM', 'CAM', 'LM',
780 |              'RM', 'LW', 'RW', 'ST', 'LST', 'RST']
781 | 
782 | print("Here is the home team lineup:")
783 | # Print home team players 
784 | for player in home_team_lineup:
785 |     print(player)
786 | 
787 | print("Here is the away team lineup:")
788 | for player in away_team_lineup:
789 |     print(player)
790 | 
791 | print("Now enter the position for each player in that game.")
792 | print(f"The positions to choose from are the following: \n{positions}")
793 | print("HOME TEAM:")
794 | for player in home_team_lineup:
795 |     print(f"Write the position for: {player}")
796 |     position = input()
797 |     while position not in positions:
798 |         print("NOT A VALID POSITION!")
799 |         print(f"Write the position for: {player}")
800 |         position = input()
801 |     mask_player = df_final_rating.shortName == player
802 |     df_final_rating.loc[mask_player, 'position'] = position
803 |     
804 | print("AWAY TEAM:")
805 | for player in away_team_lineup:
806 |     print(f"Write the position for: {player}")
807 |     position = input()
808 |     while position not in positions:
809 |         print("NOT A VALID POSITION!")
810 |         print(f"Write the position for: {player}")
811 |         position = input()
812 |     mask_player = df_final_rating.shortName == player
813 |     df_final_rating.loc[mask_player, 'position'] = position
814 |     
815 | #%%
816 | 
817 | # This copy is only done for testing purposes
818 | df_plot_ratings = df_final_rating.copy()
819 | 
820 | # Plot final ratings on a pitch
821 | ff.plot_pitch_ratings(df_plot_ratings, home_team_lineup, home_team_bench, away_team_lineup, away_team_bench)
822 | 
823 | 


--------------------------------------------------------------------------------
/validation_vs_WhoScored.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue Sep 14 16:41:04 2021
  5 | 
  6 | @author: emildanielsson & JakobEP
  7 | 
  8 | Program description: 
  9 |    Finds ratings of all players in the last round and compares them with 
 10 |    read in ratings from WhoScored (for the same matches in last round).
 11 |    
 12 |    OBS! Make sure to run GW_38_Ratings.py firstly and then 
 13 |     
 14 | """
 15 | 
 16 | # The basics
 17 | import pandas as pd
 18 | import numpy as np
 19 | import json
 20 | 
 21 | 
 22 | #%%
 23 | # - Read Excels
 24 | # - 
 25 | # - Make sure to choose the correct sheets
 26 | # - 
 27 | "---------------------------------------------------------------------------"
 28 | 
 29 | # Specify the path to the xlsx-file
 30 | excel_path = "../Gameweek_38.xlsx"
 31 | 
 32 | df_WhoScored = pd.read_excel(open(excel_path, 'rb'),
 33 |               sheet_name='WhoScored')  
 34 | 
 35 | df_pre_tune = pd.read_excel(open(excel_path, 'rb'),
 36 |               sheet_name='result_pre_tune') 
 37 | 
 38 | df_post_tune = pd.read_excel(open(excel_path, 'rb'),
 39 |               sheet_name='result_post_tune') 
 40 | 
 41 | 
 42 | 
 43 | #%%
 44 | # - Create validation dataframe
 45 | "---------------------------------------------------------------------------"
 46 | df_validation = pd.DataFrame()
 47 | 
 48 | # Find all the teams
 49 | teams = df_WhoScored.teamName.unique().tolist()
 50 | 
 51 | # Loop through teams and add theri "team_rating"
 52 | for team in teams:
 53 |     
 54 |     # Whoscored frame sorted
 55 |     df_WhoScored_team = df_WhoScored.loc[df_WhoScored.teamName == team]
 56 |     df_WhoScored_team = df_WhoScored_team.sort_values(by='Rating', ascending=False)
 57 |     WhoScored_players = df_WhoScored_team.shortName.values.tolist()
 58 |     
 59 |     # df_pre_tune frame sorted
 60 |     df_pre_tune_team = df_pre_tune.loc[df_pre_tune.teamName == team]
 61 |     df_pre_tune_team = df_pre_tune_team.sort_values(by='final_rating', ascending=False)
 62 |     pre_tune_players = df_pre_tune_team.shortName.values.tolist()
 63 |     
 64 |     # df_ost_tune frame sorted
 65 |     df_post_tune_team = df_post_tune.loc[df_post_tune.teamName == team]
 66 |     df_post_tune_team = df_post_tune_team.sort_values(by='final_rating', ascending=False)
 67 |     post_tune_players = df_post_tune_team.shortName.values.tolist()
 68 |     
 69 |     for i, player in df_WhoScored_team.iterrows():
 70 |         playerName = player.shortName
 71 |         df_validation.loc[i, 'shortName'] = playerName
 72 |         df_validation.loc[i, 'Position'] = player.position
 73 |         df_validation.loc[i, 'teamName'] = player.teamName
 74 |         df_validation.loc[i, 'WhoScored'] = WhoScored_players.index(playerName) + 1
 75 |         df_validation.loc[i, 'pre_tune'] = pre_tune_players.index(playerName) + 1
 76 |         df_validation.loc[i, 'post_tune'] = post_tune_players.index(playerName) + 1 
 77 |         
 78 | 
 79 | #%%
 80 | # - Validate all players 
 81 | "---------------------------------------------------------------------------"
 82 | 
 83 | score_pre = 0
 84 | score_post = 0
 85 | nr_of_players = len(df_validation)
 86 | for i, player in df_validation.iterrows():
 87 |     score_pre += abs(player.WhoScored - player.pre_tune) 
 88 |     score_post += abs(player.WhoScored - player.post_tune) 
 89 |     
 90 | # Divide by the number of players (average "false" in comparison to WhoScored)
 91 | score_pre = score_pre / nr_of_players
 92 | score_post = score_post / nr_of_players
 93 |     
 94 | # Print Validation for all players
 95 | print("All Players validation:")
 96 | print(f"pre tuning score = {score_pre}")
 97 | print(f"post score = {score_post}\n")
 98 | 
 99 | 
100 | 
101 | #%%
102 | # - Validate Positions
103 | "---------------------------------------------------------------------------"
104 | 
105 | # Positions to fit for
106 | positions = [['LB', 'RB'], ['CB'], ['LM', 'RM'], ['CM'], ['LW', 'RW'], ['ST']]
107 | 
108 | for position in positions:
109 |     df_validate = df_validation.loc[df_validation.Position.isin(position)]
110 |     score_pre = 0
111 |     score_post = 0
112 |     nr_of_players = len(df_validate)
113 |     for i, player in df_validate.iterrows():
114 |         score_pre += abs(player.WhoScored - player.pre_tune) 
115 |         score_post += abs(player.WhoScored - player.post_tune) 
116 |     
117 |     # Divide by the number of players (average "false" in comparison to WhoScored)
118 |     score_pre = score_pre / nr_of_players
119 |     score_post = score_post / nr_of_players
120 |         
121 |     # Print Validation for all players
122 |     print(f"Validation {position}")
123 |     print(f"pre tuning score = {score_pre}")
124 |     print(f"post score = {score_post} \n")
125 |         
126 | 
127 | #%%
128 | # - Write validation results to Excel document
129 | "---------------------------------------------------------------------------"
130 | 
131 | # with pd.ExcelWriter("../Gameweek_38.xlsx", mode="a", engine="openpyxl", if_sheet_exists = "new") as writer:
132 | #     df_validation.to_excel(writer, sheet_name="WhoScored_Validation",
133 | #                             columns=['shortName', 'Position', 'teamName', 'WhoScored', 'pre_tune', 'pre_tune'],
134 | #                     header=True, index=False)


--------------------------------------------------------------------------------
/xG_model_evaluation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Fri Sep 17 14:42:31 2021
  5 | 
  6 | @author: emildanielsson & JakobEP
  7 | 
  8 | Program description:
  9 |     
 10 |     Evaluating and validating created xG-models with test-data.
 11 | 
 12 | """
 13 | 
 14 | #%%
 15 | # - Imports used
 16 | "---------------------------------------------------------------------------"
 17 | 
 18 | # Basics
 19 | import pandas as pd
 20 | import numpy as np
 21 | import json
 22 | 
 23 | # Plotting
 24 | import matplotlib.pyplot as plt
 25 | import seaborn as sns
 26 | from mplsoccer import FontManager
 27 | 
 28 | # Import other functions
 29 | import fitting_functions as ff
 30 | import FCPython 
 31 | 
 32 | # Statistical fitting of models
 33 | from sklearn import metrics
 34 | 
 35 | 
 36 | #%%
 37 | # - Plot settings
 38 | "---------------------------------------------------------------------------"
 39 | 
 40 | # Read in fonts
 41 | URL1 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/'
 42 |         'fonts/SourceSerifPro-Regular.ttf?raw=true')
 43 | serif_regular = FontManager(URL1)
 44 | 
 45 | URL2 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/'
 46 |         'fonts/SourceSerifPro-ExtraLight.ttf?raw=true')
 47 | serif_extra_light = FontManager(URL2)
 48 | URL3 = ('https://github.com/googlefonts/SourceSerifProGFVersion/blob/main/fonts/'
 49 |         'SourceSerifPro-Bold.ttf?raw=true')
 50 | serif_bold = FontManager(URL3)
 51 | 
 52 | 
 53 | #%%
 54 | # - Create dataframes from the Wyscout data, uncomment if needed
 55 | "---------------------------------------------------------------------------"
 56 | 
 57 | """
 58 | 
 59 | # Create event dataframe
 60 | with open('../Json_files/events_All.json') as f:
 61 |     data_Europe = json.load(f)
 62 |     
 63 | df_Europe_events = pd.DataFrame(data_Europe)
 64 | 
 65 | 
 66 | # Filter out events for England
 67 | df_PL_events = df_Europe_events[df_Europe_events.league == 'England']
 68 | 
 69 | # Save as .json-file (so it can be read in directly in the future)
 70 | df_PL_events.to_json("../Json_files/events_PL.json")
 71 | 
 72 | """
 73 | 
 74 | #%%
 75 | # - Read in event data for PL, uncomment if needed
 76 | "---------------------------------------------------------------------------" 
 77 | """
 78 | # Create event dataframe
 79 | with open('../Json_files/events_PL.json') as f:
 80 |     data_PL = json.load(f)
 81 |     
 82 | df_PL_events = pd.DataFrame(data_PL)
 83 | 
 84 | """
 85 | 
 86 | #%%
 87 | # - Read in data for xG-model
 88 | "---------------------------------------------------------------------------"  
 89 | 
 90 | with open('../Json_files/xG_model_v2_All_except_Eng.json') as f:
 91 |     data_xG_model_All = json.load(f)
 92 |     
 93 |     
 94 | with open('../Json_files/xG_model_v2_England_only.json') as f:
 95 |     data_xG_model_PL = json.load(f)
 96 | 
 97 | 
 98 | # Create dataframes
 99 | df_All_xG_model = pd.DataFrame(data_xG_model_All)  
100 | df_PL_xG_model = pd.DataFrame(data_xG_model_PL)
101 | 
102 | 
103 | #%%
104 | # - Get the coeficients dataframes
105 | "---------------------------------------------------------------------------"
106 | 
107 | # Call xG-m function
108 | df_log_model_shots_coef, df_log_model_headers_coef, df_log_model_free_kicks_coef, log_model_shots, log_model_headers, log_model_free_kicks = ff.xG_model(df_All_xG_model)
109 | 
110 | 
111 | #%%
112 | # - Print out fitting results
113 | "---------------------------------------------------------------------------"
114 | 
115 | #print("\n=============== xG-model shots  ======================")
116 | 
117 | #print(log_model_shots)
118 | 
119 | 
120 | #%%
121 | # - Filter out headers and freekicks for PL data
122 | "---------------------------------------------------------------------------"
123 | 
124 | mask_headers = df_PL_xG_model.header == 1
125 | mask_free_kicks = df_PL_xG_model.free_kick == 1
126 | 
127 | df_xG_shots = df_PL_xG_model[(~mask_headers) & (~mask_free_kicks)]
128 | df_xG_headers = df_PL_xG_model[mask_headers]
129 | df_xG_free_kicks = df_PL_xG_model[mask_free_kicks]
130 | 
131 | 
132 | #%%
133 | # - Split data - PL
134 | "---------------------------------------------------------------------------" 
135 | 
136 | x_testSet = df_xG_shots[['distance', 'angle_rad']].copy()     # change df
137 | y_testSet = df_xG_shots[['goal']].copy()                      # change df
138 |     
139 | # Adding distance squared to df
140 | squaredD = x_testSet['distance']**2
141 | x_testSet = x_testSet.assign(distance_sq = squaredD)
142 | 
143 | 
144 | #%%
145 | # - Make predictions - PL
146 | "---------------------------------------------------------------------------" 
147 | 
148 | # Find prediciton probabilities
149 | y_pred_prob = log_model_shots.predict_proba(x_testSet)         # change model
150 | 
151 | # Specify thresholds
152 | threshold5 = [0.5]
153 | threshold4 = [0.4]
154 | threshold2 = [0.2]
155 | threshold05 = [0.05]
156 | 
157 | # Final predicitons
158 | #y_pred = y_pred_prob[y_pred_prob[:, 1] > threshold]
159 | 
160 | y_pred5 = (y_pred_prob[:, 1] > threshold5).astype('float')
161 | y_pred2 = (y_pred_prob[:, 1] > threshold2).astype('float')
162 | y_pred05 = (y_pred_prob[:, 1] > threshold05).astype('float')
163 | y_pred4 = (y_pred_prob[:, 1] > threshold4).astype('float')
164 | 
165 | y_pred = (y_pred_prob[:, 1] > threshold4).astype('float')          # change
166 | 
167 | # Get confusion matrix
168 | cnf_matrix5 = metrics.confusion_matrix(y_testSet, y_pred5)
169 | cnf_matrix2 = metrics.confusion_matrix(y_testSet, y_pred2)
170 | cnf_matrix05 = metrics.confusion_matrix(y_testSet, y_pred05)
171 | cnf_matrix4 = metrics.confusion_matrix(y_testSet, y_pred4)
172 | 
173 | #%%
174 | # - Visualize the confusion matrix using Heatmap - PL
175 | "---------------------------------------------------------------------------"
176 | 
177 | class_names = [0, 1] # name  of classes
178 | 
179 | fig, ax = plt.subplots(figsize=(8, 6))
180 | tick_marks = np.arange(len(class_names))
181 | plt.xticks(tick_marks, class_names)
182 | plt.yticks(tick_marks, class_names)
183 | 
184 | # create heatmap
185 | sns.heatmap(pd.DataFrame(cnf_matrix4), annot=True, annot_kws={"size": 14},                                                                  # change
186 |             cmap="Oranges", fmt='g', cbar=False, linewidths=2, linecolor='orange')
187 | ax.xaxis.set_label_position("top")
188 | plt.tight_layout()
189 | plt.title(f'Confusion matrix for Threshold: {threshold4[0]}', y=1.1, fontweight='bold', fontsize=20, fontproperties=serif_regular.prop)     # change
190 | plt.ylabel('Actual label', fontweight='bold', fontsize=18, fontproperties=serif_regular.prop)
191 | plt.xlabel('Predicted label', fontweight='bold', fontsize=18, fontproperties=serif_regular.prop)
192 | 
193 | # Set ticks size
194 | plt.xticks(fontsize=14, fontweight='bold', fontproperties=serif_regular.prop)
195 | plt.yticks(fontsize=14, fontweight='bold', fontproperties=serif_regular.prop)
196 | 
197 | 
198 | #%%
199 | # - Print stats - PL
200 | "---------------------------------------------------------------------------"
201 | 
202 | """Recall - describes how big proportion among the true positive points that are predicted as positive. A high recall
203 | (close to 1) is good, and a low recall (close to 0) indicates a problem with false negatives. 
204 | 
205 | Precision - describes what the ratio of true positive points are among the ones predicted as positive. A high precision
206 | (close to 1) is good, and a low recall (close to 0) indicates a problem with false positives."""
207 | 
208 | 
209 | cm_5 = metrics.confusion_matrix(y_testSet, y_pred5)
210 | cm_2 = metrics.confusion_matrix(y_testSet, y_pred2)
211 | cm_05 = metrics.confusion_matrix(y_testSet, y_pred05)
212 | cm_4 = metrics.confusion_matrix(y_testSet, y_pred4)
213 | 
214 | #sensitivity = the ability of the model to correctly identify shots that resulted in a goal.
215 | sensitivity_5 = cm_5[1][1]/(cm_5[1][1] + cm_5[1][0])
216 | sensitivity_2 = cm_2[1][1]/(cm_2[1][1] + cm_2[1][0])
217 | sensitivity_05 = cm_05[1][1]/(cm_05[1][1] + cm_05[1][0])
218 | sensitivity_4 = cm_4[1][1]/(cm_4[1][1] + cm_4[1][0])
219 | 
220 | #the ability of the model to correctly identify shots that did not result in a goal
221 | specificity_5 = cm_5[0][0]/(cm_5[0][1]+  cm_5[0][0])
222 | specificity_2 = cm_2[0][0]/(cm_2[0][1]+  cm_2[0][0])
223 | specificity_05 = cm_05[0][0]/(cm_05[0][1]+  cm_05[0][0])
224 | specificity_4 = cm_4[0][0]/(cm_4[0][1]+  cm_4[0][0])
225 | 
226 | print("\n=============== xG-model performance  ======================")
227 | 
228 | print("Accuracy:", metrics.accuracy_score(y_testSet, y_pred))
229 | print("Precision:", metrics.precision_score(y_testSet, y_pred))
230 | print("Recall:", metrics.recall_score(y_testSet, y_pred))
231 | 
232 | print('sensitivity = ' + str(sensitivity_4))                                                   # change
233 | print('specificity = '+ str(specificity_4) )                                                   # change
234 | 
235 | print("R-sq. score:", metrics.r2_score(y_testSet, y_pred, sample_weight=None, multioutput='uniform_average'))
236 | 
237 | # OR
238 | """
239 | 
240 | cm_display = ConfusionMatrixDisplay(cm_dis_3).plot(cmap='OrRd')
241 | cm_display.im_.colorbar.remove()
242 | plt.title('Confusion Matrix for Threshold = 0.3')
243 | """
244 | 
245 | #%%
246 | # - Plot ROC-curve - PL
247 | "---------------------------------------------------------------------------"
248 | 
249 | from sklearn.metrics import roc_curve
250 | 
251 | fig, axes = plt.subplots(figsize=(11, 7))
252 | y_score = log_model_shots.decision_function(x_testSet)                                    # change model
253 | fpr, tpr, _  = roc_curve(y_testSet, y_score, pos_label=log_model_shots.classes_[1])       # change model
254 | plt.plot(fpr,tpr, label='ROC for xG-model shots')
255 | 
256 | plt.scatter(1 - specificity_5, sensitivity_5, c='orange', s=100, label='Threshold = 0.5')
257 | plt.scatter(1 - specificity_2, sensitivity_2, c='red', s=100, label='Threshold = 0.2')
258 | plt.scatter(1 - specificity_05, sensitivity_05, c='green', s=100, label='Threshold = 0.05')
259 | plt.scatter(1 - specificity_4, sensitivity_4, c='purple', s=100, label='Threshold = 0.4')
260 | y_45 = np.linspace(0,1,100) 
261 | plt.plot(y_45,y_45,linestyle='dashed', c='cyan', label='random guess')
262 | plt.legend(prop={"family": "Times New Roman", "size": 12})
263 | plt.xlim([0, 1])
264 | plt.ylim([0, 1])
265 | plt.xlabel('False Positive Rate (1 - Specificity)', fontweight='bold', fontsize=16, fontproperties=serif_regular.prop)
266 | plt.ylabel('True Positive Rate (Sensitivity)', fontweight='bold', fontsize=16, fontproperties=serif_regular.prop)
267 | plt.title('ROC Curve', fontweight='bold', fontsize=24, fontproperties=serif_regular.prop)
268 | 
269 | 
270 | #%%
271 | # - Evaluate xG-model by plotting
272 | "---------------------------------------------------------------------------"
273 | 
274 | coef_angle = df_log_model_shots_coef.iloc[0].values[0]
275 | 
276 | coef_distance = df_log_model_shots_coef.iloc[2].values[0]
277 | 
278 | coef_distance_sq = df_log_model_shots_coef.iloc[1].values[0]
279 | 
280 | B0 = df_log_model_shots_coef.iloc[3].values[0]
281 | 
282 | #Return xG value for more general model
283 | def calculate_xG(sh):    
284 | 
285 |    xG = 1/(1 + np.exp(-(coef_distance*sh['distance'] + coef_distance_sq*sh['D2'] 
286 |                         + coef_angle*sh['angle'] + B0)))
287 |    return xG   
288 | 
289 | 
290 | #Create a 2D map of xG
291 | pgoal_2d = np.zeros((65, 65))
292 | 
293 | for x in range(65):
294 |     for y in range(65):
295 |         sh = dict()
296 |         a = np.arctan(7.32 *x /(x**2 + abs(y-65/2)**2 - (7.32/2)**2))
297 |         if a<0:
298 |             a = np.pi + a
299 |         sh['angle'] = a
300 |         sh['distance'] = np.sqrt(x**2 + abs(y-65/2)**2)
301 |         sh['D2'] = x**2 + abs(y-65/2)**2
302 |         sh['X'] = x
303 |         sh['AX'] = x*a
304 |         sh['X2'] = x**2
305 |         #sh['A2'] = a**2
306 |         sh['C'] = abs(y-65/2)
307 |         sh['C2'] = (y-65/2)**2
308 |         
309 |         pgoal_2d[x, y] = calculate_xG(sh)
310 | 
311 | (fig3, ax3) = FCPython.createGoalMouth()
312 | pos = ax3.imshow(pgoal_2d, extent=[-1, 65, 65, -1], aspect='auto',cmap=plt.cm.Reds,vmin=0, vmax=0.3)
313 | fig3.colorbar(pos, ax=ax3)
314 | ax3.set_title('xG-model goal probabilities', fontsize=24, fontproperties=serif_regular.prop)
315 | plt.xlim((0,66))
316 | plt.ylim((-3,35))
317 | plt.gca().set_aspect('equal', adjustable='box')
318 | plt.show()
319 | 
320 | 
321 | 
322 | 


--------------------------------------------------------------------------------