├── README.md
├── environment.yml
├── LICENSE
├── .gitignore
├── young_outliers.ipynb
├── 02_data_checks.ipynb
├── 06_xa_map.ipynb
├── 01_statsbomb_json_to_feather.ipynb
└── demo_crawley.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # statsbomb-explore
2 | Exploring statsbomb data with mplsoccer
3 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: statsbomb-explore
2 | channels:
3 | - anaconda
4 | dependencies:
5 | - jupyter
6 | - pandas
7 | - scipy
8 | - seaborn
9 | - beautifulsoup4
10 | - pyarrow
11 | - scikit-learn
12 | - pillow
13 | - openpyxl
14 | <<<<<<< HEAD
15 | =======
16 | - pillow
17 | - requests
18 | >>>>>>> 6d5a52dc8abbb526b8433831ca108d0786cbc715
19 | - pip
20 | - pip:
21 | - mplsoccer
22 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The data are licensed under StatsBomb Public Data User Agreement: https://github.com/statsbomb/open-data/blob/master/LICENSE.pdf.
2 | The code uses the MIT license.
3 |
4 | Code:
5 | MIT License
6 |
7 | Copyright (c) 2020 Andrew Rowlinson
8 |
9 | Permission is hereby granted, free of charge, to any person obtaining a copy
10 | of this software and associated documentation files (the "Software"), to deal
11 | in the Software without restriction, including without limitation the rights
12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 | copies of the Software, and to permit persons to whom the Software is
14 | furnished to do so, subject to the following conditions:
15 |
16 | The above copyright notice and this permission notice shall be included in all
17 | copies or substantial portions of the Software.
18 |
19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 | SOFTWARE.
26 |
27 | Data:
28 | Please refer to the StatBomb Public Data User Agreement: https://github.com/statsbomb/open-data/blob/master/LICENSE.pdf.
29 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | # data folder
132 | data/
133 |
--------------------------------------------------------------------------------
/young_outliers.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np\n",
11 | "from sklearn.covariance import EmpiricalCovariance"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "df = pd.read_csv('https://raw.githubusercontent.com/mancunian1792/2019_2020_football_analysis/master/data/big5_full_stats.csv')"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "# string to number\n",
30 | "df['playing_minutes'] = pd.to_numeric(df.playing_minutes.str.replace(',', ''))"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "# drop object columns\n",
40 | "df.drop(['per90_matches', 'xg_team_success_matches'], axis=1, inplace=True)"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "# keep young players (aged 23 or under playing 900 or greater minutes)\n",
50 | "df = df[(df.age <= 24) & (df.playing_minutes >= 900) & (df.position != 'GK')].copy()"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "# calculate mahalanobis distance and sort dataframe so largest top\n",
60 | "player_values_array = df[df.columns[7:]].values # subset columns with stats\n",
61 | "cov = EmpiricalCovariance().fit(player_values_array)\n",
62 | "df['dist'] = cov.mahalanobis(player_values_array)\n",
63 | "df.sort_values('dist', ascending=False, inplace=True)\n",
64 | "df.reset_index(drop=True, inplace=True)"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "#df.head(40)"
74 | ]
75 | }
76 | ],
77 | "metadata": {
78 | "kernelspec": {
79 | "display_name": "Python 3",
80 | "language": "python",
81 | "name": "python3"
82 | },
83 | "language_info": {
84 | "codemirror_mode": {
85 | "name": "ipython",
86 | "version": 3
87 | },
88 | "file_extension": ".py",
89 | "mimetype": "text/x-python",
90 | "name": "python",
91 | "nbconvert_exporter": "python",
92 | "pygments_lexer": "ipython3",
93 | "version": "3.8.3"
94 | }
95 | },
96 | "nbformat": 4,
97 | "nbformat_minor": 4
98 | }
99 |
--------------------------------------------------------------------------------
/02_data_checks.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np\n",
11 | "import os"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "# Load dataframes"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 2,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "DATA_PATH = os.path.join(os.getcwd(),'data')\n",
28 | "SHOT_PATH = os.path.join(DATA_PATH,'freeze.parquet')\n",
29 | "df_shots = pd.read_parquet(SHOT_PATH)"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 3,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "MATCH_PATH = os.path.join(DATA_PATH,'match.parquet')\n",
39 | "df_match = pd.read_parquet(MATCH_PATH)"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 4,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "EVENTS_PATH = os.path.join(DATA_PATH,'event.parquet')\n",
49 | "df_events = pd.read_parquet(EVENTS_PATH)"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 5,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "RELATED_PATH = os.path.join(DATA_PATH,'related.parquet')\n",
59 | "df_related_events = pd.read_parquet(RELATED_PATH)"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 6,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "RELATED_PATH = os.path.join(DATA_PATH,'lineup.parquet')\n",
69 | "df_lineup = pd.read_parquet(RELATED_PATH)"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 7,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "RELATED_PATH = os.path.join(DATA_PATH,'tactic.parquet')\n",
79 | "df_tactics = pd.read_parquet(RELATED_PATH)"
80 | ]
81 | },
82 | {
83 | "cell_type": "markdown",
84 | "metadata": {},
85 | "source": [
86 | "# Check that all events have matches and vice versa"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "Some event files don't haev match info"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 8,
99 | "metadata": {
100 | "scrolled": true
101 | },
102 | "outputs": [
103 | {
104 | "data": {
105 | "text/plain": [
106 | "{22536, 265905, 266234, 266466, 266574, 266933, 267161, 267405, 267609, 267679}"
107 | ]
108 | },
109 | "execution_count": 8,
110 | "metadata": {},
111 | "output_type": "execute_result"
112 | }
113 | ],
114 | "source": [
115 | "set(df_events.match_id.unique()).symmetric_difference(set(df_match.match_id.unique()))"
116 | ]
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "metadata": {},
121 | "source": [
122 | "# Check all shots have freeze frames"
123 | ]
124 | },
125 | {
126 | "cell_type": "markdown",
127 | "metadata": {},
128 | "source": [
129 | "All non-penalties have freeze frames. Some penalties have the goal keeper location."
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 9,
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "set_shots = set(df_events.loc[df_events.type_name=='Shot','id'].unique())\n",
139 | "set_freeze = set(df_shots.id.unique())"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 10,
145 | "metadata": {},
146 | "outputs": [
147 | {
148 | "data": {
149 | "text/plain": [
150 | "set()"
151 | ]
152 | },
153 | "execution_count": 10,
154 | "metadata": {},
155 | "output_type": "execute_result"
156 | }
157 | ],
158 | "source": [
159 | "# all freeze frames have shots\n",
160 | "set_freeze - set_shots"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": 11,
166 | "metadata": {},
167 | "outputs": [
168 | {
169 | "name": "stdout",
170 | "output_type": "stream",
171 | "text": [
172 | <<<<<<< HEAD
173 | "Number of shots without freeze frame: 259\n"
174 | =======
175 | "Number of shots without freeze frame: 254\n"
176 | >>>>>>> 6d5a52dc8abbb526b8433831ca108d0786cbc715
177 | ]
178 | },
179 | {
180 | "data": {
181 | "text/plain": [
182 | <<<<<<< HEAD
183 | "Penalty 259\n",
184 | =======
185 | "Penalty 254\n",
186 | >>>>>>> 6d5a52dc8abbb526b8433831ca108d0786cbc715
187 | "Name: shot_type_name, dtype: int64"
188 | ]
189 | },
190 | "execution_count": 11,
191 | "metadata": {},
192 | "output_type": "execute_result"
193 | }
194 | ],
195 | "source": [
196 | "# the shots without freeze frames are penalties\n",
197 | "print('Number of shots without freeze frame:',len(set_shots)-len(set_freeze))\n",
198 | "df_events[df_events.id.isin(set_shots - set_freeze)].shot_type_name.value_counts()"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": 12,
204 | "metadata": {},
205 | "outputs": [
206 | {
207 | "data": {
208 | "text/plain": [
209 | "Goalkeeper 36\n",
210 | "Left Midfield 1\n",
211 | "Name: player_position_name, dtype: int64"
212 | ]
213 | },
214 | "execution_count": 12,
215 | "metadata": {},
216 | "output_type": "execute_result"
217 | }
218 | ],
219 | "source": [
220 | "# some penalties have the location of the goalkeeper, one has the location of the left midfield\n",
221 | "penalty_ids = df_events[df_events.shot_type_name=='Penalty'].id\n",
222 | "df_shots[df_shots.id.isin(penalty_ids)].player_position_name.value_counts()"
223 | ]
224 | },
225 | {
226 | "cell_type": "markdown",
227 | "metadata": {},
228 | "source": [
229 | "# Check related events"
230 | ]
231 | },
232 | {
233 | "cell_type": "markdown",
234 | "metadata": {},
235 | "source": [
236 | "Note I made a change to the preprocessing to link all events both ways.\n",
237 | "\n",
238 | "In the docs it said that related_event was a comma separated list of the Ids of related events. For example, a shot might be related to the Goalkeeper event, and a Block Event. The corresponding events will have the Id of the shot in their related_events column.\n",
239 | "\n",
240 | "When I explored the data, often carries didn't have the corresponding event.\n",
241 | "\n",
242 | "Now this is fixed."
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 13,
248 | "metadata": {},
249 | "outputs": [
250 | {
251 | "data": {
252 | "text/plain": [
253 | "set()"
254 | ]
255 | },
256 | "execution_count": 13,
257 | "metadata": {},
258 | "output_type": "execute_result"
259 | }
260 | ],
261 | "source": [
262 | "set1 = set(df_related_events.id.unique())\n",
263 | "set2 = set(df_related_events.id_related.unique())\n",
264 | "set(set1).symmetric_difference(set2)"
265 | ]
266 | },
267 | {
268 | "cell_type": "markdown",
269 | "metadata": {},
270 | "source": [
271 | "# Are team names consistent between events and match"
272 | ]
273 | },
274 | {
275 | "cell_type": "markdown",
276 | "metadata": {},
277 | "source": [
278 | "Yes!"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": 14,
284 | "metadata": {},
285 | "outputs": [
286 | {
287 | "name": "stdout",
288 | "output_type": "stream",
289 | "text": [
290 | "True\n"
291 | ]
292 | }
293 | ],
294 | "source": [
295 | "away_teams = (df_match[['away_team_id', 'away_team_name']]\n",
296 | " .drop_duplicates()\n",
297 | " .rename({'away_team_id':'team_id','away_team_name':'team_name'},axis=1))\n",
298 | "home_teams = (df_match[['home_team_id', 'home_team_name']]\n",
299 | " .drop_duplicates()\n",
300 | " .rename({'home_team_id':'team_id','home_team_name':'team_name'},axis=1))\n",
301 | "teams = pd.concat([away_teams,home_teams]).drop_duplicates()\n",
302 | "print(teams.team_id.nunique()==len(teams))"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": 15,
308 | "metadata": {},
309 | "outputs": [
310 | {
311 | "name": "stdout",
312 | "output_type": "stream",
313 | "text": [
314 | "Number of differences: 0\n"
315 | ]
316 | }
317 | ],
318 | "source": [
319 | "teams_from_events = df_events[['team_id','team_name']].drop_duplicates()\n",
320 | "teams_from_events = teams_from_events.merge(teams,on='team_id',how='outer')\n",
321 | "print('Number of differences:',(teams_from_events.team_name_x != teams_from_events.team_name_y).sum())"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": 16,
327 | "metadata": {},
328 | "outputs": [
329 | {
330 | "data": {
331 | "text/html": [
332 | "
\n",
333 | "\n",
346 | "
\n",
347 | " \n",
348 | " \n",
349 | " | \n",
350 | " team_id | \n",
351 | " team_name_x | \n",
352 | " team_name_y | \n",
353 | "
\n",
354 | " \n",
355 | " \n",
356 | " \n",
357 | "
\n",
358 | "
"
359 | ],
360 | "text/plain": [
361 | "Empty DataFrame\n",
362 | "Columns: [team_id, team_name_x, team_name_y]\n",
363 | "Index: []"
364 | ]
365 | },
366 | "execution_count": 16,
367 | "metadata": {},
368 | "output_type": "execute_result"
369 | }
370 | ],
371 | "source": [
372 | "teams_from_events[(teams_from_events.team_name_x != teams_from_events.team_name_y)]"
373 | ]
374 | },
375 | {
376 | "cell_type": "markdown",
377 | "metadata": {},
378 | "source": [
379 | "# Are player names consistent?"
380 | ]
381 | },
382 | {
383 | "cell_type": "markdown",
384 | "metadata": {},
385 | "source": [
386 | "Yes!"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": 17,
392 | "metadata": {},
393 | "outputs": [],
394 | "source": [
395 | "player1 = df_shots.loc[df_shots.player_id.notnull(),['player_id','player_name']].drop_duplicates()\n",
396 | "player2 = df_lineup.loc[df_lineup.player_id.notnull(),['player_id','player_name']].drop_duplicates()\n",
397 | "player3 = df_tactics.loc[df_tactics.player_id.notnull(),['player_id','player_name']].drop_duplicates()\n",
398 | "player4 = df_events.loc[df_events.player_id.notnull(),['player_id','player_name']].drop_duplicates()"
399 | ]
400 | },
401 | {
402 | "cell_type": "code",
403 | "execution_count": 18,
404 | "metadata": {},
405 | "outputs": [],
406 | "source": [
407 | "players = (player1.merge(player2,how='outer',on='player_id',suffixes=['_shot','_lineup'])\n",
408 | " .merge(player3,how='outer',on='player_id')\n",
409 | " .merge(player4,how='outer',on='player_id',suffixes=['_tactics','_events']))"
410 | ]
411 | },
412 | {
413 | "cell_type": "code",
414 | "execution_count": 19,
415 | "metadata": {},
416 | "outputs": [
417 | {
418 | "data": {
419 | "text/plain": [
420 | "0"
421 | ]
422 | },
423 | "execution_count": 19,
424 | "metadata": {},
425 | "output_type": "execute_result"
426 | }
427 | ],
428 | "source": [
429 | "# check player names in shots matches events\n",
430 | "len(players[((players.player_name_events != players.player_name_shot) &\n",
431 | " (players.player_name_shot.notnull())&\n",
432 | " (players.player_name_events.notnull()))])"
433 | ]
434 | },
435 | {
436 | "cell_type": "code",
437 | "execution_count": 20,
438 | "metadata": {},
439 | "outputs": [
440 | {
441 | "data": {
442 | "text/plain": [
443 | "0"
444 | ]
445 | },
446 | "execution_count": 20,
447 | "metadata": {},
448 | "output_type": "execute_result"
449 | }
450 | ],
451 | "source": [
452 | "# check player names in lineups matches events\n",
453 | "len(players[((players.player_name_events != players.player_name_lineup) &\n",
454 | " (players.player_name_lineup.notnull())&\n",
455 | " (players.player_name_events.notnull()))])"
456 | ]
457 | },
458 | {
459 | "cell_type": "code",
460 | "execution_count": 21,
461 | "metadata": {},
462 | "outputs": [
463 | {
464 | "data": {
465 | "text/html": [
466 | "\n",
467 | "\n",
480 | "
\n",
481 | " \n",
482 | " \n",
483 | " | \n",
484 | " player_id | \n",
485 | " player_name_shot | \n",
486 | " player_name_lineup | \n",
487 | " player_name_tactics | \n",
488 | " player_name_events | \n",
489 | "
\n",
490 | " \n",
491 | " \n",
492 | " \n",
493 | "
\n",
494 | "
"
495 | ],
496 | "text/plain": [
497 | "Empty DataFrame\n",
498 | "Columns: [player_id, player_name_shot, player_name_lineup, player_name_tactics, player_name_events]\n",
499 | "Index: []"
500 | ]
501 | },
502 | "execution_count": 21,
503 | "metadata": {},
504 | "output_type": "execute_result"
505 | }
506 | ],
507 | "source": [
508 | "players[((players.player_name_events != players.player_name_lineup) &\n",
509 | " (players.player_name_lineup.notnull())&\n",
510 | " (players.player_name_events.notnull()))]"
511 | ]
512 | },
513 | {
514 | "cell_type": "code",
515 | "execution_count": 22,
516 | "metadata": {},
517 | "outputs": [
518 | {
519 | "data": {
520 | "text/plain": [
521 | "0"
522 | ]
523 | },
524 | "execution_count": 22,
525 | "metadata": {},
526 | "output_type": "execute_result"
527 | }
528 | ],
529 | "source": [
530 | "# check player names in tactics matches events\n",
531 | "len(players[((players.player_name_events != players.player_name_tactics) &\n",
532 | " (players.player_name_tactics.notnull())&\n",
533 | " (players.player_name_events.notnull()))])"
534 | ]
535 | },
536 | {
537 | "cell_type": "markdown",
538 | "metadata": {},
539 | "source": [
540 | "# Are scorelines correct (exclude shoot outs)"
541 | ]
542 | },
543 | {
544 | "cell_type": "markdown",
545 | "metadata": {},
546 | "source": [
547 | "Yes!"
548 | ]
549 | },
550 | {
551 | "cell_type": "code",
552 | "execution_count": 23,
553 | "metadata": {},
554 | "outputs": [],
555 | "source": [
556 | "team_goals_from_events = df_events[((df_events.outcome_name=='Goal')|\n",
557 | " (df_events.type_name=='Own Goal For'))&(df_events.period!=5)]\n",
558 | "team_goals_from_events = pd.DataFrame(team_goals_from_events.groupby(['match_id','team_name'])\n",
559 | " .id.nunique()).reset_index()\n",
560 | "team_goals_from_events.rename({'id':'number_goals_events'},axis=1,inplace=True)\n",
561 | "teams_home_away = df_match[['match_id','away_team_name','home_team_name']]\n",
562 | "team_goals_from_events = team_goals_from_events.merge(teams_home_away,on='match_id',validate='m:1')\n",
563 | "mask_home = team_goals_from_events.team_name == team_goals_from_events.home_team_name\n",
564 | "team_goals_from_events.loc[mask_home,'team_status'] = 'home_score_events' \n",
565 | "team_goals_from_events.loc[~mask_home,'team_status'] = 'away_score_events'\n",
566 | "team_goals_from_events = team_goals_from_events[['match_id','team_status','number_goals_events']]\n",
567 | "team_goals_from_events = (team_goals_from_events.pivot(index='match_id',\n",
568 | " columns='team_status',\n",
569 | " values='number_goals_events')\n",
570 | " .reset_index())\n",
571 | "team_goals_from_events.replace({np.nan:0},inplace=True)\n",
572 | "df_match = df_match.merge(team_goals_from_events,on='match_id',how='outer')\n",
573 | "df_match.away_score_events.replace({np.nan:0},inplace=True)\n",
574 | "df_match.home_score_events.replace({np.nan:0},inplace=True)"
575 | ]
576 | },
577 | {
578 | "cell_type": "code",
579 | "execution_count": 24,
580 | "metadata": {},
581 | "outputs": [
582 | {
583 | "data": {
584 | "text/html": [
585 | "\n",
586 | "\n",
599 | "
\n",
600 | " \n",
601 | " \n",
602 | " | \n",
603 | " match_id | \n",
604 | " match_date | \n",
605 | " kick_off | \n",
606 | " home_score | \n",
607 | " away_score | \n",
608 | " last_updated | \n",
609 | " match_week | \n",
610 | " competition_id | \n",
611 | " competition_country_name | \n",
612 | " competition_name | \n",
613 | " ... | \n",
614 | " home_team_managers_country_name | \n",
615 | " away_team_managers_id | \n",
616 | " away_team_managers_name | \n",
617 | " away_team_managers_nickname | \n",
618 | " away_team_managers_dob | \n",
619 | " away_team_managers_country_id | \n",
620 | " away_team_managers_country_name | \n",
621 | " metadata_xy_fidelity_version | \n",
622 | " away_score_events | \n",
623 | " home_score_events | \n",
624 | "
\n",
625 | " \n",
626 | " \n",
627 | " \n",
628 | "
\n",
629 | "
0 rows × 50 columns
\n",
630 | "
"
631 | ],
632 | "text/plain": [
633 | "Empty DataFrame\n",
634 | "Columns: [match_id, match_date, kick_off, home_score, away_score, last_updated, match_week, competition_id, competition_country_name, competition_name, season_id, season_name, home_team_id, home_team_name, competition_gender, home_team_group, home_team_country_id, home_team_country_name, away_team_id, away_team_name, away_team_group, away_team_country_id, away_team_country_name, metadata_data_version, metadata_shot_fidelity_version, competition_stage_id, competition_stage_name, stadium_id, stadium_name, stadium_country_id, stadium_country_name, referee_id, referee_name, referee_country_id, referee_country_name, home_team_managers_id, home_team_managers_name, home_team_managers_nickname, home_team_managers_dob, home_team_managers_country_id, home_team_managers_country_name, away_team_managers_id, away_team_managers_name, away_team_managers_nickname, away_team_managers_dob, away_team_managers_country_id, away_team_managers_country_name, metadata_xy_fidelity_version, away_score_events, home_score_events]\n",
635 | "Index: []\n",
636 | "\n",
637 | "[0 rows x 50 columns]"
638 | ]
639 | },
640 | "execution_count": 24,
641 | "metadata": {},
642 | "output_type": "execute_result"
643 | }
644 | ],
645 | "source": [
646 | "df_match[df_match.home_score != df_match.home_score_events]"
647 | ]
648 | },
649 | {
650 | "cell_type": "code",
651 | "execution_count": 25,
652 | "metadata": {},
653 | "outputs": [
654 | {
655 | "data": {
656 | "text/html": [
657 | "\n",
658 | "\n",
671 | "
\n",
672 | " \n",
673 | " \n",
674 | " | \n",
675 | " match_id | \n",
676 | " match_date | \n",
677 | " kick_off | \n",
678 | " home_score | \n",
679 | " away_score | \n",
680 | " last_updated | \n",
681 | " match_week | \n",
682 | " competition_id | \n",
683 | " competition_country_name | \n",
684 | " competition_name | \n",
685 | " ... | \n",
686 | " home_team_managers_country_name | \n",
687 | " away_team_managers_id | \n",
688 | " away_team_managers_name | \n",
689 | " away_team_managers_nickname | \n",
690 | " away_team_managers_dob | \n",
691 | " away_team_managers_country_id | \n",
692 | " away_team_managers_country_name | \n",
693 | " metadata_xy_fidelity_version | \n",
694 | " away_score_events | \n",
695 | " home_score_events | \n",
696 | "
\n",
697 | " \n",
698 | " \n",
699 | " \n",
700 | "
\n",
701 | "
0 rows × 50 columns
\n",
702 | "
"
703 | ],
704 | "text/plain": [
705 | "Empty DataFrame\n",
706 | "Columns: [match_id, match_date, kick_off, home_score, away_score, last_updated, match_week, competition_id, competition_country_name, competition_name, season_id, season_name, home_team_id, home_team_name, competition_gender, home_team_group, home_team_country_id, home_team_country_name, away_team_id, away_team_name, away_team_group, away_team_country_id, away_team_country_name, metadata_data_version, metadata_shot_fidelity_version, competition_stage_id, competition_stage_name, stadium_id, stadium_name, stadium_country_id, stadium_country_name, referee_id, referee_name, referee_country_id, referee_country_name, home_team_managers_id, home_team_managers_name, home_team_managers_nickname, home_team_managers_dob, home_team_managers_country_id, home_team_managers_country_name, away_team_managers_id, away_team_managers_name, away_team_managers_nickname, away_team_managers_dob, away_team_managers_country_id, away_team_managers_country_name, metadata_xy_fidelity_version, away_score_events, home_score_events]\n",
707 | "Index: []\n",
708 | "\n",
709 | "[0 rows x 50 columns]"
710 | ]
711 | },
712 | "execution_count": 25,
713 | "metadata": {},
714 | "output_type": "execute_result"
715 | }
716 | ],
717 | "source": [
718 | "df_match[df_match.away_score != df_match.away_score_events]"
719 | ]
720 | },
721 | {
722 | "cell_type": "markdown",
723 | "metadata": {},
724 | "source": [
725 | "# Number of events in each file"
726 | ]
727 | },
728 | {
729 | "cell_type": "code",
730 | "execution_count": 26,
731 | "metadata": {},
732 | "outputs": [
733 | {
734 | "data": {
735 | "text/plain": [
736 | <<<<<<< HEAD
737 | "count 808.000000\n",
738 | "mean 3585.856436\n",
739 | "std 400.381267\n",
740 | "min 2173.000000\n",
741 | "25% 3330.000000\n",
742 | "50% 3588.000000\n",
743 | "75% 3855.500000\n",
744 | =======
745 | "count 778.000000\n",
746 | "mean 3595.831620\n",
747 | "std 399.092479\n",
748 | "min 2173.000000\n",
749 | "25% 3337.250000\n",
750 | "50% 3603.000000\n",
751 | "75% 3866.500000\n",
752 | >>>>>>> 6d5a52dc8abbb526b8433831ca108d0786cbc715
753 | "max 5026.000000\n",
754 | "Name: id, dtype: float64"
755 | ]
756 | },
757 | "execution_count": 26,
758 | "metadata": {},
759 | "output_type": "execute_result"
760 | }
761 | ],
762 | "source": [
763 | "df_events.groupby('match_id')['id'].nunique().describe()"
764 | ]
765 | }
766 | ],
767 | "metadata": {
768 | "kernelspec": {
769 | "display_name": "Python 3",
770 | "language": "python",
771 | "name": "python3"
772 | },
773 | "language_info": {
774 | "codemirror_mode": {
775 | "name": "ipython",
776 | "version": 3
777 | },
778 | "file_extension": ".py",
779 | "mimetype": "text/x-python",
780 | "name": "python",
781 | "nbconvert_exporter": "python",
782 | "pygments_lexer": "ipython3",
783 | "version": "3.8.3"
784 | }
785 | },
786 | "nbformat": 4,
787 | "nbformat_minor": 2
788 | }
789 |
--------------------------------------------------------------------------------
/06_xa_map.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np\n",
11 | "from mplsoccer.pitch import Pitch\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import os\n",
14 | "import matplotlib.gridspec as gridspec"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "# Load datasets"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "#DATA_PATH = os.path.join(os.getcwd(),'data')\n",
31 | "#EVENTS_PATH = os.path.join(DATA_PATH,'events')\n",
32 | "#df_events = pd.read_feather(EVENTS_PATH)"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "# Get all passes for Samuel Eto"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 3,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "#df_pass = df_events[df_events.type_name=='Pass'].copy()\n",
49 | "#df_pass = df_pass[['id','player_id','player_name','pass_assisted_shot_id',\n",
50 | "# 'x', 'y', 'pass_end_x', 'pass_end_y']].copy()\n",
51 | "#df_pass = df_pass[df_pass.player_id==19298].copy()"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {},
57 | "source": [
58 | "# Get all shots and merge onto shots onto passes for outcomes"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 4,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "#df_shot = df_events[df_events.type_name=='Shot'].dropna(axis=1,how='all').copy()\n",
68 | "#df_shot = df_shot[['id','shot_statsbomb_xg','shot_outcome_name']].copy()\n",
69 | "#df_shot = df_shot.rename({'id':'pass_assisted_shot_id'},axis=1)"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 5,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "#df_pass = df_pass.merge(df_shot,on='pass_assisted_shot_id',how='left')\n",
79 | "# add assist column and drop shot outcome\n",
80 | "#df_pass['assist'] = df_pass['shot_outcome_name'] == 'Goal'\n",
81 | "#df_pass.drop('shot_outcome_name',axis=1,inplace=True)"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": 7,
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "df_pass = pd.read_csv('LubalaAssists.csv')"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": 8,
96 | "metadata": {},
97 | "outputs": [
98 | {
99 | "data": {
100 | "text/html": [
101 | "\n",
102 | "\n",
115 | "
\n",
116 | " \n",
117 | " \n",
118 | " | \n",
119 | " x_pass_start | \n",
120 | " y_pass_start | \n",
121 | " x_pass_end | \n",
122 | " y_pass_end | \n",
123 | " xg | \n",
124 | " assist | \n",
125 | "
\n",
126 | " \n",
127 | " \n",
128 | " \n",
129 | " | 0 | \n",
130 | " 93.0 | \n",
131 | " 47.5 | \n",
132 | " 90.1 | \n",
133 | " 47.1 | \n",
134 | " 0.00 | \n",
135 | " True | \n",
136 | "
\n",
137 | " \n",
138 | " | 1 | \n",
139 | " 83.1 | \n",
140 | " 39.3 | \n",
141 | " 85.3 | \n",
142 | " 53.1 | \n",
143 | " 0.19 | \n",
144 | " False | \n",
145 | "
\n",
146 | " \n",
147 | " | 2 | \n",
148 | " 53.2 | \n",
149 | " 50.9 | \n",
150 | " 40.2 | \n",
151 | " 54.0 | \n",
152 | " 0.00 | \n",
153 | " False | \n",
154 | "
\n",
155 | " \n",
156 | " | 3 | \n",
157 | " 46.2 | \n",
158 | " 52.4 | \n",
159 | " 40.5 | \n",
160 | " 43.9 | \n",
161 | " 0.22 | \n",
162 | " True | \n",
163 | "
\n",
164 | " \n",
165 | " | 4 | \n",
166 | " 84.1 | \n",
167 | " 37.9 | \n",
168 | " 99.0 | \n",
169 | " 19.3 | \n",
170 | " 0.00 | \n",
171 | " False | \n",
172 | "
\n",
173 | " \n",
174 | "
\n",
175 | "
"
176 | ],
177 | "text/plain": [
178 | " x_pass_start y_pass_start x_pass_end y_pass_end xg assist\n",
179 | "0 93.0 47.5 90.1 47.1 0.00 True\n",
180 | "1 83.1 39.3 85.3 53.1 0.19 False\n",
181 | "2 53.2 50.9 40.2 54.0 0.00 False\n",
182 | "3 46.2 52.4 40.5 43.9 0.22 True\n",
183 | "4 84.1 37.9 99.0 19.3 0.00 False"
184 | ]
185 | },
186 | "execution_count": 8,
187 | "metadata": {},
188 | "output_type": "execute_result"
189 | }
190 | ],
191 | "source": [
192 | "df_pass.head()"
193 | ]
194 | },
195 | {
196 | "cell_type": "markdown",
197 | "metadata": {},
198 | "source": [
199 | "# Subset the data for the lines and shots (assist/ or other)"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": 10,
205 | "metadata": {},
206 | "outputs": [],
207 | "source": [
208 | "#boolean mask for assists or high xg. Use this to subset data\n",
209 | "mask_line = (df_pass.assist==True)|(df_pass.xg>=0.1)\n",
210 | "df_line = df_pass[mask_line].copy()\n",
211 | "# boolean mask for assists. Use this to subset data\n",
212 | "mask_assist = (mask_line) & (df_pass.assist==True)\n",
213 | "df_assist = df_pass[mask_assist].copy()\n",
214 | "# booelan mask for other passes (no assist/goal). Use this to subset data\n",
215 | "mask_other = (mask_line) & (df_pass.assist==False)\n",
216 | "df_other = df_pass[mask_other].copy()"
217 | ]
218 | },
219 | {
220 | "cell_type": "markdown",
221 | "metadata": {},
222 | "source": [
223 | "# Plot the data"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": 16,
229 | "metadata": {},
230 | "outputs": [
231 | {
232 | "data": {
233 | "image/png": "\n",
234 | "text/plain": [
235 | ""
236 | ]
237 | },
238 | "metadata": {},
239 | "output_type": "display_data"
240 | }
241 | ],
242 | "source": [
243 | "pad = 1/72\n",
244 | "figsize1 = 1536/72\n",
245 | "figsize2 = 1125/72\n",
246 | "fig = plt.figure(figsize=(figsize1, figsize2),facecolor='#2f3653') \n",
247 | "gs = gridspec.GridSpec(2, 2, width_ratios=[3.13, 1])\n",
248 | "ax1 = plt.subplot(gs[:, 0])\n",
249 | "ax2 = plt.subplot(gs[0, 1])\n",
250 | "ax3 = plt.subplot(gs[1, 1])\n",
251 | "pitch = Pitch(pitch_type='opta',orientation='vertical',view='half',layout=(1,1),figsize=(10,10),\n",
252 | " pitch_color='#2f3653',line_color='#82868f',goal_type='box',linewidth=2,\n",
253 | " pad_bottom=0.2,pad_top=4)\n",
254 | "pitch.draw(ax1)\n",
255 | "pitch.draw(ax2)\n",
256 | "pitch.draw(ax3)\n",
257 | "#plot lines\n",
258 | "pitch.lines(df_line.x_pass_start,df_line.y_pass_start,df_line.x_pass_end,df_line.y_pass_end,\n",
259 | " lw=9,transparent=True,comet=True,ax=ax1)\n",
260 | "pitch.lines(df_line.x_pass_start,df_line.y_pass_start,df_line.x_pass_end,df_line.y_pass_end,\n",
261 | " lw=9,transparent=True,comet=True,ax=ax2)\n",
262 | "# plot assists\n",
263 | "pitch.plot(df_assist.x_pass_end,df_assist.y_pass_end,\n",
264 | " marker='o', color='None',markersize=12,markerfacecolor='#34afed',\n",
265 | " linestyle='None',markeredgecolor='#34afed',ax=ax1)\n",
266 | "pitch.plot(df_assist.x_pass_end,df_assist.y_pass_end,\n",
267 | " marker='o', color='None',markersize=7,markerfacecolor='#34afed',\n",
268 | " linestyle='None',markeredgecolor='#34afed',ax=ax2)\n",
269 | "# plot other\n",
270 | "pitch.plot(df_other.x_pass_end,df_other.y_pass_end,markerfacecolor='#2f3653',\n",
271 | " marker='o', color='None',markersize=12,zorder=3,\n",
272 | " linestyle='None',markeredgecolor='#34afed',ax=ax1)\n",
273 | "pitch.plot(df_other.x_pass_end,df_other.y_pass_end,markerfacecolor='#2f3653',\n",
274 | " marker='o', color='None',markersize=7,zorder=3,\n",
275 | " linestyle='None',markeredgecolor='#34afed',ax=ax2)\n",
276 | "# plot pass start locations\n",
277 | "pitch.plot(df_pass.x_pass_start,df_pass.y_pass_start,\n",
278 | " marker='o', color='#a43967',markersize=10,alpha=0.25,linestyle='None',ax=ax1)\n",
279 | "pitch.plot(df_pass.x_pass_start,df_pass.y_pass_start,\n",
280 | " marker='o', color='#a43967',markersize=10,alpha=0.1,linestyle='None',ax=ax3)"
281 | ]
282 | }
283 | ],
284 | "metadata": {
285 | "kernelspec": {
286 | "display_name": "Python 3",
287 | "language": "python",
288 | "name": "python3"
289 | },
290 | "language_info": {
291 | "codemirror_mode": {
292 | "name": "ipython",
293 | "version": 3
294 | },
295 | "file_extension": ".py",
296 | "mimetype": "text/x-python",
297 | "name": "python",
298 | "nbconvert_exporter": "python",
299 | "pygments_lexer": "ipython3",
300 | "version": "3.8.2"
301 | }
302 | },
303 | "nbformat": 4,
304 | "nbformat_minor": 2
305 | }
306 |
--------------------------------------------------------------------------------
/01_statsbomb_json_to_feather.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np\n",
11 | "import glob\n",
12 | "import os"
13 | ]
14 | },
15 | {
16 | "cell_type": "markdown",
17 | "metadata": {},
18 | "source": [
19 | "This notebook takes the StatsBomb json files and turns them into feather files. These are extremely fast to load so good for this prototyping kind of analysis. See: https://medium.com/@steven.p.dye/feather-files-faster-than-the-speed-of-light-d4666ce24387.\n",
20 | "\n",
21 | "They are not really meant for long term storage though. The event files are then combined from all the matches."
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "# Change these paths/ parameters\n",
29 | "You will need to change these paths/ parameters depending on where the StatsBomb open-data is located, how and where you want to save the resulting data, and if you only want the new files to be processed."
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 2,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "# open data folder is one folder down in the directory. To change if run elsewhere\n",
39 | "STATSBOMB_DATA = os.path.join('..','open-data','data')\n",
40 | "# save files in folder in current directory. To change if want to save elsewhere\n",
41 | "DATA_PATH = os.path.join(os.getcwd(),'data')\n",
42 | "# if true, only processes files that don't already have a event file\n",
43 | "process_new_only = True"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "# Delete event data included in error"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {},
56 | "source": [
57 | "One event file seems to be added to the statsbomb data in error. See: https://github.com/statsbomb/open-data/issues/13. Deleting it here for consistency."
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 3,
63 | "metadata": {},
64 | "outputs": [
65 | {
66 | "name": "stdout",
67 | "output_type": "stream",
68 | "text": [
69 | "../open-data/data/events/7298.json removed\n",
70 | "../open-data/data/lineups/7298.json removed\n"
71 | ]
72 | }
73 | ],
74 | "source": [
75 | "ERROR_FILES = [os.path.join(STATSBOMB_DATA,'events','7298.json'),\n",
76 | " os.path.join(STATSBOMB_DATA,'lineups','7298.json')]\n",
77 | "for file in ERROR_FILES:\n",
78 | " if os.path.isfile(file):\n",
79 | " os.remove(file)\n",
80 | " print(file,'removed')"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "# Setup folders"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "I set up the following folders in a new data directory folder (location set above). These are the places we will save the processed json files, in feather-format.
\n",
95 | "├── data
\n",
96 | "│ ├── events_raw <- Data from the event file
\n",
97 | "│ ├── related_events_raw <- Data with the info on how events are connected.
\n",
98 | "│ ├── shot_freeze_raw <- DAta with the individual shot freeze frames
\n",
99 | "│ └── tactics_raw <- Data with the lineup tactics.
"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": 4,
105 | "metadata": {},
106 | "outputs": [],
107 | "source": [
108 | "def make_dir(PATH):\n",
109 | " if os.path.isdir(PATH)==False: os.mkdir(PATH)"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 5,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "# locations of new folders\n",
119 | "RAW_EVENT_PATH = os.path.join(DATA_PATH,'events_raw')\n",
120 | "RAW_RELATED_PATH = os.path.join(DATA_PATH,'related_events_raw')\n",
121 | "RAW_SHOT_PATH = os.path.join(DATA_PATH,'shot_freeze_raw')\n",
122 | "RAW_TACTICS_PATH = os.path.join(DATA_PATH,'tactics_raw')"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 6,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "# making directories\n",
132 | "make_dir(DATA_PATH)\n",
133 | "make_dir(RAW_EVENT_PATH)\n",
134 | "make_dir(RAW_RELATED_PATH)\n",
135 | "make_dir(RAW_SHOT_PATH)\n",
136 | "make_dir(RAW_TACTICS_PATH)"
137 | ]
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "metadata": {},
142 | "source": [
143 | "# Get file paths"
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "metadata": {},
149 | "source": [
150 | "Retrieve a list of json file paths from which we will extract the infomation."
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": 7,
156 | "metadata": {},
157 | "outputs": [],
158 | "source": [
159 | "MATCH_PATH = glob.glob(os.path.join(STATSBOMB_DATA,'matches','**','*.json'),recursive=True)\n",
160 | "LINEUP_PATH = glob.glob(os.path.join(STATSBOMB_DATA,'lineups','**','*.json'),recursive=True)\n",
161 | "EVENT_PATH = glob.glob(os.path.join(STATSBOMB_DATA,'events','**','*.json'),recursive=True)\n",
162 | "COMPETITION_PATH = os.path.join(STATSBOMB_DATA,'competitions.json')"
163 | ]
164 | },
165 | {
166 | "cell_type": "markdown",
167 | "metadata": {},
168 | "source": [
169 | "# Format competition data"
170 | ]
171 | },
172 | {
173 | "cell_type": "markdown",
174 | "metadata": {},
175 | "source": [
176 | "Get the competition data and save in feather format."
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 8,
182 | "metadata": {},
183 | "outputs": [
184 | {
185 | "name": "stdout",
186 | "output_type": "stream",
187 | "text": [
188 | "Number of competitions in data: 20\n"
189 | ]
190 | }
191 | ],
192 | "source": [
193 | "df_competition = pd.read_json(COMPETITION_PATH,convert_dates=['match_updated','match_available'])\n",
194 | "df_competition.sort_values(['competition_id','season_id'],inplace=True)\n",
195 | "df_competition.reset_index(drop=True,inplace=True)\n",
196 | "print('Number of competitions in data:',len(df_competition))"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": 9,
202 | "metadata": {},
203 | "outputs": [
204 | {
205 | "name": "stdout",
206 | "output_type": "stream",
207 | "text": [
208 | "\n",
209 | "RangeIndex: 20 entries, 0 to 19\n",
210 | "Data columns (total 8 columns):\n",
211 | "competition_id 20 non-null int64\n",
212 | "season_id 20 non-null int64\n",
213 | "country_name 20 non-null object\n",
214 | "competition_name 20 non-null object\n",
215 | "competition_gender 20 non-null object\n",
216 | "season_name 20 non-null object\n",
217 | "match_updated 20 non-null datetime64[ns]\n",
218 | "match_available 20 non-null datetime64[ns]\n",
219 | "dtypes: datetime64[ns](2), int64(2), object(4)\n",
220 | "memory usage: 1.4+ KB\n"
221 | ]
222 | }
223 | ],
224 | "source": [
225 | "# save to feather-format and show info\n",
226 | "df_competition.to_feather(os.path.join(DATA_PATH,'competition'))\n",
227 | "df_competition.info()"
228 | ]
229 | },
230 | {
231 | "cell_type": "markdown",
232 | "metadata": {},
233 | "source": [
234 | "# Format match data"
235 | ]
236 | },
237 | {
238 | "cell_type": "markdown",
239 | "metadata": {},
240 | "source": [
241 | "Get the match data and save in feather format."
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": 10,
247 | "metadata": {},
248 | "outputs": [
249 | {
250 | "name": "stdout",
251 | "output_type": "stream",
252 | "text": [
253 | "Number of match files in data: 20\n",
254 | "Number of matches in data: 778\n"
255 | ]
256 | }
257 | ],
258 | "source": [
259 | "print('Number of match files in data:',len(MATCH_PATH))\n",
260 | "match_list_dfs = [pd.read_json(file,convert_dates=['match_date','last_updated']) for file in MATCH_PATH]\n",
261 | "df_match = pd.concat(match_list_dfs,sort=False)\n",
262 | "print('Number of matches in data:',len(df_match))"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": 11,
268 | "metadata": {},
269 | "outputs": [],
270 | "source": [
271 | "def split_dict_col(df,col):\n",
272 | " '''function to split a dictionary column to seperate columns'''\n",
273 | " # handle missings by filling with an empty dictionary\n",
274 | " df[col] = df[col].apply(lambda x: {} if pd.isna(x) else x)\n",
275 | " # split the non missings and change column names\n",
276 | " df_temp_cols = pd.io.json.json_normalize(df[col]).set_index(df.index)\n",
277 | " col_names = df_temp_cols.columns\n",
278 | " # note add column description to column name if doesn't already contain it\n",
279 | " col_names = [(c).replace('.','_') if c[:len(col)]==col else (col+'_'+c).replace('.','_') for c in col_names]\n",
280 | " df[col_names] = df_temp_cols\n",
281 | " # drop old column\n",
282 | " df.drop(col,axis=1,inplace=True)\n",
283 | " return df"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": 12,
289 | "metadata": {},
290 | "outputs": [],
291 | "source": [
292 | "# loop through the columns that are still dictionary columns and add them as seperate cols to the dataframe\n",
293 | "dictionary_columns = ['competition','season','home_team','away_team','metadata','competition_stage',\n",
294 | " 'stadium','referee']\n",
295 | "for col in dictionary_columns:\n",
296 | " df_match = split_dict_col(df_match,col)\n",
297 | "# convert kickoff to datetime - date + kickoff time\n",
298 | "df_match['kick_off'] = pd.to_datetime(df_match.match_date.astype(str) +' '+ df_match.kick_off)\n",
299 | "# drop one gender column as always equal to the other\n",
300 | "# drop match status as always available\n",
301 | "df_match.drop(['away_team_gender','match_status'],axis=1,inplace=True)\n",
302 | "df_match.rename({'home_team_gender':'competition_gender'},axis=1,inplace=True)\n",
303 | "# manager is a list (len=1) containing a dictionary so lets split into columns\n",
304 | "df_match['home_team_managers'] = df_match.home_team_managers.str[0]\n",
305 | "df_match = split_dict_col(df_match,'home_team_managers')\n",
306 | "df_match['away_team_managers'] = df_match.away_team_managers.str[0]\n",
307 | "df_match = split_dict_col(df_match,'away_team_managers')\n",
308 | "df_match['home_team_managers_dob'] = pd.to_datetime(df_match['home_team_managers_dob'])\n",
309 | "df_match['away_team_managers_dob'] = pd.to_datetime(df_match['away_team_managers_dob'])\n",
310 | "for col in ['competition_id','season_id','home_team_id','competition_stage_id']:\n",
311 | " df_match[col] = df_match[col].astype(np.int64)\n",
312 | "# sort and reset index: ready for exporting to feather\n",
313 | "df_match.sort_values('kick_off',inplace=True)\n",
314 | "df_match.reset_index(inplace=True,drop=True)"
315 | ]
316 | },
317 | {
318 | "cell_type": "code",
319 | "execution_count": 13,
320 | "metadata": {},
321 | "outputs": [
322 | {
323 | "name": "stdout",
324 | "output_type": "stream",
325 | "text": [
326 | "\n",
327 | "RangeIndex: 778 entries, 0 to 777\n",
328 | "Data columns (total 48 columns):\n",
329 | "match_id 778 non-null int64\n",
330 | "match_date 778 non-null datetime64[ns]\n",
331 | "kick_off 778 non-null datetime64[ns]\n",
332 | "home_score 778 non-null int64\n",
333 | "away_score 778 non-null int64\n",
334 | "last_updated 778 non-null datetime64[ns]\n",
335 | "match_week 778 non-null int64\n",
336 | "competition_id 778 non-null int64\n",
337 | "competition_country_name 778 non-null object\n",
338 | "competition_name 778 non-null object\n",
339 | "season_id 778 non-null int64\n",
340 | "season_name 778 non-null object\n",
341 | "home_team_id 778 non-null int64\n",
342 | "home_team_name 778 non-null object\n",
343 | "competition_gender 778 non-null object\n",
344 | "home_team_group 100 non-null object\n",
345 | "home_team_country_id 777 non-null float64\n",
346 | "home_team_country_name 777 non-null object\n",
347 | "away_team_id 778 non-null int64\n",
348 | "away_team_name 778 non-null object\n",
349 | "away_team_group 100 non-null object\n",
350 | "away_team_country_id 776 non-null float64\n",
351 | "away_team_country_name 776 non-null object\n",
352 | "metadata_data_version 778 non-null object\n",
353 | "metadata_shot_fidelity_version 591 non-null object\n",
354 | "metadata_xy_fidelity_version 501 non-null object\n",
355 | "competition_stage_id 778 non-null int64\n",
356 | "competition_stage_name 778 non-null object\n",
357 | "stadium_id 680 non-null float64\n",
358 | "stadium_name 680 non-null object\n",
359 | "stadium_country_id 549 non-null float64\n",
360 | "stadium_country_name 549 non-null object\n",
361 | "referee_id 732 non-null float64\n",
362 | "referee_name 732 non-null object\n",
363 | "referee_country_id 325 non-null float64\n",
364 | "referee_country_name 325 non-null object\n",
365 | "home_team_managers_id 520 non-null float64\n",
366 | "home_team_managers_name 520 non-null object\n",
367 | "home_team_managers_nickname 173 non-null object\n",
368 | "home_team_managers_dob 436 non-null datetime64[ns]\n",
369 | "home_team_managers_country_id 520 non-null float64\n",
370 | "home_team_managers_country_name 520 non-null object\n",
371 | "away_team_managers_id 520 non-null float64\n",
372 | "away_team_managers_name 520 non-null object\n",
373 | "away_team_managers_nickname 163 non-null object\n",
374 | "away_team_managers_dob 438 non-null datetime64[ns]\n",
375 | "away_team_managers_country_id 520 non-null float64\n",
376 | "away_team_managers_country_name 520 non-null object\n",
377 | "dtypes: datetime64[ns](5), float64(10), int64(9), object(24)\n",
378 | "memory usage: 291.9+ KB\n"
379 | ]
380 | }
381 | ],
382 | "source": [
383 | "# save to feather-format and show info\n",
384 | "df_match.to_feather(os.path.join(DATA_PATH,'match'))\n",
385 | "df_match.info()"
386 | ]
387 | },
388 | {
389 | "cell_type": "markdown",
390 | "metadata": {},
391 | "source": [
392 | "# Format lineup data"
393 | ]
394 | },
395 | {
396 | "cell_type": "markdown",
397 | "metadata": {},
398 | "source": [
399 | "Get the lineup data and save in feather format."
400 | ]
401 | },
402 | {
403 | "cell_type": "code",
404 | "execution_count": 14,
405 | "metadata": {},
406 | "outputs": [
407 | {
408 | "name": "stdout",
409 | "output_type": "stream",
410 | "text": [
411 | "Number of lineup files in data: 778\n"
412 | ]
413 | }
414 | ],
415 | "source": [
416 | "print('Number of lineup files in data:',len(LINEUP_PATH))\n",
417 | "# read as dataframe can't use list comprehension to read files as need to create the match_id from the file name\n",
418 | "lineup_list_dfs = []\n",
419 | "for file in LINEUP_PATH:\n",
420 | " df_temp = pd.read_json(file)\n",
421 | " df_temp['match_id'] = os.path.basename(file[:-5])\n",
422 | " lineup_list_dfs.append(df_temp)\n",
423 | "df_lineup = pd.concat(lineup_list_dfs,sort=False)\n",
424 | "df_lineup.reset_index(inplace=True,drop=True)\n",
425 | "# each line has a column named player that contains a list of dictionaries\n",
426 | "# we split into seperate columns and then create a new row for each player using melt\n",
427 | "df_lineup_players = df_lineup.lineup.apply(pd.Series)\n",
428 | "df_lineup = df_lineup.merge(df_lineup_players,left_index=True,right_index=True)\n",
429 | "df_lineup.drop('lineup',axis=1,inplace=True)\n",
430 | "df_lineup = df_lineup.melt(id_vars = ['team_id','team_name','match_id'], value_name = 'player')\n",
431 | "df_lineup.drop('variable',axis=1,inplace=True)\n",
432 | "df_lineup = df_lineup[df_lineup.player.notnull()].copy()\n",
433 | "df_lineup = split_dict_col(df_lineup,'player')\n",
434 | "# turn ids to integers if no missings\n",
435 | "df_lineup['match_id'] = df_lineup.match_id.astype(np.int64)\n",
436 | "df_lineup['player_id'] = df_lineup.player_id.astype(np.int64)\n",
437 | "# sort and reset index: ready for exporting to feather\n",
438 | "df_lineup.sort_values('player_id',inplace=True)\n",
439 | "df_lineup.reset_index(inplace=True,drop=True)"
440 | ]
441 | },
442 | {
443 | "cell_type": "code",
444 | "execution_count": 15,
445 | "metadata": {},
446 | "outputs": [
447 | {
448 | "name": "stdout",
449 | "output_type": "stream",
450 | "text": [
451 | "\n",
452 | "RangeIndex: 21416 entries, 0 to 21415\n",
453 | "Data columns (total 9 columns):\n",
454 | "team_id 21416 non-null int64\n",
455 | "team_name 21416 non-null object\n",
456 | "match_id 21416 non-null int64\n",
457 | "player_id 21416 non-null int64\n",
458 | "player_name 21416 non-null object\n",
459 | "player_nickname 12156 non-null object\n",
460 | "player_jersey_number 21409 non-null float64\n",
461 | "player_country_id 21328 non-null float64\n",
462 | "player_country_name 21328 non-null object\n",
463 | "dtypes: float64(2), int64(3), object(4)\n",
464 | "memory usage: 1.5+ MB\n"
465 | ]
466 | }
467 | ],
468 | "source": [
469 | "# save to feather-format and show info\n",
470 | "df_lineup.to_feather(os.path.join(DATA_PATH,'lineup'))\n",
471 | "df_lineup.info()"
472 | ]
473 | },
474 | {
475 | "cell_type": "markdown",
476 | "metadata": {},
477 | "source": [
478 | "# Format event data"
479 | ]
480 | },
481 | {
482 | "cell_type": "markdown",
483 | "metadata": {},
484 | "source": [
485 | "Get the event data and save in feather format:\n",
486 | " - an events dataframe\n",
487 | " - a related events dataframe\n",
488 | " - a shot freeze frame dataframe\n",
489 | " - a tactics lineup dataframe\n",
490 | " \n",
491 | "Each match is stored in a seperate dataframe"
492 | ]
493 | },
494 | {
495 | "cell_type": "code",
496 | "execution_count": 16,
497 | "metadata": {},
498 | "outputs": [],
499 | "source": [
500 | "def list_dictionary_to_df(df,col,value_name,var_name):\n",
501 | " '''Some columns are a list of dictionaries. This turns them into a new dataframe of rows'''\n",
502 | " df = df.loc[df[col].notnull(),['id',col]]\n",
503 | " df.set_index('id',inplace=True)\n",
504 | " df = df[col].apply(pd.Series).copy()\n",
505 | " df.reset_index(inplace=True)\n",
506 | " df = df.melt(id_vars='id',value_name=value_name,var_name=var_name)\n",
507 | " df[var_name] = df[var_name] + 1\n",
508 | " df = df[df[value_name].notnull()].copy()\n",
509 | " df.reset_index(inplace=True,drop=True)\n",
510 | " return df"
511 | ]
512 | },
513 | {
514 | "cell_type": "code",
515 | "execution_count": 17,
516 | "metadata": {},
517 | "outputs": [],
518 | "source": [
519 | "def split_location_cols(df,col,new_cols):\n",
520 | " ''' Location is stored as a list. split into columns'''\n",
521 | " if col in df.columns:\n",
522 | " df[new_cols] = df[col].apply(pd.Series)\n",
523 | " df.drop(col,axis=1,inplace=True)"
524 | ]
525 | },
526 | {
527 | "cell_type": "code",
528 | "execution_count": 18,
529 | "metadata": {},
530 | "outputs": [
531 | {
532 | "name": "stdout",
533 | "output_type": "stream",
534 | "text": [
535 | "Number of event files in data: 778\n"
536 | ]
537 | }
538 | ],
539 | "source": [
540 | "print('Number of event files in data:',len(EVENT_PATH))"
541 | ]
542 | },
543 | {
544 | "cell_type": "code",
545 | "execution_count": 19,
546 | "metadata": {},
547 | "outputs": [
548 | {
549 | "name": "stdout",
550 | "output_type": "stream",
551 | "text": [
552 | "Matches with no event file: []\n",
553 | "Events with no match file: []\n"
554 | ]
555 | }
556 | ],
557 | "source": [
558 | "EVENT_FILE_NAMES = np.array([os.path.basename(file)[:-5] for file in EVENT_PATH]).astype(int)\n",
559 | "# quick check that all events have matches and vice versa.\n",
560 | "print('Matches with no event file:',list(set(df_match.match_id) - set(EVENT_FILE_NAMES)))\n",
561 | "print('Events with no match file:',list(set(EVENT_FILE_NAMES) - set(df_match.match_id)))"
562 | ]
563 | },
564 | {
565 | "cell_type": "code",
566 | "execution_count": 20,
567 | "metadata": {},
568 | "outputs": [
569 | {
570 | "name": "stdout",
571 | "output_type": "stream",
572 | "text": [
573 | "Event files to process: 1\n"
574 | ]
575 | }
576 | ],
577 | "source": [
578 | "# if you set process_new_only to True then we will not process event jsons which already have feather files\n",
579 | "if process_new_only:\n",
580 | " event_set = set([os.path.basename(file) for file in glob.glob(os.path.join(RAW_EVENT_PATH,'*'))])\n",
581 | " related_set = set([os.path.basename(file) for file in glob.glob(os.path.join(RAW_RELATED_PATH,'*'))])\n",
582 | " shot_set = set([os.path.basename(file) for file in glob.glob(os.path.join(RAW_SHOT_PATH,'*'))])\n",
583 | " tactics_set = set([os.path.basename(file) for file in glob.glob(os.path.join(RAW_TACTICS_PATH,'*'))])\n",
584 | " to_delete = set.intersection(event_set,related_set,shot_set,tactics_set)\n",
585 | " mask_delete = [False if file in to_delete else True for file in (EVENT_FILE_NAMES).astype(str)]\n",
586 | " EVENT_PATH = np.array(EVENT_PATH)[mask_delete].tolist()\n",
587 | " print('Event files to process:',np.array(mask_delete).sum())"
588 | ]
589 | },
590 | {
591 | "cell_type": "code",
592 | "execution_count": 21,
593 | "metadata": {},
594 | "outputs": [],
595 | "source": [
596 | "def create_event_feather_files(PATH):\n",
597 | " ''' Extracts individual event jsons and loads as four feather-format files: events, related events,\n",
598 | " shot freeze frames, and tactics lineups'''\n",
599 | " # timestamp defaults to today's date so store as a string - feather can't store time objects\n",
600 | " df = pd.read_json(PATH,encoding='utf-8')\n",
601 | " df['timestamp'] = df['timestamp'].dt.time.astype(str)\n",
602 | " \n",
603 | " # get match id\n",
604 | " match_id = int(os.path.basename(PATH)[:-5])\n",
605 | " \n",
606 | " # loop through the columns that are still dictionary columns and add them as seperate cols to the dataframe\n",
607 | " # these are nested dataframes in the docs - although dribbled_past/ pressure isn't needed here?\n",
608 | " # also some others are needed: type, possession_team, play_pattern, team, tactics, player, pposition\n",
609 | " dictionary_columns = ['50_50','bad_behaviour','ball_receipt','ball_recovery','block','carry',\n",
610 | " 'clearance','dribble','duel','foul_committed','foul_won','goalkeeper',\n",
611 | " 'half_end','half_start','injury_stoppage','interception',\n",
612 | " 'miscontrol','pass','play_pattern','player','player_off','position',\n",
613 | " 'possession_team','shot','substitution','tactics','team','type',] \n",
614 | " for col in dictionary_columns:\n",
615 | " if col in df.columns:\n",
616 | " df = split_dict_col(df,col)\n",
617 | " \n",
618 | " # sort and reset index: ready for exporting to feather\n",
619 | " df.sort_values(['minute','second','timestamp','possession'],inplace=True)\n",
620 | " df.reset_index(inplace=True,drop=True)\n",
621 | " \n",
622 | " # split location info to x, y and (z for shot) columns and drop old columns\n",
623 | " split_location_cols(df,'location',['x','y'])\n",
624 | " split_location_cols(df,'pass_end_location',['pass_end_x','pass_end_y'])\n",
625 | " split_location_cols(df,'carry_end_location',['carry_end_x','carry_end_y'])\n",
626 | " split_location_cols(df,'shot_end_location',['shot_end_x','shot_end_y','shot_end_z'])\n",
627 | " split_location_cols(df,'goalkeeper_end_location',['goalkeeper_end_x','goalkeeper_end_y'])\n",
628 | " \n",
629 | " # replace weird * character in the type_name for ball receipt\n",
630 | " df['type_name'] = df['type_name'].replace({'Ball Receipt*':'Ball Receipt'})\n",
631 | " \n",
632 | " # create a related events dataframe\n",
633 | " df_related_events = list_dictionary_to_df(df,col='related_events',\n",
634 | " value_name='related_event',var_name='event_related_id')\n",
635 | " # some carries don't have the corresponding events. This makes sure all events are linked both ways\n",
636 | " df_related_events.drop('event_related_id',axis=1,inplace=True)\n",
637 | " df_related_events_reverse = df_related_events.rename({'related_event':'id','id':'related_event'},axis=1)\n",
638 | " df_related_events = pd.concat([df_related_events,df_related_events_reverse],sort=False)\n",
639 | " df_related_events.drop_duplicates(inplace=True)\n",
640 | " # and add on the type_names, index for easier lookups of how the events are related\n",
641 | " df_event_type = df[['id','type_name','index']].copy()\n",
642 | " df_related_events = df_related_events.merge(df_event_type,on='id',how='left',validate='m:1')\n",
643 | " df_event_type.rename({'id':'related_event'},axis=1,inplace=True)\n",
644 | " df_related_events = df_related_events.merge(df_event_type,on='related_event',\n",
645 | " how='left',validate='m:1',suffixes=['','_related'])\n",
646 | " df_related_events.rename({'related_event':'id_related'},axis=1,inplace=True)\n",
647 | " \n",
648 | " # create a shot freeze frame dataframe - also splits dictionary of player details into columns\n",
649 | " df_shot_freeze = list_dictionary_to_df(df,col='shot_freeze_frame',\n",
650 | " value_name='player',var_name='event_freeze_id')\n",
651 | " df_shot_freeze = split_dict_col(df_shot_freeze,'player')\n",
652 | " split_location_cols(df_shot_freeze,'player_location',['x','y'])\n",
653 | "\n",
654 | " # create a tactics lineup frame dataframe - also splits dictionary of player details into columns\n",
655 | " df_tactics_lineup = list_dictionary_to_df(df,col='tactics_lineup',\n",
656 | " value_name='player',var_name='event_tactics_id')\n",
657 | " df_tactics_lineup = split_dict_col(df_tactics_lineup,'player')\n",
658 | " \n",
659 | " # drop columns stored as a seperate table \n",
660 | " df.drop(['related_events','shot_freeze_frame','tactics_lineup'],axis=1,inplace=True)\n",
661 | " \n",
662 | " # add match id to dataframes\n",
663 | " df['match_id'] = match_id\n",
664 | " df_related_events['match_id'] = match_id\n",
665 | " df_shot_freeze['match_id'] = match_id \n",
666 | " df_tactics_lineup['match_id'] = match_id\n",
667 | " \n",
668 | " # save as feather files\n",
669 | " df.to_feather(os.path.join(RAW_EVENT_PATH,str(match_id)))\n",
670 | " df_related_events.to_feather(os.path.join(RAW_RELATED_PATH,str(match_id)))\n",
671 | " df_shot_freeze.to_feather(os.path.join(RAW_SHOT_PATH,str(match_id)))\n",
672 | " df_tactics_lineup.to_feather(os.path.join(RAW_TACTICS_PATH,str(match_id)))"
673 | ]
674 | },
675 | {
676 | "cell_type": "code",
677 | "execution_count": 22,
678 | "metadata": {},
679 | "outputs": [
680 | {
681 | "name": "stdout",
682 | "output_type": "stream",
683 | "text": [
684 | "0 2275036.json\n"
685 | ]
686 | }
687 | ],
688 | "source": [
689 | "# loop through and save all the event jsons as 4 seperate feather-files\n",
690 | "for i, file in enumerate(EVENT_PATH):\n",
691 | " create_event_feather_files(file)\n",
692 | " if i%10 == 0:\n",
693 | " print(i,os.path.basename(file))"
694 | ]
695 | },
696 | {
697 | "cell_type": "markdown",
698 | "metadata": {},
699 | "source": [
700 | "# Combine the raw dataframes and save as a single dataframe"
701 | ]
702 | },
703 | {
704 | "cell_type": "markdown",
705 | "metadata": {},
706 | "source": [
707 | "Combine the event dataframes into a single dataframe for each type:\n",
708 | "- events\n",
709 | "- related_events\n",
710 | "- shot freeze frame\n",
711 | "- tactics\n",
712 | "\n",
713 | "Note that the resulting feather file will be large (3gb+)"
714 | ]
715 | },
716 | {
717 | "cell_type": "code",
718 | "execution_count": 23,
719 | "metadata": {},
720 | "outputs": [],
721 | "source": [
722 | "def combine_single_file(PATH,SAVE_PATH):\n",
723 | " ''' loads individual feather files and combines into a mega feather file'''\n",
724 | " files = glob.glob(os.path.join(PATH,'*'))\n",
725 | " dfs = [pd.read_feather(file) for file in files]\n",
726 | " df = pd.concat(dfs,sort=False)\n",
727 | " if 'index' in df.columns:\n",
728 | " df.sort_values(['match_id','index'],inplace=True)\n",
729 | " df.reset_index(drop=True,inplace=True)\n",
730 | " print(df.info(verbose=True,null_counts=True))\n",
731 | " df.to_feather(SAVE_PATH)"
732 | ]
733 | },
734 | {
735 | "cell_type": "code",
736 | "execution_count": 24,
737 | "metadata": {},
738 | "outputs": [
739 | {
740 | "name": "stderr",
741 | "output_type": "stream",
742 | "text": [
743 | "/home/andy/anaconda3/envs/statsbomb-explore/lib/python3.7/site-packages/pyarrow/pandas_compat.py:752: FutureWarning: .labels was deprecated in version 0.24.0. Use .codes instead.\n",
744 | " labels, = index.labels\n"
745 | ]
746 | },
747 | {
748 | "name": "stdout",
749 | "output_type": "stream",
750 | "text": [
751 | "\n",
752 | "RangeIndex: 2797557 entries, 0 to 2797556\n",
753 | "Data columns (total 153 columns):\n",
754 | "id 2797557 non-null object\n",
755 | "index 2797557 non-null int64\n",
756 | "period 2797557 non-null int64\n",
757 | "timestamp 2797557 non-null object\n",
758 | "minute 2797557 non-null int64\n",
759 | "second 2797557 non-null int64\n",
760 | "possession 2797557 non-null int64\n",
761 | "duration 2046570 non-null float64\n",
762 | "off_camera 27283 non-null float64\n",
763 | "out 16363 non-null float64\n",
764 | "under_pressure 604676 non-null float64\n",
765 | "counterpress 86916 non-null float64\n",
766 | "ball_receipt_outcome_id 110299 non-null float64\n",
767 | "ball_receipt_outcome_name 110299 non-null object\n",
768 | "ball_recovery_offensive 298 non-null object\n",
769 | "ball_recovery_recovery_failure 6325 non-null object\n",
770 | "block_save_block 176 non-null object\n",
771 | "block_offensive 423 non-null object\n",
772 | "clearance_right_foot 7995 non-null object\n",
773 | "clearance_body_part_id 26528 non-null float64\n",
774 | "clearance_body_part_name 26528 non-null object\n",
775 | "clearance_left_foot 4733 non-null object\n",
776 | "clearance_head 13659 non-null object\n",
777 | "clearance_aerial_won 4886 non-null object\n",
778 | "dribble_outcome_id 32587 non-null float64\n",
779 | "dribble_outcome_name 32587 non-null object\n",
780 | "dribble_overrun 1948 non-null object\n",
781 | "duel_type_id 48472 non-null float64\n",
782 | "duel_type_name 48472 non-null object\n",
783 | "duel_outcome_id 30420 non-null float64\n",
784 | "duel_outcome_name 30420 non-null object\n",
785 | "foul_committed_offensive 921 non-null object\n",
786 | "foul_committed_type_id 1397 non-null float64\n",
787 | "foul_committed_type_name 1397 non-null object\n",
788 | "foul_committed_card_id 2492 non-null float64\n",
789 | "foul_committed_card_name 2492 non-null object\n",
790 | "foul_committed_penalty 229 non-null object\n",
791 | "foul_won_defensive 5429 non-null object\n",
792 | "foul_won_penalty 192 non-null object\n",
793 | "goalkeeper_type_id 23560 non-null float64\n",
794 | "goalkeeper_type_name 23560 non-null object\n",
795 | "goalkeeper_position_id 19906 non-null float64\n",
796 | "goalkeeper_position_name 19906 non-null object\n",
797 | "goalkeeper_outcome_id 11203 non-null float64\n",
798 | "goalkeeper_outcome_name 11203 non-null object\n",
799 | "goalkeeper_body_part_id 5730 non-null float64\n",
800 | "goalkeeper_body_part_name 5730 non-null object\n",
801 | "goalkeeper_technique_id 7702 non-null float64\n",
802 | "goalkeeper_technique_name 7702 non-null object\n",
803 | "half_start_late_video_start 32 non-null object\n",
804 | "interception_outcome_id 15212 non-null float64\n",
805 | "interception_outcome_name 15212 non-null object\n",
806 | "miscontrol_aerial_won 723 non-null object\n",
807 | "pass_length 769576 non-null float64\n",
808 | "pass_angle 769576 non-null float64\n",
809 | "pass_recipient_id 716166 non-null float64\n",
810 | "pass_recipient_name 716166 non-null object\n",
811 | "pass_height_id 769576 non-null float64\n",
812 | "pass_height_name 769576 non-null object\n",
813 | "pass_type_id 151071 non-null float64\n",
814 | "pass_type_name 151071 non-null object\n",
815 | "pass_body_part_id 724127 non-null float64\n",
816 | "pass_body_part_name 724127 non-null object\n",
817 | "pass_outcome_id 163709 non-null float64\n",
818 | "pass_outcome_name 163709 non-null object\n",
819 | "pass_cross 16541 non-null object\n",
820 | "pass_assisted_shot_id 13984 non-null object\n",
821 | "pass_shot_assist 12333 non-null object\n",
822 | "pass_switch 20890 non-null object\n",
823 | "pass_aerial_won 11234 non-null object\n",
824 | "pass_goal_assist 1651 non-null object\n",
825 | "pass_no_touch 507 non-null object\n",
826 | "pass_inswinging 1871 non-null object\n",
827 | "pass_technique_id 8987 non-null float64\n",
828 | "pass_technique_name 8987 non-null object\n",
829 | "pass_cut_back 1444 non-null object\n",
830 | "pass_straight 520 non-null object\n",
831 | "pass_through_ball 5002 non-null object\n",
832 | "pass_outswinging 1594 non-null object\n",
833 | "play_pattern_id 2797557 non-null int64\n",
834 | "play_pattern_name 2797557 non-null object\n",
835 | "player_id 2782289 non-null float64\n",
836 | "player_name 2782289 non-null object\n",
837 | "position_id 2782289 non-null float64\n",
838 | "position_name 2782289 non-null object\n",
839 | "possession_team_id 2797557 non-null int64\n",
840 | "possession_team_name 2797557 non-null object\n",
841 | "shot_statsbomb_xg 19934 non-null float64\n",
842 | "shot_key_pass_id 13984 non-null object\n",
843 | "shot_aerial_won 1209 non-null object\n",
844 | "shot_type_id 19934 non-null float64\n",
845 | "shot_type_name 19934 non-null object\n",
846 | "shot_body_part_id 19934 non-null float64\n",
847 | "shot_body_part_name 19934 non-null object\n",
848 | "shot_technique_id 19934 non-null float64\n",
849 | "shot_technique_name 19934 non-null object\n",
850 | "shot_outcome_id 19934 non-null float64\n",
851 | "shot_outcome_name 19934 non-null object\n",
852 | "shot_first_time 5275 non-null object\n",
853 | "shot_one_on_one 1239 non-null object\n",
854 | "substitution_outcome_id 4289 non-null float64\n",
855 | "substitution_outcome_name 4289 non-null object\n",
856 | "substitution_replacement_id 4294 non-null float64\n",
857 | "substitution_replacement_name 4294 non-null object\n",
858 | "tactics_formation 2929 non-null float64\n",
859 | "team_id 2797557 non-null int64\n",
860 | "team_name 2797557 non-null object\n",
861 | "type_id 2797557 non-null int64\n",
862 | "type_name 2797557 non-null object\n",
863 | "x 2775055 non-null float64\n",
864 | "y 2775055 non-null float64\n",
865 | "pass_end_x 769576 non-null float64\n",
866 | "pass_end_y 769576 non-null float64\n",
867 | "carry_end_x 637990 non-null float64\n",
868 | "carry_end_y 637990 non-null float64\n",
869 | "shot_end_x 19934 non-null float64\n",
870 | "shot_end_y 19934 non-null float64\n",
871 | "shot_end_z 14422 non-null float64\n",
872 | "goalkeeper_end_x 12306 non-null float64\n",
873 | "goalkeeper_end_y 12306 non-null float64\n",
874 | "match_id 2797557 non-null int64\n",
875 | "50_50_outcome_id 1232 non-null float64\n",
876 | "50_50_outcome_name 1232 non-null object\n",
877 | "bad_behaviour_card_id 545 non-null float64\n",
878 | "bad_behaviour_card_name 545 non-null object\n",
879 | "dribble_nutmeg 1064 non-null object\n",
880 | "foul_committed_advantage 2940 non-null object\n",
881 | "foul_won_advantage 3034 non-null object\n",
882 | "goalkeeper_success_in_play 15 non-null object\n",
883 | "injury_stoppage_in_chain 303 non-null object\n",
884 | "pass_miscommunication 478 non-null object\n",
885 | "pass_backheel 978 non-null object\n",
886 | "block_deflection 849 non-null object\n",
887 | "dribble_no_touch 84 non-null object\n",
888 | "pass_deflected 875 non-null object\n",
889 | "shot_deflected 195 non-null object\n",
890 | "clearance_other 141 non-null object\n",
891 | "shot_open_goal 236 non-null object\n",
892 | "goalkeeper_punched_out 90 non-null object\n",
893 | "shot_redirect 63 non-null object\n",
894 | "goalkeeper_lost_in_play 20 non-null object\n",
895 | "goalkeeper_shot_saved_off_target 72 non-null object\n",
896 | "shot_saved_off_target 72 non-null object\n",
897 | "shot_follows_dribble 17 non-null object\n",
898 | "goalkeeper_shot_saved_to_post 54 non-null object\n",
899 | "shot_saved_to_post 49 non-null object\n",
900 | "goalkeeper_lost_out 10 non-null object\n",
901 | "half_end_early_video_end 8 non-null object\n",
902 | "goalkeeper_saved_to_post 2 non-null object\n",
903 | "goalkeeper_success_out 8 non-null object\n",
904 | "player_off_permanent 7 non-null object\n",
905 | "goalkeeper_penalty_saved_to_post 1 non-null object\n",
906 | "shot_kick_off 1 non-null object\n",
907 | "dtypes: float64(49), int64(10), object(94)\n",
908 | "memory usage: 3.2+ GB\n",
909 | "None\n"
910 | ]
911 | }
912 | ],
913 | "source": [
914 | "combine_single_file(RAW_EVENT_PATH,SAVE_PATH=os.path.join(DATA_PATH,'events'))"
915 | ]
916 | },
917 | {
918 | "cell_type": "code",
919 | "execution_count": 25,
920 | "metadata": {},
921 | "outputs": [
922 | {
923 | "name": "stdout",
924 | "output_type": "stream",
925 | "text": [
926 | "\n",
927 | "RangeIndex: 5450328 entries, 0 to 5450327\n",
928 | "Data columns (total 7 columns):\n",
929 | "id 5450328 non-null object\n",
930 | "id_related 5450328 non-null object\n",
931 | "type_name 5450328 non-null object\n",
932 | "index 5450328 non-null int64\n",
933 | "type_name_related 5450328 non-null object\n",
934 | "index_related 5450328 non-null int64\n",
935 | "match_id 5450328 non-null int64\n",
936 | "dtypes: int64(3), object(4)\n",
937 | "memory usage: 291.1+ MB\n",
938 | "None\n"
939 | ]
940 | }
941 | ],
942 | "source": [
943 | "combine_single_file(RAW_RELATED_PATH,SAVE_PATH=os.path.join(DATA_PATH,'related_events'))"
944 | ]
945 | },
946 | {
947 | "cell_type": "code",
948 | "execution_count": 26,
949 | "metadata": {},
950 | "outputs": [
951 | {
952 | "name": "stdout",
953 | "output_type": "stream",
954 | "text": [
955 | "\n",
956 | "RangeIndex: 244803 entries, 0 to 244802\n",
957 | "Data columns (total 10 columns):\n",
958 | "id 244803 non-null object\n",
959 | "event_freeze_id 244803 non-null int64\n",
960 | "player_teammate 244803 non-null bool\n",
961 | "player_id 244803 non-null int64\n",
962 | "player_name 244803 non-null object\n",
963 | "player_position_id 244803 non-null int64\n",
964 | "player_position_name 244803 non-null object\n",
965 | "x 244803 non-null float64\n",
966 | "y 244803 non-null float64\n",
967 | "match_id 244803 non-null int64\n",
968 | "dtypes: bool(1), float64(2), int64(4), object(3)\n",
969 | "memory usage: 17.0+ MB\n",
970 | "None\n"
971 | ]
972 | }
973 | ],
974 | "source": [
975 | "combine_single_file(RAW_SHOT_PATH,SAVE_PATH=os.path.join(DATA_PATH,'shot_freeze_frame'))"
976 | ]
977 | },
978 | {
979 | "cell_type": "code",
980 | "execution_count": 27,
981 | "metadata": {},
982 | "outputs": [
983 | {
984 | "name": "stdout",
985 | "output_type": "stream",
986 | "text": [
987 | "\n",
988 | "RangeIndex: 32211 entries, 0 to 32210\n",
989 | "Data columns (total 8 columns):\n",
990 | "id 32211 non-null object\n",
991 | "event_tactics_id 32211 non-null int64\n",
992 | "player_jersey_number 32204 non-null float64\n",
993 | "player_id 32211 non-null int64\n",
994 | "player_name 32211 non-null object\n",
995 | "player_position_id 32211 non-null int64\n",
996 | "player_position_name 32211 non-null object\n",
997 | "match_id 32211 non-null int64\n",
998 | "dtypes: float64(1), int64(4), object(3)\n",
999 | "memory usage: 2.0+ MB\n",
1000 | "None\n"
1001 | ]
1002 | }
1003 | ],
1004 | "source": [
1005 | "combine_single_file(RAW_TACTICS_PATH,SAVE_PATH=os.path.join(DATA_PATH,'tactics'))"
1006 | ]
1007 | }
1008 | ],
1009 | "metadata": {
1010 | "kernelspec": {
1011 | "display_name": "Python 3",
1012 | "language": "python",
1013 | "name": "python3"
1014 | },
1015 | "language_info": {
1016 | "codemirror_mode": {
1017 | "name": "ipython",
1018 | "version": 3
1019 | },
1020 | "file_extension": ".py",
1021 | "mimetype": "text/x-python",
1022 | "name": "python",
1023 | "nbconvert_exporter": "python",
1024 | "pygments_lexer": "ipython3",
1025 | "version": "3.7.5"
1026 | }
1027 | },
1028 | "nbformat": 4,
1029 | "nbformat_minor": 2
1030 | }
1031 |
--------------------------------------------------------------------------------
/demo_crawley.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from mplsoccer.pitch import Pitch\n",
10 | "import numpy as np\n",
11 | "import matplotlib.pyplot as plt\n",
12 | "from urllib.request import urlopen\n",
13 | "from PIL import Image\n",
14 | "import numpy as np"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "background_color = '#004D98'\n",
24 | "line_color='#d2dde1'\n",
25 | "marker_color = '#dcdf4c'\n",
26 | "figsize = (9,16)\n",
27 | "width, height = figsize\n",
28 | "aspect = width/height"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 3,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "pitch = Pitch(pitch_color=background_color,line_color=line_color,orientation='vertical',pitch_type='opta',\n",
38 | " view='half')\n",
39 | "# going to use this to plot a legend - cheap hack that lines are same color as the background so\n",
40 | "# the lines won't show up\n",
41 | "empty_pitch = Pitch(pitch_color=background_color,line_color=background_color,\n",
42 | " orientation='vertical',pitch_type='opta',view='full')"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 4,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "IMAGE_URL = 'https://upload.wikimedia.org/wikipedia/en/8/8b/Crawley_Town_FC_logo.png'\n",
52 | "crawley_logo = np.array(Image.open(urlopen(IMAGE_URL)))"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 5,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "# for this image the aspect is different so you have to calculate the logo height from its width\n",
62 | "# so that you can get the height and width of the logo axis right\n",
63 | "def calculate_display_height(img,img_display_width,aspect):\n",
64 | " img_height, img_width, _ = img.shape\n",
65 | " img_aspect = img_width/img_height\n",
66 | " img_display_height = img_display_width/img_aspect*aspect\n",
67 | " return img_display_height"
68 | ]
69 | },
70 | {
71 | "cell_type": "markdown",
72 | "metadata": {},
73 | "source": [
74 | "# Note that the logo is slightly transparent (alpha = 0.9) I like this better, but you can delete"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 6,
80 | "metadata": {
81 | "scrolled": false
82 | },
83 | "outputs": [
84 | {
85 | "data": {
86 | "image/png": "\n",
87 | "text/plain": [
88 | ""
89 | ]
90 | },
91 | "metadata": {},
92 | "output_type": "display_data"
93 | }
94 | ],
95 | "source": [
96 | "# setup a figure\n",
97 | "fig = plt.figure(figsize=figsize,facecolor=background_color);\n",
98 | "# The dimensions [left, bottom, width, height] of the new axes.\n",
99 | "# All quantities are in fractions of figure width and height.\n",
100 | "pitch_top_rect = (0,0.5,1,0.45)\n",
101 | "ax_pitch_top = fig.add_axes(pitch_top_rect)\n",
102 | "legend1_rect = (0.2,0.45,0.1,0.05)\n",
103 | "ax_legend1 = fig.add_axes(legend1_rect)\n",
104 | "ax_legend1.axis('off')\n",
105 | "legend2_rect = (0.5,0.45,0.1,0.05) #dimensions for line legend\n",
106 | "ax_legend2 = fig.add_axes(legend2_rect) # ax for legend2\n",
107 | "pitch_bottom_rect = (0,0,1,0.45)\n",
108 | "ax_pitch_bottom = fig.add_axes(pitch_bottom_rect)\n",
109 | "title_rect = (0.02,0.95,0.7,0.05)\n",
110 | "ax_title= fig.add_axes(title_rect)\n",
111 | "ax_title.axis('off')\n",
112 | "logo_display_width = 0.2\n",
113 | "logo_display_height = calculate_display_height(crawley_logo,logo_display_width,aspect)\n",
114 | "logo_rect = (1-logo_display_width,1-logo_display_height,logo_display_width,logo_display_height)\n",
115 | "ax_logo = fig.add_axes(logo_rect)\n",
116 | "ax_logo.axis('off')\n",
117 | "ax_logo.imshow(crawley_logo,alpha=0.9)\n",
118 | "# draw pitches\n",
119 | "pitch.draw(ax=ax_pitch_top)\n",
120 | "pitch.draw(ax=ax_pitch_bottom)\n",
121 | "# draw legend circle\n",
122 | "ax_legend1.scatter(0.5,0.5,c=marker_color,s=100);\n",
123 | "# draw legend line\n",
124 | "empty_pitch.draw(ax=ax_legend2)\n",
125 | "empty_pitch.lines(np.array([20]),np.array([70]),np.array([70]),np.array([20]),\n",
126 | " comet=True,transparent=True,ax=ax_legend2,color=marker_color);\n",
127 | "# add title\n",
128 | "fig.text(0.03,0.98,\"Crawley Town\",verticalalignment='top',horizontalalignment='left',fontsize=50,color=line_color);\n",
129 | "fig.text(0.3,0.47,\"Pass start location\",fontsize=15,color=line_color);\n",
130 | "fig.text(0.6,0.47,\"Assist/ high Xg\",fontsize=15,color=line_color);"
131 | ]
132 | }
133 | ],
134 | "metadata": {
135 | "kernelspec": {
136 | "display_name": "Python 3",
137 | "language": "python",
138 | "name": "python3"
139 | },
140 | "language_info": {
141 | "codemirror_mode": {
142 | "name": "ipython",
143 | "version": 3
144 | },
145 | "file_extension": ".py",
146 | "mimetype": "text/x-python",
147 | "name": "python",
148 | "nbconvert_exporter": "python",
149 | "pygments_lexer": "ipython3",
150 | "version": "3.8.2"
151 | }
152 | },
153 | "nbformat": 4,
154 | "nbformat_minor": 4
155 | }
156 |
--------------------------------------------------------------------------------