├── README.md ├── data_processing.py ├── data_summary.ipynb ├── requirements.txt └── visualzation.py /README.md: -------------------------------------------------------------------------------- 1 | # An integrated dataset of spatiotemporal and event data in elite soccer 2 | 3 | This is the official repository for the paper: 4 | >Bassek, M., Rein, R., Weber, H., Memmert, D. (2025). An integrated dataset of 5 | > spatiotemporal and event data in elite soccer. *Scientific Data, 12*(1), 195. https://doi.org/10.1038/s41597-025-04505-y 6 | 7 | ## Project Structure 8 | 9 | - `data_processing.py`: Functions for loading and processing metadata, event data, and position data. 10 | - `visualization.py`: Functions for visualizing the processed data. 11 | - `data_summary.ipynb`: Jupyter notebook to replicate the descriptive statistics and visualizations presented in the paper. 12 | 13 | ## Data Source and Characteristics 14 | 15 | - Soccer matches from the [German Bundesliga](https://www.dfl.de/de/) (1st and 2nd divisions) 16 | - Size: 7 full matches 17 | - Official metadata (match information) 18 | - Official event data. 19 | - Official position data captured by [TRACAB](https://tracab.com/products/tracab-technologies/) 20 | 21 | ## License 22 | The data are provided with authorization of the [Deutsche Fussball Liga (DFL)](https://www.dfl.de/de/). The dataset 23 | is licensed under [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/). You must therefore give appropriate credit 24 | when using this dataset by 25 | 1) naming the *Deutsche Fußball Liga (DFL)* 26 | 2) citing this [publication](https://doi.org/10.1038/s41597-025-04505-y) 27 | 28 | ## Usage 29 | 30 | ### Data Processing and visualization 31 | 32 | 1. Download the raw data [here](https://doi.org/10.6084/m9.figshare.28196177) 33 | 2. Open the `data_summary.ipynb` notebook. 34 | 3. Define the path to your dataset directory in the `path` variable. 35 | 4. Run the cells to load and process the data. 36 | 5. The processed data summary will be displayed. 37 | 38 | ## Citation 39 | ```BibTeX 40 | @article{bassek2025integrated, 41 | title={An integrated dataset of spatiotemporal and event data in elite soccer}, 42 | author={Bassek, Manuel and Rein, Robert and Weber, Hendrik and Memmert, Daniel}, 43 | journal={Scientific Data}, 44 | volume={12}, 45 | number={1}, 46 | pages={195}, 47 | year={2025}, 48 | publisher={Nature Publishing Group UK London} 49 | } 50 | ``` 51 | --- 52 | 53 | ## Funding 54 | This project has been kindly supported by the [Institute of Exercise Training and Sport 55 | Informatics](https://www.dshs-koeln.de/en/institut-fuer-trainingswissenschaft-und-sportinformatik/) at the German Sport 56 | University Cologne, under supervision of Prof. Daniel Memmert. Funding was provided by the 57 | [German Research Foundation (DFG)](https://www.dfg.de/en) 58 | ([*floodlight*](https://gepris.dfg.de/gepris/projekt/522904388?context=projekt&task=showDetail&id=522904388&)). 59 | -------------------------------------------------------------------------------- /data_processing.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from floodlight.io.dfl import read_position_data_xml, read_event_data_xml, read_teamsheets_from_mat_info_xml 4 | 5 | # Define the path to the dataset 6 | path = "C:\\Users\\ke6564\\Desktop\\Studium\\Promotion\\floodlight\\Benchmark_Dataset\\Data\\" 7 | 8 | # Load Team Sheets 9 | def load_team_sheets(path): 10 | info_files = [x for x in os.listdir(path) if "matchinformation" in x] 11 | team_sheets_all = pd.DataFrame() 12 | for file in info_files: 13 | team_sheets = read_teamsheets_from_mat_info_xml(os.path.join(path, file)) 14 | team_sheets_combined = pd.concat([team_sheets["Home"].teamsheet, team_sheets["Away"].teamsheet]) 15 | team_sheets_all = pd.concat([team_sheets_all, team_sheets_combined]) 16 | return team_sheets_all 17 | 18 | # Load Event Data 19 | def load_event_data(path): 20 | info_files = sorted([x for x in os.listdir(path) if "matchinformation" in x]) 21 | event_files = sorted([x for x in os.listdir(path) if "events_raw" in x]) 22 | all_events = pd.DataFrame() 23 | for events_file, info_file in zip(event_files, info_files): 24 | events, _, _ = read_event_data_xml(os.path.join(path, events_file), os.path.join(path, info_file)) 25 | events_fullmatch = pd.DataFrame() 26 | for half in events: 27 | for team in events[half]: 28 | events_fullmatch = pd.concat([events_fullmatch, events[half][team].events]) 29 | all_events = pd.concat([all_events, events_fullmatch]) 30 | return all_events 31 | 32 | # Load Position Data 33 | def load_position_data(path): 34 | info_files = [x for x in os.listdir(path) if "matchinformation" in x] 35 | position_files = [x for x in os.listdir(path) if "positions_raw" in x] 36 | n_frames = 0 37 | for position_file, info_file in zip(position_files, info_files): 38 | positions, _, _, _, _ = read_position_data_xml(os.path.join(path, position_file), os.path.join(path, info_file)) 39 | n_frames += len(positions["firstHalf"]["Home"]) + len(positions["secondHalf"]["Home"]) 40 | return n_frames 41 | 42 | # Display Data Summary 43 | def display_data_summary(path): 44 | team_sheets_all = load_team_sheets(path) 45 | all_events = load_event_data(path) 46 | n_frames = load_position_data(path) 47 | 48 | print("Unique player IDs:", team_sheets_all["pID"].nunique()) 49 | print("Unique teams:", team_sheets_all["team"].nunique()) 50 | print("Total number of events:", len(all_events)) 51 | print("Unique event ID counts:\n", all_events["eID"].value_counts()) 52 | print("Total number of position frames:", n_frames) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | floodlight==0.4.0 2 | matplotlib==3.9.1 3 | pandas==1.5.3 4 | seaborn==0.13.2 5 | -------------------------------------------------------------------------------- /visualzation.py: -------------------------------------------------------------------------------- 1 | # visualization.ipynb 2 | 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | from floodlight.io.dfl import read_position_data_xml, read_event_data_xml 6 | 7 | # Constants 8 | COL_FACE = "lightgrey" 9 | plt.style.use("ggplot") 10 | 11 | # Load Data 12 | def load_data(path, file_name_pos, file_name_infos, file_name_events): 13 | xy_objects, possession, ballstatus, teamsheets, pitch = read_position_data_xml(f"{path}{file_name_pos}", f"{path}{file_name_infos}") 14 | events, _, _ = read_event_data_xml(f"{path}{file_name_events}", f"{path}{file_name_infos}") 15 | xy_objects["firstHalf"]["Home"].rotate(180) 16 | return xy_objects, events, pitch 17 | 18 | # Count Plot for Event IDs 19 | def plot_event_count(all_events): 20 | fig, ax = plt.subplots(figsize=(16, 9), tight_layout=True) 21 | sns.countplot(all_events["eID"], order=all_events["eID"].value_counts().index, ax=ax) 22 | ax.set_xscale("log") 23 | ax.tick_params(axis="x", labelsize=14) 24 | ax.tick_params(axis="y", labelsize=14) 25 | ax.set_xlabel("Event ID", size=14) 26 | ax.set_ylabel("Count", size=14) 27 | plt.show() 28 | 29 | # KDE Plot 30 | def plot_kde(xy_objects, pitch): 31 | fig, ax = plt.subplots(2, 2, constrained_layout=True, figsize=(13, 9)) 32 | for a in ax.flat: 33 | pitch.plot(ax=a) 34 | a.set_facecolor(COL_FACE) 35 | a.set_xlim(-55, 55) 36 | a.set_ylim(-37, 37) 37 | a.annotate("Attacking Direction", xy=(-51, -25), xytext=(-51, -28), xycoords="data", fontsize=12) 38 | a.annotate("", xy=(-22, -25), xytext=(-51, -25), xycoords="data", 39 | arrowprops=dict(arrowstyle="->", connectionstyle="arc3"), ha="center", va="top") 40 | 41 | ax[0, 0].set_xlabel("TW", size=14) 42 | ax[0, 1].set_xlabel("STZ", size=14) 43 | ax[1, 0].set_xlabel("OLM", size=14) 44 | ax[1, 1].set_xlabel("IVL", size=14) 45 | 46 | sns.kdeplot(x=xy_objects["firstHalf"]["Home"].xy[:, 22], y=xy_objects["firstHalf"]["Home"].xy[:, 23], fill=True, color="red", alpha=0.5, ax=ax[0, 0]) 47 | sns.kdeplot(x=xy_objects["firstHalf"]["Home"].xy[:, 4], y=xy_objects["firstHalf"]["Home"].xy[:, 5], fill=True, color="green", alpha=0.5, ax=ax[0, 1]) 48 | sns.kdeplot(x=xy_objects["firstHalf"]["Home"].xy[:, 30], y=xy_objects["firstHalf"]["Home"].xy[:, 31], fill=True, color="blue", alpha=0.5, ax=ax[1, 0]) 49 | sns.kdeplot(x=xy_objects["firstHalf"]["Home"].xy[:, 8], y=xy_objects["firstHalf"]["Home"].xy[:, 9], fill=True, color="purple", alpha=0.5, ax=ax[1, 1]) 50 | 51 | plt.show() 52 | 53 | # Goal Positions Plot 54 | def plot_goal_positions(xy_objects, events, pitch): 55 | framerate = xy_objects["secondHalf"]["Home"].framerate 56 | events["secondHalf"]["Home"].add_frameclock(framerate) 57 | goals = events["secondHalf"]["Home"].events.loc[events["secondHalf"]["Home"].events["eID"] == "ShotAtGoal_SuccessfulShot"] 58 | first_goal = goals.iloc[0] 59 | frame_first_goal = int(first_goal["gameclock"] * framerate + 1.6 * framerate) # offset event clock, pos data 60 | second_before_goal = frame_first_goal - 5 * framerate 61 | 62 | fig, ax = plt.subplots(tight_layout=True, figsize=(16, 10)) 63 | pitch.plot(ax=ax) 64 | ax.set_facecolor(COL_FACE) 65 | 66 | xy_objects["secondHalf"]["Home"].plot(t=(second_before_goal, frame_first_goal), plot_type="trajectories", color="blue", ax=ax) 67 | xy_objects["secondHalf"]["Away"].plot(t=(second_before_goal, frame_first_goal), plot_type="trajectories", color="red", ax=ax) 68 | xy_objects["secondHalf"]["Ball"].plot(t=(second_before_goal, frame_first_goal), plot_type="trajectories", color="black", ax=ax) 69 | 70 | xy_objects["secondHalf"]["Home"].plot(t=frame_first_goal, color="blue", ax=ax) 71 | xy_objects["secondHalf"]["Away"].plot(t=frame_first_goal, color="red", ax=ax) 72 | xy_objects["secondHalf"]["Ball"].plot(t=frame_first_goal, color="black", ax=ax) 73 | 74 | plt.show() 75 | 76 | --------------------------------------------------------------------------------