├── 0000 Preprocess Data to LEM.py ├── 0001 Train Tabular LEMs.py ├── 0002 Analyzing Survey Data.ipynb ├── 0003 Benchmark Tabular LEMs.py ├── 0004 Applications of LEMs.ipynb ├── Older Versions ├── 0001 Convert to Workable CSVs.ipynb ├── 0011 Calculate Features.ipynb ├── 0100 Basic LEMs.md ├── 0111 LEM Train Model Type.ipynb ├── 0112 LEM Train Model Accuracy.ipynb ├── 0113 LEM Train Model Data.ipynb ├── 0121 Tensor Sim.ipynb ├── 0131 Learning State Values.ipynb ├── 0132 Valuing Actions From States.ipynb ├── 0200 Fine-Tuning Approaches.md ├── 0212 Finetuning Framework - Lib.ipynb ├── 0213 Tensor Sim - Lib.ipynb ├── 0300 LEMs as Language Models.md ├── 0301 Building Dataset.ipynb ├── 0330 Simple LLM.ipynb ├── 0331 Simple LLM K3.ipynb ├── 0341 LLM scores K1-lite.ipynb ├── 0341 LLM scores K1.ipynb ├── 0341 LLM scores K3.ipynb ├── lib │ ├── data_utils.py │ ├── model_utils.py │ └── simulator.py ├── models │ └── lem │ │ ├── LEMv3_MODEL_DATA_TORCH.pth │ │ ├── LEMv3_MODEL_TYPE_TORCH.pth │ │ └── LEMv4_MODEL_ACC_TORCH.pth └── readme.md ├── lib ├── glob_fix.py ├── lem.py └── theme_assets.py ├── models └── trackers │ ├── 7111.csv │ └── 7112_definitive.csv └── readme.md /0000 Preprocess Data to LEM.py: -------------------------------------------------------------------------------- 1 | """ 2 | LEM (Large Events Model) Data Preprocessing Script 3 | 4 | This script handles the preprocessing of soccer event data into the LEM standard format. 5 | It performs three main tasks: 6 | 1. Converts raw data to LEM standard format 7 | 2. Preprocesses data for tabular models 8 | 3. Preprocesses data for time series models 9 | 10 | Built for Wyscout V3 data. 11 | """ 12 | 13 | import os 14 | import numpy as np 15 | import pandas as pd 16 | from tqdm import tqdm 17 | from typing import List, Optional, Union, Dict 18 | from pathlib import Path 19 | 20 | class LEMTokenizer: 21 | """Tokenizer for event types in soccer data.""" 22 | 23 | def __init__(self): 24 | # Initialize vocabulary with numbers 0-100 25 | self.vocab = {i: i for i in range(0, 101)} 26 | self.vocab[''] = -1 27 | 28 | # List of predefined event types 29 | self.event_types_list = [ 30 | 'pass', 'long_pass', 'cross', 'touch', 'aerial_duel', 'clearance', 'interception', 31 | 'loose_ball_duel', 'defensive_duel', 'offensive_duel', 'dribble', 'carry', 32 | 'game_interruption', 'own_goal', 'throw_in', 'free_kick', 'goal_kick', 'infraction', 33 | 'corner', 'acceleration', 'offside', 'right_foot_shot', 'left_foot_shot', 'head_shot', 34 | 'goalkeeper_exit', 'save', 'shot_against', 'fairplay', 'yellow_card', 'red_card', 35 | 'first_half_end', 'game_end' 36 | ] 37 | 38 | # Build vocabularies 39 | for i, event_type in enumerate(self.event_types_list): 40 | self.vocab[event_type] = i 41 | self.reverse_vocab = {v: k for k, v in self.vocab.items()} 42 | self.UNK_TOKEN_ID = self.vocab[''] 43 | 44 | def encode_event_types(self, data: pd.Series) -> pd.Series: 45 | """Encode event types to their corresponding IDs.""" 46 | return data.map(self.vocab) 47 | 48 | def decode_event_types(self, data: pd.Series) -> pd.Series: 49 | """Decode IDs back to event type names.""" 50 | return data.map(self.reverse_vocab) 51 | 52 | def convert_to_lem_standard( 53 | competitions_path: str, 54 | seasons_path: str, 55 | matches_path: str, 56 | events_dir: str, 57 | output_path: str, 58 | areas: List[str] = None, 59 | division_levels: List[int] = None, 60 | seasons: List[str] = None 61 | ) -> None: 62 | """ 63 | Convert raw soccer data to LEM standard format. 64 | 65 | Args: 66 | competitions_path: Path to competitions.csv 67 | seasons_path: Path to seasons.csv 68 | matches_path: Path to matches.csv 69 | events_dir: Directory containing event files 70 | output_path: Path to save the processed data 71 | areas: List of areas to include (e.g. ['Germany', 'France']) 72 | division_levels: List of division levels to include (e.g. [1, 2]) 73 | seasons: List of seasons to include (e.g. ['2022/2023']) 74 | """ 75 | if areas is None: 76 | areas = ['Germany', 'France', 'Spain', 'Portugal', 'Belgium', 'Denmark'] 77 | if division_levels is None: 78 | division_levels = [1, 2] 79 | 80 | # Load base data 81 | competitions = pd.read_csv(competitions_path) 82 | seasons = pd.read_csv(seasons_path) 83 | 84 | # Filter seasons based on criteria 85 | selected_seasons = seasons[ 86 | seasons.competition_id.isin( 87 | competitions[ 88 | competitions.area_name.isin(areas) & 89 | competitions.division_level.isin(division_levels) 90 | ].wy_id.tolist() 91 | ) 92 | ] 93 | if seasons is not None: 94 | selected_seasons = selected_seasons[selected_seasons.name.isin(seasons)] 95 | 96 | # Load matches and events 97 | matches = pd.read_csv(matches_path, low_memory=False) 98 | events = [] 99 | for season in tqdm(selected_seasons.wy_id.tolist(), desc="Loading events"): 100 | events.append(pd.read_feather(os.path.join(events_dir, f"{season}.feather"))) 101 | 102 | # Merge events with match data 103 | events = pd.concat(events).merge( 104 | matches[['wy_id', 'home_team_id', 'away_team_id', 'winner']].rename(columns={'wy_id': 'match_id'}), 105 | on='match_id' 106 | ) 107 | events['game_result'] = -1 + (events.winner == 0) + (events.winner == events.team_id) * 2 108 | 109 | # Process event types 110 | events = process_event_types(events) 111 | 112 | # Process game state variables 113 | events = process_game_state(events) 114 | 115 | # Save processed data 116 | os.makedirs(os.path.dirname(output_path), exist_ok=True) 117 | events.to_feather(output_path) 118 | 119 | def process_event_types(events: pd.DataFrame) -> pd.DataFrame: 120 | """Process and categorize different types of events.""" 121 | 122 | # Discriminate duels 123 | events.loc[events.defensive_duel, 'type_primary'] = 'defensive_duel' 124 | events.loc[events.offensive_duel, 'type_primary'] = 'offensive_duel' 125 | events.loc[events.aerial_duel, 'type_primary'] = 'aerial_duel' 126 | events.loc[events.loose_ball_duel, 'type_primary'] = 'loose_ball_duel' 127 | events.loc[events.dribble, 'type_primary'] = 'dribble' 128 | 129 | # Discriminate crosses & long passes 130 | events.loc[events.cross, 'type_primary'] = 'cross' 131 | events.loc[events.long_pass, 'type_primary'] = 'long_pass' 132 | 133 | # Process carries 134 | events_carries = events[events.carry].copy() 135 | events_carries['type_primary'] = 'carry' 136 | 137 | # Process shots 138 | events.loc[events.shot_body_part == 'head_or_other', 'type_primary'] = 'head_shot' 139 | events.loc[events.shot_body_part == 'right_foot', 'type_primary'] = 'right_foot_shot' 140 | events.loc[events.shot_body_part == 'left_foot', 'type_primary'] = 'left_foot_shot' 141 | events.loc[events.type_primary == 'shot', 'type_primary'] = 'right_foot_shot' 142 | 143 | # Process other events 144 | events.loc[events.save, 'type_primary'] = 'save' 145 | events.loc[events.yellow_card, 'type_primary'] = 'yellow_card' 146 | events.loc[events.red_card, 'type_primary'] = 'red_card' 147 | 148 | # Process end events 149 | events_end = events.groupby(['match_id', 'match_period']).tail(1) 150 | events_end.loc[events_end.match_period == '1H', 'type_primary'] = 'first_half_end' 151 | events_end.loc[events_end.match_period == '2H', 'type_primary'] = 'game_end' 152 | 153 | # Combine all events 154 | events = pd.concat([events, events_carries, events_end]) 155 | return events.sort_values(['match_id', 'match_period', 'minute', 'second']) 156 | 157 | def process_game_state(events: pd.DataFrame) -> pd.DataFrame: 158 | """Process game state variables like goals, cards, etc.""" 159 | 160 | # Calculate time between events 161 | events['t'] = (events.minute - events.minute.shift(1).fillna(0)) * 60 + (events.second - events.second.shift(1).fillna(0)) 162 | 163 | # Calculate cumulative statistics per match 164 | for prefix, condition in [ 165 | ('hg', events.goal & events.h), 166 | ('ag', events.goal & ~events.h), 167 | ('hr', events.red_card & events.h), 168 | ('ar', events.red_card & ~events.h), 169 | ('hy', events.yellow_card & events.h), 170 | ('ay', events.yellow_card & ~events.h) 171 | ]: 172 | events[prefix] = condition.groupby(events.match_id).cumsum() 173 | 174 | # Process period indicators 175 | events['p'] = events.match_period.map({'1H': False, '2H': True}).shift(1) 176 | events.loc[events.match_id != events.match_id.shift(1), 'p'] = 0 177 | 178 | # Process minute and second 179 | for col in ['m', 's']: 180 | events[col] = events[col[0]].shift(1) 181 | events.loc[events.match_id != events.match_id.shift(1), col] = 0 182 | 183 | return events 184 | 185 | def arrange_data_for_tabular( 186 | input_path: str, 187 | seq_len: int, 188 | n_files: int, 189 | output_dir: str = None 190 | ) -> None: 191 | """ 192 | Arrange data for tabular models. 193 | 194 | Args: 195 | input_path: Path to input feather file 196 | seq_len: Sequence length to use 197 | n_files: Number of files to split the data into 198 | output_dir: Directory to save the processed files 199 | """ 200 | if output_dir is None: 201 | output_dir = input_path.replace('/raw_lem/', '/tabular_lem/').rsplit('.', 1)[0] 202 | 203 | # Load and tokenize data 204 | data = pd.read_feather(input_path) 205 | tokenizer = LEMTokenizer() 206 | data['e'] = tokenizer.encode_event_types(data['e']) 207 | 208 | # Add context for each sequence length 209 | event_vars = ['h', 'e', 'x', 'y', 't', 'a'] 210 | for i in range(1, seq_len + 1): 211 | data_context = data.shift(i).fillna(tokenizer.UNK_TOKEN_ID) 212 | data_context.loc[data_context['match_id'] != data['match_id'], event_vars] = tokenizer.UNK_TOKEN_ID 213 | data_context = data_context[event_vars].add_prefix(f'c{i}_').astype(np.int8) 214 | data = pd.concat([data, data_context], axis=1) 215 | 216 | # Prepare data for each event variable 217 | data_lst = [] 218 | for i, var in enumerate(event_vars): 219 | edit_data = data.copy() 220 | edit_data['target'] = edit_data[var].clip(0, 100) 221 | edit_data[event_vars[i:]] = tokenizer.UNK_TOKEN_ID 222 | edit_data = edit_data.drop(columns=['match_id']) 223 | data_lst.append(edit_data) 224 | 225 | # Combine and process final dataset 226 | data = pd.concat(data_lst) 227 | data = data.astype(np.int8) 228 | data = data.sample(frac=1, random_state=42) 229 | 230 | # Save data splits 231 | os.makedirs(output_dir, exist_ok=True) 232 | for i in range(n_files): 233 | start_idx = i * len(data) // n_files 234 | end_idx = (i + 1) * len(data) // n_files 235 | output_path = f"{output_dir}_sq{seq_len}_rs42_{i}.feather" 236 | data.iloc[start_idx:end_idx].to_feather(output_path) 237 | 238 | def arrange_data_for_time_series( 239 | input_path: str, 240 | seq_len: int, 241 | n_files: int, 242 | output_dir: str = None 243 | ) -> None: 244 | """ 245 | Arrange data for time series models. 246 | 247 | Args: 248 | input_path: Path to input feather file 249 | seq_len: Sequence length to use 250 | n_files: Number of files to split the data into 251 | output_dir: Directory to save the processed files 252 | """ 253 | if output_dir is None: 254 | output_dir = input_path.replace('/raw_lem/', '/time_series_lem/').rsplit('.', 1)[0] 255 | 256 | # Load data 257 | df = pd.read_feather(input_path) 258 | tokenizer = LEMTokenizer() 259 | 260 | # Process events 261 | event_vars = ['h', 'e', 'x', 'y', 't', 'a'] 262 | context_vars = ['p', 'm', 's', 'hg', 'ag', 'hr', 'ar', 'hy', 'ay'] 263 | 264 | # Create temporary directory for processing 265 | temp_dir = "temp" 266 | os.makedirs(temp_dir, exist_ok=True) 267 | 268 | # Process data in chunks 269 | for match_id in tqdm(df.match_id.unique(), desc="Processing matches"): 270 | match_data = df[df.match_id == match_id] 271 | data_events = match_data[event_vars] 272 | data_contexts = match_data[context_vars] 273 | 274 | for i in range(len(data_events)): 275 | event_no = i // len(event_vars) 276 | event_var_id = i % len(event_vars) 277 | 278 | # Prepare series data 279 | series = data_events.iloc[:, max(0, i-(len(event_vars) * seq_len)):i].clip(0, 100) 280 | if series.shape[1] < (len(event_vars) * seq_len): 281 | padding = pd.DataFrame( 282 | [[tokenizer.UNK_TOKEN_ID] * (len(event_vars)*seq_len - series.shape[1])] * data_events.shape[0] 283 | ) 284 | series = pd.concat([padding, series], axis=1) 285 | series.columns = [f'i{seq_len*len(event_vars) - j}' for j in range(series.shape[1])] 286 | 287 | # Prepare target and context 288 | target = data_events.iloc[:, i].rename('target').clip(0, 100) 289 | context = data_contexts.iloc[:, max(0, event_no * len(context_vars) - len(context_vars)):event_no * len(context_vars)].clip(0, 100) 290 | if context.shape[1] < len(context_vars): 291 | context = pd.DataFrame([[0]*len(context_vars)] * data_events.shape[0]) 292 | context.columns = [f'c{j}' for j in range(len(context_vars))] 293 | 294 | # Combine data 295 | combined_data = pd.concat([context, series, target], axis=1) 296 | combined_data['event_var_id'] = event_var_id 297 | combined_data = combined_data.dropna() 298 | 299 | if not combined_data.empty: 300 | for file_idx in range(n_files): 301 | temp_file = os.path.join(temp_dir, f'arrange_data_as_time_series_{file_idx}.csv') 302 | combined_data.sample(frac=1/n_files).to_csv( 303 | temp_file, 304 | mode='a', 305 | header=not os.path.exists(temp_file), 306 | index=False 307 | ) 308 | 309 | # Save final files 310 | os.makedirs(output_dir, exist_ok=True) 311 | for i in range(n_files): 312 | temp_file = os.path.join(temp_dir, f'arrange_data_as_time_series_{i}.csv') 313 | if os.path.exists(temp_file): 314 | data = pd.read_csv(temp_file) 315 | data = data.astype(np.int8) 316 | output_path = f"{output_dir}_sq{seq_len}_rs42_{i}.feather" 317 | data.to_feather(output_path) 318 | os.remove(temp_file) 319 | 320 | def main(): 321 | """Main function to run the preprocessing pipeline.""" 322 | import argparse 323 | 324 | parser = argparse.ArgumentParser(description="Preprocess soccer event data for LEM models") 325 | parser.add_argument('--data_dir', type=str, required=True, help="Directory containing the raw data files") 326 | parser.add_argument('--output_dir', type=str, required=True, help="Directory to save processed files") 327 | parser.add_argument('--seq_lengths', type=int, nargs='+', default=[1, 3, 5, 7, 9], help="Sequence lengths to process") 328 | parser.add_argument('--n_files', type=int, default=10, help="Number of files to split the data into") 329 | args = parser.parse_args() 330 | 331 | # Convert to LEM standard 332 | print("Converting data to LEM standard...") 333 | convert_to_lem_standard( 334 | competitions_path=os.path.join(args.data_dir, "competitions.csv"), 335 | seasons_path=os.path.join(args.data_dir, "seasons.csv"), 336 | matches_path=os.path.join(args.data_dir, "matches.csv"), 337 | events_dir=os.path.join(args.data_dir, "seasons/events"), 338 | output_path=os.path.join(args.output_dir, "raw_lem/data.feather") 339 | ) 340 | 341 | # Process for different model types 342 | raw_lem_path = os.path.join(args.output_dir, "raw_lem/data.feather") 343 | for seq_len in args.seq_lengths: 344 | print(f"Processing for sequence length {seq_len}...") 345 | 346 | print("Arranging data for tabular models...") 347 | arrange_data_for_tabular( 348 | raw_lem_path, 349 | seq_len, 350 | args.n_files, 351 | os.path.join(args.output_dir, "tabular_lem") 352 | ) 353 | 354 | # WARNING: Uncomment this when you want to process time series models as per the annexes of the paper 355 | # print("Arranging data for time series models...") 356 | # arrange_data_for_time_series( 357 | # raw_lem_path, 358 | # seq_len, 359 | # args.n_files, 360 | # os.path.join(args.output_dir, "time_series_lem") 361 | # ) 362 | 363 | if __name__ == "__main__": 364 | main() -------------------------------------------------------------------------------- /0001 Train Tabular LEMs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Train Tabular LEMs (Large Events Models) 3 | 4 | This script trains various neural network architectures on soccer event data in LEM format. 5 | It supports both survey-style quick training and full training modes. 6 | The models include MLPs of various sizes, with configurable hyperparameters. 7 | 8 | The script can be run in two modes: 9 | 1. Survey mode: Quick training to compare different architectures 10 | 2. Full mode: Complete training of selected architectures 11 | """ 12 | 13 | import os 14 | import numpy as np 15 | import pandas as pd 16 | import torch 17 | import torch.nn as nn 18 | import torch.optim as optim 19 | from torch.utils.data import TensorDataset, DataLoader 20 | from tqdm import tqdm 21 | from typing import List, Dict, Optional, Union 22 | from datetime import datetime 23 | from pathlib import Path 24 | 25 | # Constants for CUDA setup 26 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 27 | 28 | class MLP(nn.Module): 29 | """Multi-layer Perceptron with configurable architecture.""" 30 | 31 | def __init__(self, input_size: int, hidden_sizes: List[int], output_size: int, dropout_rate: float = 0.0): 32 | super().__init__() 33 | 34 | layers = [] 35 | prev_size = input_size 36 | 37 | for hidden_size in hidden_sizes: 38 | layers.extend([ 39 | nn.Linear(prev_size, hidden_size), 40 | nn.ReLU(), 41 | nn.Dropout(dropout_rate) 42 | ]) 43 | prev_size = hidden_size 44 | 45 | layers.append(nn.Linear(prev_size, output_size)) 46 | self.model = nn.Sequential(*layers) 47 | 48 | def forward(self, x: torch.Tensor) -> torch.Tensor: 49 | return self.model(x) 50 | 51 | def write_to_tracker(filepath: str, line: str) -> None: 52 | """Write a line to the tracking file.""" 53 | os.makedirs(os.path.dirname(filepath), exist_ok=True) 54 | with open(filepath, 'a') as f: 55 | f.write(line) 56 | 57 | def instantiate_models(seq_len: int, output_size: int, mode: str = 'survey') -> List[nn.Module]: 58 | """ 59 | Instantiate models based on the specified mode. 60 | 61 | Args: 62 | seq_len: Length of input sequence 63 | output_size: Number of output classes 64 | mode: Either 'survey' for quick comparison or 'full' for complete training 65 | 66 | Returns: 67 | List of instantiated PyTorch models 68 | """ 69 | models = [] 70 | 71 | if mode == 'survey': 72 | # Survey mode includes a wider range of architectures 73 | models.extend([ 74 | MLP(seq_len, [80], output_size), 75 | MLP(seq_len, [96, 96, 96], output_size), 76 | MLP(seq_len, [196, 196, 196], output_size), 77 | MLP(seq_len, [360, 360, 360], output_size), 78 | MLP(seq_len, [682, 682, 682], output_size), 79 | MLP(seq_len, [1200, 1200, 1200], output_size), 80 | MLP(seq_len, [2220, 2220, 2220], output_size, dropout_rate=0.3) 81 | ]) 82 | else: 83 | # Full mode focuses on selected architectures with dropout 84 | models.extend([ 85 | MLP(seq_len, [196, 196, 196], output_size, dropout_rate=0.3), 86 | MLP(seq_len, [360, 360, 360], output_size, dropout_rate=0.3), 87 | MLP(seq_len, [682, 682, 682], output_size, dropout_rate=0.3), 88 | MLP(seq_len, [1200, 1200, 1200], output_size, dropout_rate=0.3), 89 | MLP(seq_len, [2220, 2220, 2220], output_size, dropout_rate=0.3) 90 | ]) 91 | 92 | return models 93 | 94 | def load_data(data_path: str, val_samples: int = 100_000) -> tuple: 95 | """ 96 | Load and prepare validation data. 97 | 98 | Args: 99 | data_path: Path to validation data file 100 | val_samples: Number of validation samples to use 101 | 102 | Returns: 103 | Tuple of (validation dataset, output size) 104 | """ 105 | val_data = pd.read_feather(data_path) 106 | val_data = val_data.sample(val_samples, random_state=42) 107 | 108 | X_val = torch.tensor(val_data.drop(columns=['target']).astype(int).values, dtype=torch.float32).to(DEVICE) 109 | Y_val = torch.tensor(pd.get_dummies(val_data['target']).astype(int).values, dtype=torch.float32).to(DEVICE) 110 | 111 | val_dataset = TensorDataset(X_val, Y_val) 112 | output_size = Y_val.shape[1] 113 | 114 | return val_dataset, output_size 115 | 116 | def train_models( 117 | seq_len: int, 118 | mode: str, 119 | data_dir: str, 120 | output_dir: str, 121 | learning_rate: float, 122 | batch_size: int, 123 | n_epochs: int, 124 | n_files_train: int, 125 | checkpoints: List[int], 126 | val_samples: int = 100_000 127 | ) -> None: 128 | """ 129 | Train models on the specified data. 130 | 131 | Args: 132 | seq_len: Length of input sequence 133 | mode: Either 'survey' for quick comparison or 'full' for complete training 134 | data_dir: Directory containing the data files 135 | output_dir: Directory to save model outputs 136 | learning_rate: Learning rate for optimization 137 | batch_size: Batch size for training 138 | n_epochs: Number of epochs to train 139 | n_files_train: Number of training files to use 140 | checkpoints: List of batch numbers at which to evaluate 141 | val_samples: Number of validation samples to use 142 | """ 143 | # Load validation data 144 | val_data_path = os.path.join(data_dir, f'val_extensive_2223_sq{seq_len}_rs42_0.feather') 145 | val_dataset, output_size = load_data(val_data_path, val_samples) 146 | val_loader = DataLoader(val_dataset, batch_size=batch_size*4, shuffle=False, drop_last=False) 147 | 148 | # Initialize models 149 | models = instantiate_models(seq_len, output_size, mode) 150 | 151 | # Training loop 152 | for model in models: 153 | model_name = model.__class__.__name__ 154 | model_params = sum(p.numel() for p in model.parameters() if p.requires_grad) 155 | print(f"Training {model_name} with {model_params} parameters") 156 | 157 | model = model.to(DEVICE) 158 | criterion = nn.BCEWithLogitsLoss() 159 | optimizer = optim.Adam(model.parameters(), lr=learning_rate) 160 | 161 | best_loss = float('inf') 162 | batch_counter = 0 163 | 164 | for epoch in range(n_epochs): 165 | for train_set_id in tqdm(range(n_files_train), desc=f"Epoch {epoch + 1}/{n_epochs}"): 166 | # Load training data 167 | train_data_path = os.path.join( 168 | data_dir, 169 | f'train_extensive_1516_2122_sq{seq_len}_rs42_{train_set_id}.feather' 170 | ) 171 | train_data = pd.read_feather(train_data_path) 172 | 173 | X_train = torch.tensor(train_data.drop(columns=['target']).astype(int).values, dtype=torch.float32) 174 | Y_train = torch.tensor(pd.get_dummies(train_data['target']).astype(int).values, dtype=torch.float32) 175 | 176 | train_dataset = TensorDataset(X_train, Y_train) 177 | train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) 178 | 179 | # Training 180 | model.train() 181 | train_losses = [] 182 | 183 | for X_batch, Y_batch in train_loader: 184 | X_batch, Y_batch = X_batch.to(DEVICE), Y_batch.to(DEVICE) 185 | 186 | optimizer.zero_grad() 187 | Y_pred = model(X_batch) 188 | loss = criterion(Y_pred, Y_batch) 189 | loss.backward() 190 | optimizer.step() 191 | 192 | train_losses.append(loss.item()) 193 | batch_counter += 1 194 | 195 | if batch_counter in checkpoints: 196 | # Validation 197 | model.eval() 198 | with torch.no_grad(): 199 | val_losses = [] 200 | for X_val, Y_val in val_loader: 201 | Y_pred = model(X_val) 202 | val_loss = criterion(Y_pred, Y_val) 203 | val_losses.append(val_loss.item()) 204 | 205 | val_loss = np.mean(val_losses) 206 | train_loss = np.mean(train_losses) 207 | 208 | # Log results 209 | tracker_path = os.path.join(output_dir, f'trackers/71{"12" if mode == "full" else "11"}_definitive.csv') 210 | log_line = f'{model_name},{seq_len},{model_params},{epoch},{batch_counter},{train_loss},{val_loss},{datetime.now()}\\n' 211 | write_to_tracker(tracker_path, log_line) 212 | 213 | # Save best model 214 | if val_loss < best_loss and mode == 'full': 215 | best_loss = val_loss 216 | model_path = os.path.join( 217 | output_dir, 218 | f'models/7112_{model_name}_{model_params}_{seq_len}_e{epoch}.pt' 219 | ) 220 | torch.save(model.state_dict(), model_path) 221 | 222 | model.train() 223 | train_losses = [] 224 | 225 | def main(): 226 | """Main function to run the training pipeline.""" 227 | import argparse 228 | 229 | parser = argparse.ArgumentParser(description="Train Tabular LEMs") 230 | parser.add_argument('--mode', type=str, choices=['survey', 'full'], required=True, 231 | help="Training mode: 'survey' for quick comparison or 'full' for complete training") 232 | parser.add_argument('--data_dir', type=str, required=True, 233 | help="Directory containing the data files") 234 | parser.add_argument('--output_dir', type=str, required=True, 235 | help="Directory to save model outputs") 236 | parser.add_argument('--seq_lengths', type=int, nargs='+', default=[1, 3, 5, 7, 9], 237 | help="Sequence lengths to process") 238 | 239 | args = parser.parse_args() 240 | 241 | # Set hyperparameters based on mode 242 | if args.mode == 'survey': 243 | config = { 244 | 'learning_rate': 0.01, 245 | 'batch_size': 1024, 246 | 'n_epochs': 1, 247 | 'n_files_train': 8, 248 | 'checkpoints': sorted([10 ** i for i in range(2, 10)] + [3 * (10 ** i) for i in range(2, 10)]) 249 | } 250 | else: 251 | config = { 252 | 'learning_rate': 0.001, 253 | 'batch_size': 1024, 254 | 'n_epochs': 4, 255 | 'n_files_train': 30, 256 | 'checkpoints': [i * (10 ** 4) for i in range(1, 100)] 257 | } 258 | 259 | # Create output directories 260 | os.makedirs(os.path.join(args.output_dir, 'models'), exist_ok=True) 261 | os.makedirs(os.path.join(args.output_dir, 'trackers'), exist_ok=True) 262 | 263 | # Train models for each sequence length 264 | for seq_len in args.seq_lengths: 265 | print(f"Training models for sequence length {seq_len}") 266 | train_models(seq_len, args.mode, args.data_dir, args.output_dir, **config) 267 | 268 | if __name__ == "__main__": 269 | main() -------------------------------------------------------------------------------- /0003 Benchmark Tabular LEMs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Benchmark Tabular LEMs (Large Events Models) 3 | 4 | This script performs comprehensive benchmarking of trained LEM models, including: 5 | 1. Model performance metrics (accuracy, F1-score) 6 | 2. Distribution analysis of predictions vs real data 7 | 3. Simulation analysis for game outcomes 8 | 4. Visualization of results 9 | 10 | The script generates various plots and metrics to evaluate model quality 11 | and compare different architectures. 12 | """ 13 | 14 | import os 15 | import copy 16 | import numpy as np 17 | import pandas as pd 18 | import torch 19 | import torch.nn as nn 20 | from torch.utils.data import TensorDataset 21 | import matplotlib.pyplot as plt 22 | from typing import List, Dict, Tuple, Optional, Union 23 | from datetime import datetime 24 | from pathlib import Path 25 | from sklearn.metrics import accuracy_score, f1_score 26 | from tqdm import tqdm 27 | 28 | from lib.lem import MLP, LEMTokenizer, simulate_game, DEVICE 29 | 30 | class ModelBenchmarker: 31 | """Class to handle model benchmarking operations.""" 32 | 33 | def __init__(self, seq_len: int = 3, output_size: int = 101): 34 | """ 35 | Initialize the benchmarker. 36 | 37 | Args: 38 | seq_len: Length of input sequence 39 | output_size: Number of output classes 40 | """ 41 | self.seq_len = seq_len 42 | self.output_size = output_size 43 | self.tokenizer = LEMTokenizer() 44 | 45 | # Set plotting style 46 | plt.rcParams['font.family'] = ["Times New Roman"] 47 | plt.rcParams['figure.figsize'] = [12, 3.5] 48 | 49 | def load_models_for_testing(self, base_models: List[List[int]], model_dir: str) -> List[Dict]: 50 | """ 51 | Load models for testing. 52 | 53 | Args: 54 | base_models: List of hidden layer configurations 55 | model_dir: Directory containing model files 56 | 57 | Returns: 58 | List of dictionaries containing model information 59 | """ 60 | models_for_testing = [] 61 | 62 | for model_architecture in [ 63 | MLP(self.seq_len, hidden_sizes, self.output_size, dropout_rate=0.3) 64 | for hidden_sizes in base_models 65 | ]: 66 | n_params = sum(p.numel() for p in model_architecture.parameters()) 67 | for epoch in range(4): 68 | models_for_testing.append({ 69 | 'model': copy.deepcopy(model_architecture), 70 | 'n_params': n_params, 71 | 'dir': os.path.join(model_dir, f'7112_MLP_{n_params}_{self.seq_len}_e{epoch}.pt') 72 | }) 73 | 74 | return models_for_testing 75 | 76 | def load_validation_data(self, data_path: str) -> pd.DataFrame: 77 | """ 78 | Load validation data for benchmarking. 79 | 80 | Args: 81 | data_path: Path to validation data file 82 | 83 | Returns: 84 | DataFrame containing validation data 85 | """ 86 | return pd.read_feather(data_path) 87 | 88 | def get_target_distributions(self, raw_data_path: str) -> Dict[str, pd.Series]: 89 | """ 90 | Calculate target distributions from raw data. 91 | 92 | Args: 93 | raw_data_path: Path to raw data file 94 | 95 | Returns: 96 | Dictionary containing various target distributions 97 | """ 98 | df = pd.read_feather(raw_data_path) 99 | 100 | # Calculate goal-related distributions 101 | goals_delta = df.groupby('match_id')['hg'].max() - df.groupby('match_id')['ag'].max() 102 | goals_delta = (goals_delta.clip(-5, 5).value_counts().sort_index() / goals_delta.value_counts().sum()) 103 | 104 | home_goals = (df.groupby('match_id')['hg'].max().clip(-5, 5).value_counts().sort_index() 105 | / df.groupby('match_id')['hg'].max().value_counts().sum()) 106 | 107 | away_goals = (df.groupby('match_id')['ag'].max().clip(-5, 5).value_counts().sort_index() 108 | / df.groupby('match_id')['ag'].max().value_counts().sum()) 109 | 110 | # Calculate event-related distributions 111 | event_type_dist = df['e'].value_counts() / len(df) 112 | event_type_dist = event_type_dist.sort_index() 113 | 114 | x_dist = df['x'].value_counts() / len(df) 115 | x_dist = x_dist.sort_index() 116 | 117 | y_dist = df['y'].value_counts() / len(df) 118 | y_dist = y_dist.sort_index() 119 | 120 | t_dist = df['t'].value_counts() / len(df) 121 | t_dist = t_dist.sort_index() 122 | 123 | return { 124 | 'goals_delta': goals_delta, 125 | 'home_goals': home_goals, 126 | 'away_goals': away_goals, 127 | 'event_type': event_type_dist, 128 | 'x': x_dist, 129 | 'y': y_dist, 130 | 't': t_dist 131 | } 132 | 133 | def run_simulations( 134 | self, 135 | models: List[Dict], 136 | target_distributions: Dict[str, pd.Series], 137 | n_sims: int = 10000 138 | ) -> pd.DataFrame: 139 | """ 140 | Run game simulations for each model and compare with target distributions. 141 | 142 | Args: 143 | models: List of model dictionaries 144 | target_distributions: Dictionary of target distributions 145 | n_sims: Number of simulations to run 146 | 147 | Returns: 148 | DataFrame containing simulation results 149 | """ 150 | results = [] 151 | base_tensor = torch.Tensor([ 152 | [self.tokenizer.UNK_TOKEN_ID] * 6 + [0] * 9 + [1, 0, 50, 50, 0, 1] + 153 | [self.tokenizer.UNK_TOKEN_ID] * 6 * (self.seq_len - 1) 154 | ]) 155 | context_tensor = base_tensor.repeat(n_sims, 1).to(DEVICE) 156 | 157 | for model_data in tqdm(models, desc="Running simulations"): 158 | if 'e3' not in model_data['dir']: # Only use final epoch models 159 | continue 160 | 161 | model = model_data['model'] 162 | model.load_state_dict(torch.load(model_data['dir'], weights_only=True)) 163 | model.eval() 164 | model.to(DEVICE) 165 | 166 | # Run simulation 167 | res_goals_delta, res_goals_home, res_goals_away, n_sims, inspect_e, inspect_x, inspect_y, inspect_t, \ 168 | inspect_uncertainty, inspect_shots, inspect_xg = simulate_game( 169 | model, context_tensor, max_sims=2500, return_type='results+inspect' 170 | ) 171 | 172 | # Calculate distribution differences 173 | diffs = self._calculate_distribution_differences( 174 | res_goals_delta, res_goals_home, res_goals_away, 175 | inspect_e, inspect_x, inspect_y, inspect_t, 176 | target_distributions 177 | ) 178 | 179 | # Calculate additional metrics 180 | uncertainties = [np.mean(np.concatenate(u)) for u in inspect_uncertainty] 181 | shots = [np.mean(s) for s in inspect_shots] 182 | xg = [np.mean(np.concatenate(x)) for x in inspect_xg] 183 | 184 | results.append([ 185 | model_data['dir'], 186 | *diffs, 187 | *uncertainties, 188 | *shots, 189 | *xg 190 | ]) 191 | 192 | columns = [ 193 | 'model_size', 194 | 'gdd', 'hgd', 'agd', 195 | 'eventdistdiff', 'x_dist', 'y_dist', 't_dist', 196 | 'uncertainty_h', 'uncertainty_e', 'uncertainty_x', 'uncertainty_y', 'uncertainty_t', 'uncertainty_a', 197 | 'shots_h', 'shots_a', 'xg_h', 'xg_a' 198 | ] 199 | 200 | return pd.DataFrame(results, columns=columns) 201 | 202 | def _calculate_distribution_differences( 203 | self, 204 | goals_delta: np.ndarray, 205 | goals_home: np.ndarray, 206 | goals_away: np.ndarray, 207 | events: List[np.ndarray], 208 | x_coords: List[np.ndarray], 209 | y_coords: List[np.ndarray], 210 | times: List[np.ndarray], 211 | target_distributions: Dict[str, pd.Series] 212 | ) -> List[float]: 213 | """Calculate differences between simulated and target distributions.""" 214 | 215 | # Process goals distributions 216 | goal_delta_dist = pd.Series(goals_delta).clip( 217 | lower=target_distributions['goals_delta'].index.min(), 218 | upper=target_distributions['goals_delta'].index.max() 219 | ).value_counts(normalize=True).sort_index() 220 | 221 | home_goals_dist = pd.Series(goals_home).clip( 222 | lower=target_distributions['home_goals'].index.min(), 223 | upper=target_distributions['home_goals'].index.max() 224 | ).value_counts(normalize=True).sort_index() 225 | 226 | away_goals_dist = pd.Series(goals_away).clip( 227 | lower=target_distributions['away_goals'].index.min(), 228 | upper=target_distributions['away_goals'].index.max() 229 | ).value_counts(normalize=True).sort_index() 230 | 231 | # Process event distributions 232 | event_dist = self.tokenizer.decode_event_types(pd.Series(np.concatenate(events))) 233 | event_dist = event_dist.value_counts(normalize=True).sort_index() 234 | 235 | x_dist = pd.Series(np.concatenate(x_coords)).value_counts(normalize=True).sort_index() 236 | y_dist = pd.Series(np.concatenate(y_coords)).value_counts(normalize=True).sort_index() 237 | t_dist = pd.Series(np.concatenate(times)).value_counts(normalize=True).sort_index() 238 | 239 | # Calculate absolute differences 240 | return [ 241 | (target_distributions['goals_delta'] - goal_delta_dist).abs().sum(), 242 | (target_distributions['home_goals'] - home_goals_dist).abs().sum(), 243 | (target_distributions['away_goals'] - away_goals_dist).abs().sum(), 244 | (target_distributions['event_type'] - event_dist).abs().sum(), 245 | (target_distributions['x'] - x_dist).abs().sum(), 246 | (target_distributions['y'] - y_dist).abs().sum(), 247 | (target_distributions['t'] - t_dist).abs().sum() 248 | ] 249 | 250 | def plot_goal_difference_distribution( 251 | self, 252 | results: pd.DataFrame, 253 | target_distribution: pd.Series, 254 | output_path: str 255 | ) -> None: 256 | """ 257 | Plot goal difference distributions comparison. 258 | 259 | Args: 260 | results: DataFrame containing simulation results 261 | target_distribution: Target goal difference distribution 262 | output_path: Path to save the plot 263 | """ 264 | x_values = range(-5, 6) 265 | 266 | plt.figure(figsize=(12, 3.5)) 267 | plt.bar(x_values, target_distribution, label='Real Distribution', zorder=-1, color='#003049') 268 | 269 | # Plot model distributions by size 270 | model_sizes = { 271 | '10M': '#ff99ac', 272 | '3M': '#fcbf49', 273 | '1M': '#eae2b7', 274 | '300k': '#f77f00', 275 | '100k': '#d62828' 276 | } 277 | 278 | for size_name, color in model_sizes.items(): 279 | size_results = results[results['model_size'].str.contains(size_name)] 280 | for i, row in enumerate(size_results.itertuples()): 281 | plt.bar( 282 | np.array(x_values) + 0.4 - (i + 0.5) * 0.8 / len(results), 283 | row.goal_dist, 284 | alpha=0.7, 285 | color=color, 286 | width=0.8 / len(results), 287 | label=f'{size_name} Model' 288 | ) 289 | 290 | plt.ylim(0, 0.5) 291 | plt.yticks([0, 0.1, 0.2, 0.3, 0.4, 0.5], fontsize=16) 292 | plt.ylabel('Probability', fontsize=20) 293 | plt.xlabel('Goal Difference', fontsize=20) 294 | plt.xticks(x_values, fontsize=16) 295 | 296 | handles, labels = plt.gca().get_legend_handles_labels() 297 | by_label = dict(zip(labels, handles)) 298 | plt.legend(by_label.values(), by_label.keys(), loc='upper left', fontsize=16) 299 | 300 | plt.box(False) 301 | plt.grid(True, which='both', axis='both', linestyle='--', linewidth=0.5, alpha=0.3) 302 | plt.tight_layout() 303 | 304 | plt.savefig(output_path, bbox_inches='tight') 305 | plt.close() 306 | 307 | def plot_expected_goals_distribution( 308 | self, 309 | models: List[Dict], 310 | validation_data: pd.DataFrame, 311 | competitions_data: pd.DataFrame, 312 | output_path: str 313 | ) -> None: 314 | """ 315 | Plot expected goals distribution comparison. 316 | 317 | Args: 318 | models: List of model dictionaries 319 | validation_data: Validation dataset 320 | competitions_data: Real competition data for comparison 321 | output_path: Path to save the plot 322 | """ 323 | plt.figure(figsize=(12, 12)) 324 | 325 | # Filter shots data 326 | df_shots = validation_data[ 327 | validation_data['e'].isin([21, 22, 23]) & 328 | (validation_data['t'] != -1) & 329 | (validation_data['a'] == -1) 330 | ] 331 | 332 | # Get benchmark distribution 333 | sample_for_hist = competitions_data[ 334 | ~competitions_data.shot_body_part.isna() 335 | ].sample(df_shots.shape[0]).shot_xg 336 | 337 | # Plot for each model size 338 | for spid in range(1, 6): 339 | plt.subplot(5, 1, spid) 340 | plt.hist( 341 | sample_for_hist, 342 | bins=[i/100 for i in range(101)], 343 | histtype='step', 344 | label='Benchmark xG Distribution', 345 | linewidth=1, 346 | ls='--', 347 | color='black', 348 | zorder=-1 349 | ) 350 | 351 | plt.xlim(0, 1.5) 352 | plt.yscale('log') 353 | plt.box(False) 354 | plt.xticks([0, 0.2, 0.4, 0.6, 0.8, 1.0], fontsize=16) 355 | plt.yticks([1e0, 1e1, 1e2, 1e3], fontsize=16) 356 | 357 | if spid == 5: 358 | plt.ylabel('Number of Shots', fontsize=20) 359 | plt.xlabel('Expected Goals', fontsize=20) 360 | 361 | # Calculate benchmark histogram 362 | sample_hist, _ = np.histogram( 363 | sample_for_hist, 364 | bins=[i/100 for i in range(101)], 365 | density=True 366 | ) 367 | 368 | # Plot model predictions 369 | for model_data in models: 370 | model = model_data['model'] 371 | model.load_state_dict(torch.load(model_data['dir'], weights_only=True)) 372 | model = model.to(DEVICE) 373 | model.eval() 374 | 375 | pred_proba = model.predict_proba(df_shots.drop(columns=['target']))[:, 1] 376 | pred_hist, _ = np.histogram( 377 | pred_proba, 378 | bins=[i/100 for i in range(101)], 379 | density=True 380 | ) 381 | 382 | distance = round(abs(sample_hist - pred_hist).sum()) 383 | f1 = round( 384 | f1_score( 385 | df_shots['target'], 386 | model.predict(df_shots.drop(columns=['target'])), 387 | average="weighted" 388 | ), 389 | 3 390 | ) 391 | 392 | # Determine subplot and style based on model size 393 | if '104961' in model_data['dir']: 394 | plt.subplot(5, 1, 1) 395 | plt.ylabel('MLP 100k', fontsize=24) 396 | elif '310781' in model_data['dir']: 397 | plt.subplot(5, 1, 2) 398 | plt.ylabel('MLP 300k', fontsize=24) 399 | elif '1027875' in model_data['dir']: 400 | plt.subplot(5, 1, 3) 401 | plt.ylabel('MLP 1M', fontsize=24) 402 | elif '3051701' in model_data['dir']: 403 | plt.subplot(5, 1, 4) 404 | plt.ylabel('MLP 3M', fontsize=24) 405 | elif '10174361' in model_data['dir']: 406 | plt.subplot(5, 1, 5) 407 | plt.ylabel('MLP 10M', fontsize=24) 408 | 409 | # Plot with different styles based on epoch 410 | if '_e0' in model_data['dir']: 411 | plt.hist( 412 | pred_proba, 413 | bins=[i/100 for i in range(101)], 414 | alpha=0.3, 415 | histtype='step', 416 | label=f'Epoch 1 - D: {distance} - F1: {f1}', 417 | linewidth=3, 418 | color='#fcbf49' 419 | ) 420 | elif '_e1' in model_data['dir']: 421 | plt.hist( 422 | pred_proba, 423 | bins=[i/100 for i in range(101)], 424 | alpha=0.4, 425 | histtype='step', 426 | label=f'Epoch 2 - D: {distance} - F1: {f1}', 427 | linewidth=3, 428 | color='#f77f00' 429 | ) 430 | elif '_e2' in model_data['dir']: 431 | plt.hist( 432 | pred_proba, 433 | bins=[i/100 for i in range(101)], 434 | alpha=0.5, 435 | histtype='step', 436 | label=f'Epoch 3 - D: {distance} - F1: {f1}', 437 | linewidth=3, 438 | color='#d62828' 439 | ) 440 | elif '_e3' in model_data['dir']: 441 | plt.hist( 442 | pred_proba, 443 | bins=[i/100 for i in range(101)], 444 | alpha=0.7, 445 | histtype='step', 446 | label=f'Epoch 4 - D: {distance} - F1: {f1}', 447 | linewidth=3, 448 | color='#003049' 449 | ) 450 | 451 | # Add legends 452 | for spid in range(1, 6): 453 | plt.subplot(5, 1, spid) 454 | plt.legend(loc='upper right', fontsize=14) 455 | 456 | plt.tight_layout() 457 | plt.savefig(output_path, bbox_inches='tight') 458 | plt.close() 459 | 460 | def main(): 461 | """Main function to run the benchmarking pipeline.""" 462 | import argparse 463 | 464 | parser = argparse.ArgumentParser(description="Benchmark Tabular LEMs") 465 | parser.add_argument('--data_dir', type=str, required=True, 466 | help="Directory containing the data files") 467 | parser.add_argument('--model_dir', type=str, required=True, 468 | help="Directory containing the model files") 469 | parser.add_argument('--output_dir', type=str, required=True, 470 | help="Directory to save benchmark results") 471 | parser.add_argument('--seq_len', type=int, default=3, 472 | help="Sequence length used in the models") 473 | parser.add_argument('--n_sims', type=int, default=10000, 474 | help="Number of simulations to run") 475 | 476 | args = parser.parse_args() 477 | 478 | # Initialize benchmarker 479 | benchmarker = ModelBenchmarker(seq_len=args.seq_len) 480 | 481 | # Define model architectures to test 482 | base_models = [ 483 | [196, 196, 196], 484 | [360, 360, 360], 485 | [682, 682, 682], 486 | [1200, 1200, 1200], 487 | [2220, 2220, 2220] 488 | ] 489 | 490 | # Load models 491 | models = benchmarker.load_models_for_testing(base_models, args.model_dir) 492 | 493 | # Load data 494 | val_data = benchmarker.load_validation_data( 495 | os.path.join(args.data_dir, f'tabular_lem/val_extensive_2223_sq{args.seq_len}_rs42_0.feather') 496 | ) 497 | target_distributions = benchmarker.get_target_distributions( 498 | os.path.join(args.data_dir, 'raw_lem/val_extensive_2223.feather') 499 | ) 500 | 501 | # Run simulations 502 | results = benchmarker.run_simulations(models, target_distributions, args.n_sims) 503 | 504 | # Save results 505 | os.makedirs(args.output_dir, exist_ok=True) 506 | results.to_csv(os.path.join(args.output_dir, '7120_sim_inspect.csv'), index=False) 507 | 508 | # Generate plots 509 | benchmarker.plot_goal_difference_distribution( 510 | results, 511 | target_distributions['goals_delta'], 512 | os.path.join(args.output_dir, '7120_sim_inspect_goals_delta.pdf') 513 | ) 514 | 515 | # Load competition data for xG plot 516 | competitions = pd.read_csv(os.path.join(args.data_dir, 'competitions.csv')) 517 | seasons = pd.read_csv(os.path.join(args.data_dir, 'seasons.csv')) 518 | selected_seasons = seasons[ 519 | seasons.competition_id.isin( 520 | competitions[ 521 | competitions.area_name.isin(['Germany', 'France', 'Spain', 'Portugal', 'Belgium', 'Denmark']) & 522 | competitions.division_level.isin([1, 2]) 523 | ].wy_id.tolist() 524 | ) & 525 | (seasons.name == '2022/2023') 526 | ] 527 | 528 | competition_events = [] 529 | for season_id in selected_seasons.wy_id: 530 | competition_events.append( 531 | pd.read_feather(os.path.join(args.data_dir, f'seasons/events/{season_id}.feather')) 532 | ) 533 | competition_events = pd.concat(competition_events) 534 | 535 | benchmarker.plot_expected_goals_distribution( 536 | models, 537 | val_data, 538 | competition_events, 539 | os.path.join(args.output_dir, '7120_sim_inspect_expected_goals.pdf') 540 | ) 541 | 542 | if __name__ == "__main__": 543 | main() -------------------------------------------------------------------------------- /Older Versions/0011 Calculate Features.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 9, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import json\n", 12 | "\n", 13 | "from lib.glob_fix import glob" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 10, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "FIELD_SIZE = {'x':1.05, 'y':0.68}" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 11, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "text/html": [ 33 | "

\n", 34 | "\n", 47 | "\n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | "

	id	match_id	match_name	match_winner	home_team_id	away_team_id	absolute_sec	minute	second	period	...	yellow	second_yellow	through	fairplay	lost	neutral	won	accurate	not_accurate	goal_mouth_placement
0	177959171	2499719	Arsenal - Leicester City	1609	1609	1631	2.8	0	2	1	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN
1	177959172	2499719	Arsenal - Leicester City	1609	1609	1631	4.9	0	4	1	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN
2	177959173	2499719	Arsenal - Leicester City	1609	1609	1631	6.5	0	6	1	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN
3	177959174	2499719	Arsenal - Leicester City	1609	1609	1631	8.1	0	8	1	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN
4	177959175	2499719	Arsenal - Leicester City	1609	1609	1631	10.3	0	10	1	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
643145	251596409	2500098	West Ham United - Everton	1633	1633	1623	2796.7	46	36	2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
643146	251596232	2500098	West Ham United - Everton	1633	1633	1623	2829.8	47	9	2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN
643147	251596410	2500098	West Ham United - Everton	1633	1633	1623	2831.2	47	11	2	...	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	1.0	NaN
643148	251596234	2500098	West Ham United - Everton	1633	1633	1623	2832.4	47	12	2	...	NaN	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN	NaN
643149	251596236	2500098	West Ham United - Everton	1633	1633	1623	2834.1	47	14	2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	otr

\n", 341 | "

643150 rows × 57 columns

\n", 342 | "

" 343 | ], 344 | "text/plain": [ 345 | " id match_id match_name match_winner \n", 346 | "0 177959171 2499719 Arsenal - Leicester City 1609 \\\n", 347 | "1 177959172 2499719 Arsenal - Leicester City 1609 \n", 348 | "2 177959173 2499719 Arsenal - Leicester City 1609 \n", 349 | "3 177959174 2499719 Arsenal - Leicester City 1609 \n", 350 | "4 177959175 2499719 Arsenal - Leicester City 1609 \n", 351 | "... ... ... ... ... \n", 352 | "643145 251596409 2500098 West Ham United - Everton 1633 \n", 353 | "643146 251596232 2500098 West Ham United - Everton 1633 \n", 354 | "643147 251596410 2500098 West Ham United - Everton 1633 \n", 355 | "643148 251596234 2500098 West Ham United - Everton 1633 \n", 356 | "643149 251596236 2500098 West Ham United - Everton 1633 \n", 357 | "\n", 358 | " home_team_id away_team_id absolute_sec minute second period ... \n", 359 | "0 1609 1631 2.8 0 2 1 ... \\\n", 360 | "1 1609 1631 4.9 0 4 1 ... \n", 361 | "2 1609 1631 6.5 0 6 1 ... \n", 362 | "3 1609 1631 8.1 0 8 1 ... \n", 363 | "4 1609 1631 10.3 0 10 1 ... \n", 364 | "... ... ... ... ... ... ... ... \n", 365 | "643145 1633 1623 2796.7 46 36 2 ... \n", 366 | "643146 1633 1623 2829.8 47 9 2 ... \n", 367 | "643147 1633 1623 2831.2 47 11 2 ... \n", 368 | "643148 1633 1623 2832.4 47 12 2 ... \n", 369 | "643149 1633 1623 2834.1 47 14 2 ... \n", 370 | "\n", 371 | " yellow second_yellow through fairplay lost neutral won accurate \n", 372 | "0 NaN NaN NaN NaN NaN NaN NaN 1.0 \\\n", 373 | "1 NaN NaN NaN NaN NaN NaN NaN 1.0 \n", 374 | "2 NaN NaN NaN NaN NaN NaN NaN 1.0 \n", 375 | "3 NaN NaN NaN NaN NaN NaN NaN 1.0 \n", 376 | "4 NaN NaN NaN NaN NaN NaN NaN 1.0 \n", 377 | "... ... ... ... ... ... ... ... ... \n", 378 | "643145 NaN NaN NaN NaN NaN NaN NaN NaN \n", 379 | "643146 NaN NaN NaN NaN NaN NaN NaN 1.0 \n", 380 | "643147 NaN NaN NaN NaN 1.0 NaN NaN NaN \n", 381 | "643148 NaN NaN NaN NaN NaN NaN 1.0 1.0 \n", 382 | "643149 NaN NaN NaN NaN NaN NaN NaN NaN \n", 383 | "\n", 384 | " not_accurate goal_mouth_placement \n", 385 | "0 NaN NaN \n", 386 | "1 NaN NaN \n", 387 | "2 NaN NaN \n", 388 | "3 NaN NaN \n", 389 | "4 NaN NaN \n", 390 | "... ... ... \n", 391 | "643145 NaN NaN \n", 392 | "643146 NaN NaN \n", 393 | "643147 1.0 NaN \n", 394 | "643148 NaN NaN \n", 395 | "643149 1.0 otr \n", 396 | "\n", 397 | "[643150 rows x 57 columns]" 398 | ] 399 | }, 400 | "execution_count": 11, 401 | "metadata": {}, 402 | "output_type": "execute_result" 403 | } 404 | ], 405 | "source": [ 406 | "df = pd.read_csv('data\\wyscout\\csv\\events\\England.csv')\n", 407 | "df" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 12, 413 | "metadata": {}, 414 | "outputs": [], 415 | "source": [ 416 | "subevent_type_map = {\n", 417 | " 'air_duel': 1,\n", 418 | " 'ground_attacking_duel': 2,\n", 419 | " 'ground_defending_duel': 3,\n", 420 | " 'ground_loose_ball_duel': 4,\n", 421 | " 'foul': 5,\n", 422 | " 'hand_foul': 6,\n", 423 | " 'late_card_foul': 7,\n", 424 | " 'out_of_game_foul': 8,\n", 425 | " 'protest': 9,\n", 426 | " 'simulation': 10,\n", 427 | " 'time_lost_foul': 11,\n", 428 | " 'violent_foul': 12,\n", 429 | " 'corner': 13,\n", 430 | " 'free_kick': 14,\n", 431 | " 'free_kick_cross': 15,\n", 432 | " 'goal_kick': 16,\n", 433 | " 'penalty': 17,\n", 434 | " 'throw_in': 18,\n", 435 | " 'goalkeeper_leaving_line': 19,\n", 436 | " 'acceleration': 20,\n", 437 | " 'clearance': 21,\n", 438 | " 'touch': 22,\n", 439 | " 'cross': 23,\n", 440 | " 'hand_pass': 24,\n", 441 | " 'head_pass': 25,\n", 442 | " 'high_pass': 26,\n", 443 | " 'launch': 27,\n", 444 | " 'simple_pass': 28,\n", 445 | " 'smart_pass': 29,\n", 446 | " 'reflexes': 30,\n", 447 | " 'save_attempt': 31,\n", 448 | " 'free_kick_shot': 32,\n", 449 | " 'shot': 33,\n", 450 | "}\n", 451 | "\n", 452 | "event_type_map = {\n", 453 | " 'duel': 1,\n", 454 | " 'foul': 2,\n", 455 | " 'free_kick': 3,\n", 456 | " 'goalkeeper_leaving_line': 4,\n", 457 | " 'offside': 5,\n", 458 | " 'others_on_the_ball': 6,\n", 459 | " 'pass': 7,\n", 460 | " 'interruption': 8,\n", 461 | " 'save_attempt': 9,\n", 462 | " 'shot': 10,\n", 463 | "}\n", 464 | "\n", 465 | "df['subtype_id'] = df['subtype_name'].map(subevent_type_map)\n", 466 | "df['type_id'] = df['type_name'].map(event_type_map)" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 13, 472 | "metadata": {}, 473 | "outputs": [ 474 | { 475 | "data": { 476 | "text/plain": [ 477 | "type_name subtype_name subtype_id\n", 478 | "pass simple_pass 28.0 251405\n", 479 | "duel ground_attacking_duel 2.0 53859\n", 480 | " ground_defending_duel 3.0 53737\n", 481 | " air_duel 1.0 37760\n", 482 | "others_on_the_ball touch 22.0 34409\n", 483 | "duel ground_loose_ball_duel 4.0 31332\n", 484 | "interruption ball_out_of_the_field 0.0 27331\n", 485 | "pass high_pass 26.0 25067\n", 486 | " head_pass 25.0 21332\n", 487 | "free_kick throw_in 18.0 17050\n", 488 | "pass cross 23.0 12251\n", 489 | "others_on_the_ball clearance 21.0 11784\n", 490 | "pass launch 27.0 10247\n", 491 | "shot shot 33.0 8451\n", 492 | "foul foul 5.0 7522\n", 493 | "free_kick free_kick 14.0 7279\n", 494 | " goal_kick 16.0 6061\n", 495 | "pass smart_pass 29.0 5881\n", 496 | "others_on_the_ball acceleration 20.0 4892\n", 497 | "free_kick corner 13.0 3910\n", 498 | "pass hand_pass 24.0 2474\n", 499 | "save_attempt reflexes 30.0 2124\n", 500 | "free_kick free_kick_cross 15.0 1693\n", 501 | "offside 0 0.0 1558\n", 502 | "goalkeeper_leaving_line goalkeeper_leaving_line 19.0 1266\n", 503 | "save_attempt save_attempt 31.0 1225\n", 504 | "free_kick free_kick_shot 32.0 350\n", 505 | "foul hand_foul 6.0 275\n", 506 | "interruption whistle 0.0 204\n", 507 | "foul out_of_game_foul 8.0 97\n", 508 | " protest 9.0 84\n", 509 | "free_kick penalty 17.0 80\n", 510 | "foul late_card_foul 7.0 70\n", 511 | " time_lost_foul 11.0 48\n", 512 | " simulation 10.0 28\n", 513 | " violent_foul 12.0 14\n", 514 | "Name: count, dtype: int64" 515 | ] 516 | }, 517 | "execution_count": 13, 518 | "metadata": {}, 519 | "output_type": "execute_result" 520 | } 521 | ], 522 | "source": [ 523 | "df.fillna(0)[['type_name', 'subtype_name', 'subtype_id']].value_counts()" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": 14, 529 | "metadata": {}, 530 | "outputs": [], 531 | "source": [ 532 | "# A possession starts with a pass and ends when a successful pass from the opponent is made\n", 533 | "# or when the ball goes out of play\n", 534 | "start_new_possession = (((df['type_name'] == 'pass') * df['accurate'] + (df['type_name'] == 'free_kick')) * df.team_id).replace(0, np.NaN).fillna(method='ffill')\n", 535 | "start_new_possession = (start_new_possession != start_new_possession.shift(1)).cumsum()\n", 536 | "start_new_possession = start_new_possession + ((df['type_name'] == 'interruption') | (df['type_name'] == 'foul')).shift(1).fillna(0).cumsum()\n", 537 | "df['possession_id'] = start_new_possession\n", 538 | "df['possession_type_name'] = (df['possession_id'].diff(1).fillna(1) * df['type_name']).replace('', np.NaN).fillna(method='ffill')\n", 539 | "df['possession_type_id'] = df['possession_type_name'].map(event_type_map)\n", 540 | "df['possession_team_id'] = (df['possession_id'].diff(1).fillna(1) * df['team_id']).replace(0, np.NaN).fillna(method='ffill')\n", 541 | "df['possession_start_time'] = (df['possession_id'].diff(1).fillna(1) * df['absolute_sec']).replace(0, np.NaN).fillna(method='ffill')" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": 15, 547 | "metadata": {}, 548 | "outputs": [], 549 | "source": [ 550 | "for i in range(1, 3):\n", 551 | " df[f'previous_action_type_id_{i}'] = df['type_id'].shift(i)\n", 552 | " df[f'previous_action_is_same_team_{i}'] = (df['team_id'] == df['team_id'].shift(i)).astype(int)\n", 553 | " df[f'previous_action_is_same_possession_{i}'] = (df['possession_id'] == df['possession_id'].shift(i)).astype(int)\n", 554 | " df[f'previous_action_is_same_player_{i}'] = (df['player_id'] == df['player_id'].shift(i)).astype(int)\n", 555 | " df[f'previous_action_x_{i}'] = abs((100 * (1-df[f'previous_action_is_same_team_{i}'])) - df['x'].shift(i))\n", 556 | " df[f'previous_action_y_{i}'] = abs((100 * (1-df[f'previous_action_is_same_team_{i}'])) - df['y'].shift(i))\n", 557 | " df[f'previous_action_time_since_{i}'] = df['absolute_sec'] - df['absolute_sec'].shift(i)\n", 558 | " df[f'previous_action_x_displacement_{i}'] = df['x'] - df[f'previous_action_x_{i}']\n", 559 | "\n", 560 | "df['possession_start_is_same_team'] = (df['possession_team_id'] == df['team_id']).astype(int)\n", 561 | "df['possession_start_action_x'] = (df['possession_id'].diff(1).fillna(1) * df['x']).replace(0, np.NaN).fillna(method='ffill')\n", 562 | "df['possession_start_action_y'] = (df['possession_id'].diff(1).fillna(1) * df['y']).replace(0, np.NaN).fillna(method='ffill')\n", 563 | "df['possession_start_time_since'] = df['absolute_sec'] - df['possession_start_time']\n", 564 | "df['possession_start_x_displacement'] = df['x'] - df['possession_start_action_x']" 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": 16, 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [ 573 | "df['start_distance_to_goal'] = np.sqrt(((df['x'] - 100) * FIELD_SIZE['x'])**2 + ((df['y'] - 50) * FIELD_SIZE['y'])**2)\n", 574 | "df['start_angle_to_goal'] = abs(np.arctan2((df['y'] - 50) * FIELD_SIZE['y'], (df['x'] - 100) * FIELD_SIZE['x']))\n", 575 | "df['end_distance_to_goal'] = np.sqrt(((df['end_x'] - 100) * FIELD_SIZE['x'])**2 + ((df['end_y'] - 50) * FIELD_SIZE['y'])**2)\n", 576 | "df['end_angle_to_goal'] = abs(np.arctan2((df['end_y'] - 50) * FIELD_SIZE['y'], (df['end_x'] - 100) * FIELD_SIZE['x']))\n", 577 | "\n", 578 | "df['intent_progressive'] = ((df['type_name'] == 'pass') * (df['end_distance_to_goal'] < df['start_distance_to_goal'])).astype(int)" 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": null, 584 | "metadata": {}, 585 | "outputs": [], 586 | "source": [ 587 | "df['shot_assist'] = (((df['type_name'].isin(['pass', 'free_kick']) & (df['accurate'] == 1)) & ((df['type_name'].shift(1) == 'shot') | (df['type_name'].shift(2) == 'shot'))).diff() < 0).shift(-1).fillna(0).astype(int)" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": null, 593 | "metadata": {}, 594 | "outputs": [], 595 | "source": [ 596 | "df['goal'] = df['goal'].fillna(0)\n", 597 | "\n", 598 | "actions_before_goal = None\n", 599 | "actions_before_own_goal = None\n", 600 | "for i in range(10):\n", 601 | " if actions_before_goal is None:\n", 602 | " actions_before_goal = df.goal.shift(-(i))\n", 603 | " actions_before_own_goal = -df.own_goal.shift(-(i))\n", 604 | " else:\n", 605 | " actions_before_goal += df.goal.shift(-(i))\n", 606 | " actions_before_own_goal -= df.own_goal.shift(-(i))\n", 607 | "actions_before_goal = actions_before_goal.fillna(0)\n", 608 | "actions_before_own_goal = actions_before_own_goal.fillna(0)\n", 609 | "\n", 610 | "is_same_period = (df.goal * df.period).replace(to_replace=False, method='bfill') == df.period\n", 611 | "is_same_game = (df.goal * df.match_id).replace(to_replace=False, method='bfill') == df.match_id\n", 612 | "is_team_next_goal = 2 * ((df.goal * df.team_id).replace(to_replace=False, method='bfill') == df.team_id) - 1\n", 613 | "is_team_next_goal *= actions_before_own_goal\n", 614 | "\n", 615 | "df['vaep_label_0'] = actions_before_goal * is_same_period * is_same_game * is_team_next_goal\n", 616 | "df['vaep_label_0_scoring'] = df['vaep_label_0'].clip(0, 1)\n", 617 | "df['vaep_label_0_conceding'] = abs(df['vaep_label_0'].clip(-1, 0))" 618 | ] 619 | } 620 | ], 621 | "metadata": { 622 | "kernelspec": { 623 | "display_name": "Python 3", 624 | "language": "python", 625 | "name": "python3" 626 | }, 627 | "language_info": { 628 | "codemirror_mode": { 629 | "name": "ipython", 630 | "version": 3 631 | }, 632 | "file_extension": ".py", 633 | "mimetype": "text/x-python", 634 | "name": "python", 635 | "nbconvert_exporter": "python", 636 | "pygments_lexer": "ipython3", 637 | "version": "3.10.9" 638 | }, 639 | "orig_nbformat": 4 640 | }, 641 | "nbformat": 4, 642 | "nbformat_minor": 2 643 | } 644 | -------------------------------------------------------------------------------- /Older Versions/0100 Basic LEMs.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nvsclub/LargeEventsModel/fa654f556fbe02eb60e6fd7132b25bc3788c9772/Older Versions/0100 Basic LEMs.md -------------------------------------------------------------------------------- /Older Versions/0111 LEM Train Model Type.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from lib.data_utils import *\n", 11 | "from lib.model_utils import *\n", 12 | "from tqdm import tqdm\n", 13 | "import numpy as np\n", 14 | "import matplotlib.pyplot as plt\n", 15 | "import time\n", 16 | "\n", 17 | "import torch\n", 18 | "import torch.optim as optim\n", 19 | "from torch.utils.data import DataLoader, TensorDataset\n", 20 | "\n", 21 | "from sklearn.metrics import log_loss\n", 22 | "\n", 23 | "import optuna" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "MODEL_TYPE = 'TYPE'\n", 33 | "MODEL_NAME = f'LEMv3_MODEL_{MODEL_TYPE}_TORCH'" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "device = torch.device(\"cpu\")" 43 | ] 44 | }, 45 | { 46 | "attachments": {}, 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "# Loading and Preprocessing Data" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 4, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "df_train, df_train_y, df_optimization, df_optimization_y, df_test, df_test_y, complete_feature_set, features_model = load_model_training_data_template(train_sets = ['data/wyscout/csv/events/Italy.csv', 'data/wyscout/csv/events/Germany.csv', 'data/wyscout/csv/events/France.csv'], optimization_sets = ['data/wyscout/csv/events/Italy.csv',], test_sets = ['data/wyscout/csv/events/Spain.csv', 'data/wyscout/csv/events/England.csv'])" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 5, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "['next_action_type_1', 'next_action_type_2', 'next_action_type_3', 'next_action_type_4', 'next_action_type_5', 'next_action_type_6', 'next_action_type_7', 'next_action_type_8', 'next_action_type_9', 'next_action_type_10', 'next_action_type_11', 'next_action_type_12', 'next_action_type_13', 'next_action_type_14', 'next_action_type_15', 'next_action_type_16', 'next_action_type_17', 'next_action_type_18', 'next_action_type_19', 'next_action_type_20', 'next_action_type_21', 'next_action_type_22', 'next_action_type_23', 'next_action_type_24', 'next_action_type_25', 'next_action_type_26', 'next_action_type_27', 'next_action_type_28', 'next_action_type_29', 'next_action_type_30', 'next_action_type_31', 'next_action_type_32', 'next_action_type_33']\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "print(list(df_train_y[MODEL_TYPE].columns))" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 6, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "name": "stdout", 86 | "output_type": "stream", 87 | "text": [ 88 | "['subtype_id_1', 'subtype_id_2', 'subtype_id_3', 'subtype_id_4', 'subtype_id_5', 'subtype_id_6', 'subtype_id_7', 'subtype_id_8', 'subtype_id_9', 'subtype_id_10', 'subtype_id_11', 'subtype_id_12', 'subtype_id_13', 'subtype_id_14', 'subtype_id_15', 'subtype_id_16', 'subtype_id_17', 'subtype_id_18', 'subtype_id_19', 'subtype_id_20', 'subtype_id_21', 'subtype_id_22', 'subtype_id_23', 'subtype_id_24', 'subtype_id_25', 'subtype_id_26', 'subtype_id_27', 'subtype_id_28', 'subtype_id_29', 'subtype_id_30', 'subtype_id_31', 'subtype_id_32', 'subtype_id_33', 'period', 'minute', 'x', 'y', 'is_home_team', 'accurate', 'goal', 'home_score', 'away_score']\n" 89 | ] 90 | } 91 | ], 92 | "source": [ 93 | "features = features_model[MODEL_TYPE]\n", 94 | "print(features)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 7, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "X_train = df_train[features].astype(float).values\n", 104 | "x_optimization = df_optimization[features].astype(float).values\n", 105 | "X_test = df_test[features].astype(float).values\n", 106 | "\n", 107 | "Y_train = df_train_y[MODEL_TYPE].astype(float).values\n", 108 | "Y_optimization = df_optimization_y[MODEL_TYPE].astype(float).values\n", 109 | "Y_test = df_test_y[MODEL_TYPE].astype(float).values\n", 110 | "\n", 111 | "# Convert numpy arrays to PyTorch tensors\n", 112 | "X_train_tensor = torch.tensor(X_train, dtype=torch.float32)\n", 113 | "X_optimization_tensor = torch.tensor(x_optimization, dtype=torch.float32)\n", 114 | "X_test_tensor = torch.tensor(X_test, dtype=torch.float32)\n", 115 | "\n", 116 | "Y_train_tensor = torch.tensor(Y_train, dtype=torch.float32)\n", 117 | "Y_optimization_tensor = torch.tensor(Y_optimization, dtype=torch.float32)\n", 118 | "Y_test_tensor = torch.tensor(Y_test, dtype=torch.float32)\n", 119 | "\n", 120 | "# Create datasets\n", 121 | "train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)\n", 122 | "optimization_dataset = TensorDataset(X_optimization_tensor, Y_optimization_tensor)\n", 123 | "test_dataset = TensorDataset(X_test_tensor, Y_test_tensor)\n", 124 | "\n", 125 | "# Create dataloaders\n", 126 | "train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)\n", 127 | "optimization_dataloader = DataLoader(optimization_dataset, batch_size=32, shuffle=False)\n", 128 | "test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)\n", 129 | "\n", 130 | "input_size = X_train.shape[1]\n", 131 | "output_size = Y_train.shape[1]" 132 | ] 133 | }, 134 | { 135 | "attachments": {}, 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "# Tunning Model" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 8, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "ENABLE_TUNING = False\n", 149 | "TUNNING_COMPLEXITY_PENALTY = 0.001\n", 150 | "TUNNING_TRAIN_TEST_SPLIT = 0.7\n", 151 | "TUNNING_N_TRIALS = 40" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 9, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "if ENABLE_TUNING:\n", 161 | " study = optuna.create_study(direction=\"minimize\")\n", 162 | " study.optimize(lambda trial: objective(trial, X_optimization_tensor, Y_optimization_tensor, model_name=MODEL_NAME, train_test_split=TUNNING_TRAIN_TEST_SPLIT, complexity_penalty=TUNNING_COMPLEXITY_PENALTY), n_trials=TUNNING_N_TRIALS)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 10, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "if ENABLE_TUNING:\n", 172 | " trial = study.best_trial\n", 173 | " print(trial.value, trial.params, trial.datetime_start, trial.datetime_complete)\n", 174 | " \n", 175 | " model = torch.load(f'models/lem/optuna_trials/{MODEL_NAME}_{trial.number}.pt')\n", 176 | " test_log_loss = evaluate_log_loss(model, optimization_dataloader, device)\n", 177 | " print(f'Test Log Loss: {test_log_loss:.4f}')\n", 178 | "\n", 179 | " plt.rcParams[\"figure.figsize\"] = (20, 5)\n", 180 | " plt.subplot(121)\n", 181 | " probabilities = predict(model, X_optimization_tensor, device)\n", 182 | " plt.hist(probabilities, bins=50);\n", 183 | " plt.subplot(122)\n", 184 | " plt.hist(probabilities[:,1], bins=50, color='C1')\n", 185 | " plt.yscale('log');" 186 | ] 187 | }, 188 | { 189 | "attachments": {}, 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "# Train model" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 11, 199 | "metadata": {}, 200 | "outputs": [ 201 | { 202 | "name": "stdout", 203 | "output_type": "stream", 204 | "text": [ 205 | "Epoch: 1/100.. Training loss: 0.0722.. Test loss: 0.0701.. Test Log Loss: 1.5525\n", 206 | "Epoch: 2/100.. Training loss: 0.0693.. Test loss: 0.0693.. Test Log Loss: 1.5286\n", 207 | "Epoch: 3/100.. Training loss: 0.0686.. Test loss: 0.0690.. Test Log Loss: 1.5194\n", 208 | "Epoch: 4/100.. Training loss: 0.0684.. Test loss: 0.0687.. Test Log Loss: 1.5132\n", 209 | "Epoch: 5/100.. Training loss: 0.0682.. Test loss: 0.0687.. Test Log Loss: 1.5113\n", 210 | "Epoch: 6/100.. Training loss: 0.0681.. Test loss: 0.0687.. Test Log Loss: 1.5116\n", 211 | "Epoch: 7/100.. Training loss: 0.0680.. Test loss: 0.0686.. Test Log Loss: 1.5082\n", 212 | "Epoch: 8/100.. Training loss: 0.0680.. Test loss: 0.0685.. Test Log Loss: 1.5052\n", 213 | "Epoch: 9/100.. Training loss: 0.0679.. Test loss: 0.0684.. Test Log Loss: 1.5044\n", 214 | "Epoch: 10/100.. Training loss: 0.0679.. Test loss: 0.0685.. Test Log Loss: 1.5049\n", 215 | "Epoch: 11/100.. Training loss: 0.0679.. Test loss: 0.0684.. Test Log Loss: 1.5037\n", 216 | "Epoch: 12/100.. Training loss: 0.0678.. Test loss: 0.0684.. Test Log Loss: 1.5048\n", 217 | "Epoch: 13/100.. Training loss: 0.0678.. Test loss: 0.0685.. Test Log Loss: 1.5056\n", 218 | "Epoch: 14/100.. Training loss: 0.0678.. Test loss: 0.0684.. Test Log Loss: 1.5032\n", 219 | "Epoch: 15/100.. Training loss: 0.0678.. Test loss: 0.0683.. Test Log Loss: 1.5023\n", 220 | "Epoch: 16/100.. Training loss: 0.0678.. Test loss: 0.0684.. Test Log Loss: 1.5042\n", 221 | "Epoch: 17/100.. Training loss: 0.0678.. Test loss: 0.0683.. Test Log Loss: 1.5020\n", 222 | "Epoch: 18/100.. Training loss: 0.0678.. Test loss: 0.0683.. Test Log Loss: 1.5017\n", 223 | "Epoch: 19/100.. Training loss: 0.0678.. Test loss: 0.0684.. Test Log Loss: 1.5040\n", 224 | "Epoch: 20/100.. Training loss: 0.0677.. Test loss: 0.0683.. Test Log Loss: 1.5019\n", 225 | "Epoch: 21/100.. Training loss: 0.0677.. Test loss: 0.0683.. Test Log Loss: 1.5013\n", 226 | "Epoch: 22/100.. Training loss: 0.0677.. Test loss: 0.0684.. Test Log Loss: 1.5025\n", 227 | "Epoch: 23/100.. Training loss: 0.0677.. Test loss: 0.0683.. Test Log Loss: 1.5023\n", 228 | "Epoch: 24/100.. Training loss: 0.0677.. Test loss: 0.0683.. Test Log Loss: 1.5012\n", 229 | "Epoch: 25/100.. Training loss: 0.0677.. Test loss: 0.0683.. Test Log Loss: 1.5017\n", 230 | "Epoch: 26/100.. Training loss: 0.0677.. Test loss: 0.0683.. Test Log Loss: 1.5009\n", 231 | "Epoch: 27/100.. Training loss: 0.0677.. Test loss: 0.0683.. Test Log Loss: 1.5024\n", 232 | "Epoch: 28/100.. Training loss: 0.0677.. Test loss: 0.0683.. Test Log Loss: 1.5021\n", 233 | "Epoch: 29/100.. Training loss: 0.0677.. Test loss: 0.0683.. Test Log Loss: 1.5022\n" 234 | ] 235 | } 236 | ], 237 | "source": [ 238 | "model = MultiLayerBinaryClassifier(input_size, [256], output_size, activation='sigmoid').to(device)\n", 239 | "learning_rate = 0.001\n", 240 | "num_epochs = 100\n", 241 | "patience = 3\n", 242 | "counter = 0\n", 243 | "best_val_loss = 1000\n", 244 | "\n", 245 | "optimizer = optim.Adam(model.parameters(), lr=learning_rate)\n", 246 | "criterion = nn.BCELoss()\n", 247 | "\n", 248 | "for epoch in range(num_epochs):\n", 249 | " train_loss = train(model, train_dataloader, criterion, optimizer, device)\n", 250 | " test_loss = evaluate(model, test_dataloader, criterion, device)\n", 251 | " test_log_loss = evaluate_log_loss(model, test_dataloader, device)\n", 252 | " print(f'Epoch: {epoch+1}/{num_epochs}.. Training loss: {train_loss:.4f}.. Test loss: {test_loss:.4f}.. Test Log Loss: {test_log_loss:.4f}')\n", 253 | "\n", 254 | " if test_log_loss < best_val_loss:\n", 255 | " best_val_loss = test_log_loss\n", 256 | " counter = 0\n", 257 | " torch.save(model, f'models/lem/{MODEL_NAME}.pth')\n", 258 | " else:\n", 259 | " counter += 1\n", 260 | " if counter >= patience:\n", 261 | " break" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 12, 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "data": { 271 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAA2kAAAJGCAYAAADBBc3xAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAqyUlEQVR4nO3dfXBW9Z3w4W8ASXQVlFLCy8aitApWBISFRmXUnSi1li6z05XRrvKwvqwVdqwZreALwapgO8LS2Y1lRBGdWQT1UbctDGqjPK41HVdeZnUWcRUpVE2QcSUQbQLJef6wpkYCcgeS/JJc18yZkZPfyfne4RjzyUmOeVmWZQEAAEASenT0AAAAAPyZSAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEhIp4q0F198MSZPnhyDBw+OvLy8ePrpp3N+H1mWxb333hunnHJK5Ofnx5AhQ+Luu+8+8sMCAAC0Qq+OHiAXtbW1MWrUqPiHf/iH+Nu//dtWvY/rr78+nn322bj33ntj5MiR8eGHH8aHH354hCcFAABonbwsy7KOHqI18vLy4qmnnoopU6Y07aurq4tbb701Hn300fjoo4/i9NNPj5/+9Kdx3nnnRUTEpk2b4owzzojXX389Tj311I4ZHAAA4CA61Y87fpmZM2dGZWVlrFixIv7rv/4r/u7v/i6+/e1vx//8z/9ERMSvfvWrOPnkk+PXv/51nHTSSTF06NC46qqr3EkDAACS0WUibdu2bfHQQw/F448/HhMnToxhw4bFjTfeGOecc0489NBDERGxZcuW+P3vfx+PP/54PPLII7Fs2bJYt25dfP/73+/g6QEAAD7VqX4n7WBee+21aGhoiFNOOaXZ/rq6uvjKV74SERGNjY1RV1cXjzzySNO6Bx98MMaOHRubN2/2I5AAAECH6zKRtmfPnujZs2esW7cuevbs2extxx57bEREDBo0KHr16tUs5EaMGBERn96JE2kAAEBH6zKRNmbMmGhoaIgdO3bExIkTW1xz9tlnx759++Ltt9+OYcOGRUTEm2++GRERX/va19ptVgAAgAPpVE933LNnT7z11lsR8WmULVy4MM4///zo169fnHjiifH3f//38dvf/jYWLFgQY8aMiQ8++CAqKirijDPOiIsvvjgaGxvjr/7qr+LYY4+NRYsWRWNjY8yYMSP69OkTzz77bAe/OgAAgE4WaWvXro3zzz9/v/3Tpk2LZcuWxd69e+Ouu+6KRx55JN59993o379/fOtb34o77rgjRo4cGRER7733XvzTP/1TPPvss/EXf/EXcdFFF8WCBQuiX79+7f1yAAAA9tOpIg0AAKCr6zKP4AcAAOgKRBoAAEBCOsXTHRsbG+O9996L4447LvLy8jp6HAAAgJxkWRa7d++OwYMHR48eB79X1iki7b333ouioqKOHgMAAOCwbN++Pf7yL//yoGs6RaQdd9xxEfHpC+rTp08HTwMAAJCbmpqaKCoqamqbg+kUkfbZjzj26dNHpAEAAJ3Wofz6lgeHAAAAJESkAQAAJESkAQAAJESkAQAAJESkAQAAJESkAQAAJESkAQAAJESkAQAAJESkAQAAJESkAQAAJESkAQAAJESkAQAAJESkAQAAJESkAQAAJESkAQAAJESkAQAAJESkAQAAJESkAQAAJESkAQAAJESkAQAAJESkAQAAJESkAQAAJESkHUF/mPUfMXfu3Kh4flgMnbUqYm7fjh4JAADoZEQaAABAQnKOtBdffDEmT54cgwcPjry8vHj66acPuv7JJ5+MCy64IL761a9Gnz59ori4OJ555pnWztslDHxh40HfNnTWqvYbBgAASErOkVZbWxujRo2K8vLyQ1r/4osvxgUXXBCrV6+OdevWxfnnnx+TJ0+ODRs25DxsZ1V+7fMdPQIAANBJ9Mr1gIsuuiguuuiiQ16/aNGiZn+eN29e/Pu//3v86le/ijFjxuR6+k6v4vlhEXn/t6PHAAAAEpVzpB2uxsbG2L17d/Tr1++Aa+rq6qKurq7pzzU1Ne0xGgAAQIdr9weH3HvvvbFnz5645JJLDrhm/vz50bdv36atqKioHScEAADoOO0aacuXL4877rgjHnvssRgwYMAB182ePTt27drVtG3fvr0dpwQAAOg47fbjjitWrIirrroqHn/88SgpKTno2vz8/MjPz2+nyQAAANLRLnfSHn300Zg+fXo8+uijcfHFF7fHKQEAADqlnO+k7dmzJ956662mP7/zzjuxcePG6NevX5x44okxe/bsePfdd+ORRx6JiE9/xHHatGnx85//PCZMmBBVVVUREXH00UdH3759j9DLAAAA6BpyvpP26quvxpgxY5oen19aWhpjxoyJOXPmRETE+++/H9u2bWtaf//998e+fftixowZMWjQoKbt+uuvP0IvAQAAoOvI+U7aeeedF1mWHfDty5Yta/bntWvX5noKAACAbqvdH8FPc5uGj+joEQAAgISItDY28uGRHT0CAADQiYg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIi0drJg6nc7egQAAKATEGkAAAAJEWkAAAAJEWkAAAAJEWkAAAAJEWkAAAAJEWkAAAAJEWkAAAAJEWkAAAAJEWkAAAAJEWkAAAAJEWkAAAAJEWkAAAAJEWkAAAAJEWkAAAAJEWkAAAAJEWkAAAAJEWkAAAAJEWmJ+MOs/+joEQAAgASINAAAgISINAAAgISINAAAgISItESNfHhkR48AAAB0AJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJF2hJRf+3xHjwAAAHQBIg0AACAhIq0DjXx4ZEePAAAAJEakAQAAJESkAQAAJESkAQAAJESkAQAAJESkAQAAJESkAQAAJESkHYahs1Z19AgAAEAXI9IAAAASItIAAAASItIAAAASItIAAAASItIAAAASItIAAAASknOkvfjiizF58uQYPHhw5OXlxdNPP/2lx6xduzbOPPPMyM/Pj69//euxbNmyVoza+c2dO7ejRwAAABKXc6TV1tbGqFGjory8/JDWv/POO3HxxRfH+eefHxs3bowf/ehHcdVVV8UzzzyT87AAAABdXa9cD7jooovioosuOuT1ixcvjpNOOikWLFgQEREjRoyIl156Kf75n/85Jk2alOvpAQAAurQ2/520ysrKKCkpabZv0qRJUVlZecBj6urqoqamptkGAADQHbR5pFVVVUVhYWGzfYWFhVFTUxOffPJJi8fMnz8/+vbt27QVFRW19ZgAAABJSPLpjrNnz45du3Y1bdu3b+/okQAAANpFzr+TlquBAwdGdXV1s33V1dXRp0+fOProo1s8Jj8/P/Lz89t6NAAAgOS0+Z204uLiqKioaLbvueeei+Li4rY+NQAAQKeTc6Tt2bMnNm7cGBs3boyITx+xv3Hjxti2bVtEfPqjildccUXT+muvvTa2bNkSP/7xj+ONN96I++67Lx577LG44YYbjswrAAAA6EJyjrRXX301xowZE2PGjImIiNLS0hgzZkzMmTMnIiLef//9pmCLiDjppJNi1apV8dxzz8WoUaNiwYIF8cADD3j8PgAAQAty/p208847L7IsO+Dbly1b1uIxGzZsyPVUAAAA3U6ST3cEAADorkQaAABAQkQaAABAQkQaAABAQkQaAABAQkQaAABAQkQaAABAQkQaAABAQkQaAABAQkQaAABAQkQaAABAQkTaEbBp+IiOHgEAAOgiRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCRBoAAEBCWhVp5eXlMXTo0CgoKIgJEybEK6+8ctD1ixYtilNPPTWOPvroKCoqihtuuCH++Mc/tmpgAACAriznSFu5cmWUlpZGWVlZrF+/PkaNGhWTJk2KHTt2tLh++fLlMWvWrCgrK4tNmzbFgw8+GCtXroxbbrnlsIcHAADoanKOtIULF8bVV18d06dPj9NOOy0WL14cxxxzTCxdurTF9S+//HKcffbZcdlll8XQoUPjwgsvjEsvvfRL774BAAB0RzlFWn19faxbty5KSkr+/A569IiSkpKorKxs8Zizzjor1q1b1xRlW7ZsidWrV8d3vvOdA56nrq4uampqmm0AAADdQa9cFu/cuTMaGhqisLCw2f7CwsJ44403Wjzmsssui507d8Y555wTWZbFvn374tprrz3ojzvOnz8/7rjjjlxGAwAA6BLa/OmOa9eujXnz5sV9990X69evjyeffDJWrVoVd9555wGPmT17duzatatp2759e1uPCQAAkISc7qT1798/evbsGdXV1c32V1dXx8CBA1s85vbbb4/LL788rrrqqoiIGDlyZNTW1sY111wTt956a/TosX8n5ufnR35+fi6jAQAAdAk53Unr3bt3jB07NioqKpr2NTY2RkVFRRQXF7d4zMcff7xfiPXs2TMiIrIsy3VeAACALi2nO2kREaWlpTFt2rQYN25cjB8/PhYtWhS1tbUxffr0iIi44oorYsiQITF//vyIiJg8eXIsXLgwxowZExMmTIi33norbr/99pg8eXJTrAEAAPCpnCNt6tSp8cEHH8ScOXOiqqoqRo8eHWvWrGl6mMi2bdua3Tm77bbbIi8vL2677bZ4991346tf/WpMnjw57r777iP3KgAAALqInCMtImLmzJkxc+bMFt+2du3a5ifo1SvKysqirKysNacCAADoVtr86Y4AAAAcOpEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJEGAACQEJGWgAVTv9vRIwAAAIkQaQAAAAkRaQAAAAkRaQAAAAkRaQAAAAkRaQAAAAkRaQAAAAkRaQAAAAkRaQAAAAkRaQAAAAkRaQAAAAkRaQAAAAkRaQAAAAkRaQAAAAkRaQAAAAkRaQAAAAlpVaSVl5fH0KFDo6CgICZMmBCvvPLKQdd/9NFHMWPGjBg0aFDk5+fHKaecEqtXr27VwAAAAF1ZzpG2cuXKKC0tjbKysli/fn2MGjUqJk2aFDt27GhxfX19fVxwwQWxdevWeOKJJ2Lz5s2xZMmSGDJkyGEP36nN7dvREwAAAAnqlesBCxcujKuvvjqmT58eERGLFy+OVatWxdKlS2PWrFn7rV+6dGl8+OGH8fLLL8dRRx0VERFDhw49vKkBAAC6qJzupNXX18e6deuipKTkz++gR48oKSmJysrKFo/55S9/GcXFxTFjxowoLCyM008/PebNmxcNDQ0HPE9dXV3U1NQ02wAAALqDnCJt586d0dDQEIWFhc32FxYWRlVVVYvHbNmyJZ544oloaGiI1atXx+233x4LFiyIu+6664DnmT9/fvTt27dpKyoqymVMAACATqvNn+7Y2NgYAwYMiPvvvz/Gjh0bU6dOjVtvvTUWL158wGNmz54du3btatq2b9/e1mMCAAAkIaffSevfv3/07Nkzqqurm+2vrq6OgQMHtnjMoEGD4qijjoqePXs27RsxYkRUVVVFfX199O7de79j8vPzIz8/P5fRAAAAuoSc7qT17t07xo4dGxUVFU37Ghsbo6KiIoqLi1s85uyzz4633norGhsbm/a9+eabMWjQoBYDDQAAoDvL+ccdS0tLY8mSJfHwww/Hpk2b4oc//GHU1tY2Pe3xiiuuiNmzZzet/+EPfxgffvhhXH/99fHmm2/GqlWrYt68eTFjxowj9yoAAAC6iJwfwT916tT44IMPYs6cOVFVVRWjR4+ONWvWND1MZNu2bdGjx5/br6ioKJ555pm44YYb4owzzoghQ4bE9ddfHzfffPORexUAAABdRM6RFhExc+bMmDlzZotvW7t27X77iouL43e/+11rTgUAANCttPnTHQEAADh0Ig0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIq2zmtu3oycAAADagEgDAABIiEgDAABIiEgDAABIiEg7XH43DAAAOIJEGgAAQEJEGgAAQEJEGgAAQEJEGgAAQEJEGgAAQEJEGgAAQEJEGgAAQEJaFWnl5eUxdOjQKCgoiAkTJsQrr7xySMetWLEi8vLyYsqUKa05LQAAQJeXc6StXLkySktLo6ysLNavXx+jRo2KSZMmxY4dOw563NatW+PGG2+MiRMntnpYAACAri7nSFu4cGFcffXVMX369DjttNNi8eLFccwxx8TSpUsPeExDQ0P84Ac/iDvuuCNOPvnkwxoYAACgK8sp0urr62PdunVRUlLy53fQo0eUlJREZWXlAY/7yU9+EgMGDIgrr7zykM5TV1cXNTU1zTYAAIDuIKdI27lzZzQ0NERhYWGz/YWFhVFVVdXiMS+99FI8+OCDsWTJkkM+z/z586Nv375NW1FRUS5jAgAAdFpt+nTH3bt3x+WXXx5LliyJ/v37H/Jxs2fPjl27djVt27dvb8MpAQAA0tErl8X9+/ePnj17RnV1dbP91dXVMXDgwP3Wv/3227F169aYPHly077GxsZPT9yrV2zevDmGDRu233H5+fmRn5+fy2gAAABdQk530nr37h1jx46NioqKpn2NjY1RUVERxcXF+60fPnx4vPbaa7Fx48am7Xvf+16cf/75sXHjRj/GCAAA8AU53UmLiCgtLY1p06bFuHHjYvz48bFo0aKora2N6dOnR0TEFVdcEUOGDIn58+dHQUFBnH766c2OP/744yMi9tsPAABAKyJt6tSp8cEHH8ScOXOiqqoqRo8eHWvWrGl6mMi2bduiR482/VU3AACALivnSIuImDlzZsycObPFt61du/agxy5btqw1pwQAAOgW3PICAABIiEgDAABIiEgDAABIiEgDAABIiEgDAABIiEgDAABIiEgDAABIiEgDAABIiEgDAABIiEgDAABIiEgDAABIiEjrxhZM/W5HjwAAAHyBSAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEiISAMAAEhIqyKtvLw8hg4dGgUFBTFhwoR45ZVXDrh2yZIlMXHixDjhhBPihBNOiJKSkoOuBwAA6M5yjrSVK1dGaWlplJWVxfr162PUqFExadKk2LFjR4vr165dG5deemm88MILUVlZGUVFRXHhhRfGu+++e9jDAwAAdDU5R9rChQvj6quvjunTp8dpp50WixcvjmOOOSaWLl3a4vp/+7d/i+uuuy5Gjx4dw4cPjwceeCAaGxujoqLisIcHAADoanKKtPr6+li3bl2UlJT8+R306BElJSVRWVl5SO/j448/jr1790a/fv0OuKauri5qamqabQAAAN1BTpG2c+fOaGhoiMLCwmb7CwsLo6qq6pDex8033xyDBw9uFnpfNH/+/Ojbt2/TVlRUlMuYAAAAnVa7Pt3xnnvuiRUrVsRTTz0VBQUFB1w3e/bs2LVrV9O2ffv2dpwSAACg4/TKZXH//v2jZ8+eUV1d3Wx/dXV1DBw48KDH3nvvvXHPPffEb37zmzjjjDMOujY/Pz/y8/NzGY32MrdvxNxdHT0FAAB0WTndSevdu3eMHTu22UM/PnsISHFx8QGP+9nPfhZ33nlnrFmzJsaNG9f6aQEAALq4nO6kRUSUlpbGtGnTYty4cTF+/PhYtGhR1NbWxvTp0yMi4oorroghQ4bE/PnzIyLipz/9acyZMyeWL18eQ4cObfrdtWOPPTaOPfbYI/hSAAAAOr+cI23q1KnxwQcfxJw5c6KqqipGjx4da9asaXqYyLZt26JHjz/foPvFL34R9fX18f3vf7/Z+ykrK4u5c+ce3vQAAABdTM6RFhExc+bMmDlzZotvW7t2bbM/b926tTWnAAAA6Jba9emOAAAAHJxIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIg3Yyd+7cjh4BAIBOQKQBAHRDvnkI6RJpAAAACRFpAAAACRFpdDt+vAMAgJSJNAAAgISINAAAgISINAAAgISINAAAgISItE5s0/ARHT0CAABwhIk0AACAhIg0AACAhIg0AACAhIg0AACAhIg0AACAhIi0bmjkwyOb/vkPs/6jAycBAAC+SKQBAAAkRKQBAAAkRKQBAAAkRKTR7sqvfb6jRwAAgGSJNDrM3LlzO3oEAABIjkjr5NyVAgCArkWkAQAAJESkAQAAJESkkbNNw0d09AjkwO/+AQB0LiINAAAgISKtmxk6a1VHj0A34Q4eAEDriDQAIBm+wQMg0gAAAJIi0gCAwzby4ZEdPQJAlyHSAAAAEiLSEuLn8AEAAJEGAACQEJFGq5Rf+3xHjwBAC/xuGEDnJ9Jotx+z9IUDQOfhR/ABOo5IAwAASIhIA4AjbW7fjp4AgE5MpHVxC6Z+t6NHODJ8wQPw5XyuBOgSRBpAJ+d3hwCgaxFpALSKOGwfPs4A3Y9IA6BLGjprVUePAACtItK6qU3DR3T0CK1yqP9/Nt95BgCgsxJpHDLflQZSkNM3YY7ggzQ60zd/Ous34gD4lEgjeYcbh/4n2gAAdCYijXbhLlw34NHfAByCznRXGjqKSIOuRiwBAHRqIq0Tcleq/XlgScL+FKWH+ncEtAHfHAI4oloVaeXl5TF06NAoKCiICRMmxCuvvHLQ9Y8//ngMHz48CgoKYuTIkbF69epWDcuhS/33sP4w6z9yPib117SfbvpFS7eJpW7699sa3fUbS53ucxYAycg50lauXBmlpaVRVlYW69evj1GjRsWkSZNix44dLa5/+eWX49JLL40rr7wyNmzYEFOmTIkpU6bE66+/ftjDk57Uvij5/BeHqcXDkfxYdfQdvMP9Iry1x3fUeT+v3a8rcQhAN9Jdn1abc6QtXLgwrr766pg+fXqcdtppsXjx4jjmmGNi6dKlLa7/+c9/Ht/+9rfjpptuihEjRsSdd94ZZ555Zvzrv/7rYQ/Poat4flgMfGFju51vwdTvttu5ItowDr/kC+LUovTzDhQPKc/cXXT0naXu+h88oHvr6G9oQi565bK4vr4+1q1bF7Nnz27a16NHjygpKYnKysoWj6msrIzS0tJm+yZNmhRPP/30Ac9TV1cXdXV1TX/etWtXRETU1NTkMm6ba6z7OGrystjT0BCf1NfG7rqjoi6vLmprG5ve1vDJp2+r27s36ur+9La8PVFT9+nb9jQ0xB/37o3ddbVRl1cXjbV7orHu42j4pOGAr/eL521p3WfH/3Hv3qipqWk672fv+4vnPZSPbWvO+/ljGz5piP/8xinxycQFTR+rXM772cfyYOf9l//zd/FPyx6PurpP3/fnP5afHfvZ2w5JXRbfWnxa/EP9Tw963qblf3rfp5c9E68XZBE1NXH/j/5fXLPo3P3Oe7C/489sHjsuTl336n77v7X8W/G7y36333k/mzkO8noP5byHav78+TF79uymv6PPf6y+eN4vzvx5jXUf5zTTF8/7+debi1zP20xdx5632a5DvKZPL3smXr9jUuxpODLXwKGet7Hu46iZ3Sdi9h8O63wH+nfpoOfN2//j1Vq5fu44lH/XPvucdaTPeyT+jltz3v/8xiktfs5qs/NyWDrqY93dzsvhOVL/zUrBZ68jy7IvX5zl4N13380iInv55Zeb7b/pppuy8ePHt3jMUUcdlS1fvrzZvvLy8mzAgAEHPE9ZWVkWETabzWaz2Ww2m83Wpbbt27d/aXfldCetvcyePbvZ3bfGxsb48MMP4ytf+Urk5eV12Fw1NTVRVFQU27dvjz59+nTYHHQ9ri3aguuKtuLaoi24rmgrqVxbWZbF7t27Y/DgwV+6NqdI69+/f/Ts2TOqq6ub7a+uro6BAwe2eMzAgQNzWh8RkZ+fH/n5+c32HX/88bmM2qb69OnjkwdtwrVFW3Bd0VZcW7QF1xVtJYVrq2/fvoe0LqcHh/Tu3TvGjh0bFRUVTfsaGxujoqIiiouLWzymuLi42fqIiOeee+6A6wEAALqznH/csbS0NKZNmxbjxo2L8ePHx6JFi6K2tjamT58eERFXXHFFDBkyJObPnx8REddff32ce+65sWDBgrj44otjxYoV8eqrr8b9999/ZF8JAABAF5BzpE2dOjU++OCDmDNnTlRVVcXo0aNjzZo1UVhYGBER27Ztix49/nyD7qyzzorly5fHbbfdFrfcckt84xvfiKeffjpOP/30I/cq2kl+fn6UlZXt96OYcLhcW7QF1xVtxbVFW3Bd0VY647WVl2WH8gxIAAAA2kPO/zNrAAAA2o5IAwAASIhIAwAASIhIAwAASIhI+4Ly8vIYOnRoFBQUxIQJE+KVV1456PrHH388hg8fHgUFBTFy5MhYvXp1O01KZ5PLtbVkyZKYOHFinHDCCXHCCSdESUnJl16LdE+5fs76zIoVKyIvLy+mTJnStgPSaeV6bX300UcxY8aMGDRoUOTn58cpp5ziv4nsJ9fratGiRXHqqafG0UcfHUVFRXHDDTfEH//4x3aals7gxRdfjMmTJ8fgwYMjLy8vnn766S89Zu3atXHmmWdGfn5+fP3rX49ly5a1+Zy5Emmfs3LlyigtLY2ysrJYv359jBo1KiZNmhQ7duxocf3LL78cl156aVx55ZWxYcOGmDJlSkyZMiVef/31dp6c1OV6ba1duzYuvfTSeOGFF6KysjKKioriwgsvjHfffbedJydluV5Xn9m6dWvceOONMXHixHaalM4m12urvr4+Lrjggti6dWs88cQTsXnz5liyZEkMGTKknScnZbleV8uXL49Zs2ZFWVlZbNq0KR588MFYuXJl3HLLLe08OSmrra2NUaNGRXl5+SGtf+edd+Liiy+O888/PzZu3Bg/+tGP4qqrropnnnmmjSfNUUaT8ePHZzNmzGj6c0NDQzZ48OBs/vz5La6/5JJLsosvvrjZvgkTJmT/+I//2KZz0vnkem190b59+7Ljjjsue/jhh9tqRDqh1lxX+/bty84666zsgQceyKZNm5b9zd/8TTtMSmeT67X1i1/8Ijv55JOz+vr69hqRTijX62rGjBnZX//1XzfbV1pamp199tltOiedV0RkTz311EHX/PjHP86++c1vNts3derUbNKkSW04We7cSfuT+vr6WLduXZSUlDTt69GjR5SUlERlZWWLx1RWVjZbHxExadKkA66ne2rNtfVFH3/8cezduzf69evXVmPSybT2uvrJT34SAwYMiCuvvLI9xqQTas219ctf/jKKi4tjxowZUVhYGKeffnrMmzcvGhoa2mtsEtea6+qss86KdevWNf1I5JYtW2L16tXxne98p11mpmvqLF+/9+roAVKxc+fOaGhoiMLCwmb7CwsL44033mjxmKqqqhbXV1VVtdmcdD6tuba+6Oabb47Bgwfv90mF7qs119VLL70UDz74YGzcuLEdJqSzas21tWXLlnj++efjBz/4QaxevTreeuutuO6662Lv3r1RVlbWHmOTuNZcV5dddlns3LkzzjnnnMiyLPbt2xfXXnutH3fksBzo6/eampr45JNP4uijj+6gyZpzJw0Sd88998SKFSviqaeeioKCgo4eh05q9+7dcfnll8eSJUuif//+HT0OXUxjY2MMGDAg7r///hg7dmxMnTo1br311li8eHFHj0Yntnbt2pg3b17cd999sX79+njyySdj1apVceedd3b0aNDm3En7k/79+0fPnj2jurq62f7q6uoYOHBgi8cMHDgwp/V0T625tj5z7733xj333BO/+c1v4owzzmjLMelkcr2u3n777di6dWtMnjy5aV9jY2NERPTq1Ss2b94cw4YNa9uh6RRa8zlr0KBBcdRRR0XPnj2b9o0YMSKqqqqivr4+evfu3aYzk77WXFe33357XH755XHVVVdFRMTIkSOjtrY2rrnmmrj11lujRw/3Gsjdgb5+79OnTzJ30SLcSWvSu3fvGDt2bFRUVDTta2xsjIqKiiguLm7xmOLi4mbrIyKee+65A66ne2rNtRUR8bOf/SzuvPPOWLNmTYwbN649RqUTyfW6Gj58eLz22muxcePGpu173/te09OtioqK2nN8Etaaz1lnn312vPXWW03hHxHx5ptvxqBBgwQaEdG66+rjjz/eL8Q++0ZAlmVtNyxdWqf5+r2jn1ySkhUrVmT5+fnZsmXLsv/+7//Orrnmmuz444/PqqqqsizLsssvvzybNWtW0/rf/va3Wa9evbJ7770327RpU1ZWVpYdddRR2WuvvdZRL4FE5Xpt3XPPPVnv3r2zJ554Inv//febtt27d3fUSyBBuV5XX+TpjhxIrtfWtm3bsuOOOy6bOXNmtnnz5uzXv/51NmDAgOyuu+7qqJdAgnK9rsrKyrLjjjsue/TRR7MtW7Zkzz77bDZs2LDskksu6aiXQIJ2796dbdiwIduwYUMWEdnChQuzDRs2ZL///e+zLMuyWbNmZZdffnnT+i1btmTHHHNMdtNNN2WbNm3KysvLs549e2Zr1qzpqJfQIpH2Bf/yL/+SnXjiiVnv3r2z8ePHZ7/73e+a3nbuuedm06ZNa7b+sccey0455ZSsd+/e2Te/+c1s1apV7TwxnUUu19bXvva1LCL228rKytp/cJKW6+eszxNpHEyu19bLL7+cTZgwIcvPz89OPvnk7O6778727dvXzlOTulyuq71792Zz587Nhg0blhUUFGRFRUXZddddl/3v//5v+w9Osl544YUWv2b67FqaNm1adu655+53zOjRo7PevXtnJ598cvbQQw+1+9xfJi/L3C8GAABIhd9JAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASIhIAwAASMj/B4U4CfRew9I9AAAAAElFTkSuQmCC", 272 | "text/plain": [ 273 | "

" 274 | ] 275 | }, 276 | "metadata": {}, 277 | "output_type": "display_data" 278 | } 279 | ], 280 | "source": [ 281 | "plt.rcParams[\"figure.figsize\"] = (10.6, 6.8)\n", 282 | "probabilities = predict(model, X_test_tensor, device)\n", 283 | "plt.hist(probabilities, bins=25);" 284 | ] 285 | } 286 | ], 287 | "metadata": { 288 | "kernelspec": { 289 | "display_name": "Python 3", 290 | "language": "python", 291 | "name": "python3" 292 | }, 293 | "language_info": { 294 | "codemirror_mode": { 295 | "name": "ipython", 296 | "version": 3 297 | }, 298 | "file_extension": ".py", 299 | "mimetype": "text/x-python", 300 | "name": "python", 301 | "nbconvert_exporter": "python", 302 | "pygments_lexer": "ipython3", 303 | "version": "3.10.9" 304 | }, 305 | "orig_nbformat": 4 306 | }, 307 | "nbformat": 4, 308 | "nbformat_minor": 2 309 | } 310 | -------------------------------------------------------------------------------- /Older Versions/0112 LEM Train Model Accuracy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from lib.data_utils import *\n", 11 | "from lib.model_utils import *\n", 12 | "from tqdm import tqdm\n", 13 | "import numpy as np\n", 14 | "import matplotlib.pyplot as plt\n", 15 | "import time\n", 16 | "\n", 17 | "import torch\n", 18 | "import torch.optim as optim\n", 19 | "from torch.utils.data import DataLoader, TensorDataset\n", 20 | "\n", 21 | "from sklearn.metrics import log_loss\n", 22 | "\n", 23 | "import optuna" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "MODEL_TYPE = 'ACC'\n", 33 | "MODEL_NAME = f'LEMv4_MODEL_{MODEL_TYPE}_TORCH'" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "device = torch.device(\"cpu\")" 43 | ] 44 | }, 45 | { 46 | "attachments": {}, 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "# Loading and Preprocessing Data" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "df_train, df_train_y, df_optimization, df_optimization_y, df_test, df_test_y, complete_feature_set, features_model = load_model_training_data_template(train_sets = ['data/wyscout/csv/events/Italy.csv', 'data/wyscout/csv/events/Germany.csv', 'data/wyscout/csv/events/France.csv'], optimization_sets = ['data/wyscout/csv/events/Italy.csv',], test_sets = ['data/wyscout/csv/events/Spain.csv', 'data/wyscout/csv/events/England.csv'])" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "print(list(df_train_y[MODEL_TYPE].columns))" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "features = features_model[MODEL_TYPE]\n", 78 | "print(features)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "X_train = df_train[features].astype(float).values\n", 88 | "x_optimization = df_optimization[features].astype(float).values\n", 89 | "X_test = df_test[features].astype(float).values\n", 90 | "\n", 91 | "Y_train = df_train_y[MODEL_TYPE].astype(float).values\n", 92 | "Y_optimization = df_optimization_y[MODEL_TYPE].astype(float).values\n", 93 | "Y_test = df_test_y[MODEL_TYPE].astype(float).values\n", 94 | "\n", 95 | "# Convert numpy arrays to PyTorch tensors\n", 96 | "X_train_tensor = torch.tensor(X_train, dtype=torch.float32)\n", 97 | "X_optimization_tensor = torch.tensor(x_optimization, dtype=torch.float32)\n", 98 | "X_test_tensor = torch.tensor(X_test, dtype=torch.float32)\n", 99 | "\n", 100 | "Y_train_tensor = torch.tensor(Y_train, dtype=torch.float32)\n", 101 | "Y_optimization_tensor = torch.tensor(Y_optimization, dtype=torch.float32)\n", 102 | "Y_test_tensor = torch.tensor(Y_test, dtype=torch.float32)\n", 103 | "\n", 104 | "# Create datasets\n", 105 | "train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)\n", 106 | "optimization_dataset = TensorDataset(X_optimization_tensor, Y_optimization_tensor)\n", 107 | "test_dataset = TensorDataset(X_test_tensor, Y_test_tensor)\n", 108 | "\n", 109 | "# Create dataloaders\n", 110 | "train_dataloader = DataLoader(train_dataset, batch_size=1024, shuffle=True)\n", 111 | "optimization_dataloader = DataLoader(optimization_dataset, batch_size=1024, shuffle=False)\n", 112 | "test_dataloader = DataLoader(test_dataset, batch_size=1024, shuffle=False)\n", 113 | "\n", 114 | "input_size = X_train.shape[1]\n", 115 | "output_size = Y_train.shape[1]" 116 | ] 117 | }, 118 | { 119 | "attachments": {}, 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "# Tunning Model" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "ENABLE_TUNING = False\n", 133 | "TUNNING_COMPLEXITY_PENALTY = 0.001\n", 134 | "TUNNING_TRAIN_TEST_SPLIT = 0.7\n", 135 | "TUNNING_N_TRIALS = 40" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "if ENABLE_TUNING:\n", 145 | " study = optuna.create_study(direction=\"minimize\")\n", 146 | " study.optimize(lambda trial: objective(trial, X_optimization_tensor, Y_optimization_tensor, model_name=MODEL_NAME, train_test_split=TUNNING_TRAIN_TEST_SPLIT, complexity_penalty=TUNNING_COMPLEXITY_PENALTY), n_trials=TUNNING_N_TRIALS)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "if ENABLE_TUNING:\n", 156 | " trial = study.best_trial\n", 157 | " print(trial.value, trial.params, trial.datetime_start, trial.datetime_complete)\n", 158 | " \n", 159 | " model = torch.load(f'models/lem/optuna_trials/{MODEL_NAME}_{trial.number}.pt')\n", 160 | " test_log_loss = evaluate_log_loss(model, optimization_dataloader, device)\n", 161 | " print(f'Test Log Loss: {test_log_loss:.4f}')\n", 162 | "\n", 163 | " plt.rcParams[\"figure.figsize\"] = (20, 5)\n", 164 | " plt.subplot(121)\n", 165 | " probabilities = predict(model, X_optimization_tensor, device)\n", 166 | " plt.hist(probabilities, bins=50);\n", 167 | " plt.subplot(122)\n", 168 | " plt.hist(probabilities[:,1], bins=50, color='C1')\n", 169 | " plt.yscale('log');" 170 | ] 171 | }, 172 | { 173 | "attachments": {}, 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "# Train model" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "model = MultiLayerBinaryClassifier(input_size, [128], output_size, activation='sigmoid').to(device)\n", 187 | "learning_rate = 0.0410\n", 188 | "num_epochs = 100\n", 189 | "patience = 3\n", 190 | "counter = 0\n", 191 | "best_val_loss = 1000\n", 192 | "\n", 193 | "optimizer = optim.Adam(model.parameters(), lr=learning_rate)\n", 194 | "criterion = nn.BCELoss()\n", 195 | "\n", 196 | "for epoch in range(num_epochs):\n", 197 | " train_loss = train(model, train_dataloader, criterion, optimizer, device)\n", 198 | " test_loss = evaluate(model, test_dataloader, criterion, device)\n", 199 | " test_log_loss = evaluate_log_loss(model, test_dataloader, device)\n", 200 | " print(f'Epoch: {epoch+1}/{num_epochs}.. Training loss: {train_loss:.4f}.. Test loss: {test_loss:.4f}.. Test Log Loss: {test_log_loss:.4f}')\n", 201 | "\n", 202 | " if test_log_loss < best_val_loss:\n", 203 | " best_val_loss = test_log_loss\n", 204 | " counter = 0\n", 205 | " torch.save(model, f'models/lem/{MODEL_NAME}.pth')\n", 206 | " else:\n", 207 | " counter += 1\n", 208 | " if counter >= patience:\n", 209 | " break" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "plt.rcParams[\"figure.figsize\"] = (10.6, 6.8)\n", 219 | "plt.subplot(121)\n", 220 | "probabilities = predict(model, X_test_tensor, device)\n", 221 | "plt.hist(probabilities, bins=25);\n", 222 | "plt.subplot(122)\n", 223 | "plt.hist(probabilities[:, 1], bins=25);\n", 224 | "plt.yscale('log');" 225 | ] 226 | } 227 | ], 228 | "metadata": { 229 | "kernelspec": { 230 | "display_name": "Python 3", 231 | "language": "python", 232 | "name": "python3" 233 | }, 234 | "language_info": { 235 | "codemirror_mode": { 236 | "name": "ipython", 237 | "version": 3 238 | }, 239 | "file_extension": ".py", 240 | "mimetype": "text/x-python", 241 | "name": "python", 242 | "nbconvert_exporter": "python", 243 | "pygments_lexer": "ipython3", 244 | "version": "3.10.9" 245 | }, 246 | "orig_nbformat": 4 247 | }, 248 | "nbformat": 4, 249 | "nbformat_minor": 2 250 | } 251 | -------------------------------------------------------------------------------- /Older Versions/0131 Learning State Values.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from lib.data_utils import *\n", 10 | "from lib.model_utils import *\n", 11 | "from lib.simulator import *" 12 | ] 13 | }, 14 | { 15 | "attachments": {}, 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "# Load data & models" 20 | ] 21 | }, 22 | { 23 | "attachments": {}, 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "## Data" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "df_train, df_train_y, df_optimization, df_optimization_y, df_test, df_test_y, complete_feature_set, features_model = load_model_training_data_template(train_sets = ['data/wyscout/csv/events/Italy.csv', 'data/wyscout/csv/events/Germany.csv', 'data/wyscout/csv/events/France.csv'], optimization_sets = ['data/wyscout/csv/events/Italy.csv',], test_sets = ['data/wyscout/csv/events/Spain.csv', 'data/wyscout/csv/events/England.csv'])" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "features = features_model['TYPE']\n", 46 | "X_test = df_test[features].astype(float).values" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 4, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "sim = Simulator(\n", 56 | " model_type_path='models/lem/LEMv3_MODEL_TYPE_TORCH.pth',\n", 57 | " model_acc_path='models/lem/LEMv4_MODEL_ACC_TORCH.pth',\n", 58 | " model_data_path='models/lem/LEMv3_MODEL_DATA_TORCH.pth',\n", 59 | " device = 'cuda:0')" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 5, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "name": "stderr", 69 | "output_type": "stream", 70 | "text": [ 71 | " 94%|█████████▍| 1883/2000 [07:28<00:27, 4.19it/s]\n", 72 | " 94%|█████████▎| 1874/2000 [07:24<00:29, 4.22it/s]\n", 73 | " 93%|█████████▎| 1861/2000 [06:50<00:30, 4.53it/s]\n", 74 | " 93%|█████████▎| 1864/2000 [06:51<00:30, 4.53it/s]\n", 75 | " 94%|█████████▍| 1886/2000 [06:55<00:25, 4.53it/s]\n", 76 | " 94%|█████████▎| 1872/2000 [06:53<00:28, 4.53it/s]\n", 77 | " 94%|█████████▍| 1883/2000 [06:55<00:25, 4.53it/s]\n", 78 | " 94%|█████████▍| 1887/2000 [06:56<00:24, 4.53it/s]\n", 79 | " 93%|█████████▎| 1858/2000 [06:49<00:31, 4.53it/s]\n", 80 | " 94%|█████████▍| 1890/2000 [06:56<00:24, 4.53it/s]\n", 81 | " 93%|█████████▎| 1860/2000 [07:38<00:34, 4.06it/s]\n", 82 | " 94%|█████████▍| 1885/2000 [09:58<00:36, 3.15it/s]\n", 83 | " 94%|█████████▍| 1889/2000 [09:35<00:33, 3.28it/s]\n", 84 | " 93%|█████████▎| 1851/2000 [09:45<00:47, 3.16it/s]\n", 85 | " 93%|█████████▎| 1861/2000 [09:40<00:43, 3.21it/s]\n", 86 | " 92%|█████████▏| 1843/2000 [09:40<00:49, 3.18it/s]\n", 87 | " 95%|█████████▍| 1894/2000 [09:09<00:30, 3.45it/s]\n", 88 | " 95%|█████████▌| 1908/2000 [07:21<00:21, 4.32it/s]\n", 89 | " 94%|█████████▍| 1890/2000 [06:57<00:24, 4.52it/s]\n", 90 | " 93%|█████████▎| 1853/2000 [06:52<00:32, 4.49it/s]\n", 91 | " 97%|█████████▋| 1940/2000 [07:12<00:13, 4.48it/s]\n", 92 | " 95%|█████████▍| 1892/2000 [07:05<00:24, 4.45it/s]\n", 93 | " 94%|█████████▍| 1879/2000 [07:00<00:27, 4.47it/s]\n", 94 | " 93%|█████████▎| 1853/2000 [06:55<00:32, 4.46it/s]\n", 95 | " 93%|█████████▎| 1864/2000 [06:58<00:30, 4.46it/s]\n", 96 | " 95%|█████████▌| 1906/2000 [07:07<00:21, 4.46it/s]\n", 97 | " 93%|█████████▎| 1853/2000 [06:56<00:33, 4.45it/s]\n", 98 | " 93%|█████████▎| 1863/2000 [07:01<00:30, 4.42it/s]\n", 99 | " 94%|█████████▎| 1873/2000 [07:53<00:32, 3.96it/s]\n", 100 | " 92%|█████████▏| 1846/2000 [07:47<00:39, 3.95it/s]\n", 101 | " 95%|█████████▍| 1895/2000 [08:07<00:26, 3.89it/s]\n", 102 | " 94%|█████████▍| 1875/2000 [07:58<00:31, 3.92it/s]\n", 103 | " 96%|█████████▌| 1912/2000 [08:06<00:22, 3.93it/s]\n", 104 | " 93%|█████████▎| 1860/2000 [07:16<00:32, 4.26it/s]\n", 105 | " 92%|█████████▎| 1850/2000 [07:34<00:36, 4.07it/s]\n", 106 | " 93%|█████████▎| 1860/2000 [07:37<00:34, 4.07it/s]\n", 107 | " 93%|█████████▎| 1860/2000 [06:49<00:30, 4.54it/s]\n", 108 | " 95%|█████████▍| 1896/2000 [06:51<00:22, 4.61it/s]\n", 109 | " 94%|█████████▍| 1880/2000 [06:47<00:26, 4.61it/s]\n", 110 | " 94%|█████████▎| 1871/2000 [06:45<00:27, 4.61it/s]\n", 111 | " 94%|█████████▍| 1876/2000 [06:46<00:26, 4.61it/s]\n", 112 | " 94%|█████████▍| 1876/2000 [06:52<00:27, 4.55it/s]\n", 113 | " 96%|█████████▌| 1920/2000 [08:54<00:22, 3.59it/s]\n", 114 | " 94%|█████████▎| 1873/2000 [08:02<00:32, 3.88it/s]\n", 115 | " 93%|█████████▎| 1864/2000 [08:03<00:35, 3.86it/s]\n", 116 | " 93%|█████████▎| 1851/2000 [07:56<00:38, 3.89it/s]\n", 117 | " 94%|█████████▍| 1886/2000 [08:00<00:29, 3.93it/s]\n", 118 | " 95%|█████████▍| 1893/2000 [08:05<00:27, 3.90it/s]\n", 119 | " 92%|█████████▏| 1846/2000 [07:51<00:39, 3.92it/s]\n", 120 | " 95%|█████████▌| 1901/2000 [08:00<00:25, 3.96it/s]\n", 121 | " 93%|█████████▎| 1866/2000 [06:58<00:30, 4.45it/s]\n", 122 | " 93%|█████████▎| 1867/2000 [06:48<00:29, 4.57it/s]\n", 123 | " 92%|█████████▏| 1841/2000 [06:40<00:34, 4.60it/s]\n", 124 | " 94%|█████████▍| 1889/2000 [06:49<00:24, 4.61it/s]\n", 125 | " 93%|█████████▎| 1869/2000 [06:45<00:28, 4.61it/s]\n", 126 | " 92%|█████████▏| 1842/2000 [06:39<00:34, 4.61it/s]\n", 127 | " 92%|█████████▏| 1845/2000 [06:39<00:33, 4.61it/s]\n", 128 | " 93%|█████████▎| 1861/2000 [06:43<00:30, 4.61it/s]\n", 129 | " 93%|█████████▎| 1863/2000 [06:43<00:29, 4.61it/s]\n", 130 | " 94%|█████████▎| 1871/2000 [06:45<00:27, 4.61it/s]\n", 131 | " 93%|█████████▎| 1851/2000 [06:41<00:32, 4.61it/s]\n", 132 | " 92%|█████████▏| 1835/2000 [06:37<00:35, 4.61it/s]\n", 133 | " 94%|█████████▎| 1873/2000 [06:46<00:27, 4.61it/s]\n", 134 | " 95%|█████████▍| 1891/2000 [06:49<00:23, 4.62it/s]\n", 135 | " 94%|█████████▍| 1881/2000 [06:56<00:26, 4.51it/s]\n", 136 | " 94%|█████████▍| 1887/2000 [10:17<00:37, 3.05it/s]\n", 137 | " 93%|█████████▎| 1861/2000 [09:54<00:44, 3.13it/s]\n", 138 | " 94%|█████████▎| 1871/2000 [07:54<00:32, 3.94it/s]\n", 139 | " 92%|█████████▏| 1845/2000 [07:34<00:38, 4.06it/s]\n", 140 | " 95%|█████████▌| 1903/2000 [08:09<00:24, 3.89it/s]\n", 141 | " 96%|█████████▌| 1923/2000 [08:56<00:21, 3.59it/s]\n", 142 | " 95%|█████████▍| 1896/2000 [08:42<00:28, 3.63it/s]\n", 143 | " 93%|█████████▎| 1852/2000 [08:42<00:41, 3.55it/s]\n", 144 | " 94%|█████████▍| 1878/2000 [07:23<00:28, 4.23it/s]\n", 145 | " 96%|█████████▌| 1913/2000 [07:22<00:20, 4.32it/s]\n", 146 | " 92%|█████████▏| 1847/2000 [06:47<00:33, 4.53it/s]\n", 147 | " 93%|█████████▎| 1852/2000 [06:48<00:32, 4.53it/s]\n", 148 | " 93%|█████████▎| 1866/2000 [07:43<00:33, 4.02it/s]\n", 149 | " 0%| | 4/2000 [00:02<16:05, 2.07it/s]" 150 | ] 151 | } 152 | ], 153 | "source": [ 154 | "id_offset = 250 #Implement glob count\n", 155 | "for i in range(200):\n", 156 | " feature_tensor = sim.simulate(X_test, store_full_sim=False)\n", 157 | " data = pd.DataFrame(feature_tensor.cpu())\n", 158 | " data['type_id'] = data[data.columns[:33]].idxmax(axis=1) + 1\n", 159 | " data = data.drop(data.columns[:33], axis=1)\n", 160 | " data.columns = ['period', 'minute', 'x', 'y', 'is_home_team', 'accurate', 'goal', 'home_score', 'away_score', 'type_id']\n", 161 | " data.home_score = (data.home_score * 10).astype(int)\n", 162 | " data.away_score = (data.away_score * 10).astype(int)\n", 163 | " data['type_name'] = data.type_id.map({v: k for k, v in SUBEVENT_TYPE_MAP.items()})\n", 164 | " data.to_csv(f'data/lem/lem_sim_{id_offset+i}.csv', index=False)" 165 | ] 166 | } 167 | ], 168 | "metadata": { 169 | "kernelspec": { 170 | "display_name": "Python 3", 171 | "language": "python", 172 | "name": "python3" 173 | }, 174 | "language_info": { 175 | "codemirror_mode": { 176 | "name": "ipython", 177 | "version": 3 178 | }, 179 | "file_extension": ".py", 180 | "mimetype": "text/x-python", 181 | "name": "python", 182 | "nbconvert_exporter": "python", 183 | "pygments_lexer": "ipython3", 184 | "version": "3.10.9" 185 | }, 186 | "orig_nbformat": 4 187 | }, 188 | "nbformat": 4, 189 | "nbformat_minor": 2 190 | } 191 | -------------------------------------------------------------------------------- /Older Versions/0200 Fine-Tuning Approaches.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nvsclub/LargeEventsModel/fa654f556fbe02eb60e6fd7132b25bc3788c9772/Older Versions/0200 Fine-Tuning Approaches.md -------------------------------------------------------------------------------- /Older Versions/0212 Finetuning Framework - Lib.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from lib.data_utils import *\n", 11 | "from lib.model_utils import *\n", 12 | "from tqdm import tqdm\n", 13 | "import numpy as np\n", 14 | "import matplotlib.pyplot as plt\n", 15 | "import time\n", 16 | "import os\n", 17 | "\n", 18 | "import torch\n", 19 | "import torch.optim as optim\n", 20 | "from torch.utils.data import DataLoader, TensorDataset\n", 21 | "\n", 22 | "from lib.simulator import *" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "device = torch.device(\"cpu\")" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "PARAMETERS = {\n", 41 | " 'TYPE': {\n", 42 | " 'learning_rate': 0.0001,\n", 43 | " 'num_epochs': 25,\n", 44 | " 'patience': 3,\n", 45 | " },\n", 46 | " 'ACC': {\n", 47 | " 'learning_rate': 0.0041,\n", 48 | " 'num_epochs': 25,\n", 49 | " 'patience': 3,\n", 50 | " },\n", 51 | " 'DATA': {\n", 52 | " 'learning_rate': 0.00063,\n", 53 | " 'num_epochs': 25,\n", 54 | " 'patience': 3,\n", 55 | " },\n", 56 | "}" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "def prepare_data(df_base_x, team=None, player=None, remove_player=None, is_home=True):\n", 66 | " if is_home:\n", 67 | " side_selection = df_base_x.is_home_team\n", 68 | " else:\n", 69 | " side_selection = ~df_base_x.is_home_team\n", 70 | "\n", 71 | " data = []\n", 72 | " if team != None:\n", 73 | " if remove_player != None:\n", 74 | " data.append(\n", 75 | " df_base_x[(\n", 76 | " df_base_x.match_name.str.contains(team).fillna(False) & \n", 77 | " (\n", 78 | " (side_selection & df_base_x.team_name.str.contains(team).fillna(False)) | \n", 79 | " ((~side_selection) & (~df_base_x.team_name.str.contains(team).fillna(False)))\n", 80 | " ) & \n", 81 | " (~df_base_x.player_name.str.contains(remove_player).fillna(False))\n", 82 | " )].copy()\n", 83 | " )\n", 84 | " else:\n", 85 | " data.append(\n", 86 | " df_base_x[(\n", 87 | " df_base_x.match_name.str.contains(team).fillna(False) & \n", 88 | " (\n", 89 | " (side_selection & df_base_x.team_name.str.contains(team).fillna(False)) | \n", 90 | " ((~side_selection) & (~df_base_x.team_name.str.contains(team).fillna(False)))\n", 91 | " )\n", 92 | " )].copy()\n", 93 | " )\n", 94 | "\n", 95 | " if player != None:\n", 96 | " data.append(\n", 97 | " df_base_x[(\n", 98 | " df_base_x.player_name.str.contains(player).fillna(False) & \n", 99 | " side_selection\n", 100 | " )].copy()\n", 101 | " )\n", 102 | "\n", 103 | " return pd.concat(data)\n", 104 | "\n", 105 | "def load_models(base_model=None):\n", 106 | " if base_model == None:\n", 107 | " models = {\n", 108 | " 'TYPE': torch.load('models/lem/LEMv3_MODEL_TYPE_TORCH.pth').to(device),\n", 109 | " 'ACC': torch.load('models/lem/LEMv4_MODEL_ACC_TORCH.pth').to(device),\n", 110 | " 'DATA': torch.load('models/lem/LEMv3_MODEL_DATA_TORCH.pth').to(device),\n", 111 | " }\n", 112 | " else:\n", 113 | " models = {\n", 114 | " 'TYPE': torch.load(f'models/finetuning/team_representations/{base_model}_TYPE.pth').to(device),\n", 115 | " 'ACC': torch.load(f'models/finetuning/team_representations/{base_model}_ACC.pth').to(device),\n", 116 | " 'DATA': torch.load(f'models/finetuning/team_representations/{base_model}_DATA.pth').to(device),\n", 117 | " }\n", 118 | "\n", 119 | " models['TYPE'].eval();\n", 120 | " models['ACC'].eval();\n", 121 | " models['DATA'].eval();\n", 122 | "\n", 123 | " return models\n", 124 | "\n", 125 | "def prepare_dataloader(df_selected_data, df_base_y, model_type, features):\n", 126 | " X_train = df_selected_data[features].astype(float).values # df_original_team should be a parameters\n", 127 | " Y_train = df_base_y[model_type].loc[df_selected_data.index].astype(float).values\n", 128 | "\n", 129 | " X_train_tensor = torch.tensor(X_train, dtype=torch.float32)\n", 130 | " Y_train_tensor = torch.tensor(Y_train, dtype=torch.float32)\n", 131 | "\n", 132 | " batch_size = int(max(min(np.log(len(Y_train)) ** 2, 256), 32)) # 5% of the data, max 512, min 32\n", 133 | "\n", 134 | " train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)\n", 135 | " train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)\n", 136 | "\n", 137 | " input_size = X_train.shape[1]\n", 138 | " output_size = Y_train.shape[1]\n", 139 | "\n", 140 | " return train_dataloader, input_size, output_size\n", 141 | "\n", 142 | "def check_dir(directory):\n", 143 | " if not os.path.exists(directory):\n", 144 | " os.makedirs(directory)\n", 145 | "\n", 146 | "def check_if_pth_exists(model_name):\n", 147 | " return os.path.exists(f'models/finetuning/{model_name}.pth')" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "# Load data & models" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "## Data" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "df_train, df_train_y, df_optimization, df_optimization_y, df_test, df_test_y, complete_feature_set, features_model = load_model_training_data_template(train_sets = ['data/wyscout/csv/events/Italy.csv', 'data/wyscout/csv/events/Germany.csv', 'data/wyscout/csv/events/France.csv'], optimization_sets = ['data/wyscout/csv/events/Italy.csv',], test_sets = ['data/wyscout/csv/events/Spain.csv', 'data/wyscout/csv/events/England.csv'])" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "# Tests" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "PL_TEAMS_REPLACEMENT = {\n", 187 | " 'Arsenal': 'A. Iwobi', \n", 188 | " 'Leicester City': 'M. Albrighton', \n", 189 | " 'Manchester City': 'L. Sané', \n", 190 | " 'Brighton & Hove Albion': 'S. March', \n", 191 | " 'Burnley': 'A. Barnes', \n", 192 | " 'Chelsea': 'V. Moses', \n", 193 | " 'Crystal Palace': 'C. Benteke', \n", 194 | " 'Huddersfield Town': 'T. Ince', \n", 195 | " 'Everton': 'D. Calvert-Lewin', \n", 196 | " 'Stoke City': 'P. Crouch', \n", 197 | " 'Manchester United': 'R. Lukaku', \n", 198 | " 'West Ham United': 'M. Antonio', \n", 199 | " 'Tottenham Hotspur': 'Son Heung-Min', \n", 200 | " 'Newcastle United': 'Joselu', \n", 201 | " 'Swansea City': 'S. Clucas', \n", 202 | " 'Southampton': 'N. Redmond', \n", 203 | " 'Watford': 'A. Carrillo', \n", 204 | " 'Liverpool': 'S. Mané', \n", 205 | " 'West Bromwich Albion': 'J. Rodriguez', \n", 206 | " 'AFC Bournemouth': 'J. Ibe', \n", 207 | " }" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "# Home Only Data\n", 217 | "N_ITERATIONS = 10\n", 218 | "TESTS = {}\n", 219 | "for team in PL_TEAMS_REPLACEMENT.keys():\n", 220 | " for _ in range(N_ITERATIONS): TESTS[len(TESTS)] = {'type': 'team_representations', 'team': team, 'player': None, 'remove_player': None, 'is_home': True, 'base_model': None,}\n", 221 | "\n", 222 | "for team in PL_TEAMS_REPLACEMENT.keys():\n", 223 | " player = 'Cristiano Ronaldo'\n", 224 | " for _ in range(N_ITERATIONS): TESTS[len(TESTS)] = {'type': 'player_adding', 'team': team, 'player': player, 'remove_player': None, 'is_home': True, 'base_model': None,}\n", 225 | " for _ in range(N_ITERATIONS): TESTS[len(TESTS)] = {'type': 'player_replacement', 'team': team, 'player': player, 'remove_player': PL_TEAMS_REPLACEMENT[team], 'is_home': True, 'base_model': None,}\n", 226 | "\n", 227 | " player = 'L. Messi'\n", 228 | " for _ in range(N_ITERATIONS): TESTS[len(TESTS)] = {'type': 'player_adding', 'team': team, 'player': player, 'remove_player': None, 'is_home': True, 'base_model': None,}\n", 229 | " for _ in range(N_ITERATIONS): TESTS[len(TESTS)] = {'type': 'player_replacement', 'team': team, 'player': player, 'remove_player': PL_TEAMS_REPLACEMENT[team], 'is_home': True, 'base_model': None,}\n", 230 | "\n", 231 | "for player in ['L. Messi', 'Cristiano Ronaldo', 'T. Kroos', 'Iago Aspas', 'Dani Parejo', 'L. Suárez', 'A. Griezmann', 'Casemiro', 'Illarramendi', 'Sergio Ramos']:\n", 232 | " for _ in range(N_ITERATIONS): TESTS[len(TESTS)] = {'type': 'player_only', 'team': None, 'player': player, 'remove_player': None, 'is_home': True, 'base_model': None,}\n", 233 | "\n", 234 | "print(len(TESTS))" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "test = {}\n", 244 | "for test_id in tqdm(TESTS.keys()):\n", 245 | " if test != TESTS[test_id]:\n", 246 | " test = TESTS[test_id]\n", 247 | " \n", 248 | " df_selected_data = prepare_data(df_test, team=test['team'], player=test['player'], remove_player=test['remove_player'], is_home=test['is_home'])\n", 249 | " train_dataloader, input_size, output_size = {}, {}, {}\n", 250 | " for MODEL_TYPE in ['TYPE', 'ACC', 'DATA']:\n", 251 | " train_dataloader[MODEL_TYPE], input_size[MODEL_TYPE], output_size[MODEL_TYPE] = prepare_dataloader(df_selected_data, df_test_y, MODEL_TYPE, features_model[MODEL_TYPE])\n", 252 | "\n", 253 | " models = load_models(base_model=test['base_model'])\n", 254 | "\n", 255 | " for MODEL_TYPE in ['TYPE', 'ACC', 'DATA']:\n", 256 | " home_sign = 'H' if test['is_home'] else 'A'\n", 257 | " test_description = ''\n", 258 | " #if test['base_model'] != None:\n", 259 | " # test_description += f'{test[\"base_model\"].split(\"_\")[0]}_'\n", 260 | " test_description += str(test['team']) + '_' + str(test['player']) + '_' + str(test['remove_player']) \n", 261 | " test_description = test_description.replace('None_', '').replace('.', '').replace('_None', '')\n", 262 | " test_description += '_' + home_sign\n", 263 | "\n", 264 | " check_dir(f'models/finetuning/{test[\"type\"]}/{test_description}')\n", 265 | "\n", 266 | " if check_if_pth_exists(f'{test[\"type\"]}/{test_description}/LEM_V343_{test_id}_{MODEL_TYPE}'):\n", 267 | " continue\n", 268 | "\n", 269 | " optimizer = optim.Adam(models[MODEL_TYPE].parameters(), lr=PARAMETERS[MODEL_TYPE]['learning_rate'])\n", 270 | " criterion = nn.BCELoss()\n", 271 | "\n", 272 | " counter, best_val_loss = 0, 1000\n", 273 | " for epoch in range(PARAMETERS[MODEL_TYPE]['num_epochs']):\n", 274 | " train_loss = train(models[MODEL_TYPE], train_dataloader[MODEL_TYPE], criterion, optimizer, device)\n", 275 | "\n", 276 | " if train_loss < (best_val_loss - 0.00005):\n", 277 | " best_val_loss = train_loss\n", 278 | " counter = 0\n", 279 | "\n", 280 | " torch.save(models[MODEL_TYPE], f'models/finetuning/{test[\"type\"]}/{test_description}/LEM_V343_{test_id}_{MODEL_TYPE}.pth')\n", 281 | " else:\n", 282 | " counter += 1\n", 283 | " if counter >= PARAMETERS[MODEL_TYPE]['patience']:\n", 284 | " break\n", 285 | "\n", 286 | " TESTS[test_id][f'{MODEL_TYPE}_train_loss'] = best_val_loss\n", 287 | " TESTS[test_id][f'{MODEL_TYPE}_epochs'] = epoch\n", 288 | "\n", 289 | " pd.DataFrame(TESTS).T.to_csv('res/training_process_data/TESTS.csv')" 290 | ] 291 | } 292 | ], 293 | "metadata": { 294 | "kernelspec": { 295 | "display_name": "Python 3", 296 | "language": "python", 297 | "name": "python3" 298 | }, 299 | "language_info": { 300 | "codemirror_mode": { 301 | "name": "ipython", 302 | "version": 3 303 | }, 304 | "file_extension": ".py", 305 | "mimetype": "text/x-python", 306 | "name": "python", 307 | "nbconvert_exporter": "python", 308 | "pygments_lexer": "ipython3", 309 | "version": "3.9.13" 310 | }, 311 | "orig_nbformat": 4 312 | }, 313 | "nbformat": 4, 314 | "nbformat_minor": 2 315 | } 316 | -------------------------------------------------------------------------------- /Older Versions/0213 Tensor Sim - Lib.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import matplotlib.pyplot as plt\n", 11 | "\n", 12 | "from lib.data_utils import *\n", 13 | "from lib.model_utils import *\n", 14 | "\n", 15 | "from lib.simulator import *\n", 16 | "\n", 17 | "from glob import glob" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "def sim_models(model_name):\n", 27 | " sim_basemodel = Simulator(\n", 28 | " model_type_path=model_name.replace('ACC', 'TYPE'),\n", 29 | " model_acc_path=model_name,\n", 30 | " model_data_path=model_name.replace('ACC', 'DATA'),\n", 31 | " device='cuda:0'\n", 32 | " )\n", 33 | "\n", 34 | " feature_tensor = sim_basemodel.simulate([1 if i == 27 else 0 for i in range(33)] + [0, 0, 0.5, 0.5, 1, 1, 0, 0, 0], store_full_sim=False, n_sims=3000, disable_tqdm=True)\n", 35 | "\n", 36 | " data = pd.DataFrame(feature_tensor.cpu())\n", 37 | " data['type_id'] = data[data.columns[:33]].idxmax(axis=1) + 1\n", 38 | " data = data.drop(data.columns[:33], axis=1)\n", 39 | " data.columns = ['period', 'minute', 'x', 'y', 'is_home_team', 'accurate', 'goal', 'home_score', 'away_score', 'type_id']\n", 40 | " data.home_score = (data.home_score * 10).astype(int)\n", 41 | " data.away_score = (data.away_score * 10).astype(int)\n", 42 | "\n", 43 | " return [\n", 44 | " (data.home_score > data.away_score).mean() * 3 + (data.home_score == data.away_score).mean(),\n", 45 | " (data.home_score < data.away_score).mean() * 3 + (data.home_score == data.away_score).mean(),\n", 46 | " data.home_score.mean(),\n", 47 | " data.away_score.mean(),\n", 48 | " ]" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "# Simulations" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 3, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "name": "stderr", 65 | "output_type": "stream", 66 | "text": [ 67 | " 0%| | 0/1200 [00:00 \\n')\n", 95 | " elif df.iloc[i]['match_id'] != df.iloc[i+1]['match_id']:\n", 96 | " out.write(convert_to_string(df.iloc[i]) + ' \\n')\n", 97 | " elif df.iloc[i]['period'] != df.iloc[i+1]['period']:\n", 98 | " out.write(convert_to_string(df.iloc[i]) + ' \\n')\n", 99 | " else:\n", 100 | " out.write(convert_to_string(df.iloc[i]) + ' ' + convert_to_string_label(df.iloc[i+1]) + '\\n')\n", 101 | "out.close()" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 17, 107 | "metadata": {}, 108 | "outputs": [ 109 | { 110 | "name": "stderr", 111 | "output_type": "stream", 112 | "text": [ 113 | "100%|█████████▉| 1799583/1799586 [14:26<00:00, 2078.03it/s]\n" 114 | ] 115 | } 116 | ], 117 | "source": [ 118 | "out = open('data/llm/events_train_k3.txt', 'w')\n", 119 | "k = 3\n", 120 | "for i in tqdm(range(len(df))):\n", 121 | " if i == len(df)-k:\n", 122 | " for j in range(k):\n", 123 | " out.write(convert_to_string(df.iloc[i+j]) + ' ')\n", 124 | " out.write(' \\n')\n", 125 | " break\n", 126 | " elif df.iloc[i+k-1]['match_id'] != df.iloc[i+k]['match_id']:\n", 127 | " for j in range(k):\n", 128 | " out.write(convert_to_string(df.iloc[i+j]) + ' ')\n", 129 | " out.write(' \\n')\n", 130 | " elif df.iloc[i+k-1]['period'] != df.iloc[i+k]['period']:\n", 131 | " for j in range(k):\n", 132 | " out.write(convert_to_string(df.iloc[i+j]) + ' ')\n", 133 | " out.write(' \\n')\n", 134 | " else:\n", 135 | " match_id = df.iloc[i]['match_id']\n", 136 | "\n", 137 | " string = convert_to_string(df.iloc[i])\n", 138 | " for j in range(k):\n", 139 | " if j == (k-1):\n", 140 | " string += ' ' + convert_to_string_label(df.iloc[i+j+1])\n", 141 | " else:\n", 142 | " string += ' ' + convert_to_string(df.iloc[i+j+1])\n", 143 | " if match_id != df.iloc[i+j+1]['match_id']:\n", 144 | " match_id = df.iloc[i+j+1]['match_id']\n", 145 | " string = ''\n", 146 | " for w in range(j+2):\n", 147 | " string += ' '\n", 148 | " if w != j+1:\n", 149 | " string += ' '\n", 150 | " \n", 151 | " out.write(string + '\\n')\n", 152 | "out.close()" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 4, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "text/plain": [ 163 | "(1271809, 13)" 164 | ] 165 | }, 166 | "execution_count": 4, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "data = []\n", 173 | "for dataset_fname in ['data/wyscout/csv/events/England.csv', 'data/wyscout/csv/events/Spain.csv']:\n", 174 | " df = load_data(dataset_fname)\n", 175 | " df['is_home_team'] = df['team_id'] == df['home_team_id']\n", 176 | "\n", 177 | " df['home_score'] = (\n", 178 | " ((df.subtype_name == 'free_kick_shot') & (df.goal == 1) & (df.team_id == df.home_team_id)) |\n", 179 | " ((df.subtype_name == 'penalty') & (df.goal == 1) & (df.team_id == df.home_team_id)) |\n", 180 | " ((df.subtype_name == 'shot') & (df.goal == 1) & (df.team_id == df.home_team_id)) |\n", 181 | " ((df.type_name.isin(['others_on_the_ball', 'pass'])) & (df.own_goal == 1) & (df.team_id == df.away_team_id))\n", 182 | " ).cumsum()\n", 183 | " df['home_score'] = df['home_score'] - df['match_id'].map(df.groupby('match_id')['home_score'].min())\n", 184 | " df['away_score'] = (\n", 185 | " ((df.subtype_name == 'free_kick_shot') & (df.goal == 1) & (df.team_id == df.away_team_id)) |\n", 186 | " ((df.subtype_name == 'penalty') & (df.goal == 1) & (df.team_id == df.away_team_id)) |\n", 187 | " ((df.subtype_name == 'shot') & (df.goal == 1) & (df.team_id == df.away_team_id)) |\n", 188 | " ((df.type_name.isin(['others_on_the_ball', 'pass'])) & (df.own_goal == 1) & (df.team_id == df.home_team_id))\n", 189 | " ).cumsum()\n", 190 | " df['away_score'] = df['away_score'] - df['match_id'].map(df.groupby('match_id')['away_score'].min())\n", 191 | "\n", 192 | " data.append(df[['match_id', 'subtype_name', 'period', 'minute', 'second', 'x', 'y', 'is_home_team', 'accurate', 'goal', 'home_score', 'away_score']])\n", 193 | "\n", 194 | "df = pd.concat(data)\n", 195 | "df['goal'] = df['goal'].astype(int)\n", 196 | "df['accurate'] = df['accurate'].astype(int)\n", 197 | "df['is_home_team'] = df['is_home_team'].astype(int)\n", 198 | "df.loc[df.subtype_name == 0, 'subtype_name'] = 'none'\n", 199 | "\n", 200 | "df['time_elapsed'] = (((df['minute'] * 60 + df['second']) - (df['minute'].shift(1) * 60 + df['second'].shift(1))) * (df['period'] == df['period'].shift(1))).clip(0, 100).fillna(0).astype(int)\n", 201 | "\n", 202 | "df.shape" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 8, 208 | "metadata": {}, 209 | "outputs": [ 210 | { 211 | "name": "stderr", 212 | "output_type": "stream", 213 | "text": [ 214 | "100%|██████████| 1271809/1271809 [05:11<00:00, 4079.94it/s]\n" 215 | ] 216 | } 217 | ], 218 | "source": [ 219 | "out = open('data/llm/events_test.txt', 'w')\n", 220 | "for i in tqdm(range(len(df))):\n", 221 | " if i == len(df)-1:\n", 222 | " out.write(convert_to_string(df.iloc[i]) + ' \\n')\n", 223 | " elif df.iloc[i]['match_id'] != df.iloc[i+1]['match_id']:\n", 224 | " out.write(convert_to_string(df.iloc[i]) + ' \\n')\n", 225 | " elif df.iloc[i]['period'] != df.iloc[i+1]['period']:\n", 226 | " out.write(convert_to_string(df.iloc[i]) + ' \\n')\n", 227 | " else:\n", 228 | " out.write(convert_to_string(df.iloc[i]) + ' ' + convert_to_string_label(df.iloc[i+1]) + '\\n')\n", 229 | "out.close()" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 5, 235 | "metadata": {}, 236 | "outputs": [ 237 | { 238 | "name": "stderr", 239 | "output_type": "stream", 240 | "text": [ 241 | "100%|█████████▉| 1271806/1271809 [09:05<00:00, 2329.85it/s]\n" 242 | ] 243 | } 244 | ], 245 | "source": [ 246 | "out = open('data/llm/events_test_k3.txt', 'w')\n", 247 | "k = 3\n", 248 | "for i in tqdm(range(len(df))):\n", 249 | " if i == len(df)-k:\n", 250 | " for j in range(k):\n", 251 | " out.write(convert_to_string(df.iloc[i+j]) + ' ')\n", 252 | " out.write(' \\n')\n", 253 | " break\n", 254 | " elif df.iloc[i+k-1]['match_id'] != df.iloc[i+k]['match_id']:\n", 255 | " for j in range(k):\n", 256 | " out.write(convert_to_string(df.iloc[i+j]) + ' ')\n", 257 | " out.write(' \\n')\n", 258 | " elif df.iloc[i+k-1]['period'] != df.iloc[i+k]['period']:\n", 259 | " for j in range(k):\n", 260 | " out.write(convert_to_string(df.iloc[i+j]) + ' ')\n", 261 | " out.write(' \\n')\n", 262 | " else:\n", 263 | " match_id = df.iloc[i]['match_id']\n", 264 | "\n", 265 | " string = convert_to_string(df.iloc[i])\n", 266 | " for j in range(k):\n", 267 | " if j == (k-1):\n", 268 | " string += ' ' + convert_to_string_label(df.iloc[i+j+1])\n", 269 | " else:\n", 270 | " string += ' ' + convert_to_string(df.iloc[i+j+1])\n", 271 | " if match_id != df.iloc[i+j+1]['match_id']:\n", 272 | " match_id = df.iloc[i+j+1]['match_id']\n", 273 | " string = ''\n", 274 | " for w in range(j+2):\n", 275 | " string += ' '\n", 276 | " if w != j+1:\n", 277 | " string += ' '\n", 278 | " \n", 279 | " out.write(string + '\\n')\n", 280 | "out.close()" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 10, 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [ 289 | "df = pd.read_csv('data/llm/events_test.txt', sep=' ', header=None, dtype=str)\n", 290 | "df = df.fillna('')" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 11, 296 | "metadata": {}, 297 | "outputs": [ 298 | { 299 | "ename": "KeyError", 300 | "evalue": "18", 301 | "output_type": "error", 302 | "traceback": [ 303 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 304 | "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", 305 | "File \u001b[1;32mc:\\Users\\tiago\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\pandas\\core\\indexes\\base.py:3790\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 3789\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m-> 3790\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 3791\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", 306 | "File \u001b[1;32mindex.pyx:152\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n", 307 | "File \u001b[1;32mindex.pyx:181\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n", 308 | "File \u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi:2606\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.Int64HashTable.get_item\u001b[1;34m()\u001b[0m\n", 309 | "File \u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi:2630\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.Int64HashTable.get_item\u001b[1;34m()\u001b[0m\n", 310 | "\u001b[1;31mKeyError\u001b[0m: 18", 311 | "\nThe above exception was the direct cause of the following exception:\n", 312 | "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", 313 | "Cell \u001b[1;32mIn[11], line 4\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;241m11\u001b[39m, \u001b[38;5;241m22\u001b[39m):\n\u001b[0;32m 3\u001b[0m _df \u001b[38;5;241m=\u001b[39m df\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[1;32m----> 4\u001b[0m _df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtarget\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43m_df\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m]\u001b[49m\n\u001b[0;32m 6\u001b[0m train_df\u001b[38;5;241m.\u001b[39mappend(_df\u001b[38;5;241m.\u001b[39msample(\u001b[38;5;241m1000\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m42\u001b[39m))\n\u001b[0;32m 7\u001b[0m train_df[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\u001b[38;5;241m.\u001b[39mto_csv(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata/llm/samples/events_test_k1_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.txt\u001b[39m\u001b[38;5;124m'\u001b[39m, sep\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m'\u001b[39m, header\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n", 314 | "File \u001b[1;32mc:\\Users\\tiago\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\pandas\\core\\frame.py:3893\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 3891\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m 3892\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[1;32m-> 3893\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 3894\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[0;32m 3895\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", 315 | "File \u001b[1;32mc:\\Users\\tiago\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\pandas\\core\\indexes\\base.py:3797\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 3792\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[0;32m 3793\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[0;32m 3794\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[0;32m 3795\u001b[0m ):\n\u001b[0;32m 3796\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[1;32m-> 3797\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[0;32m 3798\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[0;32m 3799\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[0;32m 3800\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[0;32m 3801\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[0;32m 3802\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", 316 | "\u001b[1;31mKeyError\u001b[0m: 18" 317 | ] 318 | } 319 | ], 320 | "source": [ 321 | "train_df = []\n", 322 | "for i in range(11, 22):\n", 323 | " _df = df.copy()\n", 324 | " _df['target'] = _df[i]\n", 325 | "\n", 326 | " train_df.append(_df.sample(1000, random_state=42))\n", 327 | " train_df[-1].to_csv(f'data/llm/samples/events_test_k1_{i}.txt', sep=' ', header=False, index=False)" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "df = pd.read_csv('data/llm/events_test_k3.txt', sep=' ', header=None, dtype=str)\n", 337 | "df = df.fillna('')" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "for i in range(33, 44):\n", 347 | " _df = df.copy()\n", 348 | " _df['target'] = _df[i]\n", 349 | "\n", 350 | " _df.loc[train_df[i-33].index].to_csv(f'data/llm/samples/events_test_k3_{i}.txt', sep=' ', header=False, index=False)" 351 | ] 352 | } 353 | ], 354 | "metadata": { 355 | "kernelspec": { 356 | "display_name": "Python 3", 357 | "language": "python", 358 | "name": "python3" 359 | }, 360 | "language_info": { 361 | "codemirror_mode": { 362 | "name": "ipython", 363 | "version": 3 364 | }, 365 | "file_extension": ".py", 366 | "mimetype": "text/x-python", 367 | "name": "python", 368 | "nbconvert_exporter": "python", 369 | "pygments_lexer": "ipython3", 370 | "version": "3.9.13" 371 | } 372 | }, 373 | "nbformat": 4, 374 | "nbformat_minor": 2 375 | } 376 | -------------------------------------------------------------------------------- /Older Versions/0330 Simple LLM.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import torch\n", 12 | "import torch.nn as nn\n", 13 | "import torch.nn.init as init\n", 14 | "import torch.optim as optim\n", 15 | "from torch.utils.data import DataLoader, TensorDataset\n", 16 | "from tqdm import tqdm\n", 17 | "from time import time\n", 18 | "import json" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import matplotlib.pyplot as plt\n", 28 | "import matplotlib.colors as mcolors\n", 29 | "\n", 30 | "colors = ['white', '#eae2b7', '#fcbf49', '#f77f00', '#d62828', '#003049']\n", 31 | "boundaries = [-1, 0.001, 0.10, 0.25, 0.5, 0.75, 1]\n", 32 | "cmap = mcolors.ListedColormap(colors)\n", 33 | "norm = mcolors.BoundaryNorm(boundaries, cmap.N, clip=True)\n", 34 | "rose = '#ff99ac'" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "data_fname = 'data/llm/events_train.txt'" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "# Organizing data" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "df = pd.read_csv(data_fname, sep=' ', header=None, dtype=str)\n", 60 | "df = df.fillna('')" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "train_df = []\n", 70 | "for i in range(11, 18):\n", 71 | " _df = df.copy()\n", 72 | " _df['target'] = _df[i]\n", 73 | " for j in range(i, 18):\n", 74 | " _df[j] = ''\n", 75 | "\n", 76 | " train_df.append(_df)\n", 77 | "train_df = pd.concat(train_df, ignore_index=True)\n", 78 | "train_df = train_df[train_df['target'] != '']" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "# Tokenizing" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "tokenizer_map = {str(i): i for i in range(0, 101)}\n", 95 | "tokenizer_map.update({df[0].value_counts().index[i]: i+len(tokenizer_map) for i in range(len(df[0].value_counts()))})\n", 96 | "tokenizer_map.update({'': len(tokenizer_map)})\n", 97 | "tokenizer_map.update({'': len(tokenizer_map)})\n", 98 | "tokenizer_map.update({'': len(tokenizer_map)})\n", 99 | "detokenizer_map = {v: k for k, v in tokenizer_map.items()}" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "json.dump(tokenizer_map, open('models/llm/tokenizer_map.json', 'w'))\n", 109 | "json.dump(detokenizer_map, open('models/llm/detokenizer_map.json', 'w'))" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "for i in range(0, 18):\n", 119 | " train_df[i] = train_df[i].map(tokenizer_map)\n", 120 | "train_df['target'] = train_df['target'].map(tokenizer_map)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "X_train = train_df.drop('target', axis=1).astype(float).values\n", 130 | "Y_train = pd.get_dummies(train_df['target']).astype(float).values\n", 131 | "\n", 132 | "X_train_tensor = torch.tensor(X_train, dtype=torch.float32)\n", 133 | "Y_train_tensor = torch.tensor(Y_train, dtype=torch.float32)\n", 134 | "\n", 135 | "train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)\n", 136 | "\n", 137 | "train_dataloader = DataLoader(train_dataset, batch_size=1024, shuffle=True)\n", 138 | "\n", 139 | "input_size = X_train.shape[1]\n", 140 | "output_size = len(tokenizer_map) - 1 # because of token not existing in the output" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "# Define Model" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n", 157 | "MODEL_NAME = 'llm_v1_tokens_v2_lite'" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "def flatten(l):\n", 167 | " return [item for sublist in l for item in sublist]\n", 168 | "\n", 169 | "class MultiLayerBinaryClassifier(nn.Module):\n", 170 | " def __init__(self, input_size, hidden_size, output_size, activation='relu'):\n", 171 | " super(MultiLayerBinaryClassifier, self).__init__()\n", 172 | "\n", 173 | " activation_dict = {\n", 174 | " 'relu': nn.ReLU,\n", 175 | " 'sigmoid': nn.Sigmoid,\n", 176 | " 'tanh': nn.Tanh,\n", 177 | " 'leaky_relu': nn.LeakyReLU,\n", 178 | " }\n", 179 | " layers = [\n", 180 | " nn.Linear(input_size, hidden_size[0]),\n", 181 | " activation_dict[activation]()\n", 182 | " ] + flatten([\n", 183 | " [nn.Linear(hidden_size[i], hidden_size[i+1]),\n", 184 | " activation_dict[activation]()] for i in range(len(hidden_size) - 1)\n", 185 | " ]) + [\n", 186 | " nn.Linear(hidden_size[-1], output_size),\n", 187 | " nn.Sigmoid()\n", 188 | " ]\n", 189 | "\n", 190 | " self.model = nn.Sequential(*layers)\n", 191 | " \n", 192 | " # Initialize the linear layers\n", 193 | " self.init_weights()\n", 194 | "\n", 195 | " def init_weights(self):\n", 196 | " for m in self.model.modules():\n", 197 | " if isinstance(m, nn.Linear):\n", 198 | " init.xavier_uniform_(m.weight)\n", 199 | " init.zeros_(m.bias)\n", 200 | " \n", 201 | " def forward(self, x):\n", 202 | " return self.model(x)\n", 203 | " \n", 204 | "def cyclic_cosine_annealing_lr(lr, T_max, eta_min=0, last_epoch=-1):\n", 205 | " if last_epoch == 0:\n", 206 | " return lr\n", 207 | "\n", 208 | " if last_epoch % (2 * T_max) < T_max:\n", 209 | " return (\n", 210 | " eta_min\n", 211 | " + (lr - eta_min)\n", 212 | " * (1 + torch.cos(torch.tensor(3.1415 * last_epoch / T_max)))\n", 213 | " / 2\n", 214 | " )\n", 215 | " else:\n", 216 | " return (\n", 217 | " eta_min\n", 218 | " + (lr - eta_min)\n", 219 | " * (1 + torch.cos(torch.tensor(3.1415 * (last_epoch - T_max) / T_max)))\n", 220 | " / 2\n", 221 | " )" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "# Train model" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "def train(model, dataloader, criterion, optimizer, device):\n", 238 | " model.train()\n", 239 | " running_loss = 0.0\n", 240 | "\n", 241 | " for inputs, labels in dataloader:\n", 242 | " inputs, labels = inputs.to(device), labels.to(device)\n", 243 | " optimizer.zero_grad()\n", 244 | " outputs = model(inputs)\n", 245 | " loss = criterion(outputs, labels)\n", 246 | " loss.backward()\n", 247 | " optimizer.step()\n", 248 | "\n", 249 | " running_loss += loss.item() * inputs.size(0)\n", 250 | "\n", 251 | " epoch_loss = running_loss / len(dataloader.dataset)\n", 252 | " return epoch_loss" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "def evaluate(model, dataloader, criterion, device):\n", 262 | " model.eval()\n", 263 | " running_loss = 0.0\n", 264 | "\n", 265 | " with torch.no_grad():\n", 266 | " for inputs, labels in dataloader:\n", 267 | " inputs, labels = inputs.to(device), labels.to(device)\n", 268 | "\n", 269 | " outputs = model(inputs)\n", 270 | " loss = criterion(outputs, labels)\n", 271 | "\n", 272 | " running_loss += loss.item() * inputs.size(0)\n", 273 | "\n", 274 | " epoch_loss = running_loss / len(dataloader.dataset)\n", 275 | " return epoch_loss" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "#model = MultiLayerBinaryClassifier(input_size, [256, 256], output_size).to(DEVICE)\n", 285 | "model = MultiLayerBinaryClassifier(input_size, [512, 512, 512], output_size).to(DEVICE)\n", 286 | "learning_rate_init = 0.001\n", 287 | "num_epochs = 50\n", 288 | "best_val_loss = 1000\n", 289 | "\n", 290 | "criterion = nn.BCELoss()\n", 291 | "for epoch in range(num_epochs):\n", 292 | " t0 = time()\n", 293 | " lr_update = cyclic_cosine_annealing_lr(learning_rate_init, num_epochs, 0, epoch)\n", 294 | " optimizer = optim.Adam(model.parameters(), lr=lr_update)\n", 295 | " train_loss = train(model, train_dataloader, criterion, optimizer, DEVICE)\n", 296 | " test_loss = evaluate(model, train_dataloader, criterion, DEVICE)\n", 297 | " print(f'Epoch: {epoch+1}/{num_epochs}. Training loss: {train_loss:.4f}. Test loss: {test_loss:.4f}. Time: {time() - t0:.2f}s')\n", 298 | "\n", 299 | " if test_loss < best_val_loss:\n", 300 | " best_val_loss = test_loss\n", 301 | " torch.save(model, f'models/llm/full_{MODEL_NAME}.pth')" 302 | ] 303 | } 304 | ], 305 | "metadata": { 306 | "kernelspec": { 307 | "display_name": "Python 3", 308 | "language": "python", 309 | "name": "python3" 310 | }, 311 | "language_info": { 312 | "codemirror_mode": { 313 | "name": "ipython", 314 | "version": 3 315 | }, 316 | "file_extension": ".py", 317 | "mimetype": "text/x-python", 318 | "name": "python", 319 | "nbconvert_exporter": "python", 320 | "pygments_lexer": "ipython3", 321 | "version": "3.9.13" 322 | } 323 | }, 324 | "nbformat": 4, 325 | "nbformat_minor": 2 326 | } 327 | -------------------------------------------------------------------------------- /Older Versions/0331 Simple LLM K3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import torch\n", 12 | "import torch.nn as nn\n", 13 | "import torch.nn.init as init\n", 14 | "import torch.optim as optim\n", 15 | "from torch.utils.data import DataLoader, TensorDataset\n", 16 | "from tqdm import tqdm\n", 17 | "from time import time\n", 18 | "import json" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import matplotlib.pyplot as plt\n", 28 | "import matplotlib.colors as mcolors\n", 29 | "\n", 30 | "colors = ['white', '#eae2b7', '#fcbf49', '#f77f00', '#d62828', '#003049']\n", 31 | "boundaries = [-1, 0.001, 0.10, 0.25, 0.5, 0.75, 1]\n", 32 | "cmap = mcolors.ListedColormap(colors)\n", 33 | "norm = mcolors.BoundaryNorm(boundaries, cmap.N, clip=True)\n", 34 | "rose = '#ff99ac'" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "data_fname = 'data/llm/events_train_k3.txt'" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "# Organizing data" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "df = pd.read_csv(data_fname, sep=' ', header=None, dtype=str)\n", 60 | "df = df.fillna('')" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "train_df = []\n", 70 | "for i in range(33, 40):\n", 71 | " _df = df.copy()\n", 72 | " _df['target'] = _df[i]\n", 73 | " for j in range(i, 40):\n", 74 | " _df[j] = ''\n", 75 | "\n", 76 | " train_df.append(_df)\n", 77 | "train_df = pd.concat(train_df, ignore_index=True)\n", 78 | "train_df = train_df[train_df['target'] != '']" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "# Tokenizing" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "tokenizer_map = json.load(open('models/llm/tokenizer_map.json', 'r'))\n", 95 | "detokenizer_map = {v: k for k, v in tokenizer_map.items()}" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "for i in range(0, 40):\n", 105 | " train_df[i] = train_df[i].map(tokenizer_map)\n", 106 | "train_df['target'] = train_df['target'].map(tokenizer_map)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "X_train = train_df.drop('target', axis=1).astype(float).values\n", 116 | "Y_train = pd.get_dummies(train_df['target']).astype(float).values\n", 117 | "\n", 118 | "X_train_tensor = torch.tensor(X_train, dtype=torch.float32)\n", 119 | "Y_train_tensor = torch.tensor(Y_train, dtype=torch.float32)\n", 120 | "\n", 121 | "train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)\n", 122 | "\n", 123 | "train_dataloader = DataLoader(train_dataset, batch_size=1024, shuffle=True)\n", 124 | "\n", 125 | "input_size = X_train.shape[1]\n", 126 | "output_size = len(tokenizer_map) - 1 # because of token not existing in the output" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "# Define Model" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n", 143 | "MODEL_NAME = 'llm_v1_tokens_v2_k3'" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "def flatten(l):\n", 153 | " return [item for sublist in l for item in sublist]\n", 154 | "\n", 155 | "class MultiLayerBinaryClassifier(nn.Module):\n", 156 | " def __init__(self, input_size, hidden_size, output_size, activation='relu'):\n", 157 | " super(MultiLayerBinaryClassifier, self).__init__()\n", 158 | "\n", 159 | " activation_dict = {\n", 160 | " 'relu': nn.ReLU,\n", 161 | " 'sigmoid': nn.Sigmoid,\n", 162 | " 'tanh': nn.Tanh,\n", 163 | " 'leaky_relu': nn.LeakyReLU,\n", 164 | " }\n", 165 | " layers = [\n", 166 | " nn.Linear(input_size, hidden_size[0]),\n", 167 | " activation_dict[activation]()\n", 168 | " ] + flatten([\n", 169 | " [nn.Linear(hidden_size[i], hidden_size[i+1]),\n", 170 | " activation_dict[activation]()] for i in range(len(hidden_size) - 1)\n", 171 | " ]) + [\n", 172 | " nn.Linear(hidden_size[-1], output_size),\n", 173 | " nn.Sigmoid()\n", 174 | " ]\n", 175 | "\n", 176 | " self.model = nn.Sequential(*layers)\n", 177 | " \n", 178 | " # Initialize the linear layers\n", 179 | " self.init_weights()\n", 180 | "\n", 181 | " def init_weights(self):\n", 182 | " for m in self.model.modules():\n", 183 | " if isinstance(m, nn.Linear):\n", 184 | " init.xavier_uniform_(m.weight)\n", 185 | " init.zeros_(m.bias)\n", 186 | " \n", 187 | " def forward(self, x):\n", 188 | " return self.model(x)\n", 189 | " \n", 190 | "def cyclic_cosine_annealing_lr(lr, T_max, eta_min=0, last_epoch=-1):\n", 191 | " if last_epoch == 0:\n", 192 | " return lr\n", 193 | "\n", 194 | " if last_epoch % (2 * T_max) < T_max:\n", 195 | " return (\n", 196 | " eta_min\n", 197 | " + (lr - eta_min)\n", 198 | " * (1 + torch.cos(torch.tensor(3.1415 * last_epoch / T_max)))\n", 199 | " / 2\n", 200 | " )\n", 201 | " else:\n", 202 | " return (\n", 203 | " eta_min\n", 204 | " + (lr - eta_min)\n", 205 | " * (1 + torch.cos(torch.tensor(3.1415 * (last_epoch - T_max) / T_max)))\n", 206 | " / 2\n", 207 | " )" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "# Train model" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "def train(model, dataloader, criterion, optimizer, device):\n", 224 | " model.train()\n", 225 | " running_loss = 0.0\n", 226 | "\n", 227 | " for inputs, labels in dataloader:\n", 228 | " inputs, labels = inputs.to(device), labels.to(device)\n", 229 | " optimizer.zero_grad()\n", 230 | " outputs = model(inputs)\n", 231 | " loss = criterion(outputs, labels)\n", 232 | " loss.backward()\n", 233 | " optimizer.step()\n", 234 | "\n", 235 | "\n", 236 | " running_loss += loss.item() * inputs.size(0)\n", 237 | "\n", 238 | " epoch_loss = running_loss / len(dataloader.dataset)\n", 239 | " return epoch_loss" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "def evaluate(model, dataloader, criterion, device):\n", 249 | " model.eval()\n", 250 | " running_loss = 0.0\n", 251 | "\n", 252 | " with torch.no_grad():\n", 253 | " for inputs, labels in dataloader:\n", 254 | " inputs, labels = inputs.to(device), labels.to(device)\n", 255 | "\n", 256 | " outputs = model(inputs)\n", 257 | " loss = criterion(outputs, labels)\n", 258 | "\n", 259 | " running_loss += loss.item() * inputs.size(0)\n", 260 | "\n", 261 | " epoch_loss = running_loss / len(dataloader.dataset)\n", 262 | " return epoch_loss" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "model = MultiLayerBinaryClassifier(input_size, [512, 512, 512], output_size).to(DEVICE)\n", 272 | "learning_rate_init = 0.001\n", 273 | "num_epochs = 50\n", 274 | "best_val_loss = 1000\n", 275 | "\n", 276 | "criterion = nn.BCELoss()\n", 277 | "for epoch in range(num_epochs):\n", 278 | " t0 = time()\n", 279 | " lr_update = cyclic_cosine_annealing_lr(learning_rate_init, num_epochs, 0, epoch)\n", 280 | " optimizer = optim.Adam(model.parameters(), lr=lr_update)\n", 281 | " train_loss = train(model, train_dataloader, criterion, optimizer, DEVICE)\n", 282 | " test_loss = evaluate(model, train_dataloader, criterion, DEVICE)\n", 283 | " print(f'Epoch: {epoch+1}/{num_epochs}. Training loss: {train_loss:.4f}. Test loss: {test_loss:.4f}. Time: {time() - t0:.2f}s')\n", 284 | "\n", 285 | " if test_loss < best_val_loss:\n", 286 | " best_val_loss = test_loss\n", 287 | " torch.save(model, f'models/llm/full_{MODEL_NAME}.pth')" 288 | ] 289 | } 290 | ], 291 | "metadata": { 292 | "kernelspec": { 293 | "display_name": "Python 3", 294 | "language": "python", 295 | "name": "python3" 296 | }, 297 | "language_info": { 298 | "codemirror_mode": { 299 | "name": "ipython", 300 | "version": 3 301 | }, 302 | "file_extension": ".py", 303 | "mimetype": "text/x-python", 304 | "name": "python", 305 | "nbconvert_exporter": "python", 306 | "pygments_lexer": "ipython3", 307 | "version": "3.9.13" 308 | } 309 | }, 310 | "nbformat": 4, 311 | "nbformat_minor": 2 312 | } 313 | -------------------------------------------------------------------------------- /Older Versions/lib/data_utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.preprocessing import OneHotEncoder, StandardScaler 4 | 5 | FIELD_SIZE = {'x':1.05, 'y':0.68} 6 | 7 | VAEP_FEATURE_SET = [ 8 | 'absolute_sec', 'period', 9 | 'x', 'y', 'end_x', 'end_y', 10 | 'goal', 'own_goal', 'assist', 'key_pass', 11 | 'counter_attack', 'left', 'right', 'head', 'direct', 'indirect', 12 | 'dangerous_ball_lost', 'blocked', 'high', 'low', 'interception', 13 | 'clearance', 'opportunity', 'feint', 'missed ball', 'sliding_tackle', 14 | 'anticipated', 'anticipation', 'red', 'yellow', 'second_yellow', 15 | 'through', 'lost', 'neutral', 'won', 'accurate', 16 | 'not_accurate', 'subtype_id', 'type_id', 17 | 'possession_type_id', 18 | 'possession_team_id', 'previous_action_type_id_1', 19 | 'previous_action_is_same_team_1', 'previous_action_is_same_possession_1', 20 | 'previous_action_is_same_player_1', 'previous_action_x_1', 'previous_action_y_1', 21 | 'previous_action_time_since_1', 'previous_action_x_displacement_1', 22 | 'previous_action_type_id_2', 'previous_action_is_same_team_2', 23 | 'previous_action_is_same_possession_2', 'previous_action_is_same_player_2', 24 | 'previous_action_x_2', 'previous_action_y_2', 'previous_action_time_since_2', 25 | 'previous_action_x_displacement_2', 'possession_start_is_same_team', 26 | 'possession_start_action_x', 'possession_start_action_y', 27 | 'possession_start_time_since', 'possession_start_x_displacement', 28 | 'start_distance_to_goal', 'start_angle_to_goal', 'end_distance_to_goal', 29 | 'end_angle_to_goal', 'intent_progressive', 'shot_assist' 30 | ] 31 | 32 | XG_FEATURE_SET = [ 33 | 'subtype_id', 'x', 'y', 34 | 'left', 'right', 'head', 35 | 'previous_action_type_id_1', 'previous_action_is_same_team_1', 36 | 'previous_action_is_same_player_1', 'previous_action_x_1', 'previous_action_y_1', 37 | 'previous_action_time_since_1', 'previous_action_x_displacement_1', 38 | 'possession_type_id', 39 | 'possession_start_action_x', 'possession_start_action_y', 40 | 'possession_start_time_since', 'possession_start_x_displacement', 41 | 'start_distance_to_goal', 'start_angle_to_goal', 'end_distance_to_goal', 'end_angle_to_goal' 42 | ] 43 | 44 | SUBEVENT_TYPE_MAP = { 45 | 'air_duel': 1, 46 | 'ground_attacking_duel': 2, 47 | 'ground_defending_duel': 3, 48 | 'ground_loose_ball_duel': 4, 49 | 'foul': 5, 50 | 'hand_foul': 6, 51 | 'late_card_foul': 7, 52 | 'out_of_game_foul': 8, 53 | 'protest': 9, 54 | 'simulation': 10, 55 | 'time_lost_foul': 11, 56 | 'violent_foul': 12, 57 | 'corner': 13, 58 | 'free_kick': 14, 59 | 'free_kick_cross': 15, 60 | 'goal_kick': 16, 61 | 'penalty': 17, 62 | 'throw_in': 18, 63 | 'goalkeeper_leaving_line': 19, 64 | 'acceleration': 20, 65 | 'clearance': 21, 66 | 'touch': 22, 67 | 'cross': 23, 68 | 'hand_pass': 24, 69 | 'head_pass': 25, 70 | 'high_pass': 26, 71 | 'launch': 27, 72 | 'simple_pass': 28, 73 | 'smart_pass': 29, 74 | 'reflexes': 30, 75 | 'save_attempt': 31, 76 | 'free_kick_shot': 32, 77 | 'shot': 33, 78 | } 79 | 80 | EVENT_TYPE_MAP = { 81 | 'duel': 1, 82 | 'foul': 2, 83 | 'free_kick': 3, 84 | 'goalkeeper_leaving_line': 4, 85 | 'offside': 5, 86 | 'others_on_the_ball': 6, 87 | 'pass': 7, 88 | 'interruption': 8, 89 | 'save_attempt': 9, 90 | 'shot': 10, 91 | } 92 | 93 | 94 | def load_data(path): 95 | df = pd.read_csv(path) 96 | df = df.fillna(0) 97 | return df 98 | 99 | def compute_features(df): 100 | 101 | df['subtype_id'] = df['subtype_name'].map(SUBEVENT_TYPE_MAP) 102 | df['type_id'] = df['type_name'].map(EVENT_TYPE_MAP) 103 | df = df.dropna(subset=['subtype_id']).copy() 104 | 105 | df['player_is_next_1'] = np.where((df.type_name == 'pass') & (df.team_name == df.team_name.shift(-1)), df.player_name.shift(-1), '') 106 | df['receiving_player_name'] = np.where((df.type_name == 'pass') & (df.team_name == df.team_name.shift(-2)), df.player_name.shift(-2), '') 107 | df.loc[df['receiving_player_name'] == '', 'receiving_player_name'] = df.loc[df['receiving_player_name'] == '', 'player_is_next_1'] 108 | 109 | # A possession starts with a pass and ends when a successful pass from the opponent is made 110 | # or when the ball goes out of play 111 | start_new_possession = (((df['type_name'] == 'pass') * df['accurate'] + (df['type_name'] == 'free_kick')) * df.team_id).replace(0, np.NaN).ffill() 112 | start_new_possession = (start_new_possession != start_new_possession.shift(1)).cumsum() 113 | start_new_possession = start_new_possession + ((df['type_name'] == 'interruption') | (df['type_name'] == 'foul')).shift(1).fillna(0).cumsum() 114 | df['possession_id'] = start_new_possession 115 | df['possession_type_name'] = (df['possession_id'].diff(1).fillna(1) * df['type_name']).replace('', np.NaN).ffill() 116 | df['possession_type_id'] = df['possession_type_name'].map(EVENT_TYPE_MAP) 117 | df['possession_team_id'] = (df['possession_id'].diff(1).fillna(1) * df['team_id']).replace(0, np.NaN).ffill() 118 | df['possession_start_time'] = (df['possession_id'].diff(1).fillna(1) * df['absolute_sec']).replace(0, np.NaN).ffill() 119 | 120 | for i in range(1, 3): 121 | df[f'previous_action_type_id_{i}'] = df['type_id'].shift(i) 122 | df[f'previous_action_is_same_team_{i}'] = (df['team_id'] == df['team_id'].shift(i)).astype(int) 123 | df[f'previous_action_is_same_possession_{i}'] = (df['possession_id'] == df['possession_id'].shift(i)).astype(int) 124 | df[f'previous_action_is_same_player_{i}'] = (df['player_id'] == df['player_id'].shift(i)).astype(int) 125 | df[f'previous_action_x_{i}'] = abs((100 * (1-df[f'previous_action_is_same_team_{i}'])) - df['x'].shift(i)) 126 | df[f'previous_action_y_{i}'] = abs((100 * (1-df[f'previous_action_is_same_team_{i}'])) - df['y'].shift(i)) 127 | df[f'previous_action_time_since_{i}'] = df['absolute_sec'] - df['absolute_sec'].shift(i) 128 | df[f'previous_action_x_displacement_{i}'] = df['x'] - df[f'previous_action_x_{i}'] 129 | 130 | df['possession_start_is_same_team'] = (df['possession_team_id'] == df['team_id']).astype(int) 131 | df['possession_start_action_x'] = (df['possession_id'].diff(1).fillna(1) * df['x']).replace(0, np.NaN).ffill() 132 | df['possession_start_action_y'] = (df['possession_id'].diff(1).fillna(1) * df['y']).replace(0, np.NaN).ffill() 133 | df['possession_start_time_since'] = df['absolute_sec'] - df['possession_start_time'] 134 | df['possession_start_x_displacement'] = df['x'] - df['possession_start_action_x'] 135 | 136 | df['start_distance_to_goal'] = np.sqrt(((df['x'] - 100) * FIELD_SIZE['x'])**2 + ((df['y'] - 50) * FIELD_SIZE['y'])**2) 137 | df['start_angle_to_goal'] = abs(np.arctan2((df['y'] - 50) * FIELD_SIZE['y'], (df['x'] - 100) * FIELD_SIZE['x'])) 138 | df['end_distance_to_goal'] = np.sqrt(((df['end_x'] - 100) * FIELD_SIZE['x'])**2 + ((df['end_y'] - 50) * FIELD_SIZE['y'])**2) 139 | df['end_angle_to_goal'] = abs(np.arctan2((df['end_y'] - 50) * FIELD_SIZE['y'], (df['end_x'] - 100) * FIELD_SIZE['x'])) 140 | 141 | df['intent_progressive'] = ((df['type_name'] == 'pass') * (df['end_distance_to_goal'] < df['start_distance_to_goal'])).astype(int) 142 | 143 | df['shot_assist'] = ((df['type_name'].isin(['pass', 'free_kick']) & (df['accurate'] == 1)) & (((df['type_name'].shift(1) == 'shot') | (df['type_name'].shift(2) == 'shot')).astype(int).diff() < 0)).shift(-1) 144 | 145 | df['home_score'] = ( 146 | ((df.type_name == 'shot') & (df.goal == 1) & (df.team_id == df.home_team_id)) | 147 | ((df.type_name.isin(['others_on_the_ball', 'pass'])) & (df.own_goal == 1) & (df.team_id == df.away_team_id)) 148 | ).cumsum() 149 | df['home_score'] = df['home_score'] - df['match_id'].map(df.groupby('match_id')['home_score'].min()) 150 | df['away_score'] = ( 151 | ((df.type_name == 'shot') & (df.goal == 1) & (df.team_id == df.away_team_id)) | 152 | ((df.type_name.isin(['others_on_the_ball', 'pass'])) & (df.own_goal == 1) & (df.team_id == df.home_team_id)) 153 | ).cumsum() 154 | df['away_score'] = df['away_score'] - df['match_id'].map(df.groupby('match_id')['away_score'].min()) 155 | 156 | df = df.fillna(0) 157 | 158 | return df 159 | 160 | def compute_labels(df, k=5): 161 | df['goal'] = df['goal'].fillna(0) 162 | 163 | actions_before_goal = None 164 | actions_before_own_goal = None 165 | for i in range(k): 166 | if actions_before_goal is None: 167 | actions_before_goal = df.goal.shift(-(i)) 168 | actions_before_own_goal = -df.own_goal.shift(-(i)) 169 | else: 170 | actions_before_goal += df.goal.shift(-(i)) 171 | actions_before_own_goal -= df.own_goal.shift(-(i)) 172 | actions_before_goal = actions_before_goal.fillna(0) 173 | actions_before_own_goal = actions_before_own_goal.fillna(0) 174 | 175 | is_same_period = (df.goal * df.period).replace(to_replace=False, method='bfill') == df.period 176 | is_same_game = (df.goal * df.match_id).replace(to_replace=False, method='bfill') == df.match_id 177 | is_team_next_goal = 2 * ((df.goal * df.team_id).replace(to_replace=False, method='bfill') == df.team_id) - 1 178 | is_team_next_goal *= actions_before_own_goal 179 | time_before_goal = ((df.goal * df.absolute_sec).replace(to_replace=False, method='bfill') - df.absolute_sec) / 60 180 | 181 | df['vaep_label_0'] = actions_before_goal * is_same_period * is_same_game * is_team_next_goal 182 | df['vaep_label_0_scoring'] = df['vaep_label_0'].clip(0, 1) 183 | df['vaep_label_0_conceding'] = abs(df['vaep_label_0'].clip(-1, 0)) 184 | 185 | action_importance = np.maximum(1 - time_before_goal, actions_before_goal) * is_same_period * is_same_game 186 | action_importance[action_importance < 0] = 0 187 | action_importance *= is_team_next_goal 188 | df['VAEP_label_regression'] = action_importance 189 | 190 | match_result = df.match_winner != 0 191 | match_result *= (df.match_winner == df.team_id).astype(int) * 2 - 1 192 | df['vaep_label_winner'] = match_result 193 | 194 | return df 195 | 196 | def encode_targets(df): 197 | enc = OneHotEncoder() 198 | df['next_action_type'] = df.groupby('match_id')['subtype_id'].shift(-1).fillna(28).astype(int) 199 | df['next_action_plus_seconds'] = (df.groupby('match_id')['absolute_sec'].shift(-1) - df['absolute_sec']).clip(0, 30).round().fillna(0) 200 | df['next_action_x'] = df.groupby('match_id')['x'].shift(-1).fillna(50) 201 | df['next_action_y'] = df.groupby('match_id')['y'].shift(-1).fillna(50) 202 | df['next_action_accurate'] = df.groupby('match_id')['accurate'].shift(-1).fillna(1) 203 | df['next_action_goal'] = df.groupby('match_id')['goal'].shift(-1).fillna(0) 204 | df['is_home_team'] = df['team_id'] == df['home_team_id'] 205 | df['next_action_team'] = df.groupby('match_id')['is_home_team'].shift(-1).fillna(1) 206 | targets = ['next_action_plus_seconds', 'next_action_x', 'next_action_y', 'next_action_type', 'next_action_accurate', 'next_action_team', 'next_action_goal'] 207 | 208 | enc.fit(df[targets]) 209 | df_enc = pd.DataFrame(enc.transform(df[targets]).toarray(), columns=enc.get_feature_names_out(enc.feature_names_in_)) 210 | df_enc = df_enc.rename(columns={col: col.split('.')[0] for col in df_enc.columns}) 211 | df_enc = df_enc.drop(columns=[col for col in df_enc.columns if 'nan' in col]) 212 | 213 | df_enc_type = df_enc[df_enc.columns[df_enc.columns.str.contains('next_action_type')]] 214 | df_enc_acc = df_enc[df_enc.columns[(df_enc.columns.str.contains('next_action_accurate') | df_enc.columns.str.contains('next_action_goal')) & ~df_enc.columns.str.contains('_0')]] 215 | df_enc_data = df_enc[df_enc.columns[~df_enc.columns.str.contains('next_action_type') & ~df_enc.columns.str.contains('next_action_accurate') & ~df_enc.columns.str.contains('next_action_goal') & ~df_enc.columns.str.contains('team_False')]] 216 | 217 | return df_enc_type, df_enc_acc, df_enc_data 218 | 219 | def normalize_and_encode_features(df): 220 | df['next_action_type'] = df.groupby('match_id')['subtype_id'].shift(-1).fillna(28).astype(int) 221 | df['next_action_accurate'] = df.groupby('match_id')['accurate'].shift(-1).fillna(1) 222 | df['next_action_goal'] = df.groupby('match_id')['goal'].shift(-1).fillna(0) 223 | 224 | df['x'] = df.x / 100 225 | df['y'] = df.y / 100 226 | 227 | df['home_score'] = df.home_score / 10 228 | df['away_score'] = df.away_score / 10 229 | 230 | df['minute'] = df.minute / 60 231 | df['period'] = df.period - 1 232 | 233 | df.subtype_id = df.subtype_id.astype(int) 234 | df = pd.get_dummies(df, columns=['subtype_id']) 235 | enc_type_vars = [i for i in list(df.columns) if 'subtype_id_' in i] 236 | df = pd.get_dummies(df, columns=['next_action_type']) 237 | enc_next_type_vars = [i for i in list(df.columns) if 'next_action_type_' in i] 238 | 239 | features = enc_type_vars + ['period', 'minute', 'x', 'y', 'is_home_team', 'accurate', 'goal', 'home_score', 'away_score'] + enc_next_type_vars + ['next_action_accurate', 'next_action_goal'] 240 | 241 | return df, features 242 | 243 | def encode_targets_v2(df): 244 | enc = OneHotEncoder() 245 | df['next_action_type'] = df.groupby('match_id')['subtype_id'].shift(-1).fillna(28).astype(int) 246 | df['next_action_plus_seconds'] = (df.groupby('match_id')['absolute_sec'].shift(-1) - df['absolute_sec']).clip(0, 60).round().fillna(0) 247 | df['next_action_x'] = df.groupby('match_id')['x'].shift(-1).fillna(50) 248 | df['next_action_y'] = df.groupby('match_id')['y'].shift(-1).fillna(50) 249 | df['next_action_accurate'] = df.groupby('match_id')['accurate'].shift(-1).fillna(1) 250 | df['next_action_goal'] = df.groupby('match_id')['goal'].shift(-1).fillna(0) 251 | df['is_home_team'] = df['team_id'] == df['home_team_id'] 252 | df['next_action_team'] = df.groupby('match_id')['is_home_team'].shift(-1).fillna(1) 253 | targets = ['next_action_plus_seconds', 'next_action_x', 'next_action_y', 'next_action_type', 'next_action_accurate', 'next_action_team', 'next_action_goal'] 254 | 255 | enc.fit(df[targets]) 256 | df_enc = pd.DataFrame(enc.transform(df[targets]).toarray(), columns=enc.get_feature_names_out(enc.feature_names_in_)) 257 | df_enc = df_enc.rename(columns={col: col.split('.')[0] for col in df_enc.columns}) 258 | df_enc = df_enc.drop(columns=[col for col in df_enc.columns if 'nan' in col]) 259 | 260 | df_enc_loc = df_enc[df_enc.columns[df_enc.columns.str.contains('next_action_x') | df_enc.columns.str.contains('next_action_y')]] 261 | df_enc_type = df_enc[df_enc.columns[df_enc.columns.str.contains('next_action_type')]] 262 | df_enc_acc = df_enc[df_enc.columns[(df_enc.columns.str.contains('next_action_accurate') | df_enc.columns.str.contains('next_action_goal')) & ~df_enc.columns.str.contains('_0')]] 263 | df_enc_data = df_enc[df_enc.columns[~df_enc.columns.str.contains('next_action_x') & ~df_enc.columns.str.contains('next_action_y') & ~df_enc.columns.str.contains('next_action_type') & ~df_enc.columns.str.contains('next_action_accurate') & ~df_enc.columns.str.contains('next_action_goal') & ~df_enc.columns.str.contains('team_False')]] 264 | 265 | df_y = { 266 | 'LOC': df_enc_loc, 267 | 'TYPE': df_enc_type, 268 | 'ACC': df_enc_acc, 269 | 'DATA': df_enc_data 270 | } 271 | 272 | return df_y 273 | 274 | def normalize_and_encode_features_v2(df): 275 | df['next_action_type'] = df.groupby('match_id')['subtype_id'].shift(-1).fillna(28).astype(int) 276 | df['next_action_accurate'] = df.groupby('match_id')['accurate'].shift(-1).fillna(1) 277 | df['next_action_goal'] = df.groupby('match_id')['goal'].shift(-1).fillna(0) 278 | df['next_action_x'] = df.groupby('match_id')['x'].shift(-1).fillna(50) 279 | df['next_action_y'] = df.groupby('match_id')['y'].shift(-1).fillna(50) 280 | 281 | df['x'] = df.x / 100 282 | df['y'] = df.y / 100 283 | df['next_action_x'] = df.next_action_x / 100 284 | df['next_action_y'] = df.next_action_y / 100 285 | 286 | df['home_score'] = df.home_score / 10 287 | df['away_score'] = df.away_score / 10 288 | 289 | df['minute'] = df.minute / 60 290 | df['period'] = df.period - 1 291 | 292 | df.subtype_id = df.subtype_id.astype(int) 293 | df = pd.get_dummies(df, columns=['subtype_id']) 294 | enc_type_vars = [i for i in list(df.columns) if 'subtype_id_' in i] 295 | df = pd.get_dummies(df, columns=['next_action_type']) 296 | enc_next_type_vars = [i for i in list(df.columns) if 'next_action_type_' in i] 297 | 298 | features = enc_type_vars + ['period', 'minute', 'x', 'y', 'is_home_team', 'accurate', 'goal', 'home_score', 'away_score'] + ['next_action_x', 'next_action_y'] + enc_next_type_vars + ['next_action_accurate', 'next_action_goal'] 299 | 300 | features_model = { 301 | 'LOC': enc_type_vars + ['period', 'minute', 'x', 'y', 'is_home_team', 'accurate', 'goal', 'home_score', 'away_score'], 302 | 'TYPE': enc_type_vars + ['period', 'minute', 'x', 'y', 'is_home_team', 'accurate', 'goal', 'home_score', 'away_score'] + ['next_action_x', 'next_action_y'], 303 | 'ACC': enc_type_vars + ['period', 'minute', 'x', 'y', 'is_home_team', 'accurate', 'goal', 'home_score', 'away_score'] + ['next_action_x', 'next_action_y'] + enc_next_type_vars, 304 | 'DATA': enc_type_vars + ['period', 'minute', 'x', 'y', 'is_home_team', 'accurate', 'goal', 'home_score', 'away_score'] + ['next_action_x', 'next_action_y'] + enc_next_type_vars + ['next_action_accurate', 'next_action_goal'] 305 | } 306 | 307 | return df, features, features_model 308 | 309 | 310 | def encode_targets_v3(df): 311 | enc = OneHotEncoder() 312 | df['next_action_type'] = df.groupby('match_id')['subtype_id'].shift(-1).fillna(28).astype(int) 313 | df['next_action_plus_seconds'] = (df.groupby('match_id')['absolute_sec'].shift(-1) - df['absolute_sec']).clip(0, 60).round().fillna(0) 314 | df['next_action_x'] = df.groupby('match_id')['x'].shift(-1).fillna(50) 315 | df['next_action_y'] = df.groupby('match_id')['y'].shift(-1).fillna(50) 316 | df['next_action_accurate'] = df.groupby('match_id')['accurate'].shift(-1).fillna(1) 317 | df['next_action_goal'] = df.groupby('match_id')['goal'].shift(-1).fillna(0) 318 | df['is_home_team'] = df['team_id'] == df['home_team_id'] 319 | df['next_action_team'] = df.groupby('match_id')['is_home_team'].shift(-1).fillna(True) 320 | targets = ['next_action_plus_seconds', 'next_action_x', 'next_action_y', 'next_action_type', 'next_action_accurate', 'next_action_team', 'next_action_goal'] 321 | 322 | enc.fit(df[targets]) 323 | df_enc = pd.DataFrame(enc.transform(df[targets]).toarray(), columns=enc.get_feature_names_out(enc.feature_names_in_)) 324 | df_enc = df_enc.rename(columns={col: col.split('.')[0] for col in df_enc.columns}) 325 | df_enc = df_enc.drop(columns=[col for col in df_enc.columns if 'nan' in col]) 326 | 327 | df_enc_type = df_enc[df_enc.columns[df_enc.columns.str.contains('next_action_type')]] 328 | df_enc_acc = df_enc[df_enc.columns[(df_enc.columns.str.contains('next_action_accurate') | df_enc.columns.str.contains('next_action_goal')) & ~df_enc.columns.str.contains('_0')]] 329 | df_enc_data = df_enc[df_enc.columns[~df_enc.columns.str.contains('next_action_type') & ~df_enc.columns.str.contains('next_action_accurate') & ~df_enc.columns.str.contains('next_action_goal') & ~df_enc.columns.str.contains('team_False')]] 330 | 331 | df_y = { 332 | 'TYPE': df_enc_type.reset_index(drop=True), 333 | 'ACC': df_enc_acc.reset_index(drop=True), 334 | 'DATA': df_enc_data.reset_index(drop=True) 335 | } 336 | 337 | return df_y 338 | 339 | def normalize_and_encode_features_v3(df): 340 | df['next_action_type'] = df.groupby('match_id')['subtype_id'].shift(-1).fillna(28).astype(int) 341 | df['next_action_accurate'] = df.groupby('match_id')['accurate'].shift(-1).fillna(1) 342 | df['next_action_goal'] = df.groupby('match_id')['goal'].shift(-1).fillna(0) 343 | df['next_action_x'] = df.groupby('match_id')['x'].shift(-1).fillna(50) 344 | df['next_action_y'] = df.groupby('match_id')['y'].shift(-1).fillna(50) 345 | 346 | df['x'] = df.x / 100 347 | df['y'] = df.y / 100 348 | df['next_action_x'] = df.next_action_x / 100 349 | df['next_action_y'] = df.next_action_y / 100 350 | 351 | df['home_score'] = df.home_score / 10 352 | df['away_score'] = df.away_score / 10 353 | 354 | df['minute'] = df.minute / 60 355 | df['period'] = df.period - 1 356 | 357 | df.subtype_id = df.subtype_id.astype(int) 358 | df = pd.get_dummies(df, columns=['subtype_id']) 359 | enc_type_vars = [i for i in list(df.columns) if 'subtype_id_' in i] 360 | df = pd.get_dummies(df, columns=['next_action_type']) 361 | enc_next_type_vars = [i for i in list(df.columns) if 'next_action_type_' in i] 362 | 363 | features = enc_type_vars + ['period', 'minute', 'x', 'y', 'is_home_team', 'accurate', 'goal', 'home_score', 'away_score'] + ['next_action_x', 'next_action_y'] + enc_next_type_vars + ['next_action_accurate', 'next_action_goal'] 364 | 365 | features_model = { 366 | 'TYPE': enc_type_vars + ['period', 'minute', 'x', 'y', 'is_home_team', 'accurate', 'goal', 'home_score', 'away_score'], 367 | 'ACC': enc_type_vars + ['period', 'minute', 'x', 'y', 'is_home_team', 'accurate', 'goal', 'home_score', 'away_score'] + enc_next_type_vars, 368 | 'DATA': enc_type_vars + ['period', 'minute', 'x', 'y', 'is_home_team', 'accurate', 'goal', 'home_score', 'away_score'] + enc_next_type_vars + ['next_action_accurate', 'next_action_goal'] 369 | } 370 | 371 | return df.reset_index(drop=True), features, features_model 372 | 373 | def load_model_training_data_template(train_sets, optimization_sets, test_sets): 374 | df_train = [] 375 | df_train_y = None 376 | if train_sets != []: 377 | for fname in train_sets: 378 | df_train.append(load_data(fname)) 379 | df_train[-1] = compute_features(df_train[-1]) 380 | df_train = pd.concat(df_train) 381 | df_train_y = encode_targets_v3(df_train) 382 | df_train, complete_feature_set, features_model = normalize_and_encode_features_v3(df_train) 383 | 384 | df_optimization = [] 385 | df_optimization_y = None 386 | if optimization_sets != []: 387 | for fname in optimization_sets: 388 | df_optimization.append(load_data(fname)) 389 | df_optimization[-1] = compute_features(df_optimization[-1]) 390 | df_optimization = pd.concat(df_optimization) 391 | df_optimization_y = encode_targets_v3(df_optimization) 392 | df_optimization, complete_feature_set, features_model = normalize_and_encode_features_v3(df_optimization) 393 | 394 | df_test = [] 395 | df_test_y = None 396 | if test_sets != []: 397 | for fname in test_sets: 398 | df_test.append(load_data(fname)) 399 | df_test[-1] = compute_features(df_test[-1]) 400 | df_test = pd.concat(df_test) 401 | df_test_y = encode_targets_v3(df_test) 402 | df_test, complete_feature_set, features_model = normalize_and_encode_features_v3(df_test) 403 | 404 | return df_train, df_train_y, df_optimization, df_optimization_y, df_test, df_test_y, complete_feature_set, features_model -------------------------------------------------------------------------------- /Older Versions/lib/model_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.init as init 4 | import torch.optim as optim 5 | from torch.utils.data import DataLoader, TensorDataset 6 | from sklearn.metrics import log_loss 7 | 8 | class SingleLayerBinaryClassifier(nn.Module): 9 | def __init__(self, input_size, hidden_size, output_size): 10 | super(SingleLayerBinaryClassifier, self).__init__() 11 | self.model = nn.Sequential( 12 | nn.Linear(input_size, hidden_size), 13 | nn.ReLU(), 14 | nn.Linear(hidden_size, output_size), 15 | nn.Sigmoid() 16 | ) 17 | 18 | # Initialize the linear layers 19 | self.init_weights() 20 | 21 | def init_weights(self): 22 | for m in self.model.modules(): 23 | if isinstance(m, nn.Linear): 24 | init.xavier_uniform_(m.weight) 25 | init.zeros_(m.bias) 26 | 27 | def forward(self, x): 28 | return self.model(x) 29 | 30 | class TripleLayerBinaryClassifier(nn.Module): 31 | def __init__(self, input_size, hidden_size, output_size): 32 | super(TripleLayerBinaryClassifier, self).__init__() 33 | self.model = nn.Sequential( 34 | nn.Linear(input_size, hidden_size), 35 | nn.ReLU(), 36 | nn.Linear(hidden_size, hidden_size), 37 | nn.ReLU(), 38 | nn.Linear(hidden_size, hidden_size), 39 | nn.ReLU(), 40 | nn.Linear(hidden_size, output_size), 41 | nn.Sigmoid() 42 | ) 43 | 44 | # Initialize the linear layers 45 | self.init_weights() 46 | 47 | def init_weights(self): 48 | for m in self.model.modules(): 49 | if isinstance(m, nn.Linear): 50 | init.xavier_uniform_(m.weight) 51 | init.zeros_(m.bias) 52 | 53 | def forward(self, x): 54 | return self.model(x) 55 | 56 | def flatten(l): 57 | return [item for sublist in l for item in sublist] 58 | 59 | class MultiLayerBinaryClassifier(nn.Module): 60 | def __init__(self, input_size, hidden_size, output_size, activation='relu'): 61 | super(MultiLayerBinaryClassifier, self).__init__() 62 | 63 | activation_dict = { 64 | 'relu': nn.ReLU, 65 | 'sigmoid': nn.Sigmoid, 66 | 'tanh': nn.Tanh, 67 | 'leaky_relu': nn.LeakyReLU, 68 | } 69 | layers = [ 70 | nn.Linear(input_size, hidden_size[0]), 71 | activation_dict[activation]() 72 | ] + flatten([ 73 | [nn.Linear(hidden_size[i], hidden_size[i+1]), 74 | activation_dict[activation]()] for i in range(len(hidden_size) - 1) 75 | ]) + [ 76 | nn.Linear(hidden_size[-1], output_size), 77 | nn.Sigmoid() 78 | ] 79 | 80 | self.model = nn.Sequential(*layers) 81 | 82 | # Initialize the linear layers 83 | self.init_weights() 84 | 85 | def init_weights(self): 86 | for m in self.model.modules(): 87 | if isinstance(m, nn.Linear): 88 | init.xavier_uniform_(m.weight) 89 | init.zeros_(m.bias) 90 | 91 | def forward(self, x): 92 | return self.model(x) 93 | 94 | class TransferModel(nn.Module): 95 | def __init__(self, base_model): 96 | super(TransferModel, self).__init__() 97 | 98 | modules = list(base_model.children())[0][:-1] 99 | self.base_layers = nn.Sequential(*modules) 100 | if 'ReLU' in str(self.base_layers[-2]): 101 | self.transfer_activation = nn.ReLU() 102 | elif 'Sigmoid' in str(self.base_layers[-2]): 103 | self.transfer_activation = nn.Sigmoid() 104 | else: 105 | self.transfer_activation = nn.Sigmoid() 106 | 107 | output_size = list(base_model.children())[0][-2].out_features 108 | self.transfer_layer = nn.Linear(output_size, output_size) 109 | self.output_sigmoid = nn.Sigmoid() 110 | 111 | def forward(self, x): 112 | x = self.base_layers(x) 113 | x = self.transfer_activation(x) 114 | x = self.transfer_layer(x) 115 | x = self.output_sigmoid(x) 116 | return x 117 | 118 | 119 | def train(model, dataloader, criterion, optimizer, device, weights=None, l1_lambda=None): 120 | model.train() 121 | running_loss = 0.0 122 | 123 | for inputs, labels in dataloader: 124 | inputs, labels = inputs.to(device), labels.to(device) 125 | 126 | optimizer.zero_grad() 127 | 128 | outputs = model(inputs) 129 | if weights == None: 130 | loss = criterion(outputs, labels) 131 | else: 132 | loss = criterion(outputs[:,0], labels[:,0]) * weights[0] 133 | for i in range(1, len(weights)): 134 | loss += criterion(outputs[:,i], labels[:,i]) * weights[i] 135 | 136 | if l1_lambda != None: 137 | l1_reg = torch.tensor(0.).to(device) 138 | for param in model.parameters(): 139 | l1_reg += torch.norm(param, 1) 140 | loss += l1_lambda * l1_reg 141 | 142 | loss.backward() 143 | optimizer.step() 144 | 145 | running_loss += loss.item() * inputs.size(0) 146 | 147 | epoch_loss = running_loss / len(dataloader.dataset) 148 | return epoch_loss 149 | 150 | def evaluate(model, dataloader, criterion, device): 151 | model.eval() 152 | running_loss = 0.0 153 | 154 | with torch.no_grad(): 155 | for inputs, labels in dataloader: 156 | inputs, labels = inputs.to(device), labels.to(device) 157 | 158 | outputs = model(inputs) 159 | loss = criterion(outputs, labels) 160 | 161 | running_loss += loss.item() * inputs.size(0) 162 | 163 | epoch_loss = running_loss / len(dataloader.dataset) 164 | return epoch_loss 165 | 166 | def evaluate_log_loss(model, dataloader, device): 167 | model.eval() 168 | true_labels = [] 169 | predicted_probs = [] 170 | 171 | with torch.no_grad(): 172 | for inputs, labels in dataloader: 173 | inputs, labels = inputs.to(device), labels.to(device) 174 | 175 | outputs = model(inputs).cpu().numpy() 176 | true_labels.extend(labels.cpu().numpy().tolist()) 177 | predicted_probs.extend(outputs.tolist()) 178 | 179 | # THE ERROR IS HERE 180 | 181 | epoch_log_loss = log_loss(true_labels, predicted_probs) 182 | return epoch_log_loss 183 | 184 | def predict(model, inputs, device): 185 | model.eval() 186 | inputs = inputs.to(device) 187 | with torch.no_grad(): 188 | outputs = model(inputs) 189 | return outputs.cpu().numpy() 190 | 191 | 192 | # Define the objective function for Optuna optimization 193 | def objective(trial, X_train_tensor, Y_train_tensor, model_name, device=None, train_test_split=0.7, complexity_penalty=0.0): 194 | if device==None: 195 | device = torch.device("cpu") 196 | 197 | input_size = X_train_tensor.shape[1] 198 | output_size = Y_train_tensor.shape[1] 199 | num_epochs = 100 200 | patience = 3 201 | counter = 0 202 | best_val_loss = 1000 203 | 204 | # Define hyperparameters 205 | hidden_size = trial.suggest_int("hidden_size", 1, 3) 206 | hidden_size_list = [2 ** trial.suggest_int(f"hidden_size_{i}", 4, 8) for i in range(3)] 207 | lr = round(trial.suggest_float("lr", 1e-4, 1e-1), 4) 208 | batch_size = 2 ** trial.suggest_int("batch_size", 5, 10) 209 | activation = trial.suggest_categorical("activation", ["relu", "sigmoid", "tanh"]) 210 | 211 | # Create the neural network 212 | model = MultiLayerBinaryClassifier(input_size, [hidden_size_list[i] for i in range(hidden_size)], output_size, activation=activation) 213 | 214 | # Create the optimizer 215 | optimizer = optim.Adam(model.parameters(), lr=lr) 216 | 217 | # Create the loss function 218 | criterion = nn.BCELoss() 219 | 220 | # 221 | split_id = int(train_test_split * len(X_train_tensor)) 222 | _train_dataset = TensorDataset(X_train_tensor[:split_id], Y_train_tensor[:split_id]) 223 | _test_dataset = TensorDataset(X_train_tensor[split_id:], Y_train_tensor[split_id:]) 224 | 225 | # Create dataloaders 226 | _train_dataloader = DataLoader(_train_dataset, batch_size=batch_size, shuffle=True) 227 | _test_dataloader = DataLoader(_test_dataset, batch_size=batch_size, shuffle=False) 228 | 229 | # Train the neural network 230 | for epoch in range(num_epochs): 231 | train_loss = train(model, _train_dataloader, criterion, optimizer, device) 232 | test_log_loss = evaluate_log_loss(model, _test_dataloader, device) 233 | 234 | if test_log_loss < best_val_loss: 235 | best_val_loss = test_log_loss 236 | counter = 0 237 | torch.save(model, f'models/lem/optuna_trials/{model_name}_{trial.number}.pt') 238 | else: 239 | counter += 1 240 | if counter >= patience: 241 | break 242 | 243 | f = open(f'res/model_tunning/lem_trial_results.csv', 'a') 244 | f.write(f'{model_name},{trial.number},{round(best_val_loss, 4)},{round(best_val_loss * (1 + complexity_penalty * hidden_size), 4)},{hidden_size},{hidden_size_list},{lr},{batch_size},{activation},{epoch}\n') 245 | f.close() 246 | 247 | return best_val_loss * (1 + complexity_penalty * hidden_size) -------------------------------------------------------------------------------- /Older Versions/lib/simulator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn.functional import one_hot 3 | from tqdm import tqdm 4 | import numpy as np 5 | 6 | class Simulator: 7 | def __init__(self, model_type_path, model_acc_path, model_data_path, device=None): 8 | 9 | if device is None: 10 | self.device = torch.device('cpu') 11 | else: 12 | self.device = torch.device(device) 13 | 14 | self.model_type = torch.load(model_type_path).to(self.device) 15 | self.model_acc = torch.load(model_acc_path).to(self.device) 16 | self.model_data = torch.load(model_data_path).to(self.device) 17 | 18 | self.model_type.eval() 19 | self.model_acc.eval() 20 | self.model_data.eval() 21 | 22 | def simulate(self, initial_state, n_sims=1000, game_length=2000, store_full_sim=False, disable_tqdm=False): 23 | init_feature_tensor = handle_initial_state(initial_state, n_sims) 24 | feature_tensor = init_feature_tensor.to(self.device) # shape: (1000, 42) 25 | if store_full_sim: 26 | all_sims_data = [init_feature_tensor] 27 | for k in tqdm(range(game_length), disable=disable_tqdm): 28 | with torch.no_grad(): 29 | pred_type_probs = self.model_type(feature_tensor) # shape: (1000, 33) 30 | pred_type = torch.multinomial(pred_type_probs, 1) # shape: (1000, 1) 31 | pred_type = one_hot(pred_type, num_classes=pred_type_probs.shape[1]).squeeze(1) # shape: (1000, 33) 32 | 33 | pred_acc_input = torch.cat([feature_tensor, pred_type], dim=-1) # shape: (1000, 77) 34 | pred_acc_probs = self.model_acc(pred_acc_input) # shape: (1000, 2) 35 | pred_acc = torch.bernoulli(pred_acc_probs) # shape: (1000, 2) 36 | 37 | pred_data_input = torch.cat([pred_acc_input, pred_acc], dim=-1) # shape: (1000, 77) 38 | pred_data_probs = self.model_data(pred_data_input) # shape: (1000, 62) 39 | one_hot_probs_1 = pred_data_probs[:, :61] # shape: (1000, 61) 40 | one_hot_probs_1 = one_hot_probs_1 / one_hot_probs_1.sum(dim=-1, keepdim=True) 41 | one_hot_probs_x = pred_data_probs[:, 61:162] # shape: (1000, 101) 42 | one_hot_probs_x = one_hot_probs_x / one_hot_probs_x.sum(dim=-1, keepdim=True) 43 | one_hot_probs_y = pred_data_probs[:, 162:263] # shape: (1000, 101) 44 | one_hot_probs_y = one_hot_probs_y / one_hot_probs_y.sum(dim=-1, keepdim=True) 45 | binary_prob = pred_data_probs[:, -1] # shape: (1000) 46 | 47 | pred_next_time = torch.multinomial(one_hot_probs_1, 1) # shape: (1000, 1) 48 | pred_next_x = torch.multinomial(one_hot_probs_x, 1) # shape: (1000, 1) 49 | pred_next_y = torch.multinomial(one_hot_probs_y, 1) # shape: (1000, 1) 50 | pred_next_team = torch.bernoulli(binary_prob).unsqueeze(1) # shape: (1000, 1) 51 | 52 | feature_tensor, all_simulations_finished = refresh_feature_tensor(feature_tensor, pred_type, pred_acc, pred_next_time, pred_next_x, pred_next_y, pred_next_team) 53 | if store_full_sim: 54 | all_sims_data.append(feature_tensor) 55 | 56 | if all_simulations_finished: 57 | break 58 | 59 | if store_full_sim: 60 | return feature_tensor, all_sims_data 61 | else: 62 | return feature_tensor 63 | 64 | def repeat_init_tensor(values, k): 65 | tensor_values = torch.tensor(values, dtype=torch.float32) 66 | repeated_tensor = tensor_values.repeat(k, 1) 67 | return repeated_tensor 68 | 69 | def refresh_feature_tensor(feature_tensor, pred_type_tensor, pred_acc_tensor, pred_next_time_tensor, pred_next_x_tensor, pred_next_y_tensor, pred_next_team_tensor): 70 | pred_time_tensor = feature_tensor[:, 33:35] 71 | 72 | pred_time_tensor[:, 1] = pred_time_tensor[:, 1] + pred_next_time_tensor.squeeze(1) / 60 / 60 73 | pred_time_tensor[:, 0][pred_time_tensor[:, 1] > 0.75] += 1 74 | pred_time_tensor[:, 1][pred_time_tensor[:, 1] > 0.75] = 0 75 | 76 | ongoing_game_tensor = (pred_time_tensor[:, 0] <= 1) 77 | 78 | pred_next_score_tensor = feature_tensor[:, 40:42] 79 | is_shot_tensor = (pred_type_tensor[:, 32] == 1) | (pred_type_tensor[:, 31] == 1) | (pred_type_tensor[:, 16] == 1) 80 | pred_next_score_tensor[:, 0] = pred_next_score_tensor[:, 0] + ongoing_game_tensor * pred_acc_tensor[:,1] * pred_next_team_tensor.squeeze(1) * is_shot_tensor / 10 81 | pred_next_score_tensor[:, 1] = pred_next_score_tensor[:, 1] + ongoing_game_tensor * pred_acc_tensor[:,1] * (pred_next_team_tensor.squeeze(1) == 0) * is_shot_tensor / 10 82 | 83 | feature_tensor = torch.cat(( 84 | pred_type_tensor, 85 | pred_time_tensor, 86 | pred_next_x_tensor / 100, 87 | pred_next_y_tensor / 100, 88 | pred_next_team_tensor, 89 | pred_acc_tensor, 90 | pred_next_score_tensor), 1) 91 | 92 | all_simulations_finished = (pred_time_tensor[:, 0] > 1).sum() == pred_time_tensor.shape[0] 93 | 94 | return feature_tensor, all_simulations_finished 95 | 96 | def handle_initial_state(initial_state, n_sims=1000): 97 | if not isinstance(initial_state, np.ndarray): 98 | initial_state = np.array(initial_state) 99 | 100 | if len(initial_state.shape) == 1: 101 | init_feature_tensor = repeat_init_tensor(initial_state, n_sims) 102 | 103 | else: 104 | init_feature_tensor = torch.tensor(initial_state, dtype=torch.float32) 105 | 106 | return init_feature_tensor -------------------------------------------------------------------------------- /Older Versions/models/lem/LEMv3_MODEL_DATA_TORCH.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nvsclub/LargeEventsModel/fa654f556fbe02eb60e6fd7132b25bc3788c9772/Older Versions/models/lem/LEMv3_MODEL_DATA_TORCH.pth -------------------------------------------------------------------------------- /Older Versions/models/lem/LEMv3_MODEL_TYPE_TORCH.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nvsclub/LargeEventsModel/fa654f556fbe02eb60e6fd7132b25bc3788c9772/Older Versions/models/lem/LEMv3_MODEL_TYPE_TORCH.pth -------------------------------------------------------------------------------- /Older Versions/models/lem/LEMv4_MODEL_ACC_TORCH.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nvsclub/LargeEventsModel/fa654f556fbe02eb60e6fd7132b25bc3788c9772/Older Versions/models/lem/LEMv4_MODEL_ACC_TORCH.pth -------------------------------------------------------------------------------- /Older Versions/readme.md: -------------------------------------------------------------------------------- 1 | # Large Events Model 2 | 3 | This repository contains the code for the paper "Large Events Model: A Foundation Events Model for Soccer" and "Estimating Player Performance in Different Contexts Using Large Events Models". 4 | 5 | ## How to use* 6 | *Note: This code was developed for WyScout v2 data, specifically the freely available datasets. It currently does not support any other data provider.* 7 | 1. Load your data to the data/wyscout/json/ folder. 8 | 2. Run the converter to CSV (0001), followed by the Calculate Features (0011). 9 | 3. *Optional* Train your own models with notebooks 0111-0113. 10 | 4. Use the examples from the remaining notebooks to implement your own applications. The xP+ example provided in the first paper is shown in notebook 0132. Notebooks 021* provide the framework to fine-tune specific contexts using LEMs. 11 | -------------------------------------------------------------------------------- /lib/glob_fix.py: -------------------------------------------------------------------------------- 1 | import glob as g 2 | 3 | def glob(path): 4 | return [fname.replace('\\', '/') for fname in g.glob(path)] -------------------------------------------------------------------------------- /lib/theme_assets.py: -------------------------------------------------------------------------------- 1 | from matplotlib.colors import LinearSegmentedColormap 2 | 3 | CGREEN = "rgb(15, 157, 88)" 4 | CRED = "rgb(219, 68, 55)" 5 | CBLUE = "rgb(66, 133, 244)" 6 | CYELLOW = "rgb(244, 160, 0)" 7 | CORANGE = "rgb(255, 87, 34)" 8 | CCYAN = "rgb(0, 188, 212)" 9 | CWHITE = "rgb(255, 255, 255)" 10 | 11 | 12 | def C(i): 13 | return { 14 | 0: "#4285F4", 15 | 1: "#FF6D01", 16 | 2: "#46BDC6", 17 | 3: "#F4B400", 18 | # 1: '#DB4437', 19 | # 3: '#0F9D58', 20 | }[i % 4] 21 | 22 | 23 | def DC(i): 24 | return { 25 | 0: "#2066a8", 26 | 1: "#8ec1da", 27 | 2: "#cde1ec", 28 | 3: "#ededed", 29 | 4: "#f6d6c2", 30 | 5: "#d47264", 31 | 6: "#ae282c", 32 | }[i % 7] 33 | 34 | 35 | CUSTOM_CMAP = LinearSegmentedColormap.from_list( 36 | "custom_cmap", ["#DB4437", "#0F9D58"], N=7 37 | ) 38 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # A Scalable Approach for Unified Large Events Models in Soccer 2 | 3 | This repository contains the implementation of the research presented in the paper "A Scalable Approach for Unified Large Events Models in Soccer" by Tiago Mendes-Neves, Luís Meireles, and João Mendes-Moreira from Faculdade de Engenharia da Universidade do Porto and LIAAD - INESC TEC, Portugal. 4 | 5 | ## Abstract 6 | 7 | Large Events Models (LEMs) are a class of models designed to predict and analyze the sequence of events in soccer matches, capturing the complex dynamics of the game. The original LEM framework, based on a chain of classifiers, faced challenges such as synchronization, scalability issues, and limited context utilization. This paper proposes a unified and scalable approach to model soccer events using a tabular autoregressive model. Our models demonstrate significant improvements over the original LEM, achieving higher accuracy in event prediction and better simulation quality, while also offering greater flexibility and scalability. The unified LEM framework enables a wide range of applications in soccer analytics that we display in this paper, including real-time match outcome prediction, player performance analysis, and game simulation, serving as a general solution for many problems in the field. 8 | 9 | ## Demo 10 | [Watch the demonstration video](https://youtu.be/IjThR71EZ0Y) 11 | 12 | ## Project Structure 13 | 14 | The project consists of three main scripts: 15 | 16 | 1. **Preprocess Data to LEM.py**: Handles the preprocessing of soccer event data into the LEM standard format. 17 | 2. **Train Tabular LEMs.py**: Trains various neural network architectures (MLPs) on the preprocessed data. 18 | 3. **Benchmark Tabular LEMs.py**: Evaluates model performance through comprehensive benchmarking. 19 | 20 | The remaining notebooks contain analysis and application examples. 21 | 22 | ## Usage 23 | ### 0. Data 24 | This implementation is built for Wyscout V3 data, which should be organized in the following structure: 25 | - competitions.csv 26 | - seasons.csv 27 | - matches.csv 28 | - seasons/events/{season_id}.feather 29 | 30 | ### 1. Preprocessing Data 31 | 32 | ```bash 33 | python "0000 Preprocess Data to LEM.py" \ 34 | --data_dir /path/to/wyscout/data \ 35 | --output_dir /path/to/processed/data \ 36 | --seq_lengths 1 3 5 7 9 37 | ``` 38 | 39 | This script performs three main tasks: 40 | - Converts raw data to LEM standard format 41 | - Preprocesses data for tabular models 42 | - Preprocesses data for time series models (commented out by default) 43 | 44 | ### 2. Training Models 45 | 46 | ```bash 47 | python "0001 Train Tabular LEMs.py" \ 48 | --mode [survey|full] \ 49 | --data_dir /path/to/processed/data \ 50 | --output_dir /path/to/model/output \ 51 | --seq_lengths 1 3 5 7 9 52 | ``` 53 | 54 | The script supports two training modes: 55 | - **survey**: Quick training to compare different architectures 56 | - **full**: Complete training of selected architectures 57 | 58 | ### 3. Benchmarking Models 59 | 60 | ```bash 61 | python "0003 Benchmark Tabular LEMs.py" \ 62 | --data_dir /path/to/data \ 63 | --model_dir /path/to/model/files \ 64 | --output_dir /path/to/benchmark/results \ 65 | --seq_len 3 \ 66 | --n_sims 10000 67 | ``` 68 | 69 | This script performs comprehensive benchmarking including: 70 | - Model performance metrics (accuracy, F1-score) 71 | - Distribution analysis of predictions vs real data 72 | - Simulation analysis for game outcomes 73 | - Visualization of results 74 | 75 | ## Papers 76 | 77 | If you use this code or find it helpful for your research, read: 78 | 79 | ``` 80 | @article{mendesneves2024, 81 | author = {Mendes-Neves, Tiago and Meireles, Luís and Mendes-Moreira, João}, 82 | title = {Towards a foundation large events model for soccer}, 83 | journal = {Machine Learning}, 84 | volume = {113}, 85 | number = {11}, 86 | pages = {8687-8709}, 87 | year = {2024}, 88 | doi = {10.1007/s10994-024-06606-y}, 89 | url = {https://doi.org/10.1007/s10994-024-06606-y}, 90 | issn = {1573-0565}, 91 | } 92 | 93 | @misc{mendesneves2024estimatingplayerperformancedifferent, 94 | title={Estimating Player Performance in Different Contexts Using Fine-tuned Large Events Models}, 95 | author={Tiago Mendes-Neves and Luís Meireles and João Mendes-Moreira}, 96 | year={2024}, 97 | eprint={2402.06815}, 98 | archivePrefix={arXiv}, 99 | primaryClass={cs.LG}, 100 | url={https://arxiv.org/abs/2402.06815}, 101 | } 102 | 103 | ``` 104 | 105 | ## License 106 | GNU Affero General Public License (AGPL) 107 | The AGPL mandates that modified source code must be made openly available when the software is distributed or used as a network service. --------------------------------------------------------------------------------