├── .gitignore ├── auxiliary ├── __init__.py ├── argparser_types.py ├── io_json.py ├── fix_team_names.py ├── deco_path_valid.py ├── kfold_crosseval.py └── data_processing.py ├── .flake8 ├── requirements.txt ├── settings ├── feature_extraction.json └── data_collection.json ├── feature-extraction ├── extract_features.py ├── make_standings.py └── make_features.py ├── data-collection ├── scrap_season_standings.py ├── scrap_season_results.py └── scrap_game_stats.py ├── feature-selection ├── feature_selection_pca.py ├── assessing-wrapper-methods.ipynb ├── feature_selection_wrapper.py ├── feature_selection_wrapper_sfs.py └── feature_selection_filter.py ├── descriptive-analysis ├── utils.py ├── descriptive_analysis.py └── descriptive_analysis.ipynb ├── model-selection ├── cross_validation_two_param_models.py ├── cross_validation_one_param_models.py └── gridsearch_cross_validation.py ├── README.md └── model-validation ├── benchmarks.py └── validation.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /auxiliary/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore=W504 3 | max-line-length = 80 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | scikit-learn 4 | matplotlib 5 | requests 6 | beautifulsoup4 7 | tqdm 8 | plotly 9 | -------------------------------------------------------------------------------- /settings/feature_extraction.json: -------------------------------------------------------------------------------- 1 | { 2 | "feature_dir": "features", 3 | "match_level_feature_file_prefix": "match_level_features", 4 | "team_level_feature_file_prefix": "team_level_features" 5 | } 6 | -------------------------------------------------------------------------------- /auxiliary/argparser_types.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def is_valid_parent_path(parser, x): 5 | ''' 6 | Check if input string has a valid parent path 7 | ''' 8 | parent_path = '.' if os.path.split(x)[0] == '' else os.path.split(x)[0] 9 | if not os.path.isdir(parent_path): 10 | parser.error('Parent path %s of output file not valid.' % parent_path) 11 | else: 12 | return str(x) 13 | -------------------------------------------------------------------------------- /auxiliary/io_json.py: -------------------------------------------------------------------------------- 1 | import json 2 | from deco_path_valid import valid_file 3 | from deco_path_valid import valid_folder 4 | 5 | 6 | @valid_folder 7 | def write_json(file, data): 8 | ''' 9 | Writes data into json file 10 | ''' 11 | with open(file, 'w', encoding='utf8') as outfile: 12 | json.dump(data, outfile, ensure_ascii=False) 13 | return 14 | 15 | 16 | @valid_file 17 | def read_json(file): 18 | ''' 19 | Reads data from json file 20 | ''' 21 | with open(file, 'r', encoding='utf8') as outfile: 22 | data = json.load(outfile) 23 | return data 24 | -------------------------------------------------------------------------------- /settings/data_collection.json: -------------------------------------------------------------------------------- 1 | { 2 | "n_rounds": 34, 3 | "output_dir": "data/", 4 | "f4teams_file": "data/f4teams.json", 5 | "game_stats": { 6 | "url_link": "http://www.euroleague.net/main/results?gamenumber=%d&phasetypecode=RS&seasoncode=E%d", 7 | "output_file_prefix": "euroleague_game_stats" 8 | }, 9 | "season_results": { 10 | "url_link": "http://www.euroleague.net/main/results?gamenumber=%d&phasetypecode=RS&seasoncode=E%d", 11 | "output_file_prefix": "euroleague_results" 12 | }, 13 | "season_standings": { 14 | "url_link": "http://www.euroleague.net/main/standings?gamenumber=%d&phasetypecode=RS++++++++&seasoncode=E%d", 15 | "output_file_prefix": "euroleague_standings" 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /auxiliary/fix_team_names.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def fix_team_names(df1, df2): 5 | ''' 6 | Fix inconsistancies in team names across the seasons 7 | ''' 8 | name_dict = { 9 | 'EA7 Emporio Armani Milan': 'AX Armani Exchange Olimpia Milan', 10 | 'Fenerbahce Istanbul': 'Fenerbahce Dogus Istanbul', 11 | 'Baskonia Vitoria Gasteiz': 'KIROLBET Baskonia Vitoria Gasteiz' 12 | } 13 | 14 | for team in name_dict.keys(): 15 | print(team in df1['Home Team'].values) 16 | df1.replace(team, name_dict[team], inplace=True) 17 | 18 | teams1 = np.unique(df1['Home Team']) 19 | teams2 = np.unique(df2['Home Team']) 20 | if not np.in1d(teams1, teams2).all(): 21 | ii = ~np.in1d(teams1, teams2) 22 | print(teams1[ii]) 23 | if not np.in1d(teams2, teams1).all(): 24 | ii = ~np.in1d(teams2, teams1) 25 | print(teams2[ii]) 26 | 27 | return df1, df2 28 | -------------------------------------------------------------------------------- /auxiliary/deco_path_valid.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import sys 3 | 4 | 5 | def valid_file(func): 6 | """ 7 | Checks the validity of the input file of read-data fuction. If files 8 | does not exist, it exists. 9 | """ 10 | 11 | def wrapper(filename, *args, **kwargs): 12 | if os.path.isfile(filename): 13 | a = func(filename, *args, **kwargs) 14 | return a 15 | else: 16 | sys.exit('File %s not found' % filename) 17 | return 18 | return wrapper 19 | 20 | 21 | def valid_folder(func): 22 | """ 23 | Checks the validity of the output directory of a write-data function. If 24 | directory does not exist, it exists. 25 | """ 26 | 27 | def wrapper(filepath, *args, **kwargs): 28 | if os.path.isdir(os.path.dirname(filepath)): 29 | a = func(filepath, *args, **kwargs) 30 | return a 31 | else: 32 | sys.exit('Directory %s not found' % filepath) 33 | return 34 | return wrapper 35 | -------------------------------------------------------------------------------- /feature-extraction/extract_features.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | import logging 5 | import pandas as pd 6 | from make_features import make_game_features 7 | from make_features import make_team_features 8 | sys.path.append('auxiliary') # noqa: E402 9 | from io_json import read_json 10 | 11 | logging.basicConfig(level=logging.INFO) 12 | 13 | 14 | def main(season): 15 | ''' 16 | Extract features (game and team) from the fetched data from the 17 | Euroleague's site 18 | ''' 19 | # get data settings 20 | data_settings = read_json('settings/data_collection.json') 21 | out_dir = data_settings['output_dir'] 22 | rslts_file_prefix = data_settings['season_results']['output_file_prefix'] 23 | results_file = os.path.join( 24 | out_dir, '%s_%d_%d.csv' % (rslts_file_prefix, season - 1, season)) 25 | stnds_file_prefix = data_settings['season_standings']['output_file_prefix'] 26 | standings_file = os.path.join( 27 | out_dir, '%s_%d_%d.csv' % (stnds_file_prefix, season - 1, season)) 28 | f4_file = data_settings['f4teams_file'] 29 | 30 | # get feature settings 31 | feat_settings = read_json('settings/feature_extraction.json') 32 | feature_dir = feat_settings['feature_dir'] 33 | match_level_file_ = feat_settings['match_level_feature_file_prefix'] 34 | team_level_file_ = feat_settings['team_level_feature_file_prefix'] 35 | match_level_file = os.path.join( 36 | feature_dir, '%s_%d_%d.csv' % (match_level_file_, season - 1, season)) 37 | team_level_file = os.path.join( 38 | feature_dir, '%s_%d_%d.csv' % (team_level_file_, season - 1, season)) 39 | 40 | data = pd.read_csv(results_file) 41 | standings = pd.read_csv(standings_file) 42 | f4teams = read_json(f4_file) 43 | 44 | # Specify the F4 teams of the *previous* year 45 | f4Teams = f4teams[str(season - 1)] 46 | 47 | # make game features 48 | feats = make_game_features(data, standings, f4Teams) 49 | 50 | # save features to file. 51 | logging.info('save match-level features') 52 | feats.to_csv(match_level_file, index=False) 53 | 54 | # make team features 55 | team_feats = make_team_features(data, standings, f4Teams) 56 | # save features to file 57 | logging.info('save team-level features') 58 | team_feats.to_csv(team_level_file, index=False) 59 | return 60 | 61 | 62 | if __name__ == "__main__": 63 | parser = argparse.ArgumentParser() 64 | parser.add_argument('-s', '--season', required=True, type=int, 65 | help="the ending year of the season") 66 | 67 | args = parser.parse_args() 68 | 69 | main(args.season) 70 | -------------------------------------------------------------------------------- /auxiliary/kfold_crosseval.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.model_selection import StratifiedKFold, GroupKFold 3 | from sklearn.naive_bayes import GaussianNB 4 | from sklearn.metrics import accuracy_score, balanced_accuracy_score 5 | 6 | 7 | def kfold_crosseval(X_train, y_train, df_train, nsplits, groups=None, 8 | model=GaussianNB(), level='match', shuffle=True): 9 | ''' 10 | Perform k-fold cross validation using the input `model` 11 | ''' 12 | if level == 'team': 13 | kfold = GroupKFold(n_splits=nsplits) 14 | folditer = kfold.split(X_train, y_train, groups) 15 | else: 16 | kfold = StratifiedKFold(n_splits=nsplits, shuffle=shuffle, 17 | random_state=10) 18 | folditer = kfold.split(X_train, y_train) 19 | 20 | accuracy = np.zeros(kfold.get_n_splits()) 21 | w_accuracy = np.zeros(kfold.get_n_splits()) 22 | i = -1 23 | for train_index, test_index in folditer: 24 | # loop over folds 25 | i += 1 26 | X_train_folds, X_test_fold = (X_train[train_index, :], 27 | X_train[test_index, :]) 28 | y_train_folds, y_test_fold = y_train[train_index], y_train[test_index] 29 | df_test_fold = df_train.iloc[test_index, :].copy() 30 | 31 | # fit model 32 | model.fit(X_train_folds, y_train_folds) 33 | 34 | if level == 'team': 35 | # calculate accuracy at the match level 36 | y_pred_prob = model.predict_proba(X_test_fold) 37 | df_test_fold['Prob'] = y_pred_prob[:, 1] 38 | y_test_fold = [] 39 | y_pred = [] 40 | for gid in np.unique(df_test_fold['Game ID']): 41 | teams = df_test_fold[df_test_fold['Game ID'] == gid] 42 | if teams.shape[0] == 2: 43 | game_pred = (1 if teams.iloc[0]['Prob'] > 44 | teams.iloc[1]['Prob'] else 0) 45 | game_resu = (1 if teams.iloc[0]['Label'] > 46 | teams.iloc[1]['Label'] else 0) 47 | y_test_fold.append(game_resu) 48 | y_pred.append(game_pred) 49 | else: 50 | print('Warning: Game ID %d has missing teams' % gid) 51 | y_test_fold = np.array(y_test_fold) 52 | y_pred = np.array(y_pred) 53 | else: 54 | # predict model 55 | y_pred = model.predict(X_test_fold) 56 | 57 | accuracy[i] = accuracy_score(y_test_fold, y_pred) 58 | w_accuracy[i] = balanced_accuracy_score(y_test_fold, y_pred) 59 | 60 | return accuracy.mean(), w_accuracy.mean() 61 | -------------------------------------------------------------------------------- /data-collection/scrap_season_standings.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | import logging 5 | import re 6 | from bs4 import BeautifulSoup 7 | import requests 8 | from tqdm import trange 9 | import pandas as pd 10 | sys.path.append('auxiliary/') # noqa: E402 11 | from io_json import read_json 12 | 13 | logging.basicConfig(level=logging.INFO) 14 | 15 | 16 | def main(season, n_rounds): 17 | ''' 18 | Scraps the standings of the Euroleague games from the Euroleague's official 19 | site for the input season. 20 | Saves data to file. 21 | ''' 22 | 23 | # read settings 24 | settings = read_json('settings/data_collection.json') 25 | out_dir = settings['output_dir'] 26 | url_pattern = settings['season_standings']['url_link'] 27 | out_file_prefix = settings['season_standings']['output_file_prefix'] 28 | filename = '%s_%d_%d.csv' % (out_file_prefix, season - 1, season) 29 | filepath = os.path.join(out_dir, filename) 30 | 31 | headers = ['Round', 'Position', 'Club Code', 'Club Name', 'Wins', 'Losses', 32 | 'Offence', 'Defence', 'Points Diff'] 33 | standings = [] 34 | for game_round in trange(1, n_rounds + 1): 35 | 36 | url = (url_pattern % (game_round, season - 1)) 37 | try: 38 | r = requests.get(url) 39 | except ConnectionError: 40 | sys.exit('Connection Error. Check URL') 41 | data = r.text 42 | soup = BeautifulSoup(data, 'html.parser') 43 | tbl_cls = ('table responsive fixed-cols-1 table-left-cols-1 ' 44 | 'table-expand table-striped table-hover table-noborder ' 45 | 'table-centered table-condensed') 46 | table = soup.find('table', attrs={'class': tbl_cls}) 47 | body = table.find('tbody') 48 | var1 = 'clubcode=' 49 | var2 = '&seasoncode=E' 50 | for row in body.find_all('tr'): 51 | a = row.find('a').get('href') 52 | cc = a[a.find(var1) + len(var1): a.find(var2)] 53 | # sc = a[a.find(var2) + len(var2):] 54 | pos_team = row.find('a').string.strip() 55 | pos = int(re.findall(r'\d{1,2}', pos_team)[0]) 56 | team = re.findall(r'[a-zA-Z\s-]+', pos_team)[0].strip() 57 | stats = row.find_all('td') 58 | wins = int(stats[1].string.strip()) 59 | losses = int(stats[2].string.strip()) 60 | points_plus = int(stats[3].string.strip()) 61 | points_minus = int(stats[4].string.strip()) 62 | points_diff = int(stats[5].string.strip()) 63 | standings.append([game_round, pos, cc, team, wins, losses, 64 | points_plus, points_minus, points_diff]) 65 | 66 | logging.info('Convert to dataframe') 67 | df = pd.DataFrame(standings, columns=headers) 68 | 69 | logging.info('Save ot file') 70 | df.to_csv(filepath, index=False) 71 | return 72 | 73 | 74 | if __name__ == "__main__": 75 | parser = argparse.ArgumentParser() 76 | parser.add_argument('-s', '--season', required=True, type=int, 77 | help="the ending year of the season") 78 | parser.add_argument('-n', '--n-rounds', default=34, type=int, 79 | help="The number of regular season rounds " 80 | "in the season") 81 | args = parser.parse_args() 82 | 83 | main(args.season, args.n_rounds) 84 | -------------------------------------------------------------------------------- /feature-selection/feature_selection_pca.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Feature transformation methods using Principal Component Analysis (PCA). 3 | 4 | For increasing number of principal components, results are being evaluated for 5 | a chosen algorithm using k-fold cross-validation on the training test. 6 | ''' 7 | import sys 8 | import numpy as np 9 | from matplotlib import pyplot as plt 10 | from tqdm import tqdm 11 | # from sklearn.linear_model import LogisticRegression 12 | # from sklearn.ensemble import RandomForestClassifier 13 | # from sklearn.tree import DecisionTreeClassifier 14 | # from sklearn.svm import SVC 15 | # from sklearn.ensemble import GradientBoostingClassifier 16 | from sklearn.ensemble import AdaBoostClassifier 17 | # from sklearn.naive_bayes import GaussianNB 18 | from sklearn.model_selection import StratifiedKFold 19 | from sklearn.model_selection import GridSearchCV 20 | from sklearn.decomposition import PCA 21 | sys.path.append('auxiliary/') # noqa: E402 22 | from data_processing import load_features, shape_data 23 | 24 | 25 | # %% Choose settings and classifier 26 | test_season = '2018-2019' # hold-out season for validation 27 | level = 'match' # match or team level features to use 28 | shuffle = True # whether to shuffle or not the data in k-fold cross validation 29 | norm = True # whether to normalise or not the features 30 | min_round = 5 # minimum number of first rounds to skip in every season 31 | nsplits = 5 # number of folds in k-fold cross validation 32 | random_state = 10 # random state for the classifier 33 | params = { 34 | 'n_estimators': np.arange(5, 200, 5), 35 | # 'learning_rate': np.arange(0.3, 1.5, 0.1)} 36 | } # params for the grid search 37 | model = AdaBoostClassifier(random_state=random_state) 38 | 39 | # %% load feature data 40 | df = load_features(level) 41 | 42 | # choose features 43 | if level == 'match': 44 | feats = ['Position_x', 'Position_y', 'Offence_x', 'Offence_y', 45 | 'Defence_x', 'Defence_y', 46 | 'form_x', 'form_y', 47 | 'Diff_x', 'Diff_y', 48 | 'Home F4', 'Away F4'] 49 | elif level == 'team': 50 | feats = ['Home', 'Away', 'Position', 'Offence', 'Defence', 51 | 'form', 52 | 'F4', 'Diff'] 53 | n_feats = len(feats) 54 | 55 | # seasons for calibration 56 | df = df[df['Season'] != test_season] 57 | 58 | # %% Re-shape data 59 | X_train, y_train, df, groups = shape_data(df, feats, norm=norm, 60 | min_round=min_round) 61 | 62 | # %% Apply PCA and then k-fold cross validation 63 | XX = X_train.copy() 64 | scores = np.zeros((n_feats, 2)) 65 | for n in tqdm(range(n_feats)): 66 | pca = PCA(n_components=n + 1) 67 | X_train = pca.fit_transform(XX) 68 | kfold = StratifiedKFold(n_splits=nsplits, shuffle=shuffle, random_state=10) 69 | folditer = kfold.split(X_train, y_train) 70 | clf = GridSearchCV(model, params, cv=folditer, iid=False, 71 | scoring=['accuracy', 'balanced_accuracy', 'roc_auc'], 72 | refit='accuracy') 73 | clf.fit(X_train, y_train) 74 | scores[n, 0] = np.max(clf.cv_results_['mean_test_accuracy']) 75 | scores[n, 1] = np.max(clf.cv_results_['mean_test_balanced_accuracy']) 76 | print(clf.best_score_) 77 | 78 | # %% Plots 79 | x = np.arange(1, n_feats + 1, dtype=int) 80 | plt.figure() 81 | plt.plot(x, scores[:, 0], label='Accuracy') 82 | plt.plot(x, scores[:, 1], label='W-Accuracy') 83 | plt.xlabel('Number of components') 84 | plt.ylabel('Score') 85 | plt.xticks(x, x) 86 | plt.grid() 87 | plt.legend() 88 | plt.show() 89 | -------------------------------------------------------------------------------- /descriptive-analysis/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import plotly.graph_objs as go 3 | 4 | 5 | def find_probs(df): 6 | ''' 7 | Probability of Winning when scoring in interval [x, x+4] 8 | ''' 9 | min_value = (df['Score'].min() // 5) * 5 10 | max_value = (df['Score'].max() // 5 + 1) * 5 11 | x = np.arange(min_value, max_value + 1, 5, dtype=int) 12 | prob = np.zeros(x.shape[0] - 1) 13 | for i, u in enumerate(x[:-1]): 14 | ii = (df['Score'] >= x[i]) & (df['Score'] < x[i + 1]) 15 | num = np.sum((df[ii]['Team Result'] == 'W')) 16 | den = np.sum(ii) 17 | prob[i] = num / den if den > 0 else 0 18 | return prob, x[:-1] 19 | 20 | 21 | def find_probs_at_least_n_points(df, step=1): 22 | ''' 23 | Probability of wining when scoring at least N points 24 | ''' 25 | min_value = step * (df['Score'].min() // step) 26 | max_value = step * (df['Score'].max() // step) 27 | x = np.arange(min_value, max_value + 1, step, dtype=int) 28 | prob = np.zeros(x.shape[0]) 29 | for i, u in enumerate(x[:-1]): 30 | num = ((df['Score'] >= u) & (df['Team Result'] == 'W')).sum() 31 | den = (df['Score'] >= u).sum() 32 | prob[i] = num / den 33 | return prob, x[:-1] 34 | 35 | 36 | def make_x_interv(x): 37 | ans = [str(x[i]) + '-' + str(x[i + 1] - 1) for i in range(x.shape[0] - 1)] 38 | ans.append(str(x[-1]) + '-' + str(x[-1] + 4)) 39 | return ans 40 | 41 | 42 | def make_bar_plot(dfs, names, title=''): 43 | data = [] 44 | for df, name in zip(dfs, names): 45 | prob, x = find_probs(df) 46 | data.append(go.Bar(x=make_x_interv(x), y=prob, name=name)) 47 | 48 | layout = go.Layout(title=title, 49 | xaxis={'title': 'Score', 'showgrid': True}, 50 | yaxis={'title': 'Probability', 'showgrid': True}) 51 | fig = go.Figure(data, layout) 52 | fig.show() 53 | return 54 | 55 | 56 | def make_scatter_plot(dfs, names, title=''): 57 | data = [] 58 | for df, name in zip(dfs, names): 59 | prob, x = find_probs(df) 60 | data.append(go.Scatter(x=x, y=prob, 61 | # fill='tozeroy', 62 | line={'shape': 'hv'}, 63 | name=name)) 64 | 65 | layout = go.Layout(title=title, 66 | xaxis={'title': 'Score', 67 | 'showgrid': True, 68 | 'gridcolor': 'rgb(200, 200, 200)', 69 | 'type': 'category'}, 70 | yaxis={'title': 'Probability', 71 | 'showgrid': True, 72 | 'gridcolor': 'rgb(200, 200, 200)'}) 73 | fig = go.Figure(data, layout) 74 | fig.show() 75 | return 76 | 77 | 78 | def make_scatter_plot_at_least_n_points(dfs, names, title=''): 79 | data = [] 80 | for df, name in zip(dfs, names): 81 | prob, x = find_probs_at_least_n_points(df, 5) 82 | data.append(go.Scatter(x=x, y=prob, 83 | # fill='tozeroy', 84 | line={'shape': 'hv'}, 85 | name=name)) 86 | 87 | layout = go.Layout(title=title, 88 | xaxis={'title': 'Score', 89 | 'showgrid': True, 90 | 'gridcolor': 'rgb(200, 200, 200)', 91 | 'type': 'category'}, 92 | yaxis={'title': 'Probability', 93 | 'showgrid': True, 94 | 'gridcolor': 'rgb(200, 200, 200)'}) 95 | fig = go.Figure(data, layout) 96 | fig.show() 97 | return 98 | -------------------------------------------------------------------------------- /data-collection/scrap_season_results.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | import logging 5 | import re 6 | from datetime import datetime 7 | from tqdm import trange 8 | from bs4 import BeautifulSoup 9 | import requests 10 | import pandas as pd 11 | sys.path.append('auxiliary/') # noqa: E402 12 | from io_json import read_json 13 | 14 | logging.basicConfig(level=logging.INFO) 15 | 16 | 17 | def main(season, n_rounds): 18 | ''' 19 | Scraps the results of the Euroleague games from the Euroleague's official 20 | site for the input season. 21 | Saves data to file. 22 | ''' 23 | 24 | # read settings 25 | settings = read_json('settings/data_collection.json') 26 | out_dir = settings['output_dir'] 27 | url_pattern = settings['season_results']['url_link'] 28 | out_file_prefix = settings['season_results']['output_file_prefix'] 29 | filename = '%s_%d_%d.csv' % (out_file_prefix, season - 1, season) 30 | filepath = os.path.join(out_dir, filename) 31 | 32 | headers = ['Season', 'Round', 'GameID', 'Date', 'Home Team', 'Away Team', 33 | 'Home Score', 'Away Score'] 34 | results = [] 35 | regex = re.compile(r'score [a-z\s]*pts[a-z\s]*') 36 | season_str = '%d-%d' % (season - 1, season) 37 | for game_round in trange(1, n_rounds + 1): 38 | 39 | url = (url_pattern % (game_round, season - 1)) 40 | try: 41 | r = requests.get(url) 42 | except ConnectionError: 43 | sys.exit('Connection Error. Check URL') 44 | data = r.text 45 | soup = BeautifulSoup(data, 'html.parser') 46 | for game in soup.find_all('div', attrs={'class': 'game played'}): 47 | data_code = game.attrs['data-code'] 48 | gameid = '%d_%d_%d_%s' % (season - 1, season, 49 | game_round, data_code) 50 | home_team = game.find_all('span', attrs={'class': 'name'})[0].string 51 | away_team = game.find_all('span', attrs={'class': 'name'})[1].string 52 | scores = game.find_all('span', attrs={'class': regex}) 53 | 54 | home_score = int(scores[0]['data-score'] if 55 | scores[0].has_attr('data-score') else 56 | scores[0].string) 57 | away_score = int(scores[1]['data-score'] if 58 | scores[1].has_attr('data-score') else 59 | scores[1].string) 60 | 61 | date_str = game.find('span', attrs={'class': 'date'}).string 62 | date = datetime.strptime(date_str, '%B %d %H:%M CET') 63 | yr = season - 1 if date.month <= 12 and date.month > 8 else season 64 | date = date.replace(year=yr) 65 | date_str = datetime.strftime(date, '%Y-%m-%d %H:%M:%S') 66 | 67 | results.append([season_str, game_round, gameid, date_str, 68 | home_team, away_team, 69 | home_score, away_score]) 70 | 71 | logging.info('Convert to dataframe') 72 | df = pd.DataFrame(results, columns=headers) 73 | 74 | logging.info('Save to file') 75 | df.to_csv(filepath, index=False) 76 | 77 | return 78 | 79 | 80 | if __name__ == "__main__": 81 | parser = argparse.ArgumentParser() 82 | parser.add_argument('-s', '--season', required=True, type=int, 83 | help="the ending year of the season") 84 | parser.add_argument('-n', '--n-rounds', default=34, 85 | type=int, 86 | help="The number of regular season rounds " 87 | "in the season") 88 | args = parser.parse_args() 89 | 90 | main(args.season, args.n_rounds) 91 | -------------------------------------------------------------------------------- /feature-selection/assessing-wrapper-methods.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "\n", 11 | "import plotly.graph_objs as go" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "scores_obj = np.load('../output/wrapper_ada2_n_121_rate_1.npz', allow_pickle=True)\n", 21 | "scores = scores_obj['scores']\n", 22 | "featute_sets = scores_obj['features']\n", 23 | "\n", 24 | "accuracy = scores[:, 0]\n", 25 | "w_accuracy = scores[:, 1]" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "data": { 35 | "text/plain": [ 36 | "['Position_x',\n", 37 | " 'Offence_x',\n", 38 | " 'Offence_y',\n", 39 | " 'Defence_y',\n", 40 | " 'Diff_y',\n", 41 | " 'Home F4',\n", 42 | " 'Away F4']" 43 | ] 44 | }, 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "output_type": "execute_result" 48 | } 49 | ], 50 | "source": [ 51 | "featute_sets[2815]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 4, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/plain": [ 62 | "'temp-plot.html'" 63 | ] 64 | }, 65 | "execution_count": 4, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "indices = np.argsort(accuracy)[::-1][:10]\n", 72 | "# x = [', '.join(u) for u in featute_sets[indices]]\n", 73 | "xx = [[featute_sets[-1].index(u) for u in feats] for feats in featute_sets[indices]]\n", 74 | "x = [str(u) for u in xx]\n", 75 | "data = [go.Bar(x=x, y=np.sort(accuracy)[::-1][:10], name='accuracy')]\n", 76 | "\n", 77 | "layout = go.Layout(yaxis={'title': 'Accuracy'})\n", 78 | "fig = go.Figure(data, layout)\n", 79 | "fig.update_yaxes(range=[0.706, 0.718])\n", 80 | "fig.show()" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 5, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "data": { 90 | "text/plain": [ 91 | "'temp-plot.html'" 92 | ] 93 | }, 94 | "execution_count": 5, 95 | "metadata": {}, 96 | "output_type": "execute_result" 97 | } 98 | ], 99 | "source": [ 100 | "indices = np.argsort(w_accuracy)[::-1][:10]\n", 101 | "xx = [[featute_sets[-1].index(u) for u in feats] for feats in featute_sets[indices]]\n", 102 | "x = [str(u) for u in xx]\n", 103 | "\n", 104 | "data = [go.Bar(x=x, y=np.sort(w_accuracy)[::-1][:10])]\n", 105 | "layout = go.Layout(yaxis={'title': 'Weighted Accuracy'})\n", 106 | "fig = go.Figure(data, layout)\n", 107 | "fig.update_yaxes(range=[0.675, 0.685])\n", 108 | "fig.show()" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [] 117 | } 118 | ], 119 | "metadata": { 120 | "kernelspec": { 121 | "display_name": "Python 3", 122 | "language": "python", 123 | "name": "python3" 124 | }, 125 | "language_info": { 126 | "codemirror_mode": { 127 | "name": "ipython", 128 | "version": 3 129 | }, 130 | "file_extension": ".py", 131 | "mimetype": "text/x-python", 132 | "name": "python", 133 | "nbconvert_exporter": "python", 134 | "pygments_lexer": "ipython3", 135 | "version": "3.7.4" 136 | } 137 | }, 138 | "nbformat": 4, 139 | "nbformat_minor": 4 140 | } 141 | -------------------------------------------------------------------------------- /auxiliary/data_processing.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from glob import glob 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.preprocessing import MinMaxScaler 7 | sys.path.append('auxiliary/') # noqa: E402 8 | from io_json import read_json 9 | 10 | 11 | def normalise(X): 12 | ''' 13 | Normalise the features of the input design matrix `X` across the x=0 axis. 14 | ''' 15 | x_norm = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) 16 | return x_norm 17 | 18 | 19 | def shape_data_scaler(df, feats, norm=True, min_round=5): 20 | ''' 21 | Shape input data in `df` by selecting the `feats`, excluding rounds and 22 | normalising if `norm=True`. 23 | 24 | Returns four variables: 25 | * X_train 26 | * y_train 27 | * df (the new df) 28 | * groups (for defining groups of matches) 29 | * scaler (the scaler object from normalistion) 30 | ''' 31 | # ignore early games in the season, as they do not contain the 'form' 32 | # feature. 33 | ii = df['Round'] > min_round 34 | 35 | # filter out the games ignored 36 | df = df[ii] 37 | df.reset_index(drop=True, inplace=True) 38 | 39 | # make the Design table 40 | X_train = df[feats].values 41 | 42 | # normalise the Design table if required 43 | if isinstance(norm, bool) and norm: 44 | scaler = MinMaxScaler() 45 | X_train = scaler.fit_transform(X_train) 46 | # X_train = normalise(X_train) 47 | elif isinstance(norm, MinMaxScaler): 48 | X_train = norm.transform(X_train) 49 | scaler = norm 50 | 51 | # extract the tags 52 | y_train = df['Label'].values 53 | 54 | # if labels are 1 and 2, set them to 0-1 55 | if 2 in np.unique(y_train): 56 | y_train = y_train - 1 57 | 58 | # define the groups matches if processing 'team' level classification 59 | groups = df['Game ID'].values if 'Game ID' in df.keys() else None 60 | 61 | return X_train, y_train, df, groups, scaler 62 | 63 | 64 | def shape_data(df, feats, norm=True, min_round=5): 65 | ''' 66 | Shape input data in `df` by selecting the `feats`, excluding rounds and 67 | normalising if `norm=True`. 68 | 69 | Returns four variables: 70 | * X_train 71 | * y_train 72 | * df (the new df) 73 | * groups (for defining groups of matches) 74 | ''' 75 | 76 | # ignore early games in the season, as they do not contain the 'form' 77 | # feature. 78 | ii = df['Round'] > min_round 79 | 80 | # filter out the games ignored 81 | df = df[ii] 82 | df.reset_index(drop=True, inplace=True) 83 | 84 | # make the Design table 85 | X_train = df[feats].values 86 | 87 | # normalise the Design table if required 88 | if norm: 89 | X_train = normalise(X_train) 90 | 91 | # extract the tags 92 | y_train = df['Label'].values 93 | 94 | # if labels are 1 and 2, set them to 0-1 95 | if 2 in np.unique(y_train): 96 | y_train = y_train - 1 97 | 98 | # define the groups matches if processing 'team' level classification 99 | groups = df['Game ID'].values if 'Game ID' in df.keys() else None 100 | 101 | return X_train, y_train, df, groups 102 | 103 | 104 | def load_features(level): 105 | '''load features''' 106 | 107 | settings = read_json('settings/feature_extraction.json') 108 | feature_dir = settings['feature_dir'] 109 | 110 | if level == 'match': 111 | file_pattern = settings['match_level_feature_file_prefix'] 112 | elif level == 'team': 113 | file_pattern = settings['team_level_feature_file_prefix'] 114 | else: 115 | raise ValueError('Invalid level of analysis: %s' % level) 116 | 117 | filepath = os.path.join(feature_dir, file_pattern) 118 | feature_files = glob('%s*.csv' % filepath) 119 | list_dfs = [pd.read_csv(file) for file in feature_files] 120 | df = pd.concat(list_dfs, ignore_index=False) 121 | 122 | df.reset_index(drop=True, inplace=True) 123 | return df 124 | -------------------------------------------------------------------------------- /descriptive-analysis/descriptive_analysis.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import numpy as np 3 | import pandas as pd 4 | 5 | import plotly.express as px 6 | 7 | from utils import make_scatter_plot, make_bar_plot 8 | from utils import make_scatter_plot_at_least_n_points 9 | 10 | 11 | # %% Load Data 12 | 13 | files_pattern = 'data/euroleague_results*csv' 14 | data_list_files = glob.glob(files_pattern) 15 | 16 | df = pd.concat([pd.read_csv(f) for f in data_list_files], ignore_index=True) 17 | df.reset_index(drop=True, inplace=True) 18 | 19 | df['Game Result'] = np.where(df['Home Score'] > df['Away Score'], 1, 2) 20 | df['Score Difference'] = np.abs(df['Home Score'] - df['Away Score']) 21 | 22 | # %% Reshape the data 23 | 24 | df_flat = pd.melt(df, id_vars=['Season', 'Round', 'Game Result'], 25 | value_vars=['Home Score', 'Away Score'], 26 | var_name='Loc', value_name='Score') 27 | df_flat['Loc'] = df_flat['Loc'].apply(lambda x: x.split(' ')[0]) 28 | df_flat['Team Result'] = np.where(((df_flat['Game Result'] == 1) & 29 | (df_flat['Loc'] == 'Home')) | 30 | ((df_flat['Game Result'] == 2) & 31 | (df_flat['Loc'] == 'Away')), 'W', 'L') 32 | df_flat['Season_int'] = df_flat['Season'].apply(lambda x: int(x[-4:])) 33 | 34 | # %% Stat Table 35 | 36 | dfgroup = df_flat.groupby(['Season', 'Loc'])['Score'].mean().unstack('Loc') 37 | dfgroup.columns = ['Away Mean Score', 'Home Mean Score'] 38 | dfgroup.reset_index(inplace=True) 39 | 40 | dff = (df.groupby(['Season', 'Game Result'])['Game Result']. 41 | count().unstack('Game Result')) 42 | dff.columns = ['Home Wins', 'Away Wins'] 43 | dff.reset_index(inplace=True) 44 | 45 | dfgroup = dfgroup.merge(dff, on='Season') 46 | 47 | dff = (df_flat.groupby(['Season', 'Game Result'])['Score']. 48 | mean().unstack('Game Result')) 49 | dff.columns = ['Home Win Mean Score', 'Away Win Mean Score'] 50 | dff.reset_index(inplace=True) 51 | 52 | dfgroup = dfgroup.merge(dff, on='Season') 53 | 54 | print(dfgroup) 55 | 56 | # %% Plots: Home/Away Scores 57 | 58 | fig = px.box(df_flat, x="Season", y="Score", color="Loc", notched=True) 59 | fig.layout.yaxis.update({'showgrid': True, 'gridcolor': 'rgb(200, 200, 200)'}) 60 | fig.show() 61 | 62 | fig = px.box(df, x="Season", y="Home Score", color="Game Result", notched=True) 63 | fig.layout.yaxis.update({'showgrid': True, 'gridcolor': 'rgb(200, 200, 200)'}) 64 | fig.show() 65 | 66 | fig = px.box(df, x="Season", y="Away Score", color="Game Result", notched=True) 67 | fig.layout.yaxis.update({'showgrid': True, 'gridcolor': 'rgb(200, 200, 200)'}) 68 | fig.show() 69 | 70 | fig = px.box(df, x="Season", y="Score Difference", color="Game Result", 71 | notched=True) 72 | fig.layout.yaxis.update({'showgrid': True, 'gridcolor': 'rgb(200, 200, 200)'}) 73 | fig.show() 74 | 75 | 76 | # %% Scatter plots - probability of winning when scoring at least N points 77 | make_scatter_plot_at_least_n_points([df_flat, df_flat[df_flat['Loc'] == 'Home'], 78 | df_flat[df_flat['Loc'] == 'Away']], 79 | ['All', 'Home', 'Away']) 80 | 81 | make_scatter_plot_at_least_n_points([df_flat[df_flat['Season_int'] == 2017], 82 | df_flat[df_flat['Season_int'] == 2018], 83 | df_flat[df_flat['Season_int'] == 2019]], 84 | ['2017', '2018', '2019']) 85 | 86 | # %% Scatter plots - probability of winning when scoring points in a range. 87 | 88 | # %% Bar plots 89 | 90 | make_bar_plot([df_flat, df_flat[df_flat['Loc'] == 'Home'], 91 | df_flat[df_flat['Loc'] == 'Away']], ['All', 'Home', 'Away']) 92 | 93 | make_bar_plot([df_flat[df_flat['Season_int'] == 2017], 94 | df_flat[df_flat['Season_int'] == 2018], 95 | df_flat[df_flat['Season_int'] == 2019]], 96 | ['2017', '2018', '2019']) 97 | 98 | # %% Scatter plots 99 | 100 | make_scatter_plot([df_flat, df_flat[df_flat['Loc'] == 'Home'], 101 | df_flat[df_flat['Loc'] == 'Away']], 102 | ['All', 'Home', 'Away']) 103 | 104 | make_scatter_plot([df_flat[df_flat['Season_int'] == 2017], 105 | df_flat[df_flat['Season_int'] == 2018], 106 | df_flat[df_flat['Season_int'] == 2019]], 107 | ['2017', '2018', '2019']) 108 | -------------------------------------------------------------------------------- /feature-selection/feature_selection_wrapper.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Wrapper method for feature selection, i.e. subsets of features are 3 | generated and evaluated using a chosen algorithm and its hyper-parameters. 4 | 5 | Here, as the number of features is relative small, we are able to generate 6 | all possible combinations of features. If the number of features grows large, 7 | a different approach should be adopted, the Sequential Forward Selection, see 8 | `feature_selection_wrapper_sfs.py` script. 9 | ''' 10 | import sys 11 | from itertools import combinations 12 | import numpy as np 13 | from matplotlib import pyplot as plt 14 | from tqdm import tqdm 15 | # from sklearn.linear_model import LogisticRegression 16 | # from sklearn.ensemble import RandomForestClassifier 17 | # from sklearn.tree import DecisionTreeClassifier 18 | # from sklearn.svm import SVC 19 | # from sklearn.ensemble import GradientBoostingClassifier 20 | from sklearn.ensemble import AdaBoostClassifier 21 | # from sklearn.naive_bayes import GaussianNB 22 | sys.path.append('auxiliary/') # noqa: E402 23 | from data_processing import load_features, shape_data 24 | from kfold_crosseval import kfold_crosseval 25 | 26 | 27 | # %% Choose settings and classifier 28 | test_season = '2018-2019' # hold-out season for validation 29 | level = 'match' # match or team level features to use 30 | shuffle = True # whether to shuffle or not the data in k-fold cross validation 31 | norm = True # whether to normalise or not the features 32 | min_round = 5 # minimum number of first rounds to skip in every season 33 | nsplits = 5 # number of folds in k-fold cross validation 34 | nestimators = 188 # this is a classifier-specific setting 35 | rate = 1.2 # this is a classifier-specific setting 36 | random_state = 10 # random state for the classifier 37 | model = AdaBoostClassifier(n_estimators=nestimators, random_state=random_state, 38 | learning_rate=rate) 39 | # name and path of the output file in which we store the performance results 40 | # of the feature sets 41 | out_file = 'output/wrapper_ada2_n_{}_rate_{}'.format(nestimators, rate) 42 | 43 | # %% load feature data 44 | df = load_features(level) 45 | 46 | # %% choose features 47 | if level == 'match': 48 | feats = ['Position_x', 'Position_y', 'Offence_x', 'Offence_y', 49 | 'Defence_x', 'Defence_y', 50 | 'form_x', 'form_y', 51 | 'Diff_x', 'Diff_y', 52 | 'Home F4', 'Away F4'] 53 | elif level == 'team': 54 | feats = ['Home', 'Away', 'Position', 'Offence', 'Defence', 55 | 'form', 'F4', 'Diff'] 56 | n_feats = len(feats) 57 | 58 | # seasons for calibration 59 | df = df[df['Season'] != test_season] 60 | 61 | # %% Re-shape data 62 | X_train, y_train, df, groups = shape_data(df, feats, norm=norm, 63 | min_round=min_round) 64 | 65 | # %% Embedded feature selection (combinations of features) 66 | 67 | # create all possible combination of features. 68 | allcombs = [] 69 | for u in range(1, n_feats + 1): 70 | combs = combinations(feats, u) 71 | for c in combs: 72 | if list(c) != []: 73 | allcombs.append(list(c)) 74 | 75 | scores = np.zeros((len(allcombs), 2)) 76 | nc = 0 77 | for ii, comb in enumerate(tqdm(allcombs)): 78 | 79 | if len(comb) > nc: 80 | tqdm.write('Number of features: %d' % len(comb)) 81 | nc = len(comb) 82 | indx, feats = [], [] 83 | 84 | X_train = df[comb].values 85 | 86 | scores[ii, 0], scores[ii, 1] = kfold_crosseval(X_train, y_train, df[comb], 87 | nsplits, groups=groups, 88 | model=model, 89 | level=level, 90 | shuffle=shuffle) 91 | # save results 92 | np.savez(out_file, scores=scores, features=np.array(allcombs)) 93 | 94 | # %% Plot results 95 | # Sort best combinations 96 | ll = np.argsort(scores[:, 0])[::-1] 97 | sortcombs = [allcombs[u] for u in ll] 98 | 99 | x = np.arange(1, len(allcombs) + 1, dtype=int) 100 | plt.figure() 101 | plt.plot(x, scores[:, 0], label='Accuracy') 102 | plt.plot(x, scores[:, 1], label='W-Accuracy') 103 | plt.legend() 104 | plt.show() 105 | 106 | plt.figure() 107 | plt.bar(x[:15], scores[ll, 0][:15]) 108 | plt.xticks(x[:15], sortcombs[:15], rotation='vertical') 109 | plt.show() 110 | -------------------------------------------------------------------------------- /model-selection/cross_validation_two_param_models.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Hyper-parameter tuning using k-fold cross-validation for two hyper-parameter 3 | models via nested for loops grid search. This scripts is left for legacy, see 4 | also the `gridsearch_cross_validation.py` which covers multiple hyper-parameter 5 | models. 6 | ''' 7 | import sys 8 | import numpy as np 9 | from tqdm import tqdm 10 | from matplotlib import pyplot as plt 11 | from sklearn.svm import SVC 12 | from sklearn.ensemble import AdaBoostClassifier 13 | sys.path.append('auxiliary/') # noqa: E402 14 | from data_processing import load_features, shape_data 15 | from kfold_crosseval import kfold_crosseval 16 | 17 | import warnings 18 | warnings.filterwarnings("ignore") 19 | 20 | 21 | # %% Choose settings and classifier 22 | test_season = '2018-2019' # hold-out season for validation 23 | level = 'match' # match or team level features to use 24 | shuffle = True # whether to shuffle or not the data in k-fold cross validation 25 | norm = True # whether to normalise or not the features 26 | min_round = 5 # minimum number of first rounds to skip in every season 27 | nsplits = 5 # number of folds in k-fold cross validation 28 | method = 'svm-rbf' # method for grid search hyper-parameter training, see list 29 | # methods: 'log-reg', 'svm-linear', 'svm-rbf', 'decision-tree', 'random-forest', 30 | # 'naive-bayes', 'gradient-boosting', 'ada', 'ada2', 'knn', 31 | # 'discriminant-analysis' 32 | 33 | print('norm: %r - shuffle: %r - method: %s' % (norm, shuffle, method)) 34 | 35 | # %% load feature data 36 | df = load_features(level) 37 | 38 | # choose features 39 | if level == 'match': 40 | feats = ['Position_x', 'Position_y', 'Offence_x', 'Offence_y', 41 | 'Defence_x', 'Defence_y', 'form_x', 'form_y', 'Diff_x', 'Diff_y', 42 | 'Home F4', 'Away F4'] 43 | elif level == 'team': 44 | feats = ['Home', 'Away', 'Position', 45 | 'Offence', 'Defence', 'form', 'F4', 'Diff'] 46 | 47 | # seasons for calibration 48 | df = df[df['Season'] != test_season] 49 | 50 | # %% Re-shape data 51 | X_train, y_train, df, groups = shape_data(df, feats, norm=norm, 52 | min_round=min_round) 53 | 54 | print('Number of feaures:', X_train.shape[1], feats) 55 | print('Number of obs:', X_train.shape[0]) 56 | 57 | # %% Set parameters 58 | if method == 'svm-rbf': 59 | params1 = np.sort(np.concatenate((np.logspace(-5, 8, 14), 60 | 5 * np.logspace(-5, 8, 14)), axis=0)) 61 | params2 = np.sort(np.concatenate((np.logspace(-5, 8, 14), 62 | 5 * np.logspace(-5, 8, 14)), axis=0)) 63 | elif method == 'ada': 64 | params1 = np.arange(5, 200, 1) 65 | params2 = np.arange(0.3, 1.5, 0.1) 66 | else: 67 | sys.exit('Method not recognised') 68 | 69 | # %% Tune parameters 70 | accuracy = np.zeros((params1.shape[0], params2.shape[0])) 71 | w_accuracy = np.zeros((params1.shape[0], params2.shape[0])) 72 | 73 | for i, param1 in enumerate(tqdm(params1, desc='1st loop')): 74 | for j, param2 in enumerate(tqdm(params2, desc='2st loop')): 75 | 76 | if method == 'svm-rbf': 77 | model = SVC(C=param1, kernel='rbf', gamma=param2, 78 | class_weight='balanced', probability=True, 79 | max_iter=400) 80 | elif method == 'ada': 81 | model = AdaBoostClassifier(n_estimators=param1, random_state=10, 82 | learning_rate=param2) 83 | 84 | # apply k-fold cross validation 85 | accuracy[i, j], w_accuracy[i, j] = kfold_crosseval(X_train, y_train, 86 | df, nsplits, 87 | groups=groups, 88 | model=model, 89 | level=level, 90 | shuffle=shuffle) 91 | np.savez('output/%s' % method, accuracy=accuracy, w_accuracy=w_accuracy, 92 | params1=params1, params2=params2) 93 | 94 | print('Accuracy: ', np.round(np.max(accuracy), 4)) 95 | print('Weighted Accuracy: ', np.round(np.max(w_accuracy), 4)) 96 | 97 | plt.imshow(accuracy) 98 | plt.colorbar() 99 | plt.show() 100 | 101 | plt.figure() 102 | plt.imshow(w_accuracy) 103 | plt.colorbar() 104 | plt.show() 105 | -------------------------------------------------------------------------------- /feature-selection/feature_selection_wrapper_sfs.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Wrapper method for feature selection using the Sequential Forward Selection 3 | using a chosen algorithm and its hyper-parameters. 4 | ''' 5 | import sys 6 | import numpy as np 7 | from matplotlib import pyplot as plt 8 | # from sklearn.linear_model import LogisticRegression 9 | # from sklearn.ensemble import RandomForestClassifier 10 | # from sklearn.tree import DecisionTreeClassifier 11 | # from sklearn.svm import SVC 12 | # from sklearn.ensemble import GradientBoostingClassifier 13 | from sklearn.ensemble import AdaBoostClassifier 14 | # from sklearn.naive_bayes import GaussianNB 15 | 16 | sys.path.append('auxiliary/') # noqa: E402 17 | from data_processing import load_features, shape_data 18 | from kfold_crosseval import kfold_crosseval 19 | 20 | # %% Choose settings and classifier 21 | test_season = '2018-2019' # hold-out season for validation 22 | level = 'match' # match or team level features to use 23 | shuffle = True # whether to shuffle or not the data in k-fold cross validation 24 | norm = True # whether to normalise or not the features 25 | min_round = 5 # minimum number of first rounds to skip in every season 26 | nsplits = 5 # number of folds in k-fold cross validation 27 | nestimators = 115 # this is a classifier-specific setting 28 | rate = 1.1 # this is a classifier-specific setting 29 | random_state = 10 # random state for the classifier 30 | model = AdaBoostClassifier(n_estimators=nestimators, random_state=random_state, 31 | learning_rate=rate) 32 | 33 | # %% load feature data 34 | df = load_features(level) 35 | 36 | # choose features 37 | if level == 'match': 38 | feats = ['Position_x', 'Position_y', 'Offence_x', 'Offence_y', 39 | 'Defence_x', 'Defence_y', 40 | 'form_x', 'form_y', 41 | 'Diff_x', 'Diff_y', 42 | 'Home F4', 'Away F4'] 43 | elif level == 'team': 44 | feats = ['Home', 'Away', 'Position', 'Offence', 'Defence', 45 | 'form', 'F4', 'Diff'] 46 | n_feats = len(feats) 47 | 48 | # seasons for calibration 49 | df = df[df['Season'] != test_season] 50 | 51 | # %% Re-shape data 52 | X_train, y_train, df, groups = shape_data(df, feats, norm=norm, 53 | min_round=min_round) 54 | 55 | # %% Embedded feature selection (combinations of features) 56 | # add features one by one 57 | 58 | # create a copy of the initial X_train. 59 | XX = X_train.copy() 60 | 61 | # create the indices of the features 62 | allfeats = np.arange(n_feats, dtype=int) 63 | # lists to keep the best features and their accuracies scores. 64 | bestfeats = [] 65 | accuracy = [] 66 | w_accuracy = [] 67 | while len(allfeats) > 0: 68 | # number of remaining features 69 | n_temp_feat = len(allfeats) 70 | print('Number of features to process from:', n_temp_feat) 71 | # indices of current best features 72 | c_best = np.array(bestfeats.copy(), dtype=int) 73 | temp_acc = np.zeros(n_temp_feat) 74 | temp_wacc = np.zeros(n_temp_feat) 75 | for n in range(n_temp_feat): 76 | # append current best features with the features remaining in the list 77 | # (one by one) 78 | cfeat = np.append(c_best, allfeats[n]) 79 | print('Indices of features under process:', cfeat) 80 | # select these features from the total design matrix. 81 | X_train = XX[:, cfeat] 82 | # run k-fold cross validation 83 | temp_acc[n], temp_wacc[n] = kfold_crosseval(X_train, y_train, df, 84 | nsplits, groups=groups, 85 | model=model, 86 | level=level, 87 | shuffle=shuffle) 88 | # find index of max accuracy 89 | nn = np.argmax(temp_acc) 90 | # append list of indices of best features with the index of the new best 91 | # feature 92 | bestfeats.append(allfeats[nn]) 93 | # similarly for accuracy scores 94 | accuracy.append(temp_acc[nn]) 95 | w_accuracy.append(temp_wacc[nn]) 96 | allfeats = np.delete(allfeats, nn) 97 | print('Best Features:', bestfeats) 98 | 99 | print([feats[u] for u in bestfeats]) 100 | 101 | # %% Plots 102 | x = np.arange(1, n_feats + 1) 103 | plt.figure() 104 | plt.plot(x, accuracy, label='Accuracy') 105 | plt.plot(x, w_accuracy, label='W-Accuracy') 106 | plt.xticks(x, x) 107 | plt.minorticks_on() 108 | plt.grid(which='major', linestyle='-') 109 | plt.grid(which='minor', linestyle='--') 110 | # plt.tight_layout() 111 | plt.show() 112 | -------------------------------------------------------------------------------- /feature-extraction/make_standings.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from itertools import permutations 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | def make_standings(results, nround): 8 | 9 | if nround < 1: 10 | sys.exit('Game round must be greater than 0') 11 | 12 | results = results[results['Game Round'] <= nround].copy() 13 | home_points = np.ones(results.shape[0], dtype=int) 14 | away_points = np.ones(results.shape[0], dtype=int) 15 | jj = results['Home Score'] > results['Away Score'] 16 | home_points[jj] = 2 17 | away_points[np.logical_not(jj)] = 2 18 | 19 | results['Home Points'] = home_points 20 | results['Away Points'] = away_points 21 | 22 | home = results.groupby(['Home Team ID'])['Home Points', 23 | 'Home Score Regular Period', 24 | 'Away Score Regular Period'].sum() 25 | away = results.groupby(['Away Team ID'])['Away Points', 26 | 'Away Score Regular Period', 27 | 'Home Score Regular Period'].sum() 28 | 29 | groupby = home.merge(away, how='outer', left_index=True, right_index=True) 30 | groupby.fillna(0, inplace=True) 31 | 32 | teamids = np.concatenate((results['Home Team ID'].values, 33 | results['Away Team ID'].values), axis=0) 34 | teams = np.concatenate((results['Home Team'].values, 35 | results['Away Team'].values), axis=0) 36 | dct = dict(zip(teamids, teams)) 37 | 38 | standing = pd.DataFrame() 39 | standing['Team ID'] = groupby.index 40 | standing['Team'] = [dct[u] for u in standing['Team ID'].values] 41 | standing['Points'] = (groupby['Home Points'].values + 42 | groupby['Away Points'].values) 43 | standing['Score+'] = (groupby['Home Score Regular Period_x'].values + 44 | groupby['Away Score Regular Period_y'].values) 45 | standing['Score-'] = (groupby['Away Score Regular Period_x'].values + 46 | groupby['Home Score Regular Period_y'].values) 47 | standing['Score Diff'] = standing['Score+'] - standing['Score-'] 48 | standing.sort_values(by=['Points', 'Score Diff', 'Score+'], inplace=True, 49 | ascending=False) 50 | standing.reset_index(drop=True, inplace=True) 51 | 52 | intcols = ['Team ID', 'Points', 'Score+', 'Score-', 'Score Diff'] 53 | standing[intcols] = standing[intcols].astype(int) 54 | 55 | if nround < standing.shape[0]: 56 | return standing 57 | 58 | nteams = standing.shape[0] 59 | secondary_points = np.zeros(nteams, dtype=int) 60 | score_diffs = np.zeros(nteams, dtype=int) 61 | for p in np.unique(standing['Points'].values): 62 | if np.sum(standing['Points'].values == p) > 1: 63 | 64 | # there are ties 65 | kk = standing['Points'].values == p 66 | teams = standing['Team ID'].iloc[kk] 67 | ii = (np.in1d(results['Home Team ID'].values, teams) & 68 | np.in1d(results['Away Team ID'].values, teams)) 69 | minichamp = results.iloc[ii] 70 | home = minichamp.groupby(['Home Team ID'])[ 71 | 'Home Points', 'Home Score Regular Period', 72 | 'Away Score Regular Period'].sum() 73 | away = minichamp.groupby(['Away Team ID'])[ 74 | 'Away Points', 'Away Score Regular Period', 75 | 'Home Score Regular Period'].sum() 76 | groupby = home.merge(away, how='outer', left_index=True, 77 | right_index=True) 78 | 79 | # only those tied teams that have played 80 | # all against each other twice are ordered 81 | # by their head-to-head matches. 82 | flag = True 83 | for h, a in permutations(groupby.index, 2): 84 | if any((results['Home Team ID'].values == h) & 85 | (results['Away Team ID'].values == a)): 86 | pass 87 | else: 88 | flag = False 89 | 90 | if flag is False: 91 | continue 92 | 93 | groupby.fillna(0, inplace=True) 94 | 95 | teamid = groupby.index 96 | points = (groupby['Home Points'].values + 97 | groupby['Away Points'].values) 98 | scoreplus = (groupby['Home Score Regular Period_x'].values + 99 | groupby['Away Score Regular Period_y'].values) 100 | scoreminus = (groupby['Away Score Regular Period_x'].values + 101 | groupby['Home Score Regular Period_y'].values) 102 | scores = scoreplus - scoreminus 103 | 104 | for team, point, score in zip(teamid, points, scores): 105 | secondary_points[standing['Team ID'] == team] = point 106 | score_diffs[standing['Team ID'] == team] = score 107 | 108 | standing['Secondary Points'] = secondary_points 109 | standing['Secondary Score Diff'] = score_diffs 110 | standing.sort_values(by=['Points', 'Secondary Points', 111 | 'Secondary Score Diff', 'Score Diff'], 112 | inplace=True, ascending=False) 113 | standing.reset_index(drop=True, inplace=True) 114 | 115 | return standing 116 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Euroleague Basketball Data Analysis and Prediction 2 | 3 | This repository includes and an end-to-end methodology for building machine learning algorithms for predicting Euroleague Basketball game outcomes. 4 | 5 | The methodology and results are discussed in detail in this article published on arxiv.org, entitled "[Descriptive and Predictive Analysis of Euroleague Basketball Games and the Wisdom of Basketball Crowds](https://arxiv.org/abs/2002.08465)" 6 | 7 | The repository consists of the following modules (which represent the logical steps in the modelling process) 8 | 9 | * `data-collection` 10 | * `feature-extraction` 11 | * `descriptive-analysis` 12 | * `model-selection` 13 | * `feature-selection` 14 | * `model-validation` 15 | 16 | Data extraction and storage settings are specified in the `settings/` directory 17 | 18 | ## Data Collection 19 | 20 | Data is collected through scraping [Euroleague](https://www.euroleague.net/)'s official website. 21 | 22 | In data collection, there are three scripts for collecting three types of data: 23 | 24 | * Team statistics per game, such as offence, defense scores, rebounds, steals, assists, rebounds, etc., for each team (row) in every game in a season. 25 | * Game results. Each row corresponds to a game in season. Teams and final scores are given. 26 | * Standing data. Each row corresponds to the standing of a team in the table at the end of the round. All rounds are included. 27 | 28 | To collect the data for a season the user should run the script with the input console argument being the end year of the season, i.e. for season 2017-2018, execute 29 | 30 | `$ python data-collection/scrap_game_stats.py -s 2018` 31 | 32 | Similarly for the collection of the other data types. 33 | 34 | Data is stored in the directory specified in the `settings/data_collection.json` file. 35 | 36 | ## Feature Extraction 37 | 38 | Features are extracted from the data collected. Features are split in two main categories: 39 | 40 | * Match-level features. Every observation (row) corresponds to a match in a season. Features include average offence, average defense, form, etc., for each team in a game. 41 | * Team-level features. Every observation (row) corresponds to team in game in a season. Features include average offence, average defense, etc., for each team in a game. 42 | 43 | To extract features, run the script with the input console argument being the end year of the season, i.e. for season 2017-2018, execute 44 | 45 | `$ python feature-extraction/extract_features.py -s 2018` 46 | 47 | Feature files are stored in the directory and with name patterns specified in the `settings/feature_extraction.json` file. 48 | 49 | ## Descriptive Analysis 50 | 51 | After data collection and feature extraction, we perform Explanatory Data Analysis, a descriptive analysis of the datasets. Here, we focus on the distribution of score points for the home and away teams, winning and losing teams and the probability of winning as a function of the points scored. A jupyter notebook and a script are provided. Both do the same tasks. 52 | 53 | ## Model Selection 54 | 55 | In model selection we compare a number of classification algorithms in different settings (e.g. match vs team level features) and perform hyper-parameter tuning. 56 | 57 | The accuracy, weighted accuracy and ROC-AUC scores are recorded and later compared for identifying the best performing algorithm. 58 | 59 | The `gridsearch_cross_validation.py` covers most cases. The other two scripts are left there for legacy (they shouldn't be used). Some setting parameters are hard-coded, so the user should edit those before running the script. More algorithms can be added and more hyper-parameters can be tuned if necessary. 60 | 61 | The script has been split into sections so that it can be converted to a notebook using [`nbconvert`](https://nbconvert.readthedocs.io/en/latest/). 62 | 63 | ## Feature Selection 64 | 65 | Different methods of feature selection are performed in this module. 66 | 67 | * Filter methods, including the mutual information, chi-squared and ANOVA F-statistic. Results are being evaluated for a chosen algorithm using k-fold cross-validation on the training test. 68 | 69 | * Feature transformation methods using Principal Component Analysis (PCA). Increasing numbers of principal components results are being evaluated for a chosen algorithm using k-fold cross-validation on the training test. 70 | 71 | * Wrapper method for feature selection, i.e. subsets of features are generated and evaluated using a chosen algorithm and its hyper-parameters. Here, as the number of features is relative small, we are able to generate all possible combinations of features. If the number of features grows large, a different approach should be adopted, the Sequential Forward Selection, see 72 | `feature_selection_wrapper_sfs.py` script. 73 | 74 | As before, Some setting parameters are hard-coded, so the user should edit those before running the script. 75 | 76 | ## Model Validation 77 | 78 | After choosing the best performing algorithm, feature selection and tuning to its optimal hyper-parameters, we validate the final model(s) on the test set. 79 | 80 | The validation of the model is performed in a jupyter notebook, see `validation.ipynb`. The notebook also includes assessment of the wisdom of the crowds model and comparison. 81 | 82 | The directory also includes a script for assessing the performance of a few benchmark models. These are: 83 | 1. Home team always wins 84 | 2. F4 teams (i.e. teams that reached the F4 in the previous season) always win when playing with a non-F4 team, otherwise home team always wins. 85 | 3. Persistence model, teams that won in the previous round win, if both teams have won, home team wins. 86 | 4. Standing model, team higher in the standings wins. 87 | 5. Panathinaikos always wins, otherwise home team always wins 88 | 6. Random model. 89 | 90 | To Run the benchmark models, execute 91 | 92 | `$ python model-validation/benchmarks.py` 93 | -------------------------------------------------------------------------------- /model-selection/cross_validation_one_param_models.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Hyper-parameter tuning using k-fold cross-validation for one hyper-parameter 3 | models via loops grid search. This scripts is left for legacy, see 4 | also the `gridsearch_cross_validation.py` which covers multiple hyper-parameter 5 | models. 6 | ''' 7 | import sys 8 | import numpy as np 9 | from matplotlib import pyplot as plt 10 | from sklearn.linear_model import LogisticRegression 11 | from sklearn.ensemble import RandomForestClassifier 12 | from sklearn.tree import DecisionTreeClassifier 13 | from sklearn.svm import SVC 14 | from sklearn.naive_bayes import GaussianNB 15 | from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier 16 | from sklearn.neighbors import KNeighborsClassifier 17 | from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 18 | sys.path.append('auxiliary/') # noqa: E402 19 | from data_processing import load_features, shape_data 20 | from kfold_crosseval import kfold_crosseval 21 | 22 | 23 | # %% Choose settings and classifier 24 | test_season = '2018-2019' # hold-out season for validation 25 | level = 'team' # match or team level features to use 26 | shuffle = True # whether to shuffle or not the data in k-fold cross validation 27 | norm = True # whether to normalise or not the features 28 | min_round = 5 # minimum number of first rounds to skip in every season 29 | nsplits = 5 # number of folds in k-fold cross validation 30 | method = 'log-reg' # method for grid search hyper-parameter training, see list 31 | # methods: 'log-reg', 'svm-linear', 'svm-rbf', 'decision-tree', 'random-forest', 32 | # 'naive-bayes', 'gradient-boosting', 'ada', 'ada2', 'knn', 33 | # 'discriminant-analysis' 34 | 35 | print('level: %s - norm: %r - shuffle: %r - method: %s' % 36 | (level, norm, shuffle, method)) 37 | 38 | # %% load feature data 39 | df = load_features(level) 40 | 41 | # choose features 42 | if level == 'match': 43 | feats = ['Position_x', 'Position_y', 'Offence_x', 'Offence_y', 44 | 'Defence_x', 'Defence_y', 'form_x', 'form_y', 'Diff_x', 'Diff_y', 45 | 'Home F4', 'Away F4'] 46 | elif level == 'team': 47 | feats = ['Home', 'Away', 'Position', 48 | 'Offence', 'Defence', 'form', 'F4', 'Diff'] 49 | 50 | # seasons for calibration 51 | df = df[df['Season'] != test_season] 52 | 53 | # %% Re-shape data 54 | X_train, y_train, df, groups = shape_data(df, feats, norm=norm, 55 | min_round=min_round) 56 | print('Number of feaures:', X_train.shape[1], feats) 57 | print('Number of obs:', X_train.shape[0]) 58 | 59 | # %% Set parameters 60 | if method == 'log-reg': 61 | params = np.sort(np.concatenate((np.logspace(-5, 8, 14), 62 | 5 * np.logspace(-5, 8, 14)), axis=0)) 63 | elif method == 'svm-linear': 64 | params = np.sort(np.concatenate((np.logspace(-5, 6, 12), 65 | 5 * np.logspace(-5, 6, 12)), axis=0)) 66 | elif method == 'decision-tree': 67 | params = np.array([0]) 68 | elif method == 'random-forest': 69 | params = np.arange(10, 100, 5) 70 | elif method == 'naive-bayes': 71 | params = np.array([0]) 72 | elif method == 'gradient-boosting': 73 | params = np.arange(10, 200, 10) 74 | elif method == 'ada': 75 | params = np.arange(5, 200, 3) 76 | elif method == 'knn': 77 | params = np.arange(3, 30, 2) 78 | elif method == 'discriminant-analysis': 79 | params = np.array([0]) 80 | else: 81 | sys.exit('Method not recognised') 82 | 83 | # %% Tune parameters 84 | accuracy = np.zeros(params.shape[0]) 85 | w_accuracy = np.zeros(params.shape[0]) 86 | 87 | for j, param in enumerate(params): 88 | 89 | # update model's parameters 90 | if method == 'log-reg': 91 | model = LogisticRegression(C=param, solver='liblinear', 92 | class_weight='balanced') 93 | elif method == 'svm-linear': 94 | model = SVC(C=param, kernel='linear', class_weight='balanced', 95 | probability=True) 96 | elif method == 'decision-tree': 97 | model = DecisionTreeClassifier(class_weight='balanced', random_state=10) 98 | elif method == 'random-forest': 99 | model = RandomForestClassifier(n_estimators=param, 100 | class_weight='balanced', 101 | random_state=10) 102 | elif method == 'naive-bayes': 103 | model = GaussianNB() 104 | elif method == 'gradient-boosting': 105 | model = GradientBoostingClassifier(n_estimators=param, random_state=10) 106 | elif method == 'ada': 107 | model = AdaBoostClassifier(n_estimators=param, random_state=10, 108 | learning_rate=0.6) 109 | elif method == 'knn': 110 | model = KNeighborsClassifier(n_neighbors=param) 111 | elif method == 'discriminant-analysis': 112 | model = QuadraticDiscriminantAnalysis() 113 | else: 114 | sys.exit('method name is not valid') 115 | 116 | # apply k-fold cross validation 117 | accuracy[j], w_accuracy[j] = kfold_crosseval(X_train, y_train, 118 | df, nsplits, groups=groups, 119 | model=model, level=level, 120 | shuffle=shuffle) 121 | 122 | # %% Plots 123 | if params.shape[0] > 1: 124 | print('Accuracy: ', np.round(np.max(accuracy), 4)) 125 | print('Weighted Accuracy: ', np.round(np.max(w_accuracy), 4)) 126 | plt.figure() 127 | plt.plot(params, accuracy, label='accuracy') 128 | plt.plot(params, w_accuracy, label='w_accuracy') 129 | if method in ['log-reg', 'svm-linear']: 130 | plt.xscale('log') 131 | plt.xlabel('parameter') 132 | plt.ylabel('Score') 133 | plt.legend() 134 | plt.title(method) 135 | plt.show() 136 | else: 137 | print('Accuracy: ', accuracy.mean(axis=0)) 138 | print('Weighted Accuracy: ', w_accuracy.mean(axis=0)) 139 | -------------------------------------------------------------------------------- /feature-selection/feature_selection_filter.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Filter methods for feature selection. Measures used are: 3 | 1) Mutual information 4 | 2) Chi square 5 | 3) ANOVA F-statistic 6 | 7 | Results are being evaluated for a chosen algorithm using k-fold 8 | cross-validation on the training test. 9 | ''' 10 | import sys 11 | import numpy as np 12 | from matplotlib import pyplot as plt 13 | from tqdm import tqdm 14 | # from sklearn.linear_model import LogisticRegression 15 | # from sklearn.ensemble import RandomForestClassifier 16 | # from sklearn.tree import DecisionTreeClassifier 17 | # from sklearn.svm import SVC 18 | # from sklearn.ensemble import GradientBoostingClassifier 19 | from sklearn.ensemble import AdaBoostClassifier 20 | # from sklearn.naive_bayes import GaussianNB 21 | from sklearn.feature_selection import SelectKBest, f_classif, chi2 22 | from sklearn.feature_selection import mutual_info_classif 23 | sys.path.append('auxiliary/') # noqa: E402 24 | from data_processing import load_features, shape_data 25 | from kfold_crosseval import kfold_crosseval 26 | 27 | 28 | def plot_accuracy(x, accuracy, w_accuracy, title=''): 29 | plt.figure() 30 | plt.plot(x, accuracy, label='Accuracy') 31 | plt.plot(x, w_accuracy, label='W-Accuracy') 32 | plt.xlabel('Number of features') 33 | plt.ylabel('Score') 34 | plt.xticks(x, x) 35 | plt.minorticks_on() 36 | plt.grid(which='major', linestyle='-') 37 | plt.grid(which='minor', linestyle='--') 38 | plt.title(title) 39 | plt.legend() 40 | plt.show() 41 | return 42 | 43 | 44 | def mutual_info_classif2(X, y, discrete_features='auto', n_neighbors=3, 45 | copy=True, random_state=10): 46 | return mutual_info_classif(X, y, discrete_features=discrete_features, 47 | n_neighbors=n_neighbors, copy=copy, 48 | random_state=random_state) 49 | 50 | 51 | # %% Choose settings and classifier 52 | test_season = '2018-2019' # hold-out season for validation 53 | level = 'match' # match or team level features to use 54 | shuffle = True # whether to shuffle or not the data in k-fold cross validation 55 | norm = True # whether to normalise or not the features 56 | min_round = 5 # minimum number of first rounds to skip in every season 57 | nsplits = 5 # number of folds in k-fold cross validation 58 | random_state = 10 # random state for the classifier 59 | model = AdaBoostClassifier(n_estimators=121, random_state=random_state, 60 | learning_rate=1.0) 61 | 62 | # %% load feature data 63 | df = load_features(level) 64 | 65 | # choose features 66 | if level == 'match': 67 | feats = ['Position_x', 'Position_y', 'Offence_x', 'Offence_y', 68 | 'Defence_x', 'Defence_y', 69 | 'form_x', 'form_y', 70 | 'Diff_x', 'Diff_y', 71 | 'Home F4', 'Away F4'] 72 | elif level == 'team': 73 | feats = ['Home', 'Away', 'Position', 'Offence', 'Defence', 74 | 'form', 'F4', 'Diff'] 75 | n_feats = len(feats) 76 | 77 | # seasons for calibration 78 | df = df[df['Season'] != test_season] 79 | 80 | # %% Re-shape data 81 | X_train, y_train, df, groups = shape_data(df, feats, norm=norm, 82 | min_round=min_round) 83 | 84 | # %% Filter based feature selection (model independent) 85 | fscores, ps = f_classif(X_train, y_train) 86 | mscores = mutual_info_classif(X_train, y_train, random_state=10) 87 | chiscores, _ = chi2(X_train, y_train) 88 | ordered_fc = [feats[u] for u in np.argsort(fscores)[::-1]] 89 | ordered_mi = [feats[u] for u in np.argsort(mscores)[::-1]] 90 | ordered_ch = [feats[u] for u in np.argsort(chiscores)[::-1]] 91 | print('F scores', ordered_fc) 92 | print('MI scores', ordered_mi) 93 | print('Chi scores', ordered_ch) 94 | 95 | # %% 96 | accuracy = np.zeros((n_feats, 3)) 97 | w_accuracy = np.zeros((n_feats, 3)) 98 | feats_fs = [] 99 | feats_mi = [] 100 | for i, n in enumerate(tqdm(range(1, n_feats + 1))): 101 | kk = n if n < n_feats else 'all' 102 | skb_fc = SelectKBest(f_classif, k=kk) 103 | skb_mi = SelectKBest(mutual_info_classif2, k=kk) 104 | skb_ch = SelectKBest(chi2, k=kk) 105 | X_fc = skb_fc.fit_transform(X_train, y_train) 106 | X_mi = skb_mi.fit_transform(X_train, y_train) 107 | X_ch = skb_ch.fit_transform(X_train, y_train) 108 | 109 | # print('MI:', skb_mi.scores_) 110 | # print(skb_mi.get_support()) 111 | 112 | accuracy[i, 0], w_accuracy[i, 0] = kfold_crosseval(X_fc, y_train, df, 113 | nsplits, groups=groups, 114 | model=model, 115 | level=level, 116 | shuffle=shuffle) 117 | accuracy[i, 1], w_accuracy[i, 1] = kfold_crosseval(X_mi, y_train, df, 118 | nsplits, groups=groups, 119 | model=model, 120 | level=level, 121 | shuffle=shuffle) 122 | accuracy[i, 2], w_accuracy[i, 2] = kfold_crosseval(X_ch, y_train, df, 123 | nsplits, groups=groups, 124 | model=model, 125 | level=level, 126 | shuffle=shuffle) 127 | 128 | # %% PLots 129 | 130 | x = np.arange(1, n_feats + 1) 131 | plot_accuracy(x, accuracy[:, 0], w_accuracy[:, 0], title='ANOVA') 132 | plot_accuracy(x, accuracy[:, 1], w_accuracy[:, 1], title='MI') 133 | plot_accuracy(x, accuracy[:, 2], w_accuracy[:, 2], title='Chi2') 134 | 135 | scores = np.concatenate((fscores[:, None], mscores[:, None], 136 | chiscores[:, None]), axis=1) 137 | order = np.argsort(scores, axis=0) 138 | ranks = order.argsort(axis=0) 139 | 140 | plt.figure() 141 | plt.imshow((scores.shape[0] - ranks).T) 142 | plt.yticks(ticks=[0, 1, 2], labels=['ANOVA', 'MI', 'Chi2']) 143 | plt.xticks(ticks=np.arange(len(feats)), labels=feats, rotation='vertical') 144 | plt.colorbar(orientation='horizontal', pad=0.3) 145 | plt.show() 146 | -------------------------------------------------------------------------------- /data-collection/scrap_game_stats.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | import logging 5 | import re 6 | from datetime import datetime 7 | from tqdm import trange 8 | from bs4 import BeautifulSoup 9 | import requests 10 | import pandas as pd 11 | sys.path.append('auxiliary/') # noqa: E402 12 | from io_json import read_json 13 | 14 | logging.basicConfig(level=logging.INFO) 15 | 16 | 17 | def main(season, n_rounds): 18 | ''' 19 | Extract games stats for all matches in a given season 20 | ''' 21 | 22 | # read settings 23 | settings = read_json('settings/data_collection.json') 24 | out_dir = settings['output_dir'] 25 | url_pattern = settings['game_stats']['url_link'] 26 | out_file_prefix = settings['game_stats']['output_file_prefix'] 27 | filename = '%s_%d_%d.csv' % (out_file_prefix, season - 1, season) 28 | filepath = os.path.join(out_dir, filename) 29 | 30 | regex = re.compile(r'score [a-z\s]*pts[a-z\s]*') 31 | allteamstats = [] 32 | season_str = '%d-%d' % (season - 1, season) 33 | header = ['Season', 'Round', 'GameID', 'Date', 'Team', 'Where', 34 | 'Offence', 'Defence'] 35 | 36 | for game_round in trange(1, n_rounds + 1): 37 | url = (url_pattern % (game_round, season - 1)) 38 | try: 39 | r = requests.get(url) 40 | except ConnectionError: 41 | sys.exit('Connection Error. Check URL') 42 | data = r.text 43 | soup = BeautifulSoup(data, 'html.parser') 44 | 45 | for game in soup.find_all('div', attrs={'class': 'game played'}): 46 | data_code = game.attrs['data-code'] 47 | gameid = '%d_%d_%d_%s' % (season - 1, season, 48 | game_round, data_code) 49 | home_team = game.find_all('span', attrs={'class': 'name'})[0].string 50 | away_team = game.find_all('span', attrs={'class': 'name'})[1].string 51 | 52 | scores = game.find_all('span', attrs={'class': regex}) 53 | home_score = int(scores[0]['data-score'] if 54 | scores[0].has_attr('data-score') else 55 | scores[0].string) 56 | away_score = int(scores[1]['data-score'] if 57 | scores[1].has_attr('data-score') else 58 | scores[1].string) 59 | 60 | date_str = game.find('span', attrs={'class': 'date'}).string 61 | date = datetime.strptime(date_str, '%B %d %H:%M CET') 62 | yr = season - 1 if date.month <= 12 and date.month > 8 else season 63 | date = date.replace(year=yr) 64 | date_str = datetime.strftime(date, '%Y-%m-%d %H:%M:%S') 65 | 66 | home = {'Season': season_str, 67 | 'Round': game_round, 68 | 'GameID': gameid, 69 | 'Date': date_str, 'Team': home_team, 'Where': 'Home', 70 | 'Offence': home_score, 'Defence': away_score} 71 | away = {'Season': season_str, 72 | 'Round': game_round, 73 | 'GameID': gameid, 74 | 'Date': date_str, 'Team': away_team, 'Where': 'Away', 75 | 'Offence': away_score, 'Defence': home_score} 76 | 77 | # follow the game-centre link 78 | link = (game.find_all('a', attrs={'class': 'game-link'})[0] 79 | .attrs['href']) 80 | fulllink = 'http://www.euroleague.net/' + link 81 | try: 82 | r = requests.get(fulllink) 83 | except ConnectionError: 84 | sys.exit('Connection Error. Check Game URL') 85 | gamedata = r.text 86 | gamesoup = BeautifulSoup(gamedata, 'html.parser') 87 | totals = gamesoup.find_all('tr', attrs={'class': 'TotalFooter'}) 88 | for i, t in enumerate(totals): 89 | if i == 0: 90 | # home team stats 91 | dics = home.copy() 92 | elif i == 1: 93 | # away team stats 94 | dics = away.copy() 95 | else: 96 | err_msg = 'Totals field returned invalid number of teams' 97 | raise ValueError(err_msg) 98 | stats = t.find_all('span') 99 | for stat in stats: 100 | # ignore total time played field 101 | fullfield = stat.attrs['id'] 102 | if 'TotalTimePlayed' not in fullfield: 103 | ii = fullfield.find('_lbl') 104 | field = fullfield[ii + 9:] 105 | string = stat.contents[0] 106 | if string.isnumeric(): 107 | f = int(string) 108 | dics[field] = f 109 | if field not in header: 110 | header.append(field) 111 | elif '/' in string: 112 | made, attmp = string.split('/') 113 | dics[field + '-Made'] = int(made) 114 | dics[field + '-Attempted'] = int(attmp) 115 | if field + '-Made' not in header: 116 | header.append(field + '-Made') 117 | if field + '-Attempted' not in header: 118 | header.append(field + '-Attempted') 119 | else: 120 | raise ValueError('Invalid field value') 121 | allteamstats.append(dics) 122 | 123 | logging.info('Convert to dataframe') 124 | df = pd.DataFrame(allteamstats, columns=header) 125 | 126 | logging.info('Save to file') 127 | df.to_csv(filepath, index=False) 128 | 129 | return df 130 | 131 | 132 | if __name__ == "__main__": 133 | parser = argparse.ArgumentParser() 134 | parser.add_argument('-s', '--season', required=True, type=int, 135 | help="the ending year of the season") 136 | parser.add_argument('-n', '--n-rounds', default=34, 137 | type=int, 138 | help="The number of regular season rounds " 139 | "in the season") 140 | args = parser.parse_args() 141 | 142 | main(args.season, args.n_rounds) 143 | -------------------------------------------------------------------------------- /model-validation/benchmarks.py: -------------------------------------------------------------------------------- 1 | """ 2 | Performs simple analysis and evaluates the scoring of simple benchmark models: 3 | 1) Home team always wins 4 | 2) F4 teams always win when playing with a non-F4 team, 5 | otherwise home team always wins. 6 | 3) Persistence model, teams that won in the previous round win, 7 | if both teams have won, home team wins. 8 | 4) Standing model, team higher in the standings wins. 9 | 5) Panathinaikos always wins, otherwise home team always wins 10 | 6) Random model. 11 | """ 12 | import sys 13 | import os 14 | import argparse 15 | import numpy as np 16 | import pandas as pd 17 | from sklearn.metrics import accuracy_score, balanced_accuracy_score 18 | from sklearn.metrics import roc_auc_score 19 | sys.path.append('auxiliary') # noqa: E402 20 | from io_json import read_json 21 | 22 | 23 | def main(season): 24 | 25 | # get settings 26 | settings = read_json('settings/data_collection.json') 27 | out_dir = settings['output_dir'] 28 | rslts_file_prefix = settings['season_results']['output_file_prefix'] 29 | rslts_filename = '%s_%d_%d.csv' % (rslts_file_prefix, season - 1, season) 30 | stnds_file_prefix = settings['season_standings']['output_file_prefix'] 31 | stnds_filename = '%s_%d_%d.csv' % (stnds_file_prefix, season - 1, season) 32 | 33 | # read input data (results and standings) 34 | rslts_filepath = os.path.join(out_dir, rslts_filename) 35 | stnds_filepath = os.path.join(out_dir, stnds_filename) 36 | data = pd.read_csv(rslts_filepath) 37 | standings = pd.read_csv(stnds_filepath) 38 | f4teams = read_json(settings['f4teams_file']) 39 | 40 | # Specify the F4 teams of the previous year 41 | f4Teams = f4teams[str(season - 1)] 42 | 43 | # Checks 44 | flag = False 45 | stand_teams = np.unique(standings['Club Name']) 46 | resul_teams = np.unique(data['Home Team']) 47 | if not np.in1d(stand_teams, resul_teams).all(): 48 | ii = ~np.in1d(stand_teams, resul_teams) 49 | print(stand_teams[ii]) 50 | flag = True 51 | if not np.in1d(resul_teams, stand_teams).all(): 52 | ii = ~np.in1d(resul_teams, stand_teams) 53 | print(resul_teams[ii]) 54 | flag = True 55 | 56 | if flag: 57 | sys.exit('Fix inconsistancies in team names') 58 | 59 | nmatches = data.shape[0] 60 | 61 | data['Actual'] = np.where(data['Home Score'] > data['Away Score'], 1, 2) 62 | data['Home Wins'] = np.ones(nmatches, dtype=int) 63 | 64 | # f4 model: the F4 teams of the previous year always win. 65 | # If no or both F4 teams in a game, home always wins. 66 | f4wins = np.ones(nmatches, dtype=int) 67 | hmf4 = np.in1d(data['Home Team'], f4Teams) 68 | awf4 = np.in1d(data['Away Team'], f4Teams) 69 | f4wins[awf4 & (~hmf4)] = 2 70 | data['F4 Wins'] = f4wins 71 | 72 | # persistence model: a team that won the previous games wins. If no or both 73 | # teams won the last game, home always wins. 74 | # standings model: the team that is higher in the standings wins. 75 | persistence = np.ones(nmatches, dtype=int) 76 | stand = np.ones(nmatches, dtype=int) 77 | for r in np.unique(data['Round']): 78 | if r == 1: 79 | continue 80 | 81 | # standings model 82 | s = standings[standings['Round'] == r - 1] 83 | d = data[data['Round'] == r] 84 | 85 | home_stands = np.array([s[s['Club Name'] == u]['Position'].iloc[0] 86 | for u in d['Home Team']]) 87 | away_stands = np.array([s[s['Club Name'] == u]['Position'].iloc[0] 88 | for u in d['Away Team']]) 89 | stand[data['Round'] == r] = np.where(home_stands < away_stands, 1, 2) 90 | 91 | # persistence model 92 | if r == 2: 93 | home_won = np.array([1 if s[s['Club Name'] == u]['Wins'].iloc[0] > 0 94 | else 0 for u in d['Home Team']]) 95 | away_won = np.array([1 if s[s['Club Name'] == u]['Wins'].iloc[0] > 0 96 | else 0 for u in d['Away Team']]) 97 | else: 98 | s_prev = standings[standings['Round'] == r - 2] 99 | home_won = np.array([1 if s[s['Club Name'] == u]['Wins'].iloc[0] > 100 | s_prev[s_prev['Club Name'] == u]['Wins'] 101 | .iloc[0] 102 | else 0 for u in d['Home Team']]) 103 | away_won = np.array([1 if s[s['Club Name'] == u]['Wins'].iloc[0] > 104 | s_prev[s_prev['Club Name'] == u]['Wins'] 105 | .iloc[0] 106 | else 0 for u in d['Away Team']]) 107 | persistence[data['Round'] == r] = np.where(away_won > home_won, 2, 1) 108 | 109 | data['Standings'] = stand 110 | data['Persistence'] = persistence 111 | # Pana model: Pana always wins, in any other game, home always wins 112 | data['Pana'] = np.where(data['Away Team'] == 113 | 'Panathinaikos Superfoods Athens', 2, 1) 114 | 115 | # Random model, for 1000 iterations, randomly assign the results of 116 | # the games 117 | random = np.zeros(1000) 118 | for i in range(1000): 119 | rand = np.random.randint(1, 3, nmatches) 120 | random[i] = np.sum(data['Actual'].values == rand) 121 | 122 | rounds_excl = [1] 123 | print('Exclude round from evaluation:', rounds_excl) 124 | data = data[~np.in1d(data['Round'], rounds_excl)] 125 | 126 | print('Number of games:', data.shape[0]) 127 | print('Home wins :', np.sum(data['Actual'] == data['Home Wins'])) 128 | print('Top4 wins :', np.sum(data['Actual'] == data['F4 Wins'])) 129 | print('Persistance:', np.sum(data['Actual'] == data['Persistence'])) 130 | print('Standing :', np.sum(data['Actual'] == data['Standings'])) 131 | print('Pana :', np.sum(data['Actual'] == data['Pana'])) 132 | print('Random :', np.round(np.mean(random), 0)) 133 | print('Home wins : accuracy: %f, weighted accuracy: %f, auc: %f:' 134 | % (accuracy_score(data['Actual'].values, data['Home Wins'].values), 135 | balanced_accuracy_score(data['Actual'].values, 136 | data['Home Wins'].values), 137 | roc_auc_score(data['Actual'].values, data['Home Wins'].values))) 138 | print('Top4 wins : accuracy: %f, weighted accuracy: %f, auc: %f:' 139 | % (accuracy_score(data['Actual'].values, data['F4 Wins'].values), 140 | balanced_accuracy_score(data['Actual'].values, 141 | data['F4 Wins'].values), 142 | roc_auc_score(data['Actual'].values, data['F4 Wins'].values))) 143 | print('Standing : accuracy: %f, weighted accuracy: %f, auc: %f:' 144 | % (accuracy_score(data['Actual'].values, data['Standings'].values), 145 | balanced_accuracy_score(data['Actual'].values, 146 | data['Standings'].values), 147 | roc_auc_score(data['Actual'].values, data['Standings'].values))) 148 | return 149 | 150 | 151 | if __name__ == "__main__": 152 | parser = argparse.ArgumentParser() 153 | parser.add_argument('-s', '--season', type=int, 154 | help="the starting year of a season") 155 | args = parser.parse_args() 156 | 157 | if args.season is None: 158 | parser.print_help() 159 | else: 160 | main(args.season) 161 | -------------------------------------------------------------------------------- /descriptive-analysis/descriptive_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Descriptive analysis of the Euroleague data. The analysis focuses on home/away scores and wins and estimates the probability of winning when scoring at least N points." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "cd directory to the project root directory" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "cd .." 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "import glob\n", 33 | "import numpy as np\n", 34 | "import pandas as pd\n", 35 | "\n", 36 | "import plotly.express as px\n", 37 | "\n", 38 | "from utils import make_scatter_plot, make_bar_plot, make_scatter_plot_at_least_n_points" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "### Load Data" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "files_pattern = 'data/euroleague_results*csv'" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "data_list_files = glob.glob(files_pattern)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "df = pd.concat([pd.read_csv(f) for f in data_list_files], ignore_index=True)\n", 73 | "df.reset_index(drop=True, inplace=True)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "df['Game Result'] = np.where(df['Home Score'] > df['Away Score'], 1, 2)\n", 83 | "df['Score Difference'] = np.abs(df['Home Score'] - df['Away Score'])" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "### Reshape the data" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "df_flat = pd.melt(df, id_vars=['Season', 'Round', 'Game Result'],\n", 100 | " value_vars=['Home Score', 'Away Score'],\n", 101 | " var_name='Loc', value_name='Score')\n", 102 | "df_flat['Loc'] = df_flat['Loc'].apply(lambda x: x.split(' ')[0])\n", 103 | "df_flat['Team Result'] = np.where(((df_flat['Game Result'] == 1) &\n", 104 | " (df_flat['Loc'] == 'Home')) |\n", 105 | " ((df_flat['Game Result'] == 2) &\n", 106 | " (df_flat['Loc'] == 'Away')), 'W', 'L')\n", 107 | "\n", 108 | "df_flat['Season_int'] = df_flat['Season'].apply(lambda x: int(x[-4:]))" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "### Stat Table" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "dfgroup = df_flat.groupby(['Season', 'Loc'])['Score'].mean().unstack('Loc')\n", 125 | "dfgroup.columns = ['Away Mean Score', 'Home Mean Score']\n", 126 | "dfgroup.reset_index(inplace=True)\n", 127 | "\n", 128 | "dff = (df.groupby(['Season', 'Game Result'])['Game Result'].\n", 129 | " count().unstack('Game Result'))\n", 130 | "dff.columns = ['Home Wins', 'Away Wins']\n", 131 | "dff.reset_index(inplace=True)\n", 132 | "\n", 133 | "dfgroup = dfgroup.merge(dff, on='Season')\n", 134 | "\n", 135 | "dff = (df_flat.groupby(['Season', 'Game Result'])['Score'].\n", 136 | " mean().unstack('Game Result'))\n", 137 | "dff.columns = ['Home Win Mean Score', 'Away Win Mean Score']\n", 138 | "dff.reset_index(inplace=True)\n", 139 | "\n", 140 | "dfgroup = dfgroup.merge(dff, on='Season')\n", 141 | "\n", 142 | "print(dfgroup)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "### Plots: Home/Away Scores" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "fig = px.box(df_flat, x=\"Season\", y=\"Score\", color=\"Loc\", notched=True)\n", 159 | "fig.layout.yaxis.update({'showgrid': True, 'gridcolor': 'rgb(200, 200, 200)'})\n", 160 | "fig.show()\n", 161 | "\n", 162 | "fig = px.box(df, x=\"Season\", y=\"Home Score\", color=\"Game Result\", notched=True)\n", 163 | "fig.layout.yaxis.update({'showgrid': True, 'gridcolor': 'rgb(200, 200, 200)'})\n", 164 | "fig.show()\n", 165 | "\n", 166 | "fig = px.box(df, x=\"Season\", y=\"Away Score\", color=\"Game Result\", notched=True)\n", 167 | "fig.layout.yaxis.update({'showgrid': True, 'gridcolor': 'rgb(200, 200, 200)'})\n", 168 | "fig.show()\n", 169 | "\n", 170 | "fig = px.box(df, x=\"Season\", y=\"Score Difference\", color=\"Game Result\",\n", 171 | " notched=True)\n", 172 | "fig.layout.yaxis.update({'showgrid': True, 'gridcolor': 'rgb(200, 200, 200)'})\n", 173 | "fig.show()" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "### Scatter plots - probability of winning when scoring at least N points" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "make_scatter_plot_at_least_n_points([df_flat, df_flat[df_flat['Loc'] == 'Home'],\n", 190 | " df_flat[df_flat['Loc'] == 'Away']],\n", 191 | " ['All', 'Home', 'Away'])\n", 192 | "\n", 193 | "make_scatter_plot_at_least_n_points([df_flat[df_flat['Season_int'] == 2017],\n", 194 | " df_flat[df_flat['Season_int'] == 2018],\n", 195 | " df_flat[df_flat['Season_int'] == 2019]],\n", 196 | " ['2017', '2018', '2019'])" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "### Scatter plots - probability of winning when scoring points in a range." 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "#### Bar plots" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "make_bar_plot([df_flat, df_flat[df_flat['Loc'] == 'Home'],\n", 220 | " df_flat[df_flat['Loc'] == 'Away']],\n", 221 | " ['All', 'Home', 'Away'])\n", 222 | "\n", 223 | "make_bar_plot([df_flat[df_flat['Season_int'] == 2017],\n", 224 | " df_flat[df_flat['Season_int'] == 2018],\n", 225 | " df_flat[df_flat['Season_int'] == 2019]],\n", 226 | " ['2017', '2018', '2019'])" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "#### Scatter plots" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "make_scatter_plot([df_flat, df_flat[df_flat['Loc'] == 'Home'],\n", 243 | " df_flat[df_flat['Loc'] == 'Away']],\n", 244 | " ['All', 'Home', 'Away'])\n", 245 | "\n", 246 | "make_scatter_plot([df_flat[df_flat['Season_int'] == 2017],\n", 247 | " df_flat[df_flat['Season_int'] == 2018],\n", 248 | " df_flat[df_flat['Season_int'] == 2019]],\n", 249 | " ['2017', '2018', '2019'])" 250 | ] 251 | } 252 | ], 253 | "metadata": { 254 | "anaconda-cloud": {}, 255 | "kernelspec": { 256 | "display_name": "Python 3", 257 | "language": "python", 258 | "name": "python3" 259 | }, 260 | "language_info": { 261 | "codemirror_mode": { 262 | "name": "ipython", 263 | "version": 3 264 | }, 265 | "file_extension": ".py", 266 | "mimetype": "text/x-python", 267 | "name": "python", 268 | "nbconvert_exporter": "python", 269 | "pygments_lexer": "ipython3", 270 | "version": "3.7.4" 271 | } 272 | }, 273 | "nbformat": 4, 274 | "nbformat_minor": 4 275 | } 276 | -------------------------------------------------------------------------------- /model-selection/gridsearch_cross_validation.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Hyper-parameter tuning using k-fold cross-validation for any-number of 3 | parameters using sklearn grid-search. This script covers both the 4 | `cross_validation_one_param_models.py` and 5 | `cross_validation_two_param_models.py` scripts. 6 | ''' 7 | import sys 8 | import numpy as np 9 | from matplotlib import pyplot as plt 10 | from sklearn.model_selection import StratifiedKFold, GroupKFold 11 | from sklearn.linear_model import LogisticRegression 12 | from sklearn.ensemble import RandomForestClassifier 13 | from sklearn.tree import DecisionTreeClassifier 14 | from sklearn.svm import SVC 15 | from sklearn.naive_bayes import GaussianNB 16 | from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier 17 | from sklearn.neighbors import KNeighborsClassifier 18 | from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 19 | from sklearn.model_selection import GridSearchCV 20 | sys.path.append('auxiliary/') # noqa: E402 21 | from data_processing import load_features, shape_data 22 | 23 | 24 | # %% Choose settings and classifier 25 | test_season = '2018-2019' # hold-out season for validation 26 | level = 'match' # match or team level features to use 27 | shuffle = True # whether to shuffle or not the data in k-fold cross validation 28 | norm = True # whether to normalise or not the features 29 | min_round = 5 # minimum number of first rounds to skip in every season 30 | nsplits = 5 # number of folds in k-fold cross validation 31 | method = 'ada2' # method for grid search hyper-parameter training, see list 32 | # methods: 'log-reg', 'svm-linear', 'svm-rbf', 'decision-tree', 'random-forest', 33 | # 'naive-bayes', 'gradient-boosting', 'ada', 'ada2', 'knn', 34 | # 'discriminant-analysis' 35 | random_state = 10 36 | 37 | print('level: %s - norm: %r - shuffle: %r - method: %s' % 38 | (level, norm, shuffle, method)) 39 | 40 | # %% load feature data 41 | df = load_features(level) 42 | 43 | # choose features 44 | if level == 'match': 45 | # feats = ['Position_x', 'Position_y', 'Offence_x', 'Offence_y', 46 | # 'Defence_x', 'Defence_y', 'form_x', 'form_y', 'Diff_x', 'Diff_y', 47 | # 'Home F4', 'Away F4'] 48 | # feats = ['Position_x', 'Offence_x', 'Offence_y', 'Defence_y', 49 | # 'Diff_y', 'Home F4', 'Away F4'] 50 | feats = ['Position_x', 'Position_y', 'Offence_x', 'Offence_y', 51 | 'Defence_y', 'Diff_y', 'Away F4'] 52 | elif level == 'team': 53 | feats = ['Home', 'Away', 'Position', 'Offence', 'Defence', 54 | 'form', 'F4', 'Diff'] 55 | 56 | # seasons for calibration 57 | df = df[df['Season'] != test_season] 58 | 59 | # %% Re-shape data 60 | X_train, y_train, df, groups = shape_data(df, feats, norm=norm, 61 | min_round=min_round) 62 | print('Number of feaures:', X_train.shape[1], feats) 63 | print('Number of obs:', X_train.shape[0]) 64 | 65 | if level == 'team': 66 | kfold = GroupKFold(n_splits=nsplits) 67 | folditer = kfold.split(X_train, y_train, groups) 68 | else: 69 | kfold = StratifiedKFold(n_splits=nsplits, shuffle=shuffle, 70 | random_state=random_state) 71 | folditer = kfold.split(X_train, y_train) 72 | 73 | # %% Set parameters 74 | if method == 'log-reg': 75 | params = {'C': np.sort(np.concatenate((np.logspace(-5, 8, 14), 76 | 5 * np.logspace(-5, 8, 14)), axis=0))} 77 | model = LogisticRegression(solver='liblinear', class_weight='balanced') 78 | elif method == 'svm-linear': 79 | params = {'C': np.sort(np.concatenate( 80 | (np.logspace(-5, 8, 14), 5 * np.logspace(-5, 8, 14)), axis=0))} 81 | model = SVC(kernel='linear', class_weight='balanced', 82 | random_state=random_state, max_iter=1000) 83 | elif method == 'svm-rbf': 84 | params = {'C': np.sort(np.concatenate((np.logspace(-5, 6, 12), 85 | 5 * np.logspace(-5, 6, 12)), axis=0)), 86 | 'gamma': np.sort(np.concatenate((np.logspace(-5, 6, 12), 87 | 5 * np.logspace(-5, 6, 12)), axis=0))} 88 | model = SVC(kernel='rbf', class_weight='balanced', 89 | random_state=random_state, max_iter=1000) 90 | elif method == 'decision-tree': 91 | params = {} 92 | model = DecisionTreeClassifier(class_weight='balanced', 93 | random_state=random_state) 94 | elif method == 'random-forest': 95 | params = {'n_estimators': np.arange(10, 100, 5)} 96 | model = RandomForestClassifier(class_weight='balanced', 97 | random_state=random_state) 98 | elif method == 'naive-bayes': 99 | params = {} 100 | model = GaussianNB() 101 | elif method == 'gradient-boosting': 102 | params = {'n_estimators': np.arange(10, 200, 10)} 103 | model = GradientBoostingClassifier(random_state=random_state) 104 | elif method == 'ada': 105 | params = {'n_estimators': np.arange(5, 200, 1)} 106 | model = AdaBoostClassifier(random_state=random_state, learning_rate=1.) 107 | elif method == 'ada2': 108 | params = {'n_estimators': np.arange(5, 200, 1), 109 | 'learning_rate': np.concatenate(([0.01, 0.05], 110 | np.arange(0.1, 2.1, 0.1)))} 111 | model = AdaBoostClassifier(random_state=random_state) 112 | elif method == 'ada3': 113 | params = {'n_estimators': np.arange(5, 200, 2), 114 | 'learning_rate': np.arange(0.2, 2.1, 0.2), 115 | 'base_estimator': [DecisionTreeClassifier(max_depth=1), 116 | DecisionTreeClassifier(max_depth=5), 117 | DecisionTreeClassifier(max_depth=10), 118 | DecisionTreeClassifier(max_depth=15), 119 | DecisionTreeClassifier(max_depth=20), 120 | DecisionTreeClassifier(max_depth=25), 121 | DecisionTreeClassifier(max_depth=30)]} 122 | model = AdaBoostClassifier(random_state=random_state) 123 | elif method == 'knn': 124 | params = {'n_neighbors': np.arange(3, 50, 2)} 125 | model = KNeighborsClassifier() 126 | elif method == 'discriminant-analysis': 127 | params = {} 128 | model = QuadraticDiscriminantAnalysis() 129 | else: 130 | sys.exit('Method not recognised') 131 | 132 | # %% Tune parameters 133 | 134 | clf = GridSearchCV(model, params, cv=folditer, verbose=1, iid=False, 135 | scoring=['accuracy', 'balanced_accuracy', 'roc_auc'], 136 | refit='accuracy', n_jobs=-1) 137 | clf.fit(X_train, y_train) 138 | 139 | if hasattr(clf.best_estimator_, 'feature_importances_'): 140 | imp = clf.best_estimator_.feature_importances_ 141 | ii = np.argsort(imp)[::-1] 142 | print('Feature Importance') 143 | print([(feats[u], imp[u]) for u in ii]) 144 | 145 | # %% Plots 146 | accuracy = clf.cv_results_['mean_test_accuracy'] 147 | w_accuracy = clf.cv_results_['mean_test_balanced_accuracy'] 148 | roc_auc = clf.cv_results_['mean_test_roc_auc'] 149 | if len(params.keys()) == 0: 150 | print('Accuracy: ', accuracy[0]) 151 | print('Weighted Accuracy: ', w_accuracy[0]) 152 | print('ROC-AUC: ', roc_auc[0]) 153 | elif len(params.keys()) == 1: 154 | tmp = list(clf.param_grid) 155 | params = clf.param_grid[tmp[0]] 156 | print('Accuracy: %.4f at %.4g' % 157 | (np.max(accuracy), params[np.argmax(accuracy)])) 158 | print('Weighted Accuracy: %.4f at %.4g' % 159 | (np.max(w_accuracy), params[np.argmax(w_accuracy)])) 160 | print('ROC-AUC: %.4f at %.4g' % 161 | (np.max(roc_auc), params[np.argmax(roc_auc)])) 162 | plt.figure() 163 | plt.plot(params, accuracy, label='accuracy') 164 | plt.plot(params, w_accuracy, label='w_accuracy') 165 | plt.plot(params, roc_auc, label='ROC-AUC') 166 | if method in ['log-reg', 'svm-linear']: 167 | plt.xscale('log') 168 | plt.xlabel('parameter') 169 | plt.ylabel('Score') 170 | plt.legend() 171 | plt.title(method) 172 | plt.show() 173 | elif len(params.keys()) == 2: 174 | 175 | # according to some references GridSearchCV() performs search in 176 | # alphabetical order of the parameters. 177 | tmp = sorted(list(clf.param_grid.keys())) 178 | shape = (clf.param_grid[tmp[0]].shape[0], 179 | clf.param_grid[tmp[1]].shape[0]) 180 | accuracy = accuracy.reshape(shape) 181 | w_accuracy = w_accuracy.reshape(shape) 182 | np.savez('output/%s_feat_comb_index_2543' % method, accuracy=accuracy, 183 | w_accuracy=w_accuracy, 184 | params1=clf.param_grid[tmp[0]], params2=clf.param_grid[tmp[1]]) 185 | 186 | print('Accuracy: %.4f at %s=%.4g and %s=%.4g' % 187 | (clf.best_score_, tmp[0], clf.best_params_[tmp[0]], 188 | tmp[1], clf.best_params_[tmp[1]])) 189 | 190 | inds = np.unravel_index(np.argmax(accuracy), shape) 191 | print('Accuracy: %.4f at %s=%.4g and %s=%.4g' % 192 | (np.max(accuracy), 193 | tmp[0], clf.param_grid[tmp[0]][inds[0]], 194 | tmp[1], clf.param_grid[tmp[1]][inds[1]])) 195 | 196 | inds = np.unravel_index(np.argmax(w_accuracy), shape) 197 | print('Weighted Accuracy: %.4f at %s=%.4g and %s=%.4g' % 198 | (np.max(w_accuracy), 199 | tmp[0], clf.param_grid[tmp[0]][inds[0]], 200 | tmp[1], clf.param_grid[tmp[1]][inds[1]])) 201 | 202 | print('ROC-AUC: %.4f' % np.max(roc_auc)) 203 | 204 | plt.figure() 205 | plt.imshow(accuracy) 206 | plt.colorbar() 207 | plt.show() 208 | 209 | plt.figure() 210 | plt.imshow(w_accuracy) 211 | plt.colorbar() 212 | plt.show() 213 | elif len(params.keys()) == 3: 214 | print('Accuracy: %.4f at %s' % (clf.best_score_, clf.best_estimator_)) 215 | np.savez('output/%s' % method, 216 | accuracy=clf.cv_results_['mean_test_accuracy'], 217 | w_accuracy=clf.cv_results_['mean_test_balanced_accuracy'], 218 | params=clf.cv_results_['params']) 219 | -------------------------------------------------------------------------------- /feature-extraction/make_features.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import numpy as np 3 | import pandas as pd 4 | from make_standings import make_standings 5 | 6 | 7 | def find_form(df, game_round, team_id): 8 | ''' 9 | Finds the form of a team, i.e. the ratio of winning games over the last 5 10 | games. 11 | ''' 12 | form = np.nan 13 | team_df = df[((df['Home Team ID'] == team_id) | 14 | (df['Away Team ID'] == team_id)) & 15 | ((df['Game Round'] < game_round) & 16 | (df['Game Round'] >= game_round - 5))] 17 | n_games = team_df.shape[0] 18 | 19 | if n_games == 0: 20 | return np.nan 21 | 22 | home_games = team_df['Home Team ID'] == team_id 23 | away_games = team_df['Away Team ID'] == team_id 24 | wins = np.sum(team_df['Home Score'][home_games] > 25 | team_df['Away Score'][home_games]) 26 | wins += np.sum(team_df['Home Score'][away_games] < 27 | team_df['Away Score'][away_games]) 28 | 29 | form = wins / n_games 30 | return form 31 | 32 | 33 | def make_game_features(data, standings, f4teams=[]): 34 | '''game-level features: 35 | standing of home team 36 | standing of away team 37 | avg scoring points of home team 38 | avg scoring points of away team 39 | avg against points of home team 40 | avg against points of away team 41 | wins to losses of home team 42 | wins to losses of away team 43 | form of home team (wins over the last 5 games) 44 | form of away team (wins over the last 5 games) 45 | ''' 46 | logger = logging.getLogger(__name__) 47 | logger.info('make match-level features') 48 | 49 | stands = standings.copy() 50 | stands['Round'] += 1 51 | data['Home F4'] = np.where(data['Home Team'].isin(f4teams), 1, 0) 52 | data['Away F4'] = np.where(data['Away Team'].isin(f4teams), 1, 0) 53 | data['Label'] = np.where(data['Home Score'] > data['Away Score'], 1, 2) 54 | new_df = data.merge(stands, how='left', 55 | left_on=['Round', 'Home Team'], 56 | right_on=['Round', 'Club Name']) 57 | new_df = new_df.merge(stands, how='left', 58 | left_on=['Round', 'Away Team'], 59 | right_on=['Round', 'Club Name']) 60 | 61 | tmp = new_df[['Offence_x', 'Offence_y', 'Defence_x', 'Defence_y']].values 62 | tmp /= np.repeat((new_df['Round'].values - 1)[:, np.newaxis], tmp.shape[1], 63 | axis=1) 64 | 65 | new_df[['Offence_x', 'Offence_y', 'Defence_x', 'Defence_y']] = tmp 66 | new_df['Diff_x'] = new_df['Offence_x'] - new_df['Defence_x'] 67 | new_df['Diff_y'] = new_df['Offence_y'] - new_df['Defence_y'] 68 | 69 | # tmp = new_df[['Wins_x', 'Losses_x']].values 70 | # new_df['Wins_to_Losses_x'] = tmp[:, 0] / tmp[:, 1] 71 | # tmp = new_df[['Wins_y', 'Losses_y']].values 72 | # new_df['Wins_to_Losses_y'] = tmp[:, 0] / tmp[:, 1] 73 | 74 | forms_home = np.zeros(new_df.shape[0]) 75 | forms_away = np.zeros(new_df.shape[0]) 76 | n_form_games = 5 77 | for index, row in new_df.iterrows(): 78 | g_round = row['Round'] 79 | home_team = row['Home Team'] 80 | away_team = row['Away Team'] 81 | form_home = 0. 82 | form_away = 0. 83 | den = 1 84 | if g_round > n_form_games + 1: 85 | # index of home team at previous round 86 | ii1 = ((standings['Club Name'] == home_team) & 87 | (standings['Round'] == g_round - 1)) 88 | # index of home team at `n_form_games` rounds ago. 89 | ii2 = ((standings['Club Name'] == home_team) & 90 | (standings['Round'] == g_round - n_form_games - 1)) 91 | form_home = (standings[ii1]['Wins'].values[0] - 92 | standings[ii2]['Wins'].values[0]) 93 | 94 | # index of away team at previous round 95 | ii1 = ((standings['Club Name'] == away_team) & 96 | (standings['Round'] == g_round - 1)) 97 | # index of away team at `n_form_games` rounds ago. 98 | ii2 = ((standings['Club Name'] == away_team) & 99 | (standings['Round'] == g_round - n_form_games - 1)) 100 | form_away = (standings[ii1]['Wins'].values[0] - 101 | standings[ii2]['Wins'].values[0]) 102 | den = n_form_games 103 | elif g_round > 1: 104 | # index of home team at previous round 105 | ii1 = ((standings['Club Name'] == home_team) & 106 | (standings['Round'] == g_round - 1)) 107 | form_home = standings[ii1]['Wins'].values[0] 108 | # index of away team at previous round 109 | ii1 = ((standings['Club Name'] == away_team) & 110 | (standings['Round'] == g_round - 1)) 111 | form_away = standings[ii1]['Wins'].values[0] 112 | den = g_round - 1 113 | # print(g_round, form_home, form_away) 114 | forms_home[index] = form_home / den 115 | forms_away[index] = form_away / den 116 | 117 | new_df['form_x'] = forms_home 118 | new_df['form_y'] = forms_away 119 | 120 | new_df = new_df[['Season', 'Round', 'Home Team', 'Away Team', 'Label', 121 | 'Position_x', 'Position_y', 122 | 'Offence_x', 'Offence_y', 123 | 'Defence_x', 'Defence_y', 124 | # 'Wins_to_Losses_x', 'Wins_to_Losses_y', 125 | 'form_x', 'form_y', 126 | 'Diff_x', 'Diff_y', 127 | 'Home F4', 'Away F4']] 128 | 129 | return new_df 130 | 131 | 132 | def make_game_features_v0(df, standings=None): 133 | '''game-level features: 134 | standing of home team 135 | standing of away team 136 | form of home team (wins over the last 5 games) 137 | form of away team (wins over the last 5 games) 138 | avg scoring points of home team 139 | avg scoring points of away team 140 | avg against points of home team 141 | avg against points of away team 142 | ''' 143 | teams = np.unique(df['Home Team ID'].values) 144 | 145 | if standings is None: 146 | standings = {} 147 | for i in range(1, 31): 148 | standings[i] = make_standings(df, i) 149 | 150 | f4 = [3514, 3501, 3540, 6663] 151 | top8 = [3508, 3515, 3553] 152 | 153 | n_features = 14 + 32 154 | features = np.zeros((df.shape[0], n_features)) 155 | for row in range(df.shape[0]): 156 | 157 | game_round = df['Game Round'].iloc[row] 158 | home_team = df['Home Team ID'].iloc[row] 159 | away_team = df['Away Team ID'].iloc[row] 160 | 161 | if game_round == 1: 162 | features[row, :] = -1 * np.ones(n_features) 163 | continue 164 | 165 | standing = standings[game_round - 1] 166 | 167 | standing_home_team = standing[standing['Team ID'] == 168 | home_team].index[0] + 1 169 | standing_away_team = standing[standing['Team ID'] == 170 | away_team].index[0] + 1 171 | 172 | form_home_team = find_form(df, game_round, home_team) 173 | form_away_team = find_form(df, game_round, away_team) 174 | 175 | avg_attack_home_team = standing[standing['Team ID'] == 176 | home_team]['Score+'] / game_round 177 | avg_attack_away_team = standing[standing['Team ID'] == 178 | away_team]['Score+'] / game_round 179 | 180 | avg_defence_home_team = standing[standing['Team ID'] == 181 | home_team]['Score-'] / game_round 182 | avg_defence_away_team = standing[standing['Team ID'] == 183 | away_team]['Score-'] / game_round 184 | home_team_inf4 = 1 if home_team in f4 else 0 185 | home_team_intop8 = 1 if home_team in top8 else 0 186 | home_team_inrest = 0 if (home_team_inf4 or home_team_intop8) else 1 187 | away_team_inf4 = 1 if away_team in f4 else 0 188 | away_team_intop8 = 1 if away_team in top8 else 0 189 | away_team_inrest = 0 if (away_team_inf4 or away_team_intop8) else 1 190 | 191 | features[row, 0] = standing_home_team 192 | features[row, 1] = standing_away_team 193 | features[row, 2] = form_home_team 194 | features[row, 3] = form_away_team 195 | features[row, 4] = avg_attack_home_team 196 | features[row, 5] = avg_attack_away_team 197 | features[row, 6] = avg_defence_home_team 198 | features[row, 7] = avg_defence_away_team 199 | features[row, 8] = home_team_inf4 200 | features[row, 9] = home_team_intop8 201 | features[row, 10] = home_team_inrest 202 | features[row, 11] = away_team_inf4 203 | features[row, 12] = away_team_intop8 204 | features[row, 13] = away_team_inrest 205 | features[row, 14:30] = (teams == home_team).astype(int) 206 | features[row, 30:] = (teams == away_team).astype(int) 207 | 208 | headers = ['standing-home-team', 'standing-away-team', 209 | 'form-home-team', 'form-away-team', 210 | 'avg-attack-home-team', 'avg-attack-away-team', 211 | 'avg-defence-home-team', 'avg-defence-away-team', 212 | 'home-team-f4', 'home-team-top8', 'home-team-rest', 213 | 'away-team-f4', 'away-team-top8', 'away-team-rest'] 214 | headers.extend([str(t) + '-home' for t in teams]) 215 | headers.extend([str(t) + '-away' for t in teams]) 216 | 217 | df = pd.DataFrame(data=features, columns=headers) 218 | # df = df.astype(dtype={'standing-home-team': int, 219 | # 'standing-away-team': int, 220 | # 'form-home-team': float, 'form-away-team': float, 221 | # 'avg-attack-home-team': float, 222 | # 'avg-attack-away-team': float, 223 | # 'avg-defence-home-team': float, 224 | # 'avg-defence-away-team': float, 225 | # 'home-team-f4': int, 'home-team-top8': int, 226 | # 'home-team-rest': int, 227 | # 'away-team-f4': int, 'away-team-top8': int, 228 | # 'away-team-rest': int}) 229 | headers_dict = dict(zip(headers, [int] * features.shape[1])) 230 | headers_dict['form-home-team'] = float 231 | headers_dict['form-away-team'] = float 232 | headers_dict['avg-attack-home-team'] = float 233 | headers_dict['avg-attack-away-team'] = float 234 | headers_dict['avg-defence-home-team'] = float 235 | headers_dict['avg-defence-away-team'] = float 236 | df = df.astype(dtype=headers_dict) 237 | return df 238 | 239 | 240 | def make_team_features(data, standings, f4Teams=[]): 241 | logger = logging.getLogger(__name__) 242 | logger.info('make team-level features') 243 | game_feats = make_game_features(data, standings, f4Teams) 244 | 245 | cols = ['Season', 'Round', 'Game ID', 'Team', 'Label', 'Home', 'Away', 246 | 'Position', 'Offence', 'Defence', 'form', 'F4', 'Diff'] 247 | 248 | game_feats['Game ID'] = data['GameID'] 249 | 250 | home = game_feats[['Season', 'Round', 'Game ID', 'Home Team', 251 | 'Position_x', 'Offence_x', 'Defence_x', 'form_x', 252 | 'Home F4']] 253 | home = home.rename(index=str, columns={'Home Team': 'Team', 254 | 'Position_x': 'Position', 255 | 'Offence_x': 'Offence', 256 | 'Defence_x': 'Defence', 257 | 'form_x': 'form', 258 | 'Home F4': 'F4'}) 259 | home['Diff'] = home['Offence'] - home['Defence'] 260 | home['Label'] = np.where(game_feats['Label'].values == 1, 1, 0) 261 | home['Home'] = 1 262 | home['Away'] = 0 263 | # rearrange feature columns 264 | home = home[cols] 265 | 266 | away = game_feats[['Season', 'Round', 'Game ID', 'Away Team', 267 | 'Position_y', 'Offence_y', 'Defence_y', 'form_y', 268 | 'Away F4']] 269 | away = away.rename(index=str, columns={'Away Team': 'Team', 270 | 'Position_y': 'Position', 271 | 'Offence_y': 'Offence', 272 | 'Defence_y': 'Defence', 273 | 'form_y': 'form', 274 | 'Away F4': 'F4'}) 275 | away['Diff'] = away['Offence'] - away['Defence'] 276 | away['Label'] = np.where(game_feats['Label'].values == 2, 1, 0) 277 | away['Home'] = 0 278 | away['Away'] = 1 279 | # rearrange feature columns 280 | away = away[cols] 281 | 282 | team_feats = pd.concat([home, away]) 283 | team_feats.sort_values(by=['Round', 'Team'], inplace=True) 284 | 285 | return team_feats 286 | -------------------------------------------------------------------------------- /model-validation/validation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Training and validation of the final model(s) per round and comparison to the wisdom of the crowd." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "cd .." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import sys\n", 26 | "import os\n", 27 | "import numpy as np\n", 28 | "import pandas as pd\n", 29 | "from tqdm import tqdm\n", 30 | "from scipy.stats import linregress\n", 31 | "\n", 32 | "from matplotlib import pyplot as plt\n", 33 | "import plotly.graph_objs as go\n", 34 | "\n", 35 | "from sklearn.ensemble import AdaBoostClassifier\n", 36 | "from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score\n", 37 | "\n", 38 | "sys.path.append('auxiliary/')\n", 39 | "from data_processing import load_features, shape_data, shape_data_scaler" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "### Choose settings for the final model validation" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "test_season = '2018-2019' # hold-out season for validation\n", 56 | "level = 'match' # match or team level features to use\n", 57 | "min_round_train = 5 # minimum number of first rounds to skip in every season (train set)\n", 58 | "min_round_test = 5 # minimum number of first rounds to skip in every season (test set)\n", 59 | "norm = True # whether to normalise or not the features\n", 60 | "random_state = 10 # random state for the classifier" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "### Choose model hyper-parameters and feature sets for the models to validate\n", 68 | "Adjust hyper-parameters and feature sets to reflect the optimal options from analysis in previous steps." 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "params = [\n", 78 | " {'features': ['Position_x', 'Offence_x', 'Offence_y', 'Defence_y',\n", 79 | " 'Diff_y', 'Home F4', 'Away F4'],\n", 80 | " 'n_estimators': 115, \n", 81 | " 'learning_rate': 0.7},\n", 82 | " {'features': ['Position_x', 'Position_y', 'Offence_x', 'Offence_y',\n", 83 | " 'Defence_y', 'Diff_y', 'Away F4'],\n", 84 | " 'n_estimators': 141, \n", 85 | " 'learning_rate': 0.7},\n", 86 | " {'features': ['Position_x', 'Position_y', 'Offence_x', 'Offence_y',\n", 87 | " 'Defence_x', 'Defence_y', 'form_x', 'form_y',\n", 88 | " 'Diff_x', 'Diff_y', 'Home F4', 'Away F4'],\n", 89 | " 'n_estimators': 121, \n", 90 | " 'learning_rate': 1.0}\n", 91 | "]" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "### Load Features" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "df = load_features(level)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "### Train and Predict progressively" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "# Every week has each own model\n", 124 | "rounds = np.arange(2, 31, dtype=int)\n", 125 | "print('Rounds for validation:', rounds)\n", 126 | "accuracy = np.zeros((rounds.shape[0], len(params)))\n", 127 | "waccuracy = np.zeros((rounds.shape[0], len(params)))\n", 128 | "models_results = pd.DataFrame({'game_round': rounds.repeat(8)})\n", 129 | "for j, param in enumerate(tqdm(params)):\n", 130 | " features = param['features']\n", 131 | " n_estimators = param['n_estimators']\n", 132 | " learning_rate = param['learning_rate']\n", 133 | " model = AdaBoostClassifier(n_estimators=n_estimators, random_state=10,\n", 134 | " learning_rate=learning_rate)\n", 135 | "\n", 136 | " y_pred_all = np.array([])\n", 137 | " y_test_all = np.array([])\n", 138 | " for i, game_round in enumerate(rounds):\n", 139 | " train_inds = (df['Season'] != test_season) | ((df['Season'] == test_season) & (df['Round'] < game_round))\n", 140 | " test_inds = ~ train_inds\n", 141 | " X_train, y_train, df_train, _, scaler = shape_data_scaler(df[train_inds], features,\n", 142 | " norm=norm, min_round=1)\n", 143 | " model.fit(X_train, y_train)\n", 144 | "\n", 145 | " X_test, y_test, df_test, _, _ = shape_data_scaler(df[test_inds], features,\n", 146 | " norm=scaler, min_round=1)\n", 147 | "\n", 148 | " y_pred = model.predict(X_test)\n", 149 | " \n", 150 | " accur = accuracy_score(y_test, y_pred)\n", 151 | " w_accur = balanced_accuracy_score(y_test, y_pred)\n", 152 | " \n", 153 | " # store the predictions, actuals of the current round\n", 154 | " y_pred_all = np.concatenate((y_pred_all, y_pred[:8]))\n", 155 | " y_test_all = np.concatenate((y_test_all, y_test[:8]))\n", 156 | "\n", 157 | " accuracy[i, j] = accur\n", 158 | " waccuracy[i, j] = w_accur\n", 159 | " \n", 160 | " if 'actual' not in models_results.columns:\n", 161 | " models_results['Actual'] = y_test_all.astype(int)\n", 162 | " models_results['Pred_%d' % j] = y_pred_all.astype(int)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "models_results['Pred_comb'] = np.where(models_results[['Pred_0', 'Pred_1', 'Pred_2']].sum(axis=1) > 1.5, 1, 0)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "models_results['Pred_Majority'] = np.zeros(models_results.shape[0], dtype=int)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "### Print Scores" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "model_list = [u for u in models_results.columns if u.startswith('Pred')]" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "print('Accuracy scores')\n", 206 | "for col in model_list:\n", 207 | " print('%s:' % col, \n", 208 | " accuracy_score(models_results['Actual'], \n", 209 | " models_results[col]))" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "print('Weighted accuracy scores')\n", 219 | "for col in model_list:\n", 220 | " print('%s:' % col, \n", 221 | " balanced_accuracy_score(models_results['Actual'], \n", 222 | " models_results[col]))" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "print('ROC-AUC scores')\n", 232 | "for col in model_list:\n", 233 | " print('%s:' % col, roc_auc_score(models_results['Actual'], models_results[col]))" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "### Plot Accuracy per round" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "uniq_rounds = np.unique(models_results['game_round'].values)\n", 250 | "n_rounds = uniq_rounds.shape[0]\n", 251 | "round_accuracy = np.zeros(n_rounds)\n", 252 | "n_correct = np.zeros(n_rounds)\n", 253 | "for i, u in enumerate(uniq_rounds):\n", 254 | " ii = models_results['game_round'] == u\n", 255 | " n_correct[i] = (models_results.loc[ii, 'Actual'].values == models_results.loc[ii, 'Pred_1'].values).sum()\n", 256 | " round_accuracy[i] = accuracy_score(models_results.loc[ii, 'Actual'].values, models_results.loc[ii, 'Pred_1'].values)" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "data = go.Bar(x=rounds, y=n_correct)\n", 266 | "layout = go.Layout(yaxis={'title': 'Number of Correctly Predicted Games'},\n", 267 | " xaxis={'title': 'Game Round'})\n", 268 | "fig = go.Figure(data, layout)\n", 269 | "fig.show()" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "slope, interc, _, _, _ = linregress(uniq_rounds, round_accuracy)\n", 279 | "y = slope * uniq_rounds + interc\n", 280 | "data = [\n", 281 | " go.Scatter(x=uniq_rounds, y=round_accuracy, mode='markers'),\n", 282 | " go.Scatter(x=uniq_rounds, y=y)\n", 283 | "]\n", 284 | "layout = go.Layout(yaxis={'title': 'Accuracy'}, xaxis={'title': 'Game Round'}, showlegend=False)\n", 285 | "fig = go.Figure(data, layout)\n", 286 | "fig.show()" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "# The Wisdom of the Crowds\n", 294 | "The data for this task is available upon request." 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "predict_files_pattern = os.path.expanduser('~/Documents/mia_syn_mia_app/output/2018-2019/predictions_day_%d.csv')" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "woc_results = np.array([])\n", 313 | "for i in rounds:\n", 314 | " try:\n", 315 | " woc_df = pd.read_csv(predict_files_pattern % i)\n", 316 | " xx = woc_df[['game_%d' % u for u in range(1, 9)]].mode().values[0, :].flatten()\n", 317 | " except:\n", 318 | " print('File not found: round', i)\n", 319 | " xx = np.full(8, np.nan)\n", 320 | " woc_results = np.concatenate((woc_results, xx))" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "# WoC predictions\n", 330 | "models_results['Pred_WoC'] = woc_results - 1" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "if 'Pred_WoC' not in model_list:\n", 340 | " model_list.append('Pred_WoC')" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "### Comparison of results without the missing round(s)" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "# exclude the missing round(s) (if any)\n", 357 | "ii = pd.notna(models_results['Pred_WoC'])" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "metadata": {}, 364 | "outputs": [], 365 | "source": [ 366 | "print('Accuracy Scores')\n", 367 | "for col in model_list:\n", 368 | " print('%s: \\t' % col, \n", 369 | " accuracy_score(models_results.loc[ii, 'Actual'].values,\n", 370 | " models_results.loc[ii, col].values)\n", 371 | " )" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "print('Weighted-Accuracy Scores')\n", 381 | "for col in model_list:\n", 382 | " print('%s: \\t' % col, \n", 383 | " balanced_accuracy_score(models_results.loc[ii, 'Actual'].values, \n", 384 | " models_results.loc[ii, col].values)\n", 385 | " )" 386 | ] 387 | } 388 | ], 389 | "metadata": { 390 | "anaconda-cloud": {}, 391 | "kernelspec": { 392 | "display_name": "Python 3", 393 | "language": "python", 394 | "name": "python3" 395 | }, 396 | "language_info": { 397 | "codemirror_mode": { 398 | "name": "ipython", 399 | "version": 3 400 | }, 401 | "file_extension": ".py", 402 | "mimetype": "text/x-python", 403 | "name": "python", 404 | "nbconvert_exporter": "python", 405 | "pygments_lexer": "ipython3", 406 | "version": "3.7.4" 407 | } 408 | }, 409 | "nbformat": 4, 410 | "nbformat_minor": 4 411 | } 412 | --------------------------------------------------------------------------------