├── .gitignore
├── auxiliary
    ├── __init__.py
    ├── argparser_types.py
    ├── io_json.py
    ├── fix_team_names.py
    ├── deco_path_valid.py
    ├── kfold_crosseval.py
    └── data_processing.py
├── .flake8
├── requirements.txt
├── settings
    ├── feature_extraction.json
    └── data_collection.json
├── feature-extraction
    ├── extract_features.py
    ├── make_standings.py
    └── make_features.py
├── data-collection
    ├── scrap_season_standings.py
    ├── scrap_season_results.py
    └── scrap_game_stats.py
├── feature-selection
    ├── feature_selection_pca.py
    ├── assessing-wrapper-methods.ipynb
    ├── feature_selection_wrapper.py
    ├── feature_selection_wrapper_sfs.py
    └── feature_selection_filter.py
├── descriptive-analysis
    ├── utils.py
    ├── descriptive_analysis.py
    └── descriptive_analysis.ipynb
├── model-selection
    ├── cross_validation_two_param_models.py
    ├── cross_validation_one_param_models.py
    └── gridsearch_cross_validation.py
├── README.md
└── model-validation
    ├── benchmarks.py
    └── validation.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/auxiliary/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore=W504
3 | max-line-length = 80
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | scikit-learn
4 | matplotlib
5 | requests
6 | beautifulsoup4
7 | tqdm
8 | plotly
9 | 


--------------------------------------------------------------------------------
/settings/feature_extraction.json:
--------------------------------------------------------------------------------
1 | {
2 |   "feature_dir": "features",
3 |   "match_level_feature_file_prefix": "match_level_features",
4 |   "team_level_feature_file_prefix": "team_level_features"
5 | }
6 | 


--------------------------------------------------------------------------------
/auxiliary/argparser_types.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def is_valid_parent_path(parser, x):
 5 |     '''
 6 |     Check if input string has a valid parent path
 7 |     '''
 8 |     parent_path = '.' if os.path.split(x)[0] == '' else os.path.split(x)[0]
 9 |     if not os.path.isdir(parent_path):
10 |         parser.error('Parent path %s of output file not valid.' % parent_path)
11 |     else:
12 |         return str(x)
13 | 


--------------------------------------------------------------------------------
/auxiliary/io_json.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from deco_path_valid import valid_file
 3 | from deco_path_valid import valid_folder
 4 | 
 5 | 
 6 | @valid_folder
 7 | def write_json(file, data):
 8 |     '''
 9 |     Writes data into json file
10 |     '''
11 |     with open(file, 'w', encoding='utf8') as outfile:
12 |         json.dump(data, outfile, ensure_ascii=False)
13 |     return
14 | 
15 | 
16 | @valid_file
17 | def read_json(file):
18 |     '''
19 |     Reads data from json file
20 |     '''
21 |     with open(file, 'r', encoding='utf8') as outfile:
22 |         data = json.load(outfile)
23 |     return data
24 | 


--------------------------------------------------------------------------------
/settings/data_collection.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "n_rounds": 34,
 3 |   "output_dir": "data/",
 4 |   "f4teams_file": "data/f4teams.json",
 5 |   "game_stats": {
 6 |     "url_link": "http://www.euroleague.net/main/results?gamenumber=%d&phasetypecode=RS&seasoncode=E%d",
 7 |     "output_file_prefix": "euroleague_game_stats"
 8 |   },
 9 |   "season_results": {
10 |     "url_link": "http://www.euroleague.net/main/results?gamenumber=%d&phasetypecode=RS&seasoncode=E%d",
11 |     "output_file_prefix": "euroleague_results"
12 |   },
13 |   "season_standings": {
14 |     "url_link": "http://www.euroleague.net/main/standings?gamenumber=%d&phasetypecode=RS++++++++&seasoncode=E%d",
15 |     "output_file_prefix": "euroleague_standings"
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/auxiliary/fix_team_names.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def fix_team_names(df1, df2):
 5 |     '''
 6 |     Fix inconsistancies in team names across the seasons
 7 |     '''
 8 |     name_dict = {
 9 |         'EA7 Emporio Armani Milan': 'AX Armani Exchange Olimpia Milan',
10 |         'Fenerbahce Istanbul': 'Fenerbahce Dogus Istanbul',
11 |         'Baskonia Vitoria Gasteiz': 'KIROLBET Baskonia Vitoria Gasteiz'
12 |     }
13 | 
14 |     for team in name_dict.keys():
15 |         print(team in df1['Home Team'].values)
16 |         df1.replace(team, name_dict[team], inplace=True)
17 | 
18 |     teams1 = np.unique(df1['Home Team'])
19 |     teams2 = np.unique(df2['Home Team'])
20 |     if not np.in1d(teams1, teams2).all():
21 |         ii = ~np.in1d(teams1, teams2)
22 |         print(teams1[ii])
23 |     if not np.in1d(teams2, teams1).all():
24 |         ii = ~np.in1d(teams2, teams1)
25 |         print(teams2[ii])
26 | 
27 |     return df1, df2
28 | 


--------------------------------------------------------------------------------
/auxiliary/deco_path_valid.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | import sys
 3 | 
 4 | 
 5 | def valid_file(func):
 6 |     """
 7 |     Checks the validity of the input file of read-data fuction. If files
 8 |     does not exist, it exists.
 9 |     """
10 | 
11 |     def wrapper(filename, *args, **kwargs):
12 |         if os.path.isfile(filename):
13 |             a = func(filename, *args, **kwargs)
14 |             return a
15 |         else:
16 |             sys.exit('File %s not found' % filename)
17 |             return
18 |     return wrapper
19 | 
20 | 
21 | def valid_folder(func):
22 |     """
23 |     Checks the validity of the output directory of a write-data function. If
24 |     directory does not exist, it exists.
25 |     """
26 | 
27 |     def wrapper(filepath, *args, **kwargs):
28 |         if os.path.isdir(os.path.dirname(filepath)):
29 |             a = func(filepath, *args, **kwargs)
30 |             return a
31 |         else:
32 |             sys.exit('Directory %s not found' % filepath)
33 |             return
34 |     return wrapper
35 | 


--------------------------------------------------------------------------------
/feature-extraction/extract_features.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import argparse
 4 | import logging
 5 | import pandas as pd
 6 | from make_features import make_game_features
 7 | from make_features import make_team_features
 8 | sys.path.append('auxiliary')  # noqa: E402
 9 | from io_json import read_json
10 | 
11 | logging.basicConfig(level=logging.INFO)
12 | 
13 | 
14 | def main(season):
15 |     '''
16 |     Extract features (game and team) from the fetched data from the
17 |     Euroleague's site
18 |     '''
19 |     # get data settings
20 |     data_settings = read_json('settings/data_collection.json')
21 |     out_dir = data_settings['output_dir']
22 |     rslts_file_prefix = data_settings['season_results']['output_file_prefix']
23 |     results_file = os.path.join(
24 |         out_dir, '%s_%d_%d.csv' % (rslts_file_prefix, season - 1, season))
25 |     stnds_file_prefix = data_settings['season_standings']['output_file_prefix']
26 |     standings_file = os.path.join(
27 |         out_dir, '%s_%d_%d.csv' % (stnds_file_prefix, season - 1, season))
28 |     f4_file = data_settings['f4teams_file']
29 | 
30 |     # get feature settings
31 |     feat_settings = read_json('settings/feature_extraction.json')
32 |     feature_dir = feat_settings['feature_dir']
33 |     match_level_file_ = feat_settings['match_level_feature_file_prefix']
34 |     team_level_file_ = feat_settings['team_level_feature_file_prefix']
35 |     match_level_file = os.path.join(
36 |         feature_dir, '%s_%d_%d.csv' % (match_level_file_, season - 1, season))
37 |     team_level_file = os.path.join(
38 |         feature_dir, '%s_%d_%d.csv' % (team_level_file_, season - 1, season))
39 | 
40 |     data = pd.read_csv(results_file)
41 |     standings = pd.read_csv(standings_file)
42 |     f4teams = read_json(f4_file)
43 | 
44 |     # Specify the F4 teams of the *previous* year
45 |     f4Teams = f4teams[str(season - 1)]
46 | 
47 |     # make game features
48 |     feats = make_game_features(data, standings, f4Teams)
49 | 
50 |     # save features to file.
51 |     logging.info('save match-level features')
52 |     feats.to_csv(match_level_file, index=False)
53 | 
54 |     # make team features
55 |     team_feats = make_team_features(data, standings, f4Teams)
56 |     # save features to file
57 |     logging.info('save team-level features')
58 |     team_feats.to_csv(team_level_file, index=False)
59 |     return
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     parser = argparse.ArgumentParser()
64 |     parser.add_argument('-s', '--season', required=True, type=int,
65 |                         help="the ending year of the season")
66 | 
67 |     args = parser.parse_args()
68 | 
69 |     main(args.season)
70 | 


--------------------------------------------------------------------------------
/auxiliary/kfold_crosseval.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.model_selection import StratifiedKFold, GroupKFold
 3 | from sklearn.naive_bayes import GaussianNB
 4 | from sklearn.metrics import accuracy_score, balanced_accuracy_score
 5 | 
 6 | 
 7 | def kfold_crosseval(X_train, y_train, df_train, nsplits, groups=None,
 8 |                     model=GaussianNB(), level='match', shuffle=True):
 9 |     '''
10 |     Perform k-fold cross validation using the input `model`
11 |     '''
12 |     if level == 'team':
13 |         kfold = GroupKFold(n_splits=nsplits)
14 |         folditer = kfold.split(X_train, y_train, groups)
15 |     else:
16 |         kfold = StratifiedKFold(n_splits=nsplits, shuffle=shuffle,
17 |                                 random_state=10)
18 |         folditer = kfold.split(X_train, y_train)
19 | 
20 |     accuracy = np.zeros(kfold.get_n_splits())
21 |     w_accuracy = np.zeros(kfold.get_n_splits())
22 |     i = -1
23 |     for train_index, test_index in folditer:
24 |         # loop over folds
25 |         i += 1
26 |         X_train_folds, X_test_fold = (X_train[train_index, :],
27 |                                       X_train[test_index, :])
28 |         y_train_folds, y_test_fold = y_train[train_index], y_train[test_index]
29 |         df_test_fold = df_train.iloc[test_index, :].copy()
30 | 
31 |         # fit model
32 |         model.fit(X_train_folds, y_train_folds)
33 | 
34 |         if level == 'team':
35 |             # calculate accuracy at the match level
36 |             y_pred_prob = model.predict_proba(X_test_fold)
37 |             df_test_fold['Prob'] = y_pred_prob[:, 1]
38 |             y_test_fold = []
39 |             y_pred = []
40 |             for gid in np.unique(df_test_fold['Game ID']):
41 |                 teams = df_test_fold[df_test_fold['Game ID'] == gid]
42 |                 if teams.shape[0] == 2:
43 |                     game_pred = (1 if teams.iloc[0]['Prob'] >
44 |                                  teams.iloc[1]['Prob'] else 0)
45 |                     game_resu = (1 if teams.iloc[0]['Label'] >
46 |                                  teams.iloc[1]['Label'] else 0)
47 |                     y_test_fold.append(game_resu)
48 |                     y_pred.append(game_pred)
49 |                 else:
50 |                     print('Warning: Game ID %d has missing teams' % gid)
51 |             y_test_fold = np.array(y_test_fold)
52 |             y_pred = np.array(y_pred)
53 |         else:
54 |             # predict model
55 |             y_pred = model.predict(X_test_fold)
56 | 
57 |         accuracy[i] = accuracy_score(y_test_fold, y_pred)
58 |         w_accuracy[i] = balanced_accuracy_score(y_test_fold, y_pred)
59 | 
60 |     return accuracy.mean(), w_accuracy.mean()
61 | 


--------------------------------------------------------------------------------
/data-collection/scrap_season_standings.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import argparse
 4 | import logging
 5 | import re
 6 | from bs4 import BeautifulSoup
 7 | import requests
 8 | from tqdm import trange
 9 | import pandas as pd
10 | sys.path.append('auxiliary/')  # noqa: E402
11 | from io_json import read_json
12 | 
13 | logging.basicConfig(level=logging.INFO)
14 | 
15 | 
16 | def main(season, n_rounds):
17 |     '''
18 |     Scraps the standings of the Euroleague games from the Euroleague's official
19 |     site for the input season.
20 |     Saves data to file.
21 |     '''
22 | 
23 |     # read settings
24 |     settings = read_json('settings/data_collection.json')
25 |     out_dir = settings['output_dir']
26 |     url_pattern = settings['season_standings']['url_link']
27 |     out_file_prefix = settings['season_standings']['output_file_prefix']
28 |     filename = '%s_%d_%d.csv' % (out_file_prefix, season - 1, season)
29 |     filepath = os.path.join(out_dir, filename)
30 | 
31 |     headers = ['Round', 'Position', 'Club Code', 'Club Name', 'Wins', 'Losses',
32 |                'Offence', 'Defence', 'Points Diff']
33 |     standings = []
34 |     for game_round in trange(1, n_rounds + 1):
35 | 
36 |         url = (url_pattern % (game_round, season - 1))
37 |         try:
38 |             r = requests.get(url)
39 |         except ConnectionError:
40 |             sys.exit('Connection Error. Check URL')
41 |         data = r.text
42 |         soup = BeautifulSoup(data, 'html.parser')
43 |         tbl_cls = ('table responsive fixed-cols-1 table-left-cols-1 '
44 |                    'table-expand table-striped table-hover table-noborder '
45 |                    'table-centered table-condensed')
46 |         table = soup.find('table', attrs={'class': tbl_cls})
47 |         body = table.find('tbody')
48 |         var1 = 'clubcode='
49 |         var2 = '&seasoncode=E'
50 |         for row in body.find_all('tr'):
51 |             a = row.find('a').get('href')
52 |             cc = a[a.find(var1) + len(var1): a.find(var2)]
53 |             # sc = a[a.find(var2) + len(var2):]
54 |             pos_team = row.find('a').string.strip()
55 |             pos = int(re.findall(r'\d{1,2}', pos_team)[0])
56 |             team = re.findall(r'[a-zA-Z\s-]+', pos_team)[0].strip()
57 |             stats = row.find_all('td')
58 |             wins = int(stats[1].string.strip())
59 |             losses = int(stats[2].string.strip())
60 |             points_plus = int(stats[3].string.strip())
61 |             points_minus = int(stats[4].string.strip())
62 |             points_diff = int(stats[5].string.strip())
63 |             standings.append([game_round, pos, cc, team, wins, losses,
64 |                               points_plus, points_minus, points_diff])
65 | 
66 |     logging.info('Convert to dataframe')
67 |     df = pd.DataFrame(standings, columns=headers)
68 | 
69 |     logging.info('Save ot file')
70 |     df.to_csv(filepath, index=False)
71 |     return
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     parser = argparse.ArgumentParser()
76 |     parser.add_argument('-s', '--season', required=True, type=int,
77 |                         help="the ending year of the season")
78 |     parser.add_argument('-n', '--n-rounds', default=34, type=int,
79 |                         help="The number of regular season rounds "
80 |                              "in the season")
81 |     args = parser.parse_args()
82 | 
83 |     main(args.season, args.n_rounds)
84 | 


--------------------------------------------------------------------------------
/feature-selection/feature_selection_pca.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Feature transformation methods using Principal Component Analysis (PCA).
 3 | 
 4 | For increasing number of principal components, results are being evaluated for
 5 | a chosen algorithm using k-fold cross-validation on the training test.
 6 | '''
 7 | import sys
 8 | import numpy as np
 9 | from matplotlib import pyplot as plt
10 | from tqdm import tqdm
11 | # from sklearn.linear_model import LogisticRegression
12 | # from sklearn.ensemble import RandomForestClassifier
13 | # from sklearn.tree import DecisionTreeClassifier
14 | # from sklearn.svm import SVC
15 | # from sklearn.ensemble import GradientBoostingClassifier
16 | from sklearn.ensemble import AdaBoostClassifier
17 | # from sklearn.naive_bayes import GaussianNB
18 | from sklearn.model_selection import StratifiedKFold
19 | from sklearn.model_selection import GridSearchCV
20 | from sklearn.decomposition import PCA
21 | sys.path.append('auxiliary/')  # noqa: E402
22 | from data_processing import load_features, shape_data
23 | 
24 | 
25 | # %% Choose settings and classifier
26 | test_season = '2018-2019'  # hold-out season for validation
27 | level = 'match'  # match or team level features to use
28 | shuffle = True  # whether to shuffle or not the data in k-fold cross validation
29 | norm = True  # whether to normalise or not the features
30 | min_round = 5  # minimum number of first rounds to skip in every season
31 | nsplits = 5  # number of folds in k-fold cross validation
32 | random_state = 10  # random state for the classifier
33 | params = {
34 |     'n_estimators': np.arange(5, 200, 5),
35 |     # 'learning_rate': np.arange(0.3, 1.5, 0.1)}
36 | }  # params for the grid search
37 | model = AdaBoostClassifier(random_state=random_state)
38 | 
39 | # %% load feature data
40 | df = load_features(level)
41 | 
42 | # choose features
43 | if level == 'match':
44 |     feats = ['Position_x', 'Position_y', 'Offence_x', 'Offence_y',
45 |              'Defence_x', 'Defence_y',
46 |              'form_x', 'form_y',
47 |              'Diff_x', 'Diff_y',
48 |              'Home F4', 'Away F4']
49 | elif level == 'team':
50 |     feats = ['Home', 'Away', 'Position', 'Offence', 'Defence',
51 |              'form',
52 |              'F4', 'Diff']
53 | n_feats = len(feats)
54 | 
55 | # seasons for calibration
56 | df = df[df['Season'] != test_season]
57 | 
58 | # %% Re-shape data
59 | X_train, y_train, df, groups = shape_data(df, feats, norm=norm,
60 |                                           min_round=min_round)
61 | 
62 | # %% Apply PCA and then k-fold cross validation
63 | XX = X_train.copy()
64 | scores = np.zeros((n_feats, 2))
65 | for n in tqdm(range(n_feats)):
66 |     pca = PCA(n_components=n + 1)
67 |     X_train = pca.fit_transform(XX)
68 |     kfold = StratifiedKFold(n_splits=nsplits, shuffle=shuffle, random_state=10)
69 |     folditer = kfold.split(X_train, y_train)
70 |     clf = GridSearchCV(model, params, cv=folditer, iid=False,
71 |                        scoring=['accuracy', 'balanced_accuracy', 'roc_auc'],
72 |                        refit='accuracy')
73 |     clf.fit(X_train, y_train)
74 |     scores[n, 0] = np.max(clf.cv_results_['mean_test_accuracy'])
75 |     scores[n, 1] = np.max(clf.cv_results_['mean_test_balanced_accuracy'])
76 |     print(clf.best_score_)
77 | 
78 | # %% Plots
79 | x = np.arange(1, n_feats + 1, dtype=int)
80 | plt.figure()
81 | plt.plot(x, scores[:, 0], label='Accuracy')
82 | plt.plot(x, scores[:, 1], label='W-Accuracy')
83 | plt.xlabel('Number of components')
84 | plt.ylabel('Score')
85 | plt.xticks(x, x)
86 | plt.grid()
87 | plt.legend()
88 | plt.show()
89 | 


--------------------------------------------------------------------------------
/descriptive-analysis/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import plotly.graph_objs as go
 3 | 
 4 | 
 5 | def find_probs(df):
 6 |     '''
 7 |     Probability of Winning when scoring in interval [x, x+4]
 8 |     '''
 9 |     min_value = (df['Score'].min() // 5) * 5
10 |     max_value = (df['Score'].max() // 5 + 1) * 5
11 |     x = np.arange(min_value, max_value + 1, 5, dtype=int)
12 |     prob = np.zeros(x.shape[0] - 1)
13 |     for i, u in enumerate(x[:-1]):
14 |         ii = (df['Score'] >= x[i]) & (df['Score'] < x[i + 1])
15 |         num = np.sum((df[ii]['Team Result'] == 'W'))
16 |         den = np.sum(ii)
17 |         prob[i] = num / den if den > 0 else 0
18 |     return prob, x[:-1]
19 | 
20 | 
21 | def find_probs_at_least_n_points(df, step=1):
22 |     '''
23 |     Probability of wining when scoring at least N points
24 |     '''
25 |     min_value = step * (df['Score'].min() // step)
26 |     max_value = step * (df['Score'].max() // step)
27 |     x = np.arange(min_value, max_value + 1, step, dtype=int)
28 |     prob = np.zeros(x.shape[0])
29 |     for i, u in enumerate(x[:-1]):
30 |         num = ((df['Score'] >= u) & (df['Team Result'] == 'W')).sum()
31 |         den = (df['Score'] >= u).sum()
32 |         prob[i] = num / den
33 |     return prob, x[:-1]
34 | 
35 | 
36 | def make_x_interv(x):
37 |     ans = [str(x[i]) + '-' + str(x[i + 1] - 1) for i in range(x.shape[0] - 1)]
38 |     ans.append(str(x[-1]) + '-' + str(x[-1] + 4))
39 |     return ans
40 | 
41 | 
42 | def make_bar_plot(dfs, names, title=''):
43 |     data = []
44 |     for df, name in zip(dfs, names):
45 |         prob, x = find_probs(df)
46 |         data.append(go.Bar(x=make_x_interv(x), y=prob, name=name))
47 | 
48 |     layout = go.Layout(title=title,
49 |                        xaxis={'title': 'Score', 'showgrid': True},
50 |                        yaxis={'title': 'Probability', 'showgrid': True})
51 |     fig = go.Figure(data, layout)
52 |     fig.show()
53 |     return
54 | 
55 | 
56 | def make_scatter_plot(dfs, names, title=''):
57 |     data = []
58 |     for df, name in zip(dfs, names):
59 |         prob, x = find_probs(df)
60 |         data.append(go.Scatter(x=x, y=prob,
61 |                                # fill='tozeroy',
62 |                                line={'shape': 'hv'},
63 |                                name=name))
64 | 
65 |     layout = go.Layout(title=title,
66 |                        xaxis={'title': 'Score',
67 |                               'showgrid': True,
68 |                               'gridcolor': 'rgb(200, 200, 200)',
69 |                               'type': 'category'},
70 |                        yaxis={'title': 'Probability',
71 |                               'showgrid': True,
72 |                               'gridcolor': 'rgb(200, 200, 200)'})
73 |     fig = go.Figure(data, layout)
74 |     fig.show()
75 |     return
76 | 
77 | 
78 | def make_scatter_plot_at_least_n_points(dfs, names, title=''):
79 |     data = []
80 |     for df, name in zip(dfs, names):
81 |         prob, x = find_probs_at_least_n_points(df, 5)
82 |         data.append(go.Scatter(x=x, y=prob,
83 |                                # fill='tozeroy',
84 |                                line={'shape': 'hv'},
85 |                                name=name))
86 | 
87 |     layout = go.Layout(title=title,
88 |                        xaxis={'title': 'Score',
89 |                               'showgrid': True,
90 |                               'gridcolor': 'rgb(200, 200, 200)',
91 |                               'type': 'category'},
92 |                        yaxis={'title': 'Probability',
93 |                               'showgrid': True,
94 |                               'gridcolor': 'rgb(200, 200, 200)'})
95 |     fig = go.Figure(data, layout)
96 |     fig.show()
97 |     return
98 | 


--------------------------------------------------------------------------------
/data-collection/scrap_season_results.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import argparse
 4 | import logging
 5 | import re
 6 | from datetime import datetime
 7 | from tqdm import trange
 8 | from bs4 import BeautifulSoup
 9 | import requests
10 | import pandas as pd
11 | sys.path.append('auxiliary/')  # noqa: E402
12 | from io_json import read_json
13 | 
14 | logging.basicConfig(level=logging.INFO)
15 | 
16 | 
17 | def main(season, n_rounds):
18 |     '''
19 |     Scraps the results of the Euroleague games from the Euroleague's official
20 |     site for the input season.
21 |     Saves data to file.
22 |     '''
23 | 
24 |     # read settings
25 |     settings = read_json('settings/data_collection.json')
26 |     out_dir = settings['output_dir']
27 |     url_pattern = settings['season_results']['url_link']
28 |     out_file_prefix = settings['season_results']['output_file_prefix']
29 |     filename = '%s_%d_%d.csv' % (out_file_prefix, season - 1, season)
30 |     filepath = os.path.join(out_dir, filename)
31 | 
32 |     headers = ['Season', 'Round', 'GameID', 'Date', 'Home Team', 'Away Team',
33 |                'Home Score', 'Away Score']
34 |     results = []
35 |     regex = re.compile(r'score [a-z\s]*pts[a-z\s]*')
36 |     season_str = '%d-%d' % (season - 1, season)
37 |     for game_round in trange(1, n_rounds + 1):
38 | 
39 |         url = (url_pattern % (game_round, season - 1))
40 |         try:
41 |             r = requests.get(url)
42 |         except ConnectionError:
43 |             sys.exit('Connection Error. Check URL')
44 |         data = r.text
45 |         soup = BeautifulSoup(data, 'html.parser')
46 |         for game in soup.find_all('div', attrs={'class': 'game played'}):
47 |             data_code = game.attrs['data-code']
48 |             gameid = '%d_%d_%d_%s' % (season - 1, season,
49 |                                       game_round, data_code)
50 |             home_team = game.find_all('span', attrs={'class': 'name'})[0].string
51 |             away_team = game.find_all('span', attrs={'class': 'name'})[1].string
52 |             scores = game.find_all('span', attrs={'class': regex})
53 | 
54 |             home_score = int(scores[0]['data-score'] if
55 |                              scores[0].has_attr('data-score') else
56 |                              scores[0].string)
57 |             away_score = int(scores[1]['data-score'] if
58 |                              scores[1].has_attr('data-score') else
59 |                              scores[1].string)
60 | 
61 |             date_str = game.find('span', attrs={'class': 'date'}).string
62 |             date = datetime.strptime(date_str, '%B %d %H:%M CET')
63 |             yr = season - 1 if date.month <= 12 and date.month > 8 else season
64 |             date = date.replace(year=yr)
65 |             date_str = datetime.strftime(date, '%Y-%m-%d %H:%M:%S')
66 | 
67 |             results.append([season_str, game_round, gameid, date_str,
68 |                             home_team, away_team,
69 |                             home_score, away_score])
70 | 
71 |     logging.info('Convert to dataframe')
72 |     df = pd.DataFrame(results, columns=headers)
73 | 
74 |     logging.info('Save to file')
75 |     df.to_csv(filepath, index=False)
76 | 
77 |     return
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     parser = argparse.ArgumentParser()
82 |     parser.add_argument('-s', '--season', required=True, type=int,
83 |                         help="the ending year of the season")
84 |     parser.add_argument('-n', '--n-rounds', default=34,
85 |                         type=int,
86 |                         help="The number of regular season rounds "
87 |                              "in the season")
88 |     args = parser.parse_args()
89 | 
90 |     main(args.season, args.n_rounds)
91 | 


--------------------------------------------------------------------------------
/feature-selection/assessing-wrapper-methods.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "\n",
 11 |     "import plotly.graph_objs as go"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "scores_obj = np.load('../output/wrapper_ada2_n_121_rate_1.npz', allow_pickle=True)\n",
 21 |     "scores = scores_obj['scores']\n",
 22 |     "featute_sets = scores_obj['features']\n",
 23 |     "\n",
 24 |     "accuracy = scores[:, 0]\n",
 25 |     "w_accuracy = scores[:, 1]"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "metadata": {},
 32 |    "outputs": [
 33 |     {
 34 |      "data": {
 35 |       "text/plain": [
 36 |        "['Position_x',\n",
 37 |        " 'Offence_x',\n",
 38 |        " 'Offence_y',\n",
 39 |        " 'Defence_y',\n",
 40 |        " 'Diff_y',\n",
 41 |        " 'Home F4',\n",
 42 |        " 'Away F4']"
 43 |       ]
 44 |      },
 45 |      "execution_count": 3,
 46 |      "metadata": {},
 47 |      "output_type": "execute_result"
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "featute_sets[2815]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 4,
 57 |    "metadata": {},
 58 |    "outputs": [
 59 |     {
 60 |      "data": {
 61 |       "text/plain": [
 62 |        "'temp-plot.html'"
 63 |       ]
 64 |      },
 65 |      "execution_count": 4,
 66 |      "metadata": {},
 67 |      "output_type": "execute_result"
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "indices = np.argsort(accuracy)[::-1][:10]\n",
 72 |     "# x = [', '.join(u) for u in featute_sets[indices]]\n",
 73 |     "xx = [[featute_sets[-1].index(u) for u in feats] for feats in featute_sets[indices]]\n",
 74 |     "x = [str(u) for u in xx]\n",
 75 |     "data = [go.Bar(x=x, y=np.sort(accuracy)[::-1][:10], name='accuracy')]\n",
 76 |     "\n",
 77 |     "layout = go.Layout(yaxis={'title': 'Accuracy'})\n",
 78 |     "fig = go.Figure(data, layout)\n",
 79 |     "fig.update_yaxes(range=[0.706, 0.718])\n",
 80 |     "fig.show()"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 5,
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "data": {
 90 |       "text/plain": [
 91 |        "'temp-plot.html'"
 92 |       ]
 93 |      },
 94 |      "execution_count": 5,
 95 |      "metadata": {},
 96 |      "output_type": "execute_result"
 97 |     }
 98 |    ],
 99 |    "source": [
100 |     "indices = np.argsort(w_accuracy)[::-1][:10]\n",
101 |     "xx = [[featute_sets[-1].index(u) for u in feats] for feats in featute_sets[indices]]\n",
102 |     "x = [str(u) for u in xx]\n",
103 |     "\n",
104 |     "data = [go.Bar(x=x, y=np.sort(w_accuracy)[::-1][:10])]\n",
105 |     "layout = go.Layout(yaxis={'title': 'Weighted Accuracy'})\n",
106 |     "fig = go.Figure(data, layout)\n",
107 |     "fig.update_yaxes(range=[0.675, 0.685])\n",
108 |     "fig.show()"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": []
117 |   }
118 |  ],
119 |  "metadata": {
120 |   "kernelspec": {
121 |    "display_name": "Python 3",
122 |    "language": "python",
123 |    "name": "python3"
124 |   },
125 |   "language_info": {
126 |    "codemirror_mode": {
127 |     "name": "ipython",
128 |     "version": 3
129 |    },
130 |    "file_extension": ".py",
131 |    "mimetype": "text/x-python",
132 |    "name": "python",
133 |    "nbconvert_exporter": "python",
134 |    "pygments_lexer": "ipython3",
135 |    "version": "3.7.4"
136 |   }
137 |  },
138 |  "nbformat": 4,
139 |  "nbformat_minor": 4
140 | }
141 | 


--------------------------------------------------------------------------------
/auxiliary/data_processing.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from glob import glob
  4 | import numpy as np
  5 | import pandas as pd
  6 | from sklearn.preprocessing import MinMaxScaler
  7 | sys.path.append('auxiliary/')  # noqa: E402
  8 | from io_json import read_json
  9 | 
 10 | 
 11 | def normalise(X):
 12 |     '''
 13 |     Normalise the features of the input design matrix `X` across the x=0 axis.
 14 |     '''
 15 |     x_norm = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
 16 |     return x_norm
 17 | 
 18 | 
 19 | def shape_data_scaler(df, feats, norm=True, min_round=5):
 20 |     '''
 21 |     Shape input data in `df` by selecting the `feats`, excluding rounds and
 22 |     normalising if `norm=True`.
 23 | 
 24 |     Returns four variables:
 25 |     * X_train
 26 |     * y_train
 27 |     * df (the new df)
 28 |     * groups (for defining groups of matches)
 29 |     * scaler (the scaler object from normalistion)
 30 |     '''
 31 |     # ignore early games in the season, as they do not contain the 'form'
 32 |     # feature.
 33 |     ii = df['Round'] > min_round
 34 | 
 35 |     # filter out the games ignored
 36 |     df = df[ii]
 37 |     df.reset_index(drop=True, inplace=True)
 38 | 
 39 |     # make the Design table
 40 |     X_train = df[feats].values
 41 | 
 42 |     # normalise the Design table if required
 43 |     if isinstance(norm, bool) and norm:
 44 |         scaler = MinMaxScaler()
 45 |         X_train = scaler.fit_transform(X_train)
 46 |         # X_train = normalise(X_train)
 47 |     elif isinstance(norm, MinMaxScaler):
 48 |         X_train = norm.transform(X_train)
 49 |         scaler = norm
 50 | 
 51 |     # extract the tags
 52 |     y_train = df['Label'].values
 53 | 
 54 |     # if labels are 1 and 2, set them to 0-1
 55 |     if 2 in np.unique(y_train):
 56 |         y_train = y_train - 1
 57 | 
 58 |     # define the groups matches if processing 'team' level classification
 59 |     groups = df['Game ID'].values if 'Game ID' in df.keys() else None
 60 | 
 61 |     return X_train, y_train, df, groups, scaler
 62 | 
 63 | 
 64 | def shape_data(df, feats, norm=True, min_round=5):
 65 |     '''
 66 |     Shape input data in `df` by selecting the `feats`, excluding rounds and
 67 |     normalising if `norm=True`.
 68 | 
 69 |     Returns four variables:
 70 |     * X_train
 71 |     * y_train
 72 |     * df (the new df)
 73 |     * groups (for defining groups of matches)
 74 |     '''
 75 | 
 76 |     # ignore early games in the season, as they do not contain the 'form'
 77 |     # feature.
 78 |     ii = df['Round'] > min_round
 79 | 
 80 |     # filter out the games ignored
 81 |     df = df[ii]
 82 |     df.reset_index(drop=True, inplace=True)
 83 | 
 84 |     # make the Design table
 85 |     X_train = df[feats].values
 86 | 
 87 |     # normalise the Design table if required
 88 |     if norm:
 89 |         X_train = normalise(X_train)
 90 | 
 91 |     # extract the tags
 92 |     y_train = df['Label'].values
 93 | 
 94 |     # if labels are 1 and 2, set them to 0-1
 95 |     if 2 in np.unique(y_train):
 96 |         y_train = y_train - 1
 97 | 
 98 |     # define the groups matches if processing 'team' level classification
 99 |     groups = df['Game ID'].values if 'Game ID' in df.keys() else None
100 | 
101 |     return X_train, y_train, df, groups
102 | 
103 | 
104 | def load_features(level):
105 |     '''load features'''
106 | 
107 |     settings = read_json('settings/feature_extraction.json')
108 |     feature_dir = settings['feature_dir']
109 | 
110 |     if level == 'match':
111 |         file_pattern = settings['match_level_feature_file_prefix']
112 |     elif level == 'team':
113 |         file_pattern = settings['team_level_feature_file_prefix']
114 |     else:
115 |         raise ValueError('Invalid level of analysis: %s' % level)
116 | 
117 |     filepath = os.path.join(feature_dir, file_pattern)
118 |     feature_files = glob('%s*.csv' % filepath)
119 |     list_dfs = [pd.read_csv(file) for file in feature_files]
120 |     df = pd.concat(list_dfs, ignore_index=False)
121 | 
122 |     df.reset_index(drop=True, inplace=True)
123 |     return df
124 | 


--------------------------------------------------------------------------------
/descriptive-analysis/descriptive_analysis.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import numpy as np
  3 | import pandas as pd
  4 | 
  5 | import plotly.express as px
  6 | 
  7 | from utils import make_scatter_plot, make_bar_plot
  8 | from utils import make_scatter_plot_at_least_n_points
  9 | 
 10 | 
 11 | # %% Load Data
 12 | 
 13 | files_pattern = 'data/euroleague_results*csv'
 14 | data_list_files = glob.glob(files_pattern)
 15 | 
 16 | df = pd.concat([pd.read_csv(f) for f in data_list_files], ignore_index=True)
 17 | df.reset_index(drop=True, inplace=True)
 18 | 
 19 | df['Game Result'] = np.where(df['Home Score'] > df['Away Score'], 1, 2)
 20 | df['Score Difference'] = np.abs(df['Home Score'] - df['Away Score'])
 21 | 
 22 | # %% Reshape the data
 23 | 
 24 | df_flat = pd.melt(df, id_vars=['Season', 'Round', 'Game Result'],
 25 |                   value_vars=['Home Score', 'Away Score'],
 26 |                   var_name='Loc', value_name='Score')
 27 | df_flat['Loc'] = df_flat['Loc'].apply(lambda x: x.split(' ')[0])
 28 | df_flat['Team Result'] = np.where(((df_flat['Game Result'] == 1) &
 29 |                                    (df_flat['Loc'] == 'Home')) |
 30 |                                   ((df_flat['Game Result'] == 2) &
 31 |                                    (df_flat['Loc'] == 'Away')), 'W', 'L')
 32 | df_flat['Season_int'] = df_flat['Season'].apply(lambda x: int(x[-4:]))
 33 | 
 34 | # %% Stat Table
 35 | 
 36 | dfgroup = df_flat.groupby(['Season', 'Loc'])['Score'].mean().unstack('Loc')
 37 | dfgroup.columns = ['Away Mean Score', 'Home Mean Score']
 38 | dfgroup.reset_index(inplace=True)
 39 | 
 40 | dff = (df.groupby(['Season', 'Game Result'])['Game Result'].
 41 |        count().unstack('Game Result'))
 42 | dff.columns = ['Home Wins', 'Away Wins']
 43 | dff.reset_index(inplace=True)
 44 | 
 45 | dfgroup = dfgroup.merge(dff, on='Season')
 46 | 
 47 | dff = (df_flat.groupby(['Season', 'Game Result'])['Score'].
 48 |        mean().unstack('Game Result'))
 49 | dff.columns = ['Home Win Mean Score', 'Away Win Mean Score']
 50 | dff.reset_index(inplace=True)
 51 | 
 52 | dfgroup = dfgroup.merge(dff, on='Season')
 53 | 
 54 | print(dfgroup)
 55 | 
 56 | # %% Plots: Home/Away Scores
 57 | 
 58 | fig = px.box(df_flat, x="Season", y="Score", color="Loc", notched=True)
 59 | fig.layout.yaxis.update({'showgrid': True, 'gridcolor': 'rgb(200, 200, 200)'})
 60 | fig.show()
 61 | 
 62 | fig = px.box(df, x="Season", y="Home Score", color="Game Result", notched=True)
 63 | fig.layout.yaxis.update({'showgrid': True, 'gridcolor': 'rgb(200, 200, 200)'})
 64 | fig.show()
 65 | 
 66 | fig = px.box(df, x="Season", y="Away Score", color="Game Result", notched=True)
 67 | fig.layout.yaxis.update({'showgrid': True, 'gridcolor': 'rgb(200, 200, 200)'})
 68 | fig.show()
 69 | 
 70 | fig = px.box(df, x="Season", y="Score Difference", color="Game Result",
 71 |              notched=True)
 72 | fig.layout.yaxis.update({'showgrid': True, 'gridcolor': 'rgb(200, 200, 200)'})
 73 | fig.show()
 74 | 
 75 | 
 76 | # %% Scatter plots - probability of winning when scoring at least N points
 77 | make_scatter_plot_at_least_n_points([df_flat, df_flat[df_flat['Loc'] == 'Home'],
 78 |                                      df_flat[df_flat['Loc'] == 'Away']],
 79 |                                     ['All', 'Home', 'Away'])
 80 | 
 81 | make_scatter_plot_at_least_n_points([df_flat[df_flat['Season_int'] == 2017],
 82 |                                      df_flat[df_flat['Season_int'] == 2018],
 83 |                                      df_flat[df_flat['Season_int'] == 2019]],
 84 |                                     ['2017', '2018', '2019'])
 85 | 
 86 | # %% Scatter plots - probability of winning when scoring points in a range.
 87 | 
 88 | # %% Bar plots
 89 | 
 90 | make_bar_plot([df_flat, df_flat[df_flat['Loc'] == 'Home'],
 91 |                df_flat[df_flat['Loc'] == 'Away']], ['All', 'Home', 'Away'])
 92 | 
 93 | make_bar_plot([df_flat[df_flat['Season_int'] == 2017],
 94 |                df_flat[df_flat['Season_int'] == 2018],
 95 |                df_flat[df_flat['Season_int'] == 2019]],
 96 |               ['2017', '2018', '2019'])
 97 | 
 98 | # %% Scatter plots
 99 | 
100 | make_scatter_plot([df_flat, df_flat[df_flat['Loc'] == 'Home'],
101 |                    df_flat[df_flat['Loc'] == 'Away']],
102 |                   ['All', 'Home', 'Away'])
103 | 
104 | make_scatter_plot([df_flat[df_flat['Season_int'] == 2017],
105 |                    df_flat[df_flat['Season_int'] == 2018],
106 |                    df_flat[df_flat['Season_int'] == 2019]],
107 |                   ['2017', '2018', '2019'])
108 | 


--------------------------------------------------------------------------------
/feature-selection/feature_selection_wrapper.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Wrapper method for feature selection, i.e. subsets of features are
  3 | generated and evaluated using a chosen algorithm and its hyper-parameters.
  4 | 
  5 | Here, as the number of features is relative small, we are able to generate
  6 | all possible combinations of features. If the number of features grows large,
  7 | a different approach should be adopted, the Sequential Forward Selection, see
  8 | `feature_selection_wrapper_sfs.py` script.
  9 | '''
 10 | import sys
 11 | from itertools import combinations
 12 | import numpy as np
 13 | from matplotlib import pyplot as plt
 14 | from tqdm import tqdm
 15 | # from sklearn.linear_model import LogisticRegression
 16 | # from sklearn.ensemble import RandomForestClassifier
 17 | # from sklearn.tree import DecisionTreeClassifier
 18 | # from sklearn.svm import SVC
 19 | # from sklearn.ensemble import GradientBoostingClassifier
 20 | from sklearn.ensemble import AdaBoostClassifier
 21 | # from sklearn.naive_bayes import GaussianNB
 22 | sys.path.append('auxiliary/')  # noqa: E402
 23 | from data_processing import load_features, shape_data
 24 | from kfold_crosseval import kfold_crosseval
 25 | 
 26 | 
 27 | # %% Choose settings and classifier
 28 | test_season = '2018-2019'  # hold-out season for validation
 29 | level = 'match'  # match or team level features to use
 30 | shuffle = True  # whether to shuffle or not the data in k-fold cross validation
 31 | norm = True  # whether to normalise or not the features
 32 | min_round = 5  # minimum number of first rounds to skip in every season
 33 | nsplits = 5  # number of folds in k-fold cross validation
 34 | nestimators = 188  # this is a classifier-specific setting
 35 | rate = 1.2  # this is a classifier-specific setting
 36 | random_state = 10  # random state for the classifier
 37 | model = AdaBoostClassifier(n_estimators=nestimators, random_state=random_state,
 38 |                            learning_rate=rate)
 39 | # name and path of the output file in which we store the performance results
 40 | # of the feature sets
 41 | out_file = 'output/wrapper_ada2_n_{}_rate_{}'.format(nestimators, rate)
 42 | 
 43 | # %% load feature data
 44 | df = load_features(level)
 45 | 
 46 | # %% choose features
 47 | if level == 'match':
 48 |     feats = ['Position_x', 'Position_y', 'Offence_x', 'Offence_y',
 49 |              'Defence_x', 'Defence_y',
 50 |              'form_x', 'form_y',
 51 |              'Diff_x', 'Diff_y',
 52 |              'Home F4', 'Away F4']
 53 | elif level == 'team':
 54 |     feats = ['Home', 'Away', 'Position', 'Offence', 'Defence',
 55 |              'form', 'F4', 'Diff']
 56 | n_feats = len(feats)
 57 | 
 58 | # seasons for calibration
 59 | df = df[df['Season'] != test_season]
 60 | 
 61 | # %% Re-shape data
 62 | X_train, y_train, df, groups = shape_data(df, feats, norm=norm,
 63 |                                           min_round=min_round)
 64 | 
 65 | # %% Embedded feature selection (combinations of features)
 66 | 
 67 | # create all possible combination of features.
 68 | allcombs = []
 69 | for u in range(1, n_feats + 1):
 70 |     combs = combinations(feats, u)
 71 |     for c in combs:
 72 |         if list(c) != []:
 73 |             allcombs.append(list(c))
 74 | 
 75 | scores = np.zeros((len(allcombs), 2))
 76 | nc = 0
 77 | for ii, comb in enumerate(tqdm(allcombs)):
 78 | 
 79 |     if len(comb) > nc:
 80 |         tqdm.write('Number of features: %d' % len(comb))
 81 |         nc = len(comb)
 82 |     indx, feats = [], []
 83 | 
 84 |     X_train = df[comb].values
 85 | 
 86 |     scores[ii, 0], scores[ii, 1] = kfold_crosseval(X_train, y_train, df[comb],
 87 |                                                    nsplits, groups=groups,
 88 |                                                    model=model,
 89 |                                                    level=level,
 90 |                                                    shuffle=shuffle)
 91 | # save results
 92 | np.savez(out_file, scores=scores, features=np.array(allcombs))
 93 | 
 94 | # %% Plot results
 95 | # Sort best combinations
 96 | ll = np.argsort(scores[:, 0])[::-1]
 97 | sortcombs = [allcombs[u] for u in ll]
 98 | 
 99 | x = np.arange(1, len(allcombs) + 1, dtype=int)
100 | plt.figure()
101 | plt.plot(x, scores[:, 0], label='Accuracy')
102 | plt.plot(x, scores[:, 1], label='W-Accuracy')
103 | plt.legend()
104 | plt.show()
105 | 
106 | plt.figure()
107 | plt.bar(x[:15], scores[ll, 0][:15])
108 | plt.xticks(x[:15], sortcombs[:15], rotation='vertical')
109 | plt.show()
110 | 


--------------------------------------------------------------------------------
/model-selection/cross_validation_two_param_models.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Hyper-parameter tuning using k-fold cross-validation for two hyper-parameter
  3 | models via nested for loops grid search. This scripts is left for legacy, see
  4 | also the `gridsearch_cross_validation.py` which covers multiple hyper-parameter
  5 | models.
  6 | '''
  7 | import sys
  8 | import numpy as np
  9 | from tqdm import tqdm
 10 | from matplotlib import pyplot as plt
 11 | from sklearn.svm import SVC
 12 | from sklearn.ensemble import AdaBoostClassifier
 13 | sys.path.append('auxiliary/')  # noqa: E402
 14 | from data_processing import load_features, shape_data
 15 | from kfold_crosseval import kfold_crosseval
 16 | 
 17 | import warnings
 18 | warnings.filterwarnings("ignore")
 19 | 
 20 | 
 21 | # %% Choose settings and classifier
 22 | test_season = '2018-2019'  # hold-out season for validation
 23 | level = 'match'  # match or team level features to use
 24 | shuffle = True  # whether to shuffle or not the data in k-fold cross validation
 25 | norm = True  # whether to normalise or not the features
 26 | min_round = 5  # minimum number of first rounds to skip in every season
 27 | nsplits = 5  # number of folds in k-fold cross validation
 28 | method = 'svm-rbf'  # method for grid search hyper-parameter training, see list
 29 | # methods: 'log-reg', 'svm-linear', 'svm-rbf', 'decision-tree', 'random-forest',
 30 | # 'naive-bayes', 'gradient-boosting', 'ada', 'ada2', 'knn',
 31 | # 'discriminant-analysis'
 32 | 
 33 | print('norm: %r - shuffle: %r - method: %s' % (norm, shuffle, method))
 34 | 
 35 | # %% load feature data
 36 | df = load_features(level)
 37 | 
 38 | # choose features
 39 | if level == 'match':
 40 |     feats = ['Position_x', 'Position_y', 'Offence_x', 'Offence_y',
 41 |              'Defence_x', 'Defence_y', 'form_x', 'form_y', 'Diff_x', 'Diff_y',
 42 |              'Home F4', 'Away F4']
 43 | elif level == 'team':
 44 |     feats = ['Home', 'Away', 'Position',
 45 |              'Offence', 'Defence', 'form', 'F4', 'Diff']
 46 | 
 47 | # seasons for calibration
 48 | df = df[df['Season'] != test_season]
 49 | 
 50 | # %% Re-shape data
 51 | X_train, y_train, df, groups = shape_data(df, feats, norm=norm,
 52 |                                           min_round=min_round)
 53 | 
 54 | print('Number of feaures:', X_train.shape[1], feats)
 55 | print('Number of obs:', X_train.shape[0])
 56 | 
 57 | # %% Set parameters
 58 | if method == 'svm-rbf':
 59 |     params1 = np.sort(np.concatenate((np.logspace(-5, 8, 14),
 60 |                                      5 * np.logspace(-5, 8, 14)), axis=0))
 61 |     params2 = np.sort(np.concatenate((np.logspace(-5, 8, 14),
 62 |                                      5 * np.logspace(-5, 8, 14)), axis=0))
 63 | elif method == 'ada':
 64 |     params1 = np.arange(5, 200, 1)
 65 |     params2 = np.arange(0.3, 1.5, 0.1)
 66 | else:
 67 |     sys.exit('Method not recognised')
 68 | 
 69 | # %% Tune parameters
 70 | accuracy = np.zeros((params1.shape[0], params2.shape[0]))
 71 | w_accuracy = np.zeros((params1.shape[0], params2.shape[0]))
 72 | 
 73 | for i, param1 in enumerate(tqdm(params1, desc='1st loop')):
 74 |     for j, param2 in enumerate(tqdm(params2, desc='2st loop')):
 75 | 
 76 |         if method == 'svm-rbf':
 77 |             model = SVC(C=param1, kernel='rbf', gamma=param2,
 78 |                         class_weight='balanced', probability=True,
 79 |                         max_iter=400)
 80 |         elif method == 'ada':
 81 |             model = AdaBoostClassifier(n_estimators=param1, random_state=10,
 82 |                                        learning_rate=param2)
 83 | 
 84 |         # apply k-fold cross validation
 85 |         accuracy[i, j], w_accuracy[i, j] = kfold_crosseval(X_train, y_train,
 86 |                                                            df, nsplits,
 87 |                                                            groups=groups,
 88 |                                                            model=model,
 89 |                                                            level=level,
 90 |                                                            shuffle=shuffle)
 91 |     np.savez('output/%s' % method, accuracy=accuracy, w_accuracy=w_accuracy,
 92 |              params1=params1, params2=params2)
 93 | 
 94 | print('Accuracy: ', np.round(np.max(accuracy), 4))
 95 | print('Weighted Accuracy: ', np.round(np.max(w_accuracy), 4))
 96 | 
 97 | plt.imshow(accuracy)
 98 | plt.colorbar()
 99 | plt.show()
100 | 
101 | plt.figure()
102 | plt.imshow(w_accuracy)
103 | plt.colorbar()
104 | plt.show()
105 | 


--------------------------------------------------------------------------------
/feature-selection/feature_selection_wrapper_sfs.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Wrapper method for feature selection using the  Sequential Forward Selection
  3 | using a chosen algorithm and its hyper-parameters.
  4 | '''
  5 | import sys
  6 | import numpy as np
  7 | from matplotlib import pyplot as plt
  8 | # from sklearn.linear_model import LogisticRegression
  9 | # from sklearn.ensemble import RandomForestClassifier
 10 | # from sklearn.tree import DecisionTreeClassifier
 11 | # from sklearn.svm import SVC
 12 | # from sklearn.ensemble import GradientBoostingClassifier
 13 | from sklearn.ensemble import AdaBoostClassifier
 14 | # from sklearn.naive_bayes import GaussianNB
 15 | 
 16 | sys.path.append('auxiliary/')  # noqa: E402
 17 | from data_processing import load_features, shape_data
 18 | from kfold_crosseval import kfold_crosseval
 19 | 
 20 | # %% Choose settings and classifier
 21 | test_season = '2018-2019'  # hold-out season for validation
 22 | level = 'match'  # match or team level features to use
 23 | shuffle = True  # whether to shuffle or not the data in k-fold cross validation
 24 | norm = True  # whether to normalise or not the features
 25 | min_round = 5  # minimum number of first rounds to skip in every season
 26 | nsplits = 5  # number of folds in k-fold cross validation
 27 | nestimators = 115  # this is a classifier-specific setting
 28 | rate = 1.1  # this is a classifier-specific setting
 29 | random_state = 10  # random state for the classifier
 30 | model = AdaBoostClassifier(n_estimators=nestimators, random_state=random_state,
 31 |                            learning_rate=rate)
 32 | 
 33 | # %% load feature data
 34 | df = load_features(level)
 35 | 
 36 | # choose features
 37 | if level == 'match':
 38 |     feats = ['Position_x', 'Position_y', 'Offence_x', 'Offence_y',
 39 |              'Defence_x', 'Defence_y',
 40 |              'form_x', 'form_y',
 41 |              'Diff_x', 'Diff_y',
 42 |              'Home F4', 'Away F4']
 43 | elif level == 'team':
 44 |     feats = ['Home', 'Away', 'Position', 'Offence', 'Defence',
 45 |              'form', 'F4', 'Diff']
 46 | n_feats = len(feats)
 47 | 
 48 | # seasons for calibration
 49 | df = df[df['Season'] != test_season]
 50 | 
 51 | # %% Re-shape data
 52 | X_train, y_train, df, groups = shape_data(df, feats, norm=norm,
 53 |                                           min_round=min_round)
 54 | 
 55 | # %% Embedded feature selection (combinations of features)
 56 | # add features one by one
 57 | 
 58 | # create a copy of the initial X_train.
 59 | XX = X_train.copy()
 60 | 
 61 | # create the indices of the features
 62 | allfeats = np.arange(n_feats, dtype=int)
 63 | # lists to keep the best features and their accuracies scores.
 64 | bestfeats = []
 65 | accuracy = []
 66 | w_accuracy = []
 67 | while len(allfeats) > 0:
 68 |     # number of remaining features
 69 |     n_temp_feat = len(allfeats)
 70 |     print('Number of features to process from:', n_temp_feat)
 71 |     # indices of current best features
 72 |     c_best = np.array(bestfeats.copy(), dtype=int)
 73 |     temp_acc = np.zeros(n_temp_feat)
 74 |     temp_wacc = np.zeros(n_temp_feat)
 75 |     for n in range(n_temp_feat):
 76 |         # append current best features with the features remaining in the list
 77 |         # (one by one)
 78 |         cfeat = np.append(c_best, allfeats[n])
 79 |         print('Indices of features under process:', cfeat)
 80 |         # select these features from the total design matrix.
 81 |         X_train = XX[:, cfeat]
 82 |         # run k-fold cross validation
 83 |         temp_acc[n], temp_wacc[n] = kfold_crosseval(X_train, y_train, df,
 84 |                                                     nsplits, groups=groups,
 85 |                                                     model=model,
 86 |                                                     level=level,
 87 |                                                     shuffle=shuffle)
 88 |     # find index of max accuracy
 89 |     nn = np.argmax(temp_acc)
 90 |     # append list of indices of best features with the index of the new best
 91 |     # feature
 92 |     bestfeats.append(allfeats[nn])
 93 |     # similarly for accuracy scores
 94 |     accuracy.append(temp_acc[nn])
 95 |     w_accuracy.append(temp_wacc[nn])
 96 |     allfeats = np.delete(allfeats, nn)
 97 |     print('Best Features:', bestfeats)
 98 | 
 99 | print([feats[u] for u in bestfeats])
100 | 
101 | # %% Plots
102 | x = np.arange(1, n_feats + 1)
103 | plt.figure()
104 | plt.plot(x, accuracy, label='Accuracy')
105 | plt.plot(x, w_accuracy, label='W-Accuracy')
106 | plt.xticks(x, x)
107 | plt.minorticks_on()
108 | plt.grid(which='major', linestyle='-')
109 | plt.grid(which='minor', linestyle='--')
110 | # plt.tight_layout()
111 | plt.show()
112 | 


--------------------------------------------------------------------------------
/feature-extraction/make_standings.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from itertools import permutations
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | 
  7 | def make_standings(results, nround):
  8 | 
  9 |     if nround < 1:
 10 |         sys.exit('Game round must be greater than 0')
 11 | 
 12 |     results = results[results['Game Round'] <= nround].copy()
 13 |     home_points = np.ones(results.shape[0], dtype=int)
 14 |     away_points = np.ones(results.shape[0], dtype=int)
 15 |     jj = results['Home Score'] > results['Away Score']
 16 |     home_points[jj] = 2
 17 |     away_points[np.logical_not(jj)] = 2
 18 | 
 19 |     results['Home Points'] = home_points
 20 |     results['Away Points'] = away_points
 21 | 
 22 |     home = results.groupby(['Home Team ID'])['Home Points',
 23 |                                              'Home Score Regular Period',
 24 |                                              'Away Score Regular Period'].sum()
 25 |     away = results.groupby(['Away Team ID'])['Away Points',
 26 |                                              'Away Score Regular Period',
 27 |                                              'Home Score Regular Period'].sum()
 28 | 
 29 |     groupby = home.merge(away, how='outer', left_index=True, right_index=True)
 30 |     groupby.fillna(0, inplace=True)
 31 | 
 32 |     teamids = np.concatenate((results['Home Team ID'].values,
 33 |                               results['Away Team ID'].values), axis=0)
 34 |     teams = np.concatenate((results['Home Team'].values,
 35 |                             results['Away Team'].values), axis=0)
 36 |     dct = dict(zip(teamids, teams))
 37 | 
 38 |     standing = pd.DataFrame()
 39 |     standing['Team ID'] = groupby.index
 40 |     standing['Team'] = [dct[u] for u in standing['Team ID'].values]
 41 |     standing['Points'] = (groupby['Home Points'].values +
 42 |                           groupby['Away Points'].values)
 43 |     standing['Score+'] = (groupby['Home Score Regular Period_x'].values +
 44 |                           groupby['Away Score Regular Period_y'].values)
 45 |     standing['Score-'] = (groupby['Away Score Regular Period_x'].values +
 46 |                           groupby['Home Score Regular Period_y'].values)
 47 |     standing['Score Diff'] = standing['Score+'] - standing['Score-']
 48 |     standing.sort_values(by=['Points', 'Score Diff', 'Score+'], inplace=True,
 49 |                          ascending=False)
 50 |     standing.reset_index(drop=True, inplace=True)
 51 | 
 52 |     intcols = ['Team ID', 'Points', 'Score+', 'Score-', 'Score Diff']
 53 |     standing[intcols] = standing[intcols].astype(int)
 54 | 
 55 |     if nround < standing.shape[0]:
 56 |         return standing
 57 | 
 58 |     nteams = standing.shape[0]
 59 |     secondary_points = np.zeros(nteams, dtype=int)
 60 |     score_diffs = np.zeros(nteams, dtype=int)
 61 |     for p in np.unique(standing['Points'].values):
 62 |         if np.sum(standing['Points'].values == p) > 1:
 63 | 
 64 |             # there are ties
 65 |             kk = standing['Points'].values == p
 66 |             teams = standing['Team ID'].iloc[kk]
 67 |             ii = (np.in1d(results['Home Team ID'].values, teams) &
 68 |                   np.in1d(results['Away Team ID'].values, teams))
 69 |             minichamp = results.iloc[ii]
 70 |             home = minichamp.groupby(['Home Team ID'])[
 71 |                 'Home Points', 'Home Score Regular Period',
 72 |                 'Away Score Regular Period'].sum()
 73 |             away = minichamp.groupby(['Away Team ID'])[
 74 |                 'Away Points', 'Away Score Regular Period',
 75 |                 'Home Score Regular Period'].sum()
 76 |             groupby = home.merge(away, how='outer', left_index=True,
 77 |                                  right_index=True)
 78 | 
 79 |             # only those tied teams that have played
 80 |             # all against each other twice are ordered
 81 |             # by their head-to-head matches.
 82 |             flag = True
 83 |             for h, a in permutations(groupby.index, 2):
 84 |                 if any((results['Home Team ID'].values == h) &
 85 |                        (results['Away Team ID'].values == a)):
 86 |                     pass
 87 |                 else:
 88 |                     flag = False
 89 | 
 90 |             if flag is False:
 91 |                 continue
 92 | 
 93 |             groupby.fillna(0, inplace=True)
 94 | 
 95 |             teamid = groupby.index
 96 |             points = (groupby['Home Points'].values +
 97 |                       groupby['Away Points'].values)
 98 |             scoreplus = (groupby['Home Score Regular Period_x'].values +
 99 |                          groupby['Away Score Regular Period_y'].values)
100 |             scoreminus = (groupby['Away Score Regular Period_x'].values +
101 |                           groupby['Home Score Regular Period_y'].values)
102 |             scores = scoreplus - scoreminus
103 | 
104 |             for team, point, score in zip(teamid, points, scores):
105 |                 secondary_points[standing['Team ID'] == team] = point
106 |                 score_diffs[standing['Team ID'] == team] = score
107 | 
108 |     standing['Secondary Points'] = secondary_points
109 |     standing['Secondary Score Diff'] = score_diffs
110 |     standing.sort_values(by=['Points', 'Secondary Points',
111 |                              'Secondary Score Diff', 'Score Diff'],
112 |                          inplace=True, ascending=False)
113 |     standing.reset_index(drop=True, inplace=True)
114 | 
115 |     return standing
116 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Euroleague Basketball Data Analysis and Prediction
 2 | 
 3 | This repository includes and an end-to-end methodology for building machine learning algorithms for predicting Euroleague Basketball game outcomes.
 4 | 
 5 | The methodology and results are discussed in detail in this article published on arxiv.org, entitled "[Descriptive and Predictive Analysis of Euroleague Basketball Games and the Wisdom of Basketball Crowds](https://arxiv.org/abs/2002.08465)"
 6 | 
 7 | The repository consists of the following modules (which represent the logical steps in the modelling process)
 8 | 
 9 | *   `data-collection`
10 | *   `feature-extraction`
11 | *   `descriptive-analysis`
12 | *   `model-selection`
13 | *   `feature-selection`
14 | *   `model-validation`
15 | 
16 | Data extraction and storage settings are specified in the `settings/` directory
17 | 
18 | ## Data Collection
19 | 
20 | Data is collected through scraping [Euroleague](https://www.euroleague.net/)'s official website.
21 | 
22 | In data collection, there are three scripts for collecting three types of data:
23 | 
24 | *   Team statistics per game, such as offence, defense scores, rebounds, steals, assists, rebounds, etc., for each team (row) in every game in a season.
25 | *   Game results. Each row corresponds to a game in season. Teams and final scores are given.
26 | *   Standing data. Each row corresponds to the standing of a team in the table at the end of the round. All rounds are included.
27 | 
28 | To collect the data for a season the user should run the script with the input console argument being the end year of the season, i.e. for season 2017-2018, execute
29 | 
30 | `$ python data-collection/scrap_game_stats.py -s 2018`
31 | 
32 | Similarly for the collection of the other data types.
33 | 
34 | Data is stored in the directory specified in the `settings/data_collection.json` file.
35 | 
36 | ## Feature Extraction
37 | 
38 | Features are extracted from the data collected. Features are split in two main categories:
39 | 
40 | *   Match-level features. Every observation (row) corresponds to a match in a season. Features include average offence, average defense, form, etc., for each team in a game.
41 | *   Team-level features. Every observation (row) corresponds to team in game in a season. Features include average offence, average defense, etc., for each team in a game.
42 | 
43 | To extract features, run the script with the input console argument being the end year of the season, i.e. for season 2017-2018, execute
44 | 
45 | `$ python feature-extraction/extract_features.py -s 2018`
46 | 
47 | Feature files are stored in the directory and with name patterns specified in the `settings/feature_extraction.json` file.
48 | 
49 | ## Descriptive Analysis
50 | 
51 | After data collection and feature extraction, we perform Explanatory Data Analysis, a descriptive analysis of the datasets. Here, we focus on the distribution of score points for the home and away teams, winning and losing teams and the probability of winning as a function of the points scored. A jupyter notebook and a script are provided. Both do the same tasks.
52 | 
53 | ## Model Selection
54 | 
55 | In model selection we compare a number of classification algorithms in different settings (e.g. match vs team level features) and perform hyper-parameter tuning.
56 | 
57 | The accuracy, weighted accuracy and ROC-AUC scores are recorded and later compared for identifying the best performing algorithm.
58 | 
59 | The `gridsearch_cross_validation.py` covers most cases. The other two scripts are left there for legacy (they shouldn't be used). Some setting parameters are hard-coded, so the user should edit those before running the script. More algorithms can be added and more hyper-parameters can be tuned if necessary.
60 | 
61 | The script has been split into sections so that it can be converted to a notebook using [`nbconvert`](https://nbconvert.readthedocs.io/en/latest/).
62 | 
63 | ## Feature Selection
64 | 
65 | Different methods of feature selection are performed in this module.
66 | 
67 | *   Filter methods, including the mutual information, chi-squared and ANOVA F-statistic. Results are being evaluated for a chosen algorithm using k-fold cross-validation on the training test.
68 | 
69 | *   Feature transformation methods using Principal Component Analysis (PCA). Increasing numbers of principal components results are being evaluated for a chosen algorithm using k-fold cross-validation on the training test.
70 | 
71 | *   Wrapper method for feature selection, i.e. subsets of features are generated and evaluated using a chosen algorithm and its hyper-parameters. Here, as the number of features is relative small, we are able to generate all possible combinations of features. If the number of features grows large, a different approach should be adopted, the Sequential Forward Selection, see
72 | `feature_selection_wrapper_sfs.py` script.
73 | 
74 | As before, Some setting parameters are hard-coded, so the user should edit those before running the script.
75 | 
76 | ## Model Validation
77 | 
78 | After choosing the best performing algorithm, feature selection and tuning to its optimal hyper-parameters, we validate the final model(s) on the test set.
79 | 
80 | The validation of the model is performed in a jupyter notebook, see `validation.ipynb`. The notebook also includes assessment of the wisdom of the crowds model and comparison.
81 | 
82 | The directory also includes a script for assessing the performance of a few benchmark models. These are:
83 | 1. Home team always wins
84 | 2. F4 teams (i.e. teams that reached the F4 in the previous season) always win when playing with a non-F4 team, otherwise home team always wins.
85 | 3. Persistence model, teams that won in the previous round win, if both teams have won, home team wins.
86 | 4. Standing model, team higher in the standings wins.
87 | 5. Panathinaikos always wins, otherwise home team always wins
88 | 6. Random model.
89 | 
90 | To Run the benchmark models, execute
91 | 
92 | `$ python model-validation/benchmarks.py`
93 | 


--------------------------------------------------------------------------------
/model-selection/cross_validation_one_param_models.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Hyper-parameter tuning using k-fold cross-validation for one hyper-parameter
  3 | models via loops grid search. This scripts is left for legacy, see
  4 | also the `gridsearch_cross_validation.py` which covers multiple hyper-parameter
  5 | models.
  6 | '''
  7 | import sys
  8 | import numpy as np
  9 | from matplotlib import pyplot as plt
 10 | from sklearn.linear_model import LogisticRegression
 11 | from sklearn.ensemble import RandomForestClassifier
 12 | from sklearn.tree import DecisionTreeClassifier
 13 | from sklearn.svm import SVC
 14 | from sklearn.naive_bayes import GaussianNB
 15 | from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
 16 | from sklearn.neighbors import KNeighborsClassifier
 17 | from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
 18 | sys.path.append('auxiliary/')  # noqa: E402
 19 | from data_processing import load_features, shape_data
 20 | from kfold_crosseval import kfold_crosseval
 21 | 
 22 | 
 23 | # %% Choose settings and classifier
 24 | test_season = '2018-2019'  # hold-out season for validation
 25 | level = 'team'  # match or team level features to use
 26 | shuffle = True  # whether to shuffle or not the data in k-fold cross validation
 27 | norm = True  # whether to normalise or not the features
 28 | min_round = 5  # minimum number of first rounds to skip in every season
 29 | nsplits = 5  # number of folds in k-fold cross validation
 30 | method = 'log-reg'  # method for grid search hyper-parameter training, see list
 31 | # methods: 'log-reg', 'svm-linear', 'svm-rbf', 'decision-tree', 'random-forest',
 32 | # 'naive-bayes', 'gradient-boosting', 'ada', 'ada2', 'knn',
 33 | # 'discriminant-analysis'
 34 | 
 35 | print('level: %s - norm: %r - shuffle: %r - method: %s' %
 36 |       (level, norm, shuffle, method))
 37 | 
 38 | # %% load feature data
 39 | df = load_features(level)
 40 | 
 41 | # choose features
 42 | if level == 'match':
 43 |     feats = ['Position_x', 'Position_y', 'Offence_x', 'Offence_y',
 44 |              'Defence_x', 'Defence_y', 'form_x', 'form_y', 'Diff_x', 'Diff_y',
 45 |              'Home F4', 'Away F4']
 46 | elif level == 'team':
 47 |     feats = ['Home', 'Away', 'Position',
 48 |              'Offence', 'Defence', 'form', 'F4', 'Diff']
 49 | 
 50 | # seasons for calibration
 51 | df = df[df['Season'] != test_season]
 52 | 
 53 | # %% Re-shape data
 54 | X_train, y_train, df, groups = shape_data(df, feats, norm=norm,
 55 |                                           min_round=min_round)
 56 | print('Number of feaures:', X_train.shape[1], feats)
 57 | print('Number of obs:', X_train.shape[0])
 58 | 
 59 | # %% Set parameters
 60 | if method == 'log-reg':
 61 |     params = np.sort(np.concatenate((np.logspace(-5, 8, 14),
 62 |                                      5 * np.logspace(-5, 8, 14)), axis=0))
 63 | elif method == 'svm-linear':
 64 |     params = np.sort(np.concatenate((np.logspace(-5, 6, 12),
 65 |                                      5 * np.logspace(-5, 6, 12)), axis=0))
 66 | elif method == 'decision-tree':
 67 |     params = np.array([0])
 68 | elif method == 'random-forest':
 69 |     params = np.arange(10, 100, 5)
 70 | elif method == 'naive-bayes':
 71 |     params = np.array([0])
 72 | elif method == 'gradient-boosting':
 73 |     params = np.arange(10, 200, 10)
 74 | elif method == 'ada':
 75 |     params = np.arange(5, 200, 3)
 76 | elif method == 'knn':
 77 |     params = np.arange(3, 30, 2)
 78 | elif method == 'discriminant-analysis':
 79 |     params = np.array([0])
 80 | else:
 81 |     sys.exit('Method not recognised')
 82 | 
 83 | # %% Tune parameters
 84 | accuracy = np.zeros(params.shape[0])
 85 | w_accuracy = np.zeros(params.shape[0])
 86 | 
 87 | for j, param in enumerate(params):
 88 | 
 89 |     # update model's parameters
 90 |     if method == 'log-reg':
 91 |         model = LogisticRegression(C=param, solver='liblinear',
 92 |                                    class_weight='balanced')
 93 |     elif method == 'svm-linear':
 94 |         model = SVC(C=param, kernel='linear', class_weight='balanced',
 95 |                     probability=True)
 96 |     elif method == 'decision-tree':
 97 |         model = DecisionTreeClassifier(class_weight='balanced', random_state=10)
 98 |     elif method == 'random-forest':
 99 |         model = RandomForestClassifier(n_estimators=param,
100 |                                        class_weight='balanced',
101 |                                        random_state=10)
102 |     elif method == 'naive-bayes':
103 |         model = GaussianNB()
104 |     elif method == 'gradient-boosting':
105 |         model = GradientBoostingClassifier(n_estimators=param, random_state=10)
106 |     elif method == 'ada':
107 |         model = AdaBoostClassifier(n_estimators=param, random_state=10,
108 |                                    learning_rate=0.6)
109 |     elif method == 'knn':
110 |         model = KNeighborsClassifier(n_neighbors=param)
111 |     elif method == 'discriminant-analysis':
112 |         model = QuadraticDiscriminantAnalysis()
113 |     else:
114 |         sys.exit('method name is not valid')
115 | 
116 |     # apply k-fold cross validation
117 |     accuracy[j], w_accuracy[j] = kfold_crosseval(X_train, y_train,
118 |                                                  df, nsplits, groups=groups,
119 |                                                  model=model, level=level,
120 |                                                  shuffle=shuffle)
121 | 
122 | # %% Plots
123 | if params.shape[0] > 1:
124 |     print('Accuracy: ', np.round(np.max(accuracy), 4))
125 |     print('Weighted Accuracy: ', np.round(np.max(w_accuracy), 4))
126 |     plt.figure()
127 |     plt.plot(params, accuracy, label='accuracy')
128 |     plt.plot(params, w_accuracy, label='w_accuracy')
129 |     if method in ['log-reg', 'svm-linear']:
130 |         plt.xscale('log')
131 |     plt.xlabel('parameter')
132 |     plt.ylabel('Score')
133 |     plt.legend()
134 |     plt.title(method)
135 |     plt.show()
136 | else:
137 |     print('Accuracy: ', accuracy.mean(axis=0))
138 |     print('Weighted Accuracy: ', w_accuracy.mean(axis=0))
139 | 


--------------------------------------------------------------------------------
/feature-selection/feature_selection_filter.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Filter methods for feature selection. Measures used are:
  3 | 1) Mutual information
  4 | 2) Chi square
  5 | 3) ANOVA F-statistic
  6 | 
  7 | Results are being evaluated for a chosen algorithm using k-fold
  8 | cross-validation on the training test.
  9 | '''
 10 | import sys
 11 | import numpy as np
 12 | from matplotlib import pyplot as plt
 13 | from tqdm import tqdm
 14 | # from sklearn.linear_model import LogisticRegression
 15 | # from sklearn.ensemble import RandomForestClassifier
 16 | # from sklearn.tree import DecisionTreeClassifier
 17 | # from sklearn.svm import SVC
 18 | # from sklearn.ensemble import GradientBoostingClassifier
 19 | from sklearn.ensemble import AdaBoostClassifier
 20 | # from sklearn.naive_bayes import GaussianNB
 21 | from sklearn.feature_selection import SelectKBest, f_classif, chi2
 22 | from sklearn.feature_selection import mutual_info_classif
 23 | sys.path.append('auxiliary/')  # noqa: E402
 24 | from data_processing import load_features, shape_data
 25 | from kfold_crosseval import kfold_crosseval
 26 | 
 27 | 
 28 | def plot_accuracy(x, accuracy, w_accuracy, title=''):
 29 |     plt.figure()
 30 |     plt.plot(x, accuracy, label='Accuracy')
 31 |     plt.plot(x, w_accuracy, label='W-Accuracy')
 32 |     plt.xlabel('Number of features')
 33 |     plt.ylabel('Score')
 34 |     plt.xticks(x, x)
 35 |     plt.minorticks_on()
 36 |     plt.grid(which='major', linestyle='-')
 37 |     plt.grid(which='minor', linestyle='--')
 38 |     plt.title(title)
 39 |     plt.legend()
 40 |     plt.show()
 41 |     return
 42 | 
 43 | 
 44 | def mutual_info_classif2(X, y, discrete_features='auto', n_neighbors=3,
 45 |                          copy=True, random_state=10):
 46 |     return mutual_info_classif(X, y, discrete_features=discrete_features,
 47 |                                n_neighbors=n_neighbors, copy=copy,
 48 |                                random_state=random_state)
 49 | 
 50 | 
 51 | # %% Choose settings and classifier
 52 | test_season = '2018-2019'  # hold-out season for validation
 53 | level = 'match'  # match or team level features to use
 54 | shuffle = True  # whether to shuffle or not the data in k-fold cross validation
 55 | norm = True  # whether to normalise or not the features
 56 | min_round = 5  # minimum number of first rounds to skip in every season
 57 | nsplits = 5  # number of folds in k-fold cross validation
 58 | random_state = 10  # random state for the classifier
 59 | model = AdaBoostClassifier(n_estimators=121, random_state=random_state,
 60 |                            learning_rate=1.0)
 61 | 
 62 | # %% load feature data
 63 | df = load_features(level)
 64 | 
 65 | # choose features
 66 | if level == 'match':
 67 |     feats = ['Position_x', 'Position_y', 'Offence_x', 'Offence_y',
 68 |              'Defence_x', 'Defence_y',
 69 |              'form_x', 'form_y',
 70 |              'Diff_x', 'Diff_y',
 71 |              'Home F4', 'Away F4']
 72 | elif level == 'team':
 73 |     feats = ['Home', 'Away', 'Position', 'Offence', 'Defence',
 74 |              'form', 'F4', 'Diff']
 75 | n_feats = len(feats)
 76 | 
 77 | # seasons for calibration
 78 | df = df[df['Season'] != test_season]
 79 | 
 80 | # %% Re-shape data
 81 | X_train, y_train, df, groups = shape_data(df, feats, norm=norm,
 82 |                                           min_round=min_round)
 83 | 
 84 | # %% Filter based feature selection (model independent)
 85 | fscores, ps = f_classif(X_train, y_train)
 86 | mscores = mutual_info_classif(X_train, y_train, random_state=10)
 87 | chiscores, _ = chi2(X_train, y_train)
 88 | ordered_fc = [feats[u] for u in np.argsort(fscores)[::-1]]
 89 | ordered_mi = [feats[u] for u in np.argsort(mscores)[::-1]]
 90 | ordered_ch = [feats[u] for u in np.argsort(chiscores)[::-1]]
 91 | print('F scores', ordered_fc)
 92 | print('MI scores', ordered_mi)
 93 | print('Chi scores', ordered_ch)
 94 | 
 95 | # %%
 96 | accuracy = np.zeros((n_feats, 3))
 97 | w_accuracy = np.zeros((n_feats, 3))
 98 | feats_fs = []
 99 | feats_mi = []
100 | for i, n in enumerate(tqdm(range(1, n_feats + 1))):
101 |     kk = n if n < n_feats else 'all'
102 |     skb_fc = SelectKBest(f_classif, k=kk)
103 |     skb_mi = SelectKBest(mutual_info_classif2, k=kk)
104 |     skb_ch = SelectKBest(chi2, k=kk)
105 |     X_fc = skb_fc.fit_transform(X_train, y_train)
106 |     X_mi = skb_mi.fit_transform(X_train, y_train)
107 |     X_ch = skb_ch.fit_transform(X_train, y_train)
108 | 
109 |     # print('MI:', skb_mi.scores_)
110 |     # print(skb_mi.get_support())
111 | 
112 |     accuracy[i, 0], w_accuracy[i, 0] = kfold_crosseval(X_fc, y_train, df,
113 |                                                        nsplits, groups=groups,
114 |                                                        model=model,
115 |                                                        level=level,
116 |                                                        shuffle=shuffle)
117 |     accuracy[i, 1], w_accuracy[i, 1] = kfold_crosseval(X_mi, y_train, df,
118 |                                                        nsplits, groups=groups,
119 |                                                        model=model,
120 |                                                        level=level,
121 |                                                        shuffle=shuffle)
122 |     accuracy[i, 2], w_accuracy[i, 2] = kfold_crosseval(X_ch, y_train, df,
123 |                                                        nsplits, groups=groups,
124 |                                                        model=model,
125 |                                                        level=level,
126 |                                                        shuffle=shuffle)
127 | 
128 | # %% PLots
129 | 
130 | x = np.arange(1, n_feats + 1)
131 | plot_accuracy(x, accuracy[:, 0], w_accuracy[:, 0], title='ANOVA')
132 | plot_accuracy(x, accuracy[:, 1], w_accuracy[:, 1], title='MI')
133 | plot_accuracy(x, accuracy[:, 2], w_accuracy[:, 2], title='Chi2')
134 | 
135 | scores = np.concatenate((fscores[:, None], mscores[:, None],
136 |                          chiscores[:, None]), axis=1)
137 | order = np.argsort(scores, axis=0)
138 | ranks = order.argsort(axis=0)
139 | 
140 | plt.figure()
141 | plt.imshow((scores.shape[0] - ranks).T)
142 | plt.yticks(ticks=[0, 1, 2], labels=['ANOVA', 'MI', 'Chi2'])
143 | plt.xticks(ticks=np.arange(len(feats)), labels=feats, rotation='vertical')
144 | plt.colorbar(orientation='horizontal', pad=0.3)
145 | plt.show()
146 | 


--------------------------------------------------------------------------------
/data-collection/scrap_game_stats.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import argparse
  4 | import logging
  5 | import re
  6 | from datetime import datetime
  7 | from tqdm import trange
  8 | from bs4 import BeautifulSoup
  9 | import requests
 10 | import pandas as pd
 11 | sys.path.append('auxiliary/')  # noqa: E402
 12 | from io_json import read_json
 13 | 
 14 | logging.basicConfig(level=logging.INFO)
 15 | 
 16 | 
 17 | def main(season, n_rounds):
 18 |     '''
 19 |     Extract games stats for all matches in a given season
 20 |     '''
 21 | 
 22 |     # read settings
 23 |     settings = read_json('settings/data_collection.json')
 24 |     out_dir = settings['output_dir']
 25 |     url_pattern = settings['game_stats']['url_link']
 26 |     out_file_prefix = settings['game_stats']['output_file_prefix']
 27 |     filename = '%s_%d_%d.csv' % (out_file_prefix, season - 1, season)
 28 |     filepath = os.path.join(out_dir, filename)
 29 | 
 30 |     regex = re.compile(r'score [a-z\s]*pts[a-z\s]*')
 31 |     allteamstats = []
 32 |     season_str = '%d-%d' % (season - 1, season)
 33 |     header = ['Season', 'Round', 'GameID', 'Date', 'Team', 'Where',
 34 |               'Offence', 'Defence']
 35 | 
 36 |     for game_round in trange(1, n_rounds + 1):
 37 |         url = (url_pattern % (game_round, season - 1))
 38 |         try:
 39 |             r = requests.get(url)
 40 |         except ConnectionError:
 41 |             sys.exit('Connection Error. Check URL')
 42 |         data = r.text
 43 |         soup = BeautifulSoup(data, 'html.parser')
 44 | 
 45 |         for game in soup.find_all('div', attrs={'class': 'game played'}):
 46 |             data_code = game.attrs['data-code']
 47 |             gameid = '%d_%d_%d_%s' % (season - 1, season,
 48 |                                       game_round, data_code)
 49 |             home_team = game.find_all('span', attrs={'class': 'name'})[0].string
 50 |             away_team = game.find_all('span', attrs={'class': 'name'})[1].string
 51 | 
 52 |             scores = game.find_all('span', attrs={'class': regex})
 53 |             home_score = int(scores[0]['data-score'] if
 54 |                              scores[0].has_attr('data-score') else
 55 |                              scores[0].string)
 56 |             away_score = int(scores[1]['data-score'] if
 57 |                              scores[1].has_attr('data-score') else
 58 |                              scores[1].string)
 59 | 
 60 |             date_str = game.find('span', attrs={'class': 'date'}).string
 61 |             date = datetime.strptime(date_str, '%B %d %H:%M CET')
 62 |             yr = season - 1 if date.month <= 12 and date.month > 8 else season
 63 |             date = date.replace(year=yr)
 64 |             date_str = datetime.strftime(date, '%Y-%m-%d %H:%M:%S')
 65 | 
 66 |             home = {'Season': season_str,
 67 |                     'Round': game_round,
 68 |                     'GameID': gameid,
 69 |                     'Date': date_str, 'Team': home_team, 'Where': 'Home',
 70 |                     'Offence': home_score, 'Defence': away_score}
 71 |             away = {'Season': season_str,
 72 |                     'Round': game_round,
 73 |                     'GameID': gameid,
 74 |                     'Date': date_str, 'Team': away_team, 'Where': 'Away',
 75 |                     'Offence': away_score, 'Defence': home_score}
 76 | 
 77 |             # follow the game-centre link
 78 |             link = (game.find_all('a', attrs={'class': 'game-link'})[0]
 79 |                     .attrs['href'])
 80 |             fulllink = 'http://www.euroleague.net/' + link
 81 |             try:
 82 |                 r = requests.get(fulllink)
 83 |             except ConnectionError:
 84 |                 sys.exit('Connection Error. Check Game URL')
 85 |             gamedata = r.text
 86 |             gamesoup = BeautifulSoup(gamedata, 'html.parser')
 87 |             totals = gamesoup.find_all('tr', attrs={'class': 'TotalFooter'})
 88 |             for i, t in enumerate(totals):
 89 |                 if i == 0:
 90 |                     # home team stats
 91 |                     dics = home.copy()
 92 |                 elif i == 1:
 93 |                     # away team stats
 94 |                     dics = away.copy()
 95 |                 else:
 96 |                     err_msg = 'Totals field returned invalid number of teams'
 97 |                     raise ValueError(err_msg)
 98 |                 stats = t.find_all('span')
 99 |                 for stat in stats:
100 |                     # ignore total time played field
101 |                     fullfield = stat.attrs['id']
102 |                     if 'TotalTimePlayed' not in fullfield:
103 |                         ii = fullfield.find('_lbl')
104 |                         field = fullfield[ii + 9:]
105 |                         string = stat.contents[0]
106 |                         if string.isnumeric():
107 |                             f = int(string)
108 |                             dics[field] = f
109 |                             if field not in header:
110 |                                 header.append(field)
111 |                         elif '/' in string:
112 |                             made, attmp = string.split('/')
113 |                             dics[field + '-Made'] = int(made)
114 |                             dics[field + '-Attempted'] = int(attmp)
115 |                             if field + '-Made' not in header:
116 |                                 header.append(field + '-Made')
117 |                             if field + '-Attempted' not in header:
118 |                                 header.append(field + '-Attempted')
119 |                         else:
120 |                             raise ValueError('Invalid field value')
121 |                 allteamstats.append(dics)
122 | 
123 |     logging.info('Convert to dataframe')
124 |     df = pd.DataFrame(allteamstats, columns=header)
125 | 
126 |     logging.info('Save to file')
127 |     df.to_csv(filepath, index=False)
128 | 
129 |     return df
130 | 
131 | 
132 | if __name__ == "__main__":
133 |     parser = argparse.ArgumentParser()
134 |     parser.add_argument('-s', '--season', required=True, type=int,
135 |                         help="the ending year of the season")
136 |     parser.add_argument('-n', '--n-rounds', default=34,
137 |                         type=int,
138 |                         help="The number of regular season rounds "
139 |                              "in the season")
140 |     args = parser.parse_args()
141 | 
142 |     main(args.season, args.n_rounds)
143 | 


--------------------------------------------------------------------------------
/model-validation/benchmarks.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Performs simple analysis and evaluates the scoring of simple benchmark models:
  3 |     1) Home team always wins
  4 |     2) F4 teams always win when playing with a non-F4 team,
  5 |         otherwise home team always wins.
  6 |     3) Persistence model, teams that won in the previous round win,
  7 |         if both teams have won, home team wins.
  8 |     4) Standing model, team higher in the standings wins.
  9 |     5) Panathinaikos always wins, otherwise home team always wins
 10 |     6) Random model.
 11 | """
 12 | import sys
 13 | import os
 14 | import argparse
 15 | import numpy as np
 16 | import pandas as pd
 17 | from sklearn.metrics import accuracy_score, balanced_accuracy_score
 18 | from sklearn.metrics import roc_auc_score
 19 | sys.path.append('auxiliary')  # noqa: E402
 20 | from io_json import read_json
 21 | 
 22 | 
 23 | def main(season):
 24 | 
 25 |     # get settings
 26 |     settings = read_json('settings/data_collection.json')
 27 |     out_dir = settings['output_dir']
 28 |     rslts_file_prefix = settings['season_results']['output_file_prefix']
 29 |     rslts_filename = '%s_%d_%d.csv' % (rslts_file_prefix, season - 1, season)
 30 |     stnds_file_prefix = settings['season_standings']['output_file_prefix']
 31 |     stnds_filename = '%s_%d_%d.csv' % (stnds_file_prefix, season - 1, season)
 32 | 
 33 |     # read input data (results and standings)
 34 |     rslts_filepath = os.path.join(out_dir, rslts_filename)
 35 |     stnds_filepath = os.path.join(out_dir, stnds_filename)
 36 |     data = pd.read_csv(rslts_filepath)
 37 |     standings = pd.read_csv(stnds_filepath)
 38 |     f4teams = read_json(settings['f4teams_file'])
 39 | 
 40 |     # Specify the F4 teams of the previous year
 41 |     f4Teams = f4teams[str(season - 1)]
 42 | 
 43 |     # Checks
 44 |     flag = False
 45 |     stand_teams = np.unique(standings['Club Name'])
 46 |     resul_teams = np.unique(data['Home Team'])
 47 |     if not np.in1d(stand_teams, resul_teams).all():
 48 |         ii = ~np.in1d(stand_teams, resul_teams)
 49 |         print(stand_teams[ii])
 50 |         flag = True
 51 |     if not np.in1d(resul_teams, stand_teams).all():
 52 |         ii = ~np.in1d(resul_teams, stand_teams)
 53 |         print(resul_teams[ii])
 54 |         flag = True
 55 | 
 56 |     if flag:
 57 |         sys.exit('Fix inconsistancies in team names')
 58 | 
 59 |     nmatches = data.shape[0]
 60 | 
 61 |     data['Actual'] = np.where(data['Home Score'] > data['Away Score'], 1, 2)
 62 |     data['Home Wins'] = np.ones(nmatches, dtype=int)
 63 | 
 64 |     # f4 model: the F4 teams of the previous year always win.
 65 |     # If no or both F4 teams in a game, home always wins.
 66 |     f4wins = np.ones(nmatches, dtype=int)
 67 |     hmf4 = np.in1d(data['Home Team'], f4Teams)
 68 |     awf4 = np.in1d(data['Away Team'], f4Teams)
 69 |     f4wins[awf4 & (~hmf4)] = 2
 70 |     data['F4 Wins'] = f4wins
 71 | 
 72 |     # persistence model: a team that won the previous games wins. If no or both
 73 |     # teams won the last game, home always wins.
 74 |     # standings model: the team that is higher in the standings wins.
 75 |     persistence = np.ones(nmatches, dtype=int)
 76 |     stand = np.ones(nmatches, dtype=int)
 77 |     for r in np.unique(data['Round']):
 78 |         if r == 1:
 79 |             continue
 80 | 
 81 |         # standings model
 82 |         s = standings[standings['Round'] == r - 1]
 83 |         d = data[data['Round'] == r]
 84 | 
 85 |         home_stands = np.array([s[s['Club Name'] == u]['Position'].iloc[0]
 86 |                                 for u in d['Home Team']])
 87 |         away_stands = np.array([s[s['Club Name'] == u]['Position'].iloc[0]
 88 |                                 for u in d['Away Team']])
 89 |         stand[data['Round'] == r] = np.where(home_stands < away_stands, 1, 2)
 90 | 
 91 |         # persistence model
 92 |         if r == 2:
 93 |             home_won = np.array([1 if s[s['Club Name'] == u]['Wins'].iloc[0] > 0
 94 |                                 else 0 for u in d['Home Team']])
 95 |             away_won = np.array([1 if s[s['Club Name'] == u]['Wins'].iloc[0] > 0
 96 |                                 else 0 for u in d['Away Team']])
 97 |         else:
 98 |             s_prev = standings[standings['Round'] == r - 2]
 99 |             home_won = np.array([1 if s[s['Club Name'] == u]['Wins'].iloc[0] >
100 |                                  s_prev[s_prev['Club Name'] == u]['Wins']
101 |                                  .iloc[0]
102 |                                  else 0 for u in d['Home Team']])
103 |             away_won = np.array([1 if s[s['Club Name'] == u]['Wins'].iloc[0] >
104 |                                  s_prev[s_prev['Club Name'] == u]['Wins']
105 |                                  .iloc[0]
106 |                                  else 0 for u in d['Away Team']])
107 |         persistence[data['Round'] == r] = np.where(away_won > home_won, 2, 1)
108 | 
109 |     data['Standings'] = stand
110 |     data['Persistence'] = persistence
111 |     # Pana model: Pana always wins, in any other game, home always wins
112 |     data['Pana'] = np.where(data['Away Team'] ==
113 |                             'Panathinaikos Superfoods Athens', 2, 1)
114 | 
115 |     # Random model, for 1000 iterations, randomly assign the results of
116 |     # the games
117 |     random = np.zeros(1000)
118 |     for i in range(1000):
119 |         rand = np.random.randint(1, 3, nmatches)
120 |         random[i] = np.sum(data['Actual'].values == rand)
121 | 
122 |     rounds_excl = [1]
123 |     print('Exclude round from evaluation:', rounds_excl)
124 |     data = data[~np.in1d(data['Round'], rounds_excl)]
125 | 
126 |     print('Number of games:', data.shape[0])
127 |     print('Home wins  :', np.sum(data['Actual'] == data['Home Wins']))
128 |     print('Top4 wins  :', np.sum(data['Actual'] == data['F4 Wins']))
129 |     print('Persistance:', np.sum(data['Actual'] == data['Persistence']))
130 |     print('Standing   :', np.sum(data['Actual'] == data['Standings']))
131 |     print('Pana       :', np.sum(data['Actual'] == data['Pana']))
132 |     print('Random     :', np.round(np.mean(random), 0))
133 |     print('Home wins  : accuracy: %f, weighted accuracy: %f, auc: %f:'
134 |           % (accuracy_score(data['Actual'].values, data['Home Wins'].values),
135 |              balanced_accuracy_score(data['Actual'].values,
136 |                                      data['Home Wins'].values),
137 |              roc_auc_score(data['Actual'].values, data['Home Wins'].values)))
138 |     print('Top4 wins  : accuracy: %f, weighted accuracy: %f, auc: %f:'
139 |           % (accuracy_score(data['Actual'].values, data['F4 Wins'].values),
140 |              balanced_accuracy_score(data['Actual'].values,
141 |                                      data['F4 Wins'].values),
142 |              roc_auc_score(data['Actual'].values, data['F4 Wins'].values)))
143 |     print('Standing  : accuracy: %f, weighted accuracy: %f, auc: %f:'
144 |           % (accuracy_score(data['Actual'].values, data['Standings'].values),
145 |              balanced_accuracy_score(data['Actual'].values,
146 |                                      data['Standings'].values),
147 |              roc_auc_score(data['Actual'].values, data['Standings'].values)))
148 |     return
149 | 
150 | 
151 | if __name__ == "__main__":
152 |     parser = argparse.ArgumentParser()
153 |     parser.add_argument('-s', '--season', type=int,
154 |                         help="the starting year of a season")
155 |     args = parser.parse_args()
156 | 
157 |     if args.season is None:
158 |         parser.print_help()
159 |     else:
160 |         main(args.season)
161 | 


--------------------------------------------------------------------------------
/descriptive-analysis/descriptive_analysis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Descriptive analysis of the Euroleague data. The analysis focuses on home/away scores and wins and estimates the probability of winning when scoring at least N points."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "cd directory to the project root directory"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "cd .."
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "import glob\n",
 33 |     "import numpy as np\n",
 34 |     "import pandas as pd\n",
 35 |     "\n",
 36 |     "import plotly.express as px\n",
 37 |     "\n",
 38 |     "from utils import make_scatter_plot, make_bar_plot, make_scatter_plot_at_least_n_points"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "### Load Data"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "files_pattern = 'data/euroleague_results*csv'"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "data_list_files = glob.glob(files_pattern)"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "df = pd.concat([pd.read_csv(f) for f in data_list_files], ignore_index=True)\n",
 73 |     "df.reset_index(drop=True, inplace=True)"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "df['Game Result'] = np.where(df['Home Score'] > df['Away Score'], 1, 2)\n",
 83 |     "df['Score Difference'] = np.abs(df['Home Score'] - df['Away Score'])"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "### Reshape the data"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "df_flat = pd.melt(df, id_vars=['Season', 'Round', 'Game Result'],\n",
100 |     "                  value_vars=['Home Score', 'Away Score'],\n",
101 |     "                  var_name='Loc', value_name='Score')\n",
102 |     "df_flat['Loc'] = df_flat['Loc'].apply(lambda x: x.split(' ')[0])\n",
103 |     "df_flat['Team Result'] = np.where(((df_flat['Game Result'] == 1) &\n",
104 |     "                                   (df_flat['Loc'] == 'Home')) |\n",
105 |     "                                  ((df_flat['Game Result'] == 2) &\n",
106 |     "                                   (df_flat['Loc'] == 'Away')), 'W', 'L')\n",
107 |     "\n",
108 |     "df_flat['Season_int'] = df_flat['Season'].apply(lambda x: int(x[-4:]))"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "### Stat Table"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "dfgroup = df_flat.groupby(['Season', 'Loc'])['Score'].mean().unstack('Loc')\n",
125 |     "dfgroup.columns = ['Away Mean Score', 'Home Mean Score']\n",
126 |     "dfgroup.reset_index(inplace=True)\n",
127 |     "\n",
128 |     "dff = (df.groupby(['Season', 'Game Result'])['Game Result'].\n",
129 |     "       count().unstack('Game Result'))\n",
130 |     "dff.columns = ['Home Wins', 'Away Wins']\n",
131 |     "dff.reset_index(inplace=True)\n",
132 |     "\n",
133 |     "dfgroup = dfgroup.merge(dff, on='Season')\n",
134 |     "\n",
135 |     "dff = (df_flat.groupby(['Season', 'Game Result'])['Score'].\n",
136 |     "       mean().unstack('Game Result'))\n",
137 |     "dff.columns = ['Home Win Mean Score', 'Away Win Mean Score']\n",
138 |     "dff.reset_index(inplace=True)\n",
139 |     "\n",
140 |     "dfgroup = dfgroup.merge(dff, on='Season')\n",
141 |     "\n",
142 |     "print(dfgroup)"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "### Plots: Home/Away Scores"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "fig = px.box(df_flat, x=\"Season\", y=\"Score\", color=\"Loc\", notched=True)\n",
159 |     "fig.layout.yaxis.update({'showgrid': True, 'gridcolor': 'rgb(200, 200, 200)'})\n",
160 |     "fig.show()\n",
161 |     "\n",
162 |     "fig = px.box(df, x=\"Season\", y=\"Home Score\", color=\"Game Result\", notched=True)\n",
163 |     "fig.layout.yaxis.update({'showgrid': True, 'gridcolor': 'rgb(200, 200, 200)'})\n",
164 |     "fig.show()\n",
165 |     "\n",
166 |     "fig = px.box(df, x=\"Season\", y=\"Away Score\", color=\"Game Result\", notched=True)\n",
167 |     "fig.layout.yaxis.update({'showgrid': True, 'gridcolor': 'rgb(200, 200, 200)'})\n",
168 |     "fig.show()\n",
169 |     "\n",
170 |     "fig = px.box(df, x=\"Season\", y=\"Score Difference\", color=\"Game Result\",\n",
171 |     "             notched=True)\n",
172 |     "fig.layout.yaxis.update({'showgrid': True, 'gridcolor': 'rgb(200, 200, 200)'})\n",
173 |     "fig.show()"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "### Scatter plots - probability of winning when scoring at least N points"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "make_scatter_plot_at_least_n_points([df_flat, df_flat[df_flat['Loc'] == 'Home'],\n",
190 |     "                                     df_flat[df_flat['Loc'] == 'Away']],\n",
191 |     "                                    ['All', 'Home', 'Away'])\n",
192 |     "\n",
193 |     "make_scatter_plot_at_least_n_points([df_flat[df_flat['Season_int'] == 2017],\n",
194 |     "                                     df_flat[df_flat['Season_int'] == 2018],\n",
195 |     "                                     df_flat[df_flat['Season_int'] == 2019]],\n",
196 |     "                                    ['2017', '2018', '2019'])"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "metadata": {},
202 |    "source": [
203 |     "### Scatter plots - probability of winning when scoring points in a range."
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "metadata": {},
209 |    "source": [
210 |     "#### Bar plots"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "make_bar_plot([df_flat, df_flat[df_flat['Loc'] == 'Home'],\n",
220 |     "               df_flat[df_flat['Loc'] == 'Away']],\n",
221 |     "              ['All', 'Home', 'Away'])\n",
222 |     "\n",
223 |     "make_bar_plot([df_flat[df_flat['Season_int'] == 2017],\n",
224 |     "               df_flat[df_flat['Season_int'] == 2018],\n",
225 |     "               df_flat[df_flat['Season_int'] == 2019]],\n",
226 |     "              ['2017', '2018', '2019'])"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "metadata": {},
232 |    "source": [
233 |     "#### Scatter plots"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "make_scatter_plot([df_flat, df_flat[df_flat['Loc'] == 'Home'],\n",
243 |     "                   df_flat[df_flat['Loc'] == 'Away']],\n",
244 |     "                  ['All', 'Home', 'Away'])\n",
245 |     "\n",
246 |     "make_scatter_plot([df_flat[df_flat['Season_int'] == 2017],\n",
247 |     "                   df_flat[df_flat['Season_int'] == 2018],\n",
248 |     "                   df_flat[df_flat['Season_int'] == 2019]],\n",
249 |     "                  ['2017', '2018', '2019'])"
250 |    ]
251 |   }
252 |  ],
253 |  "metadata": {
254 |   "anaconda-cloud": {},
255 |   "kernelspec": {
256 |    "display_name": "Python 3",
257 |    "language": "python",
258 |    "name": "python3"
259 |   },
260 |   "language_info": {
261 |    "codemirror_mode": {
262 |     "name": "ipython",
263 |     "version": 3
264 |    },
265 |    "file_extension": ".py",
266 |    "mimetype": "text/x-python",
267 |    "name": "python",
268 |    "nbconvert_exporter": "python",
269 |    "pygments_lexer": "ipython3",
270 |    "version": "3.7.4"
271 |   }
272 |  },
273 |  "nbformat": 4,
274 |  "nbformat_minor": 4
275 | }
276 | 


--------------------------------------------------------------------------------
/model-selection/gridsearch_cross_validation.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Hyper-parameter tuning using k-fold cross-validation for any-number of
  3 | parameters using sklearn grid-search. This script covers both the
  4 | `cross_validation_one_param_models.py` and
  5 | `cross_validation_two_param_models.py` scripts.
  6 | '''
  7 | import sys
  8 | import numpy as np
  9 | from matplotlib import pyplot as plt
 10 | from sklearn.model_selection import StratifiedKFold, GroupKFold
 11 | from sklearn.linear_model import LogisticRegression
 12 | from sklearn.ensemble import RandomForestClassifier
 13 | from sklearn.tree import DecisionTreeClassifier
 14 | from sklearn.svm import SVC
 15 | from sklearn.naive_bayes import GaussianNB
 16 | from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
 17 | from sklearn.neighbors import KNeighborsClassifier
 18 | from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
 19 | from sklearn.model_selection import GridSearchCV
 20 | sys.path.append('auxiliary/')  # noqa: E402
 21 | from data_processing import load_features, shape_data
 22 | 
 23 | 
 24 | # %% Choose settings and classifier
 25 | test_season = '2018-2019'  # hold-out season for validation
 26 | level = 'match'  # match or team level features to use
 27 | shuffle = True  # whether to shuffle or not the data in k-fold cross validation
 28 | norm = True  # whether to normalise or not the features
 29 | min_round = 5  # minimum number of first rounds to skip in every season
 30 | nsplits = 5  # number of folds in k-fold cross validation
 31 | method = 'ada2'  # method for grid search hyper-parameter training, see list
 32 | # methods: 'log-reg', 'svm-linear', 'svm-rbf', 'decision-tree', 'random-forest',
 33 | # 'naive-bayes', 'gradient-boosting', 'ada', 'ada2', 'knn',
 34 | # 'discriminant-analysis'
 35 | random_state = 10
 36 | 
 37 | print('level: %s - norm: %r - shuffle: %r - method: %s' %
 38 |       (level, norm, shuffle, method))
 39 | 
 40 | # %% load feature data
 41 | df = load_features(level)
 42 | 
 43 | # choose features
 44 | if level == 'match':
 45 |     # feats = ['Position_x', 'Position_y', 'Offence_x', 'Offence_y',
 46 |     #          'Defence_x', 'Defence_y', 'form_x', 'form_y', 'Diff_x', 'Diff_y',
 47 |     #          'Home F4', 'Away F4']
 48 |     # feats = ['Position_x', 'Offence_x', 'Offence_y', 'Defence_y',
 49 |     #          'Diff_y', 'Home F4', 'Away F4']
 50 |     feats = ['Position_x', 'Position_y', 'Offence_x', 'Offence_y',
 51 |              'Defence_y', 'Diff_y', 'Away F4']
 52 | elif level == 'team':
 53 |     feats = ['Home', 'Away', 'Position', 'Offence', 'Defence',
 54 |              'form', 'F4', 'Diff']
 55 | 
 56 | # seasons for calibration
 57 | df = df[df['Season'] != test_season]
 58 | 
 59 | # %% Re-shape data
 60 | X_train, y_train, df, groups = shape_data(df, feats, norm=norm,
 61 |                                           min_round=min_round)
 62 | print('Number of feaures:', X_train.shape[1], feats)
 63 | print('Number of obs:', X_train.shape[0])
 64 | 
 65 | if level == 'team':
 66 |     kfold = GroupKFold(n_splits=nsplits)
 67 |     folditer = kfold.split(X_train, y_train, groups)
 68 | else:
 69 |     kfold = StratifiedKFold(n_splits=nsplits, shuffle=shuffle,
 70 |                             random_state=random_state)
 71 |     folditer = kfold.split(X_train, y_train)
 72 | 
 73 | # %% Set parameters
 74 | if method == 'log-reg':
 75 |     params = {'C': np.sort(np.concatenate((np.logspace(-5, 8, 14),
 76 |                            5 * np.logspace(-5, 8, 14)), axis=0))}
 77 |     model = LogisticRegression(solver='liblinear', class_weight='balanced')
 78 | elif method == 'svm-linear':
 79 |     params = {'C': np.sort(np.concatenate(
 80 |         (np.logspace(-5, 8, 14), 5 * np.logspace(-5, 8, 14)), axis=0))}
 81 |     model = SVC(kernel='linear', class_weight='balanced',
 82 |                 random_state=random_state, max_iter=1000)
 83 | elif method == 'svm-rbf':
 84 |     params = {'C': np.sort(np.concatenate((np.logspace(-5, 6, 12),
 85 |                            5 * np.logspace(-5, 6, 12)), axis=0)),
 86 |               'gamma': np.sort(np.concatenate((np.logspace(-5, 6, 12),
 87 |                                5 * np.logspace(-5, 6, 12)), axis=0))}
 88 |     model = SVC(kernel='rbf', class_weight='balanced',
 89 |                 random_state=random_state, max_iter=1000)
 90 | elif method == 'decision-tree':
 91 |     params = {}
 92 |     model = DecisionTreeClassifier(class_weight='balanced',
 93 |                                    random_state=random_state)
 94 | elif method == 'random-forest':
 95 |     params = {'n_estimators': np.arange(10, 100, 5)}
 96 |     model = RandomForestClassifier(class_weight='balanced',
 97 |                                    random_state=random_state)
 98 | elif method == 'naive-bayes':
 99 |     params = {}
100 |     model = GaussianNB()
101 | elif method == 'gradient-boosting':
102 |     params = {'n_estimators': np.arange(10, 200, 10)}
103 |     model = GradientBoostingClassifier(random_state=random_state)
104 | elif method == 'ada':
105 |     params = {'n_estimators': np.arange(5, 200, 1)}
106 |     model = AdaBoostClassifier(random_state=random_state, learning_rate=1.)
107 | elif method == 'ada2':
108 |     params = {'n_estimators': np.arange(5, 200, 1),
109 |               'learning_rate': np.concatenate(([0.01, 0.05],
110 |                                                np.arange(0.1, 2.1, 0.1)))}
111 |     model = AdaBoostClassifier(random_state=random_state)
112 | elif method == 'ada3':
113 |     params = {'n_estimators': np.arange(5, 200, 2),
114 |               'learning_rate': np.arange(0.2, 2.1, 0.2),
115 |               'base_estimator': [DecisionTreeClassifier(max_depth=1),
116 |                                  DecisionTreeClassifier(max_depth=5),
117 |                                  DecisionTreeClassifier(max_depth=10),
118 |                                  DecisionTreeClassifier(max_depth=15),
119 |                                  DecisionTreeClassifier(max_depth=20),
120 |                                  DecisionTreeClassifier(max_depth=25),
121 |                                  DecisionTreeClassifier(max_depth=30)]}
122 |     model = AdaBoostClassifier(random_state=random_state)
123 | elif method == 'knn':
124 |     params = {'n_neighbors': np.arange(3, 50, 2)}
125 |     model = KNeighborsClassifier()
126 | elif method == 'discriminant-analysis':
127 |     params = {}
128 |     model = QuadraticDiscriminantAnalysis()
129 | else:
130 |     sys.exit('Method not recognised')
131 | 
132 | # %% Tune parameters
133 | 
134 | clf = GridSearchCV(model, params, cv=folditer, verbose=1, iid=False,
135 |                    scoring=['accuracy', 'balanced_accuracy', 'roc_auc'],
136 |                    refit='accuracy', n_jobs=-1)
137 | clf.fit(X_train, y_train)
138 | 
139 | if hasattr(clf.best_estimator_, 'feature_importances_'):
140 |     imp = clf.best_estimator_.feature_importances_
141 |     ii = np.argsort(imp)[::-1]
142 |     print('Feature Importance')
143 |     print([(feats[u], imp[u]) for u in ii])
144 | 
145 | # %% Plots
146 | accuracy = clf.cv_results_['mean_test_accuracy']
147 | w_accuracy = clf.cv_results_['mean_test_balanced_accuracy']
148 | roc_auc = clf.cv_results_['mean_test_roc_auc']
149 | if len(params.keys()) == 0:
150 |     print('Accuracy: ', accuracy[0])
151 |     print('Weighted Accuracy: ', w_accuracy[0])
152 |     print('ROC-AUC: ', roc_auc[0])
153 | elif len(params.keys()) == 1:
154 |     tmp = list(clf.param_grid)
155 |     params = clf.param_grid[tmp[0]]
156 |     print('Accuracy: %.4f at %.4g' %
157 |           (np.max(accuracy), params[np.argmax(accuracy)]))
158 |     print('Weighted Accuracy: %.4f at %.4g' %
159 |           (np.max(w_accuracy), params[np.argmax(w_accuracy)]))
160 |     print('ROC-AUC: %.4f at %.4g' %
161 |           (np.max(roc_auc), params[np.argmax(roc_auc)]))
162 |     plt.figure()
163 |     plt.plot(params, accuracy, label='accuracy')
164 |     plt.plot(params, w_accuracy, label='w_accuracy')
165 |     plt.plot(params, roc_auc, label='ROC-AUC')
166 |     if method in ['log-reg', 'svm-linear']:
167 |         plt.xscale('log')
168 |     plt.xlabel('parameter')
169 |     plt.ylabel('Score')
170 |     plt.legend()
171 |     plt.title(method)
172 |     plt.show()
173 | elif len(params.keys()) == 2:
174 | 
175 |     # according to some references GridSearchCV() performs search in
176 |     # alphabetical order of the parameters.
177 |     tmp = sorted(list(clf.param_grid.keys()))
178 |     shape = (clf.param_grid[tmp[0]].shape[0],
179 |              clf.param_grid[tmp[1]].shape[0])
180 |     accuracy = accuracy.reshape(shape)
181 |     w_accuracy = w_accuracy.reshape(shape)
182 |     np.savez('output/%s_feat_comb_index_2543' % method, accuracy=accuracy,
183 |              w_accuracy=w_accuracy,
184 |              params1=clf.param_grid[tmp[0]], params2=clf.param_grid[tmp[1]])
185 | 
186 |     print('Accuracy: %.4f at %s=%.4g and %s=%.4g' %
187 |           (clf.best_score_, tmp[0], clf.best_params_[tmp[0]],
188 |            tmp[1], clf.best_params_[tmp[1]]))
189 | 
190 |     inds = np.unravel_index(np.argmax(accuracy), shape)
191 |     print('Accuracy: %.4f at %s=%.4g and %s=%.4g' %
192 |           (np.max(accuracy),
193 |            tmp[0], clf.param_grid[tmp[0]][inds[0]],
194 |            tmp[1], clf.param_grid[tmp[1]][inds[1]]))
195 | 
196 |     inds = np.unravel_index(np.argmax(w_accuracy), shape)
197 |     print('Weighted Accuracy: %.4f at %s=%.4g and %s=%.4g' %
198 |           (np.max(w_accuracy),
199 |            tmp[0], clf.param_grid[tmp[0]][inds[0]],
200 |            tmp[1], clf.param_grid[tmp[1]][inds[1]]))
201 | 
202 |     print('ROC-AUC: %.4f' % np.max(roc_auc))
203 | 
204 |     plt.figure()
205 |     plt.imshow(accuracy)
206 |     plt.colorbar()
207 |     plt.show()
208 | 
209 |     plt.figure()
210 |     plt.imshow(w_accuracy)
211 |     plt.colorbar()
212 |     plt.show()
213 | elif len(params.keys()) == 3:
214 |     print('Accuracy: %.4f at %s' % (clf.best_score_, clf.best_estimator_))
215 |     np.savez('output/%s' % method,
216 |              accuracy=clf.cv_results_['mean_test_accuracy'],
217 |              w_accuracy=clf.cv_results_['mean_test_balanced_accuracy'],
218 |              params=clf.cv_results_['params'])
219 | 


--------------------------------------------------------------------------------
/feature-extraction/make_features.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import numpy as np
  3 | import pandas as pd
  4 | from make_standings import make_standings
  5 | 
  6 | 
  7 | def find_form(df, game_round, team_id):
  8 |     '''
  9 |     Finds the form of a team, i.e. the ratio of winning games over the last 5
 10 |     games.
 11 |     '''
 12 |     form = np.nan
 13 |     team_df = df[((df['Home Team ID'] == team_id) |
 14 |                  (df['Away Team ID'] == team_id)) &
 15 |                  ((df['Game Round'] < game_round) &
 16 |                   (df['Game Round'] >= game_round - 5))]
 17 |     n_games = team_df.shape[0]
 18 | 
 19 |     if n_games == 0:
 20 |         return np.nan
 21 | 
 22 |     home_games = team_df['Home Team ID'] == team_id
 23 |     away_games = team_df['Away Team ID'] == team_id
 24 |     wins = np.sum(team_df['Home Score'][home_games] >
 25 |                   team_df['Away Score'][home_games])
 26 |     wins += np.sum(team_df['Home Score'][away_games] <
 27 |                    team_df['Away Score'][away_games])
 28 | 
 29 |     form = wins / n_games
 30 |     return form
 31 | 
 32 | 
 33 | def make_game_features(data, standings, f4teams=[]):
 34 |     '''game-level features:
 35 |         standing of home team
 36 |         standing of away team
 37 |         avg scoring points of home team
 38 |         avg scoring points of away team
 39 |         avg against points of home team
 40 |         avg against points of away team
 41 |         wins to losses of home team
 42 |         wins to losses of away team
 43 |         form of home team (wins over the last 5 games)
 44 |         form of away team (wins over the last 5 games)
 45 |     '''
 46 |     logger = logging.getLogger(__name__)
 47 |     logger.info('make match-level features')
 48 | 
 49 |     stands = standings.copy()
 50 |     stands['Round'] += 1
 51 |     data['Home F4'] = np.where(data['Home Team'].isin(f4teams), 1, 0)
 52 |     data['Away F4'] = np.where(data['Away Team'].isin(f4teams), 1, 0)
 53 |     data['Label'] = np.where(data['Home Score'] > data['Away Score'], 1, 2)
 54 |     new_df = data.merge(stands, how='left',
 55 |                         left_on=['Round', 'Home Team'],
 56 |                         right_on=['Round', 'Club Name'])
 57 |     new_df = new_df.merge(stands, how='left',
 58 |                           left_on=['Round', 'Away Team'],
 59 |                           right_on=['Round', 'Club Name'])
 60 | 
 61 |     tmp = new_df[['Offence_x', 'Offence_y', 'Defence_x', 'Defence_y']].values
 62 |     tmp /= np.repeat((new_df['Round'].values - 1)[:, np.newaxis], tmp.shape[1],
 63 |                      axis=1)
 64 | 
 65 |     new_df[['Offence_x', 'Offence_y', 'Defence_x', 'Defence_y']] = tmp
 66 |     new_df['Diff_x'] = new_df['Offence_x'] - new_df['Defence_x']
 67 |     new_df['Diff_y'] = new_df['Offence_y'] - new_df['Defence_y']
 68 | 
 69 | #    tmp = new_df[['Wins_x', 'Losses_x']].values
 70 | #    new_df['Wins_to_Losses_x'] = tmp[:, 0] / tmp[:, 1]
 71 | #    tmp = new_df[['Wins_y', 'Losses_y']].values
 72 | #    new_df['Wins_to_Losses_y'] = tmp[:, 0] / tmp[:, 1]
 73 | 
 74 |     forms_home = np.zeros(new_df.shape[0])
 75 |     forms_away = np.zeros(new_df.shape[0])
 76 |     n_form_games = 5
 77 |     for index, row in new_df.iterrows():
 78 |         g_round = row['Round']
 79 |         home_team = row['Home Team']
 80 |         away_team = row['Away Team']
 81 |         form_home = 0.
 82 |         form_away = 0.
 83 |         den = 1
 84 |         if g_round > n_form_games + 1:
 85 |             # index of home team at previous round
 86 |             ii1 = ((standings['Club Name'] == home_team) &
 87 |                    (standings['Round'] == g_round - 1))
 88 |             # index of home team at `n_form_games` rounds ago.
 89 |             ii2 = ((standings['Club Name'] == home_team) &
 90 |                    (standings['Round'] == g_round - n_form_games - 1))
 91 |             form_home = (standings[ii1]['Wins'].values[0] -
 92 |                          standings[ii2]['Wins'].values[0])
 93 | 
 94 |             # index of away team at previous round
 95 |             ii1 = ((standings['Club Name'] == away_team) &
 96 |                    (standings['Round'] == g_round - 1))
 97 |             # index of away team at `n_form_games` rounds ago.
 98 |             ii2 = ((standings['Club Name'] == away_team) &
 99 |                    (standings['Round'] == g_round - n_form_games - 1))
100 |             form_away = (standings[ii1]['Wins'].values[0] -
101 |                          standings[ii2]['Wins'].values[0])
102 |             den = n_form_games
103 |         elif g_round > 1:
104 |             # index of home team at previous round
105 |             ii1 = ((standings['Club Name'] == home_team) &
106 |                    (standings['Round'] == g_round - 1))
107 |             form_home = standings[ii1]['Wins'].values[0]
108 |             # index of away team at previous round
109 |             ii1 = ((standings['Club Name'] == away_team) &
110 |                    (standings['Round'] == g_round - 1))
111 |             form_away = standings[ii1]['Wins'].values[0]
112 |             den = g_round - 1
113 |         # print(g_round, form_home, form_away)
114 |         forms_home[index] = form_home / den
115 |         forms_away[index] = form_away / den
116 | 
117 |     new_df['form_x'] = forms_home
118 |     new_df['form_y'] = forms_away
119 | 
120 |     new_df = new_df[['Season', 'Round', 'Home Team', 'Away Team', 'Label',
121 |                      'Position_x', 'Position_y',
122 |                      'Offence_x', 'Offence_y',
123 |                      'Defence_x', 'Defence_y',
124 |                      # 'Wins_to_Losses_x', 'Wins_to_Losses_y',
125 |                      'form_x', 'form_y',
126 |                      'Diff_x', 'Diff_y',
127 |                      'Home F4', 'Away F4']]
128 | 
129 |     return new_df
130 | 
131 | 
132 | def make_game_features_v0(df, standings=None):
133 |     '''game-level features:
134 |         standing of home team
135 |         standing of away team
136 |         form of home team (wins over the last 5 games)
137 |         form of away team (wins over the last 5 games)
138 |         avg scoring points of home team
139 |         avg scoring points of away team
140 |         avg against points of home team
141 |         avg against points of away team
142 |     '''
143 |     teams = np.unique(df['Home Team ID'].values)
144 | 
145 |     if standings is None:
146 |         standings = {}
147 |         for i in range(1, 31):
148 |             standings[i] = make_standings(df, i)
149 | 
150 |     f4 = [3514, 3501, 3540, 6663]
151 |     top8 = [3508, 3515, 3553]
152 | 
153 |     n_features = 14 + 32
154 |     features = np.zeros((df.shape[0], n_features))
155 |     for row in range(df.shape[0]):
156 | 
157 |         game_round = df['Game Round'].iloc[row]
158 |         home_team = df['Home Team ID'].iloc[row]
159 |         away_team = df['Away Team ID'].iloc[row]
160 | 
161 |         if game_round == 1:
162 |             features[row, :] = -1 * np.ones(n_features)
163 |             continue
164 | 
165 |         standing = standings[game_round - 1]
166 | 
167 |         standing_home_team = standing[standing['Team ID'] ==
168 |                                       home_team].index[0] + 1
169 |         standing_away_team = standing[standing['Team ID'] ==
170 |                                       away_team].index[0] + 1
171 | 
172 |         form_home_team = find_form(df, game_round, home_team)
173 |         form_away_team = find_form(df, game_round, away_team)
174 | 
175 |         avg_attack_home_team = standing[standing['Team ID'] ==
176 |                                         home_team]['Score+'] / game_round
177 |         avg_attack_away_team = standing[standing['Team ID'] ==
178 |                                         away_team]['Score+'] / game_round
179 | 
180 |         avg_defence_home_team = standing[standing['Team ID'] ==
181 |                                          home_team]['Score-'] / game_round
182 |         avg_defence_away_team = standing[standing['Team ID'] ==
183 |                                          away_team]['Score-'] / game_round
184 |         home_team_inf4 = 1 if home_team in f4 else 0
185 |         home_team_intop8 = 1 if home_team in top8 else 0
186 |         home_team_inrest = 0 if (home_team_inf4 or home_team_intop8) else 1
187 |         away_team_inf4 = 1 if away_team in f4 else 0
188 |         away_team_intop8 = 1 if away_team in top8 else 0
189 |         away_team_inrest = 0 if (away_team_inf4 or away_team_intop8) else 1
190 | 
191 |         features[row, 0] = standing_home_team
192 |         features[row, 1] = standing_away_team
193 |         features[row, 2] = form_home_team
194 |         features[row, 3] = form_away_team
195 |         features[row, 4] = avg_attack_home_team
196 |         features[row, 5] = avg_attack_away_team
197 |         features[row, 6] = avg_defence_home_team
198 |         features[row, 7] = avg_defence_away_team
199 |         features[row, 8] = home_team_inf4
200 |         features[row, 9] = home_team_intop8
201 |         features[row, 10] = home_team_inrest
202 |         features[row, 11] = away_team_inf4
203 |         features[row, 12] = away_team_intop8
204 |         features[row, 13] = away_team_inrest
205 |         features[row, 14:30] = (teams == home_team).astype(int)
206 |         features[row, 30:] = (teams == away_team).astype(int)
207 | 
208 |         headers = ['standing-home-team', 'standing-away-team',
209 |                    'form-home-team', 'form-away-team',
210 |                    'avg-attack-home-team', 'avg-attack-away-team',
211 |                    'avg-defence-home-team', 'avg-defence-away-team',
212 |                    'home-team-f4', 'home-team-top8', 'home-team-rest',
213 |                    'away-team-f4', 'away-team-top8', 'away-team-rest']
214 |         headers.extend([str(t) + '-home' for t in teams])
215 |         headers.extend([str(t) + '-away' for t in teams])
216 | 
217 |     df = pd.DataFrame(data=features, columns=headers)
218 |     # df = df.astype(dtype={'standing-home-team': int,
219 |     #                       'standing-away-team': int,
220 |     #                       'form-home-team': float, 'form-away-team': float,
221 |     #                       'avg-attack-home-team': float,
222 |     #                       'avg-attack-away-team': float,
223 |     #                       'avg-defence-home-team': float,
224 |     #                       'avg-defence-away-team': float,
225 |     #                       'home-team-f4': int, 'home-team-top8': int,
226 |     #                       'home-team-rest': int,
227 |     #                       'away-team-f4': int, 'away-team-top8': int,
228 |     #                       'away-team-rest': int})
229 |     headers_dict = dict(zip(headers, [int] * features.shape[1]))
230 |     headers_dict['form-home-team'] = float
231 |     headers_dict['form-away-team'] = float
232 |     headers_dict['avg-attack-home-team'] = float
233 |     headers_dict['avg-attack-away-team'] = float
234 |     headers_dict['avg-defence-home-team'] = float
235 |     headers_dict['avg-defence-away-team'] = float
236 |     df = df.astype(dtype=headers_dict)
237 |     return df
238 | 
239 | 
240 | def make_team_features(data, standings, f4Teams=[]):
241 |     logger = logging.getLogger(__name__)
242 |     logger.info('make team-level features')
243 |     game_feats = make_game_features(data, standings, f4Teams)
244 | 
245 |     cols = ['Season', 'Round', 'Game ID', 'Team', 'Label', 'Home', 'Away',
246 |             'Position', 'Offence', 'Defence', 'form', 'F4', 'Diff']
247 | 
248 |     game_feats['Game ID'] = data['GameID']
249 | 
250 |     home = game_feats[['Season', 'Round', 'Game ID', 'Home Team',
251 |                        'Position_x', 'Offence_x', 'Defence_x', 'form_x',
252 |                        'Home F4']]
253 |     home = home.rename(index=str, columns={'Home Team': 'Team',
254 |                                            'Position_x': 'Position',
255 |                                            'Offence_x': 'Offence',
256 |                                            'Defence_x': 'Defence',
257 |                                            'form_x': 'form',
258 |                                            'Home F4': 'F4'})
259 |     home['Diff'] = home['Offence'] - home['Defence']
260 |     home['Label'] = np.where(game_feats['Label'].values == 1, 1, 0)
261 |     home['Home'] = 1
262 |     home['Away'] = 0
263 |     # rearrange feature columns
264 |     home = home[cols]
265 | 
266 |     away = game_feats[['Season', 'Round', 'Game ID', 'Away Team',
267 |                        'Position_y', 'Offence_y', 'Defence_y', 'form_y',
268 |                        'Away F4']]
269 |     away = away.rename(index=str, columns={'Away Team': 'Team',
270 |                                            'Position_y': 'Position',
271 |                                            'Offence_y': 'Offence',
272 |                                            'Defence_y': 'Defence',
273 |                                            'form_y': 'form',
274 |                                            'Away F4': 'F4'})
275 |     away['Diff'] = away['Offence'] - away['Defence']
276 |     away['Label'] = np.where(game_feats['Label'].values == 2, 1, 0)
277 |     away['Home'] = 0
278 |     away['Away'] = 1
279 |     # rearrange feature columns
280 |     away = away[cols]
281 | 
282 |     team_feats = pd.concat([home, away])
283 |     team_feats.sort_values(by=['Round', 'Team'], inplace=True)
284 | 
285 |     return team_feats
286 | 


--------------------------------------------------------------------------------
/model-validation/validation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Training and validation of the final model(s) per round and comparison to the wisdom of the crowd."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "cd .."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import sys\n",
 26 |     "import os\n",
 27 |     "import numpy as np\n",
 28 |     "import pandas as pd\n",
 29 |     "from tqdm import tqdm\n",
 30 |     "from scipy.stats import linregress\n",
 31 |     "\n",
 32 |     "from matplotlib import pyplot as plt\n",
 33 |     "import plotly.graph_objs as go\n",
 34 |     "\n",
 35 |     "from sklearn.ensemble import AdaBoostClassifier\n",
 36 |     "from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score\n",
 37 |     "\n",
 38 |     "sys.path.append('auxiliary/')\n",
 39 |     "from data_processing import load_features, shape_data, shape_data_scaler"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "### Choose settings for the final model validation"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "test_season = '2018-2019'  # hold-out season for validation\n",
 56 |     "level = 'match'  # match or team level features to use\n",
 57 |     "min_round_train = 5  # minimum number of first rounds to skip in every season (train set)\n",
 58 |     "min_round_test = 5  # minimum number of first rounds to skip in every season (test set)\n",
 59 |     "norm = True  # whether to normalise or not the features\n",
 60 |     "random_state = 10  # random state for the classifier"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "### Choose model hyper-parameters and feature sets for the models to validate\n",
 68 |     "Adjust hyper-parameters and feature sets to reflect the optimal options from analysis in previous steps."
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "params = [\n",
 78 |     "     {'features': ['Position_x', 'Offence_x', 'Offence_y', 'Defence_y',\n",
 79 |     "                   'Diff_y', 'Home F4', 'Away F4'],\n",
 80 |     "      'n_estimators': 115, \n",
 81 |     "      'learning_rate': 0.7},\n",
 82 |     "     {'features': ['Position_x', 'Position_y', 'Offence_x', 'Offence_y',\n",
 83 |     "                   'Defence_y', 'Diff_y', 'Away F4'],\n",
 84 |     "      'n_estimators': 141, \n",
 85 |     "      'learning_rate': 0.7},\n",
 86 |     "     {'features': ['Position_x', 'Position_y', 'Offence_x', 'Offence_y',\n",
 87 |     "                   'Defence_x', 'Defence_y', 'form_x', 'form_y',\n",
 88 |     "                   'Diff_x', 'Diff_y', 'Home F4', 'Away F4'],\n",
 89 |     "      'n_estimators': 121, \n",
 90 |     "      'learning_rate': 1.0}\n",
 91 |     "]"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "### Load Features"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "df = load_features(level)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "### Train and Predict progressively"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "# Every week has each own model\n",
124 |     "rounds = np.arange(2, 31, dtype=int)\n",
125 |     "print('Rounds for validation:', rounds)\n",
126 |     "accuracy = np.zeros((rounds.shape[0], len(params)))\n",
127 |     "waccuracy = np.zeros((rounds.shape[0], len(params)))\n",
128 |     "models_results = pd.DataFrame({'game_round': rounds.repeat(8)})\n",
129 |     "for j, param in enumerate(tqdm(params)):\n",
130 |     "    features = param['features']\n",
131 |     "    n_estimators = param['n_estimators']\n",
132 |     "    learning_rate = param['learning_rate']\n",
133 |     "    model = AdaBoostClassifier(n_estimators=n_estimators, random_state=10,\n",
134 |     "                               learning_rate=learning_rate)\n",
135 |     "\n",
136 |     "    y_pred_all = np.array([])\n",
137 |     "    y_test_all = np.array([])\n",
138 |     "    for i, game_round in enumerate(rounds):\n",
139 |     "        train_inds = (df['Season'] != test_season) | ((df['Season'] == test_season) & (df['Round'] < game_round))\n",
140 |     "        test_inds = ~ train_inds\n",
141 |     "        X_train, y_train, df_train, _, scaler = shape_data_scaler(df[train_inds], features,\n",
142 |     "                                                                  norm=norm, min_round=1)\n",
143 |     "        model.fit(X_train, y_train)\n",
144 |     "\n",
145 |     "        X_test, y_test, df_test, _, _ = shape_data_scaler(df[test_inds], features,\n",
146 |     "                                                          norm=scaler, min_round=1)\n",
147 |     "\n",
148 |     "        y_pred = model.predict(X_test)\n",
149 |     "        \n",
150 |     "        accur = accuracy_score(y_test, y_pred)\n",
151 |     "        w_accur = balanced_accuracy_score(y_test, y_pred)\n",
152 |     "        \n",
153 |     "        # store the predictions, actuals of the current round\n",
154 |     "        y_pred_all = np.concatenate((y_pred_all, y_pred[:8]))\n",
155 |     "        y_test_all = np.concatenate((y_test_all, y_test[:8]))\n",
156 |     "\n",
157 |     "        accuracy[i, j] = accur\n",
158 |     "        waccuracy[i, j] = w_accur\n",
159 |     "    \n",
160 |     "    if 'actual' not in models_results.columns:\n",
161 |     "        models_results['Actual'] = y_test_all.astype(int)\n",
162 |     "    models_results['Pred_%d' % j] = y_pred_all.astype(int)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "models_results['Pred_comb'] = np.where(models_results[['Pred_0', 'Pred_1', 'Pred_2']].sum(axis=1) > 1.5, 1, 0)"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "models_results['Pred_Majority'] = np.zeros(models_results.shape[0], dtype=int)"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "metadata": {},
186 |    "source": [
187 |     "### Print Scores"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "model_list = [u for u in models_results.columns if u.startswith('Pred')]"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {},
203 |    "outputs": [],
204 |    "source": [
205 |     "print('Accuracy scores')\n",
206 |     "for col in model_list:\n",
207 |     "    print('%s:' % col, \n",
208 |     "          accuracy_score(models_results['Actual'], \n",
209 |     "                         models_results[col]))"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {},
216 |    "outputs": [],
217 |    "source": [
218 |     "print('Weighted accuracy scores')\n",
219 |     "for col in model_list:\n",
220 |     "    print('%s:' % col, \n",
221 |     "          balanced_accuracy_score(models_results['Actual'], \n",
222 |     "                                  models_results[col]))"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": null,
228 |    "metadata": {},
229 |    "outputs": [],
230 |    "source": [
231 |     "print('ROC-AUC scores')\n",
232 |     "for col in model_list:\n",
233 |     "    print('%s:' % col, roc_auc_score(models_results['Actual'], models_results[col]))"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "### Plot Accuracy per round"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": null,
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "uniq_rounds = np.unique(models_results['game_round'].values)\n",
250 |     "n_rounds = uniq_rounds.shape[0]\n",
251 |     "round_accuracy = np.zeros(n_rounds)\n",
252 |     "n_correct = np.zeros(n_rounds)\n",
253 |     "for i, u in enumerate(uniq_rounds):\n",
254 |     "    ii = models_results['game_round'] == u\n",
255 |     "    n_correct[i] =  (models_results.loc[ii, 'Actual'].values == models_results.loc[ii, 'Pred_1'].values).sum()\n",
256 |     "    round_accuracy[i] = accuracy_score(models_results.loc[ii, 'Actual'].values, models_results.loc[ii, 'Pred_1'].values)"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "metadata": {},
263 |    "outputs": [],
264 |    "source": [
265 |     "data = go.Bar(x=rounds, y=n_correct)\n",
266 |     "layout = go.Layout(yaxis={'title': 'Number of Correctly Predicted Games'},\n",
267 |     "                   xaxis={'title': 'Game Round'})\n",
268 |     "fig = go.Figure(data, layout)\n",
269 |     "fig.show()"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": null,
275 |    "metadata": {},
276 |    "outputs": [],
277 |    "source": [
278 |     "slope, interc, _, _, _ = linregress(uniq_rounds, round_accuracy)\n",
279 |     "y = slope * uniq_rounds + interc\n",
280 |     "data = [\n",
281 |     "    go.Scatter(x=uniq_rounds, y=round_accuracy, mode='markers'),\n",
282 |     "    go.Scatter(x=uniq_rounds, y=y)\n",
283 |     "]\n",
284 |     "layout = go.Layout(yaxis={'title': 'Accuracy'}, xaxis={'title': 'Game Round'}, showlegend=False)\n",
285 |     "fig = go.Figure(data, layout)\n",
286 |     "fig.show()"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "markdown",
291 |    "metadata": {},
292 |    "source": [
293 |     "# The Wisdom of the Crowds\n",
294 |     "The data for this task is available upon request."
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {},
301 |    "outputs": [],
302 |    "source": [
303 |     "predict_files_pattern = os.path.expanduser('~/Documents/mia_syn_mia_app/output/2018-2019/predictions_day_%d.csv')"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": null,
309 |    "metadata": {},
310 |    "outputs": [],
311 |    "source": [
312 |     "woc_results = np.array([])\n",
313 |     "for i in rounds:\n",
314 |     "    try:\n",
315 |     "        woc_df = pd.read_csv(predict_files_pattern % i)\n",
316 |     "        xx = woc_df[['game_%d' % u for u in range(1, 9)]].mode().values[0, :].flatten()\n",
317 |     "    except:\n",
318 |     "        print('File not found: round', i)\n",
319 |     "        xx = np.full(8, np.nan)\n",
320 |     "    woc_results = np.concatenate((woc_results, xx))"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": null,
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": [
329 |     "# WoC predictions\n",
330 |     "models_results['Pred_WoC'] = woc_results - 1"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": null,
336 |    "metadata": {},
337 |    "outputs": [],
338 |    "source": [
339 |     "if 'Pred_WoC' not in model_list:\n",
340 |     "    model_list.append('Pred_WoC')"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "markdown",
345 |    "metadata": {},
346 |    "source": [
347 |     "### Comparison of results without the missing round(s)"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": null,
353 |    "metadata": {},
354 |    "outputs": [],
355 |    "source": [
356 |     "# exclude the missing round(s) (if any)\n",
357 |     "ii = pd.notna(models_results['Pred_WoC'])"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": null,
363 |    "metadata": {},
364 |    "outputs": [],
365 |    "source": [
366 |     "print('Accuracy Scores')\n",
367 |     "for col in model_list:\n",
368 |     "    print('%s: \\t' % col, \n",
369 |     "          accuracy_score(models_results.loc[ii, 'Actual'].values,\n",
370 |     "                         models_results.loc[ii, col].values)\n",
371 |     "         )"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": null,
377 |    "metadata": {},
378 |    "outputs": [],
379 |    "source": [
380 |     "print('Weighted-Accuracy Scores')\n",
381 |     "for col in model_list:\n",
382 |     "    print('%s: \\t' % col, \n",
383 |     "          balanced_accuracy_score(models_results.loc[ii, 'Actual'].values, \n",
384 |     "                                  models_results.loc[ii, col].values)\n",
385 |     "         )"
386 |    ]
387 |   }
388 |  ],
389 |  "metadata": {
390 |   "anaconda-cloud": {},
391 |   "kernelspec": {
392 |    "display_name": "Python 3",
393 |    "language": "python",
394 |    "name": "python3"
395 |   },
396 |   "language_info": {
397 |    "codemirror_mode": {
398 |     "name": "ipython",
399 |     "version": 3
400 |    },
401 |    "file_extension": ".py",
402 |    "mimetype": "text/x-python",
403 |    "name": "python",
404 |    "nbconvert_exporter": "python",
405 |    "pygments_lexer": "ipython3",
406 |    "version": "3.7.4"
407 |   }
408 |  },
409 |  "nbformat": 4,
410 |  "nbformat_minor": 4
411 | }
412 | 


--------------------------------------------------------------------------------