├── .gitignore ├── robot.png ├── tennis_robot.png ├── tennis_robot_2.png ├── examples ├── data │ ├── nb_matches.png │ ├── Best_player_win_percentage.png │ ├── stan_the_man_win_percentage.png │ ├── stanimal_aces_percentage_difference.png │ ├── data_row_example.csv │ ├── data_loading.py │ └── single_row_example.csv ├── results_reading │ ├── win_per_surface.png │ ├── models_performances.png │ ├── precision_percentage_players_ranks.png │ ├── models_comparison.py │ └── best_model.py ├── history_modeling │ ├── 2d_pca_match_representation.png │ ├── 2d_pca_match_representation_test.png │ ├── first_example.py │ ├── history_encoding.py │ ├── pca_representation.py │ ├── pca_match_encoder_train.py │ └── train_test.py └── models │ ├── dl_train_test.py │ ├── prediction.py │ ├── train_test.py │ ├── train_test_eval.py │ ├── deep_history.py │ └── grid_search.py ├── envs ├── minimal_env.yml └── requirements.txt ├── .gitmodules ├── .github ├── dependabot.yml └── workflows │ └── black_action.yml ├── python ├── model │ ├── base_model.py │ ├── lgbm.py │ ├── dumb_models.py │ ├── sk_model.py │ ├── xgboost.py │ └── deep_model.py ├── data │ ├── data_utils.py │ ├── data_encoding.py │ ├── match.py │ ├── data_loader.py │ └── player.py ├── history_modeling │ ├── encoding_model.py │ └── match_representation.py └── evaluation │ └── train_test.py ├── LICENSE.md ├── notes.txt └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | DataBase/ 3 | 4 | \.idea/ 5 | cache/ 6 | results/ 7 | *.pyc -------------------------------------------------------------------------------- /robot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/robot.png -------------------------------------------------------------------------------- /tennis_robot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/tennis_robot.png -------------------------------------------------------------------------------- /tennis_robot_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/tennis_robot_2.png -------------------------------------------------------------------------------- /examples/data/nb_matches.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/data/nb_matches.png -------------------------------------------------------------------------------- /envs/minimal_env.yml: -------------------------------------------------------------------------------- 1 | name: min_tennis 2 | channels: 3 | - defaults 4 | dependencies: 5 | - pandas 6 | - matplotlib 7 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "submodules/tennis_atp"] 2 | path = submodules/tennis_atp 3 | url = https://github.com/JeffSackmann/tennis_atp.git 4 | -------------------------------------------------------------------------------- /examples/data/Best_player_win_percentage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/data/Best_player_win_percentage.png -------------------------------------------------------------------------------- /examples/data/stan_the_man_win_percentage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/data/stan_the_man_win_percentage.png -------------------------------------------------------------------------------- /examples/results_reading/win_per_surface.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/results_reading/win_per_surface.png -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "gitsubmodule" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | -------------------------------------------------------------------------------- /examples/results_reading/models_performances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/results_reading/models_performances.png -------------------------------------------------------------------------------- /examples/data/stanimal_aces_percentage_difference.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/data/stanimal_aces_percentage_difference.png -------------------------------------------------------------------------------- /envs/requirements.txt: -------------------------------------------------------------------------------- 1 | lightgbm==3.3.5 2 | matplotlib==3.5.1 3 | numpy==1.22.3 4 | pandas==1.5.2 5 | scikit_learn==1.2.2 6 | tensorflow==2.8.0 7 | tqdm==4.65.0 8 | xgboost==1.7.4 -------------------------------------------------------------------------------- /examples/history_modeling/2d_pca_match_representation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/history_modeling/2d_pca_match_representation.png -------------------------------------------------------------------------------- /examples/history_modeling/2d_pca_match_representation_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/history_modeling/2d_pca_match_representation_test.png -------------------------------------------------------------------------------- /examples/results_reading/precision_percentage_players_ranks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/results_reading/precision_percentage_players_ranks.png -------------------------------------------------------------------------------- /python/model/base_model.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | 3 | 4 | class BaseModel: 5 | def __init__(self): 6 | pass 7 | 8 | @abstractmethod 9 | def fit(self, X): 10 | pass 11 | 12 | @abstractmethod 13 | def predict(self, X): 14 | pass 15 | 16 | 17 | class DeepBaseModel(BaseModel): 18 | def __init__(self): 19 | super().__init__() 20 | self.instantiate_model() 21 | 22 | @abstractmethod 23 | def instantiate_model(self, X): 24 | pass 25 | -------------------------------------------------------------------------------- /python/model/lgbm.py: -------------------------------------------------------------------------------- 1 | import lightgbm as lgb 2 | import numpy as np 3 | 4 | from model.base_model import BaseModel 5 | 6 | 7 | class LightGBM(BaseModel): 8 | def __init__(self, params, num_rounds=10): 9 | self.params = params 10 | self.num_rounds = num_rounds 11 | 12 | def fit(self, X, y): 13 | train_data = lgb.Dataset(X, label=y) 14 | self.model = lgb.train(self.params, train_data, self.num_rounds) 15 | return self.model 16 | 17 | def predict(self, X): 18 | return np.round(self.model.predict(X), 0) 19 | 20 | def save(self, path): 21 | self.model.save_model(path) 22 | -------------------------------------------------------------------------------- /python/model/dumb_models.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from model.base_model import BaseModel 4 | 5 | 6 | class BestRankedPlayerWins(BaseModel): 7 | def fit(self, X, y): 8 | pass 9 | 10 | def predict(self, X): 11 | y_pred = [] 12 | for n_row, row in X.iterrows(): 13 | rank_1 = row["Ranking_1"] 14 | rank_2 = row["Ranking_2"] 15 | y_pred.append([np.argmin([rank_1, rank_2])]) 16 | return y_pred 17 | 18 | 19 | class RandomModel(BaseModel): 20 | def fit(self, X, y): 21 | pass 22 | 23 | def predict(self, X): 24 | return np.random.randint(0, 2, 1) 25 | -------------------------------------------------------------------------------- /python/data/data_utils.py: -------------------------------------------------------------------------------- 1 | def get_days_difference(prev_date, curr_date): 2 | prev_date, curr_date = str(prev_date), str(curr_date) 3 | days_difference = ( 4 | (int(curr_date[:4]) - int(prev_date[:4])) * 365 5 | + (int(curr_date[4:6]) - int(prev_date[4:6])) * 30 6 | + int(curr_date[6:8]) 7 | - int(prev_date[6:8]) 8 | ) + 2 9 | return days_difference 10 | 11 | 12 | def reverse_score(score): 13 | score = str(score) 14 | reversed_score = [] 15 | sets = score.split(" ") 16 | for set in sets: 17 | games = set.split("-") 18 | reversed_score.append("-".join(games[::-1])) 19 | return " ".join(reversed_score) 20 | -------------------------------------------------------------------------------- /python/model/sk_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sklearn.preprocessing import StandardScaler 4 | from sklearn.svm import SVC 5 | 6 | from model.base_model import BaseModel 7 | 8 | 9 | class ScalerSVC(BaseModel): 10 | def __init__(self, C=1.0, kernel="linear", degree=3, gamma="scale", tol=1e-3): 11 | self.C = C 12 | self.kernel = kernel 13 | self.degree = degree 14 | self.gamma = gamma 15 | self.tol = tol 16 | 17 | self.model = SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, tol=tol) 18 | self.scaler_x = StandardScaler() 19 | 20 | def fit(self, X, y): 21 | self.scaler_x.fit(X) 22 | self.model.fit(self.scaler_x.transform(X), y.reshape(-1, 1)) 23 | 24 | def predict(self, X): 25 | return self.model.predict(self.scaler_x.transform(X)).reshape(-1, 1) 26 | -------------------------------------------------------------------------------- /python/model/xgboost.py: -------------------------------------------------------------------------------- 1 | import xgboost as xgb 2 | import numpy as np 3 | 4 | from model.base_model import BaseModel 5 | 6 | 7 | class XGBoost(BaseModel): 8 | def __init__(self, params, num_rounds=10): 9 | self.params = params 10 | self.num_rounds = num_rounds 11 | 12 | def fit(self, X, y, validation_data=None): 13 | train_data = xgb.DMatrix(X, label=y) 14 | if validation_data is not None: 15 | evallist = [ 16 | (train_data, "train"), 17 | (xgb.DMatrix(validation_data[0], label=validation_data[1]), "eval"), 18 | ] 19 | else: 20 | evallist = [] 21 | self.model = xgb.train(self.params, train_data, self.num_rounds, evals=evallist) 22 | return self.model 23 | 24 | def predict(self, X): 25 | X = xgb.DMatrix(X) 26 | return np.round(self.model.predict(X), 0) 27 | 28 | def save(self, path): 29 | self.model.save_model(path) 30 | -------------------------------------------------------------------------------- /.github/workflows/black_action.yml: -------------------------------------------------------------------------------- 1 | name: black-action 2 | on: [pull_request] 3 | jobs: 4 | linter_name: 5 | name: runner / black 6 | runs-on: ubuntu-latest 7 | steps: 8 | - uses: actions/checkout@v2 9 | - name: Check files using the black formatter 10 | uses: rickstaa/action-black@v1 11 | id: action_black 12 | with: 13 | black_args: "." 14 | - name: Create Pull Request 15 | if: steps.action_black.outputs.is_formatted == 'true' 16 | uses: peter-evans/create-pull-request@v3 17 | with: 18 | token: ${{ secrets.GITHUB_TOKEN }} 19 | title: "Format Python code with psf/black push" 20 | commit-message: ":art: Format Python code with psf/black" 21 | body: | 22 | There appear to be some python formatting errors in ${{ github.sha }}. This pull request 23 | uses the [psf/black](https://github.com/psf/black) formatter to fix these issues. 24 | base: ${{ github.head_ref }} # Creates pull request onto pull request or commit branch 25 | branch: actions/black 26 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 VincentAuriau 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /examples/history_modeling/first_example.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import os, sys 3 | 4 | sys.path.append("../../python") 5 | sys.path.append("../../") 6 | 7 | import matplotlib.pyplot as plt 8 | from matplotlib.patches import Rectangle 9 | import numpy as np 10 | import pandas as pd 11 | 12 | from data.data_loader import matches_data_loader 13 | from history_modeling.match_representation import get_match_info, matches_info_norm 14 | 15 | data_df = matches_data_loader( 16 | path_to_data="../../submodules/tennis_atp", 17 | path_to_cache="../../cache", 18 | flush_cache=False, 19 | keep_values_from_year=2023, 20 | get_match_statistics=True, 21 | get_reversed_match_data=True, 22 | ) 23 | 24 | data_df = data_df.loc[data_df.ID_1 == 105173] # Adrian Mannarino 25 | print(f"Adrian Mannarino has played {len(data_df)} matches in 2023 in our database") 26 | 27 | ten_matches_history = pd.concat( 28 | [get_match_info(data_df.iloc[i]) for i in range(10)], axis=0 29 | ) 30 | ten_matches_history.reset_index(inplace=True, drop=True) 31 | match_info = matches_info_norm(ten_matches_history, data_df.iloc[10]["tournament_date"]) 32 | 33 | print(match_info.columns) 34 | plt.figure() 35 | plt.imshow(match_info.values) 36 | plt.show() 37 | -------------------------------------------------------------------------------- /notes.txt: -------------------------------------------------------------------------------- 1 | Features to add: 2 | - Ranking over time: ranking last months 3 | - Ranking should come from ranking files and not match files 4 | - Last x (5 currently) matches could be an argument (5, 10, 15, etc...) 5 | - % Victory against players of same rank as adv 6 | 7 | 8 | Improvements: 9 | 10 | - encoding last x matches [v, v, v, d, d] => [1, 1, 1, 0, 0] 11 | - encoding last x matches versus 12 | - add non main atp matches 13 | 14 | Done: 15 | - Using nb of games played instead of nb of sets ? 16 | - use match elapsed minutes to model fatigue 17 | - player.versus should not be given as a whole from match but only versus against adv ? 18 | - cut cache in several files for dynamic loading with different parameters 19 | - Fix columns names 20 | - Remove all persons from versus 21 | - add unique match ID to link to original matches data 22 | - add concatenation of players stats at match time, result & match stats 23 | - age at match time not well handled 24 | # Either get it from match data (easier) 25 | # Or better handling birthdate vs match date 26 | - have better historic of matches order & versus matches = {id: [date, result], [date, result], ...]} 27 | - Remove double data 28 | 29 | -------------------------------------------------------------------------------- /examples/results_reading/models_comparison.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from matplotlib.patches import Rectangle 3 | import pandas as pd 4 | 5 | df_results = pd.read_csv("../../results/20212022_chall/results.csv", sep=";") 6 | 7 | print(df_results.head()) 8 | models_color = {} 9 | for i, model in enumerate(df_results.model_class.unique()): 10 | models_color[model] = [ 11 | "tab:blue", 12 | "tab:orange", 13 | "tab:green", 14 | "tab:red", 15 | "tab:purple", 16 | "tab:brown", 17 | "tab:pink", 18 | "tab:grey", 19 | "tab:olive", 20 | "tab:cyan", 21 | ][i] 22 | fig, ax = plt.subplots() 23 | for n_row, row in df_results.iterrows(): 24 | if n_row < 200: 25 | rect = Rectangle( 26 | (n_row, 0), 27 | 1, 28 | row["precision"] * 100, 29 | edgecolor=models_color[row["model_class"]], 30 | facecolor=models_color[row["model_class"]], 31 | label=row["model_class"], 32 | ) 33 | ax.add_patch(rect) 34 | 35 | ax.autoscale() 36 | handles, labels = plt.gca().get_legend_handles_labels() 37 | by_label = dict(zip(labels, handles)) 38 | plt.legend(by_label.values(), by_label.keys(), loc=1) 39 | plt.ylabel("Precision %") 40 | plt.savefig("models_performances.png") 41 | plt.show() 42 | -------------------------------------------------------------------------------- /examples/models/dl_train_test.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | 3 | sys.path.append("../../python") 4 | sys.path.append("../../../") 5 | 6 | from model.deep_model import SimpleFullyConnected 7 | from evaluation.train_test import train_test_evaluation 8 | 9 | 10 | train_years = [2018, 2019, 2020] 11 | test_years = [2021, 2022] 12 | 13 | match_features = ["tournament_surface", "tournament_level"] 14 | player_features = [ 15 | "Ranking", 16 | "Ranking_Points", 17 | "Height", 18 | "Victories_Percentage", 19 | "Clay_Victories_Percentage", 20 | "Grass_Victories_Percentage", 21 | "Carpet_Victories_Percentage", 22 | "Hard_Victories_Percentage", 23 | "Aces_Percentage", 24 | ] 25 | additional_features = ["diff_rank", "v_perc_versus"] 26 | test_score = train_test_evaluation( 27 | train_years=train_years, 28 | test_years=test_years, 29 | model_class=SimpleFullyConnected, 30 | model_params={ 31 | "input_shape": 22, 32 | "hidden_units": (22, 44, 44, 22, 11, 4), 33 | "output_shape": 2, 34 | "last_activation": "softmax", 35 | "epochs": 100, 36 | "reduced_lr_epochs": 50, 37 | "loss": "categorical_crossentropy", 38 | }, 39 | match_features=match_features, 40 | player_features=player_features, 41 | encoding_params={}, 42 | additional_features=additional_features, 43 | save_path="../../results/test", 44 | save_all_results=True, 45 | ) 46 | -------------------------------------------------------------------------------- /examples/models/prediction.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | 3 | sys.path.append("../../python") 4 | sys.path.append("../../../") 5 | 6 | import numpy as np 7 | 8 | from data.data_loader import matches_data_loader 9 | from model.dumb_models import RandomModel, BestRankedPlayerWins 10 | 11 | data_df = matches_data_loader( 12 | path_to_data="../../submodules/tennis_atp", 13 | path_to_cache="../../cache", 14 | flush_cache=True, 15 | keep_values_from_year=2021, 16 | get_match_statistics=False, 17 | ) 18 | 19 | random_model = RandomModel() 20 | best_player_model = BestRankedPlayerWins() 21 | 22 | random_predictions = [] 23 | best_player_predictions = [] 24 | ground_truths = [] 25 | for n_row, row in data_df.iterrows(): 26 | r_prediction = random_model.predict(row) 27 | bp_prediction = best_player_model.predict(row) 28 | truth = row["Winner"] 29 | 30 | random_predictions.append(r_prediction) 31 | best_player_predictions.append(bp_prediction) 32 | ground_truths.append(truth) 33 | 34 | ground_truths = np.array(ground_truths) 35 | random_predictions = np.squeeze(np.array(random_predictions)) 36 | best_player_predictions = np.squeeze(best_player_predictions) 37 | 38 | print("Among the", len(ground_truths), "matches analyzed, we have found:") 39 | 40 | random_percentage = ( 41 | np.sum(ground_truths == random_predictions) / len(random_predictions) * 100 42 | ) 43 | print("Random Prediction Percentage:", np.round(random_percentage, 2), "%") 44 | bp_percentage = ( 45 | np.sum(ground_truths == best_player_predictions) 46 | / len(best_player_predictions) 47 | * 100 48 | ) 49 | print("Best Ranked Player Prediction Percentage:", np.round(bp_percentage, 2), "%") 50 | -------------------------------------------------------------------------------- /examples/history_modeling/history_encoding.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import os, sys 3 | 4 | sys.path.append("../../python") 5 | sys.path.append("../../") 6 | 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | import pandas as pd 10 | 11 | from data.data_loader import matches_data_loader 12 | from history_modeling.encoding_model import PCAMatchEncoder 13 | 14 | from data.data_encoding import create_encoded_history 15 | 16 | data_df = matches_data_loader( 17 | path_to_data="../../submodules/tennis_atp", 18 | path_to_cache="../../cache", 19 | flush_cache=False, 20 | keep_values_from_year=2022, 21 | get_match_statistics=True, 22 | get_reversed_match_data=True, 23 | include_davis_cup=False, 24 | ) 25 | 26 | print("Data Loaded") 27 | columns = [ 28 | "surface", 29 | "result", 30 | "adv_ranking", 31 | "adv_ranking_points", 32 | "num_won_sets", 33 | "num_lost_sets", 34 | "num_won_games", 35 | "num_lost_games", 36 | "num_tie_break_wons", 37 | "num_tie_break_lost", 38 | ] 39 | model = PCAMatchEncoder(num_pca_features=2, columns=columns) 40 | model.fit(data_df, transform_data=True) 41 | 42 | print("Model Fitted, now predicting") 43 | X_r, match_info = model.predict(data_df, transform_data=True) 44 | 45 | history_df = create_encoded_history(data_df, model, 5) 46 | 47 | cols = ["history_1", "history_2"] 48 | print( 49 | pd.DataFrame( 50 | np.array(history_df["history_1"].values.tolist()) 51 | .reshape((len(history_df), -1)) 52 | .tolist() 53 | ) 54 | ) 55 | print( 56 | np.array(history_df["history_1"].values.tolist()) 57 | .reshape((len(history_df), -1)) 58 | .shape 59 | ) 60 | flatten_data = pd.concat( 61 | [ 62 | pd.DataFrame( 63 | np.array(history_df["history_1"].values.tolist()) 64 | .reshape((len(history_df), -1)) 65 | .tolist() 66 | ).add_prefix(x) 67 | for x in cols 68 | ], 69 | axis=1, 70 | ) 71 | flatten_data.to_csv("flatten_data.csv", sep=";", index=False) 72 | encoded_data = pd.concat([flatten_data, history_df.drop(cols, axis=1)], axis=1) 73 | history_df = pd.merge(data_df, encoded_data, on=["id", "ID_1", "ID_2"]) 74 | history_df = history_df.loc[history_df.tournament_year == 2023] 75 | history_df.to_csv("history_df.csv", sep=";", index=False) 76 | -------------------------------------------------------------------------------- /examples/history_modeling/pca_representation.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import os, sys 3 | 4 | sys.path.append("../../python") 5 | sys.path.append("../../") 6 | 7 | import matplotlib.pyplot as plt 8 | import pandas as pd 9 | from sklearn.decomposition import PCA 10 | 11 | from data.data_loader import matches_data_loader 12 | from history_modeling.match_representation import get_match_info, matches_info_norm 13 | 14 | data_df = matches_data_loader( 15 | path_to_data="../../submodules/tennis_atp", 16 | path_to_cache="../../cache", 17 | flush_cache=False, 18 | keep_values_from_year=2023, 19 | get_match_statistics=True, 20 | get_reversed_match_data=True, 21 | ) 22 | 23 | ten_matches_history = pd.concat( 24 | [get_match_info(data_df.iloc[i]) for i in range(len(data_df))], axis=0 25 | ) 26 | ten_matches_history.reset_index(inplace=True, drop=True) 27 | match_info = matches_info_norm(ten_matches_history, "20230401") 28 | 29 | match_info = match_info.dropna().reset_index(drop=True) 30 | 31 | X = match_info.values 32 | pca = PCA(n_components=2) 33 | X_r = pca.fit(X).transform(X) 34 | 35 | plt.figure(figsize=(20, 12)) 36 | 37 | plt.subplot(2, 4, 1) 38 | v_i = match_info.loc[match_info.result == 0].index.values 39 | d_i = match_info.loc[match_info.result == 1].index.values 40 | plt.scatter(X_r[v_i, 0], X_r[v_i, 1], label="Victories") 41 | plt.scatter(X_r[d_i, 0], X_r[d_i, 1], label="Defeats") 42 | plt.legend() 43 | plt.title("Result") 44 | 45 | plt.subplot(2, 4, 2) 46 | c_i = match_info.loc[match_info.surface == 0.0].index.values 47 | h_i = match_info.loc[match_info.surface == 2 / 3].index.values 48 | g_i = match_info.loc[match_info.surface == 1.0].index.values 49 | plt.scatter(X_r[c_i, 0], X_r[c_i, 1], label="Clay") 50 | plt.scatter(X_r[h_i, 0], X_r[h_i, 1], label="Hard") 51 | plt.scatter(X_r[g_i, 0], X_r[g_i, 1], label="Grass") 52 | plt.legend() 53 | plt.title("Surface") 54 | 55 | plt.subplot(2, 4, 3) 56 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_played_minutes) 57 | plt.title("played minutes") 58 | 59 | plt.subplot(2, 4, 4) 60 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.adv_ranking) 61 | plt.title("Ranking Adversary") 62 | 63 | plt.subplot(2, 4, 5) 64 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_won_sets) 65 | plt.title("Won sets Number") 66 | plt.subplot(2, 4, 6) 67 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_lost_sets) 68 | plt.title("Lost set Number") 69 | plt.subplot(2, 4, 7) 70 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_won_games) 71 | plt.title("Won games Number") 72 | plt.subplot(2, 4, 8) 73 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_lost_games) 74 | plt.title("Lost games Number") 75 | 76 | plt.savefig("2d_pca_match_representation.png") 77 | plt.show() 78 | -------------------------------------------------------------------------------- /examples/history_modeling/pca_match_encoder_train.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import os, sys 3 | 4 | sys.path.append("../../python") 5 | sys.path.append("../../") 6 | 7 | import matplotlib.pyplot as plt 8 | import pandas as pd 9 | from sklearn.decomposition import PCA 10 | 11 | from data.data_loader import matches_data_loader 12 | from history_modeling.match_representation import ( 13 | create_timeless_dataset, 14 | create_dataset, 15 | ) 16 | from history_modeling.encoding_model import PCAMatchEncoder 17 | 18 | data_df = matches_data_loader( 19 | path_to_data="../../submodules/tennis_atp", 20 | path_to_cache="../../cache", 21 | flush_cache=False, 22 | keep_values_from_year=2020, 23 | get_match_statistics=True, 24 | get_reversed_match_data=True, 25 | ) 26 | 27 | """ 28 | match_info = create_timeless_dataset(data_df) 29 | print(len(match_info)) 30 | match_info = match_info.dropna().reset_index(drop=True) 31 | print(len(match_info)) 32 | 33 | X = match_info.values 34 | pca = PCA(n_components=2) 35 | X_r = pca.fit(X).transform(X) 36 | """ 37 | model = PCAMatchEncoder(num_pca_features=2) 38 | model.fit(data_df, transform_data=True) 39 | X_r, match_info = model.predict(data_df, transform_data=True) 40 | 41 | plt.figure(figsize=(20, 12)) 42 | 43 | plt.subplot(2, 4, 1) 44 | v_i = match_info.loc[match_info.result == 0].index.values 45 | d_i = match_info.loc[match_info.result == 1].index.values 46 | plt.scatter(X_r[v_i, 0], X_r[v_i, 1], label="Victories") 47 | plt.scatter(X_r[d_i, 0], X_r[d_i, 1], label="Defeats") 48 | plt.legend() 49 | plt.title("Result") 50 | 51 | plt.subplot(2, 4, 2) 52 | c_i = match_info.loc[match_info.surface == 0.0].index.values 53 | h_i = match_info.loc[match_info.surface == 2 / 3].index.values 54 | g_i = match_info.loc[match_info.surface == 1.0].index.values 55 | plt.scatter(X_r[c_i, 0], X_r[c_i, 1], label="Clay") 56 | plt.scatter(X_r[h_i, 0], X_r[h_i, 1], label="Hard") 57 | plt.scatter(X_r[g_i, 0], X_r[g_i, 1], label="Grass") 58 | plt.legend() 59 | plt.title("Surface") 60 | 61 | plt.subplot(2, 4, 3) 62 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_played_minutes) 63 | plt.title("played minutes") 64 | 65 | plt.subplot(2, 4, 4) 66 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.adv_ranking) 67 | plt.title("Ranking Adversary") 68 | 69 | plt.subplot(2, 4, 5) 70 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_won_sets) 71 | plt.title("Won sets Number") 72 | plt.subplot(2, 4, 6) 73 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_lost_sets) 74 | plt.title("Lost set Number") 75 | plt.subplot(2, 4, 7) 76 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_won_games) 77 | plt.title("Won games Number") 78 | plt.subplot(2, 4, 8) 79 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_lost_games) 80 | plt.title("Lost games Number") 81 | 82 | plt.savefig("2d_pca_match_representation.png") 83 | plt.show() 84 | -------------------------------------------------------------------------------- /examples/history_modeling/train_test.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | 3 | sys.path.append("../../python") 4 | sys.path.append("../../../") 5 | 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | import pandas as pd 9 | 10 | from model.xgboost import XGBoost 11 | from history_modeling.encoding_model import PCAMatchEncoder 12 | from evaluation.train_test import train_test_evaluation 13 | 14 | 15 | train_years = [2018, 2019, 2020] 16 | test_years = [2021, 2022] 17 | 18 | 19 | match_features = ["tournament_surface", "tournament_level"] 20 | player_features = [ 21 | "Ranking", 22 | "Ranking_Points", 23 | "Height", 24 | "Victories_Percentage", 25 | "Clay_Victories_Percentage", 26 | "Grass_Victories_Percentage", 27 | "Carpet_Victories_Percentage", 28 | "Hard_Victories_Percentage", 29 | "Aces_Percentage", 30 | ] 31 | additional_features = ["diff_rank", "v_perc_versus", "nb_match_versus"] 32 | xgb_hyperparams = { 33 | "params": { 34 | "eta": 0.3, 35 | "objective": "binary:logistic", 36 | "gamma": 10, 37 | "max_depth": 10, 38 | "min_child_weight": 8, 39 | "subsample": 1, 40 | } 41 | } 42 | 43 | xgb_hyperparams = [] 44 | for eta in [0.1, 0.3, 0.6]: 45 | for gamma in [0, 1, 10]: 46 | for max_depth in [2, 4, 6, 8, 10]: 47 | for min_child_weight in [1, 2, 8]: 48 | for subsample in [0.4, 0.8, 1]: 49 | xgb_hyperparams.append( 50 | { 51 | "params": { 52 | "eta": eta, 53 | "objective": "binary:logistic", 54 | "gamma": gamma, 55 | "max_depth": max_depth, 56 | "min_child_weight": min_child_weight, 57 | "subsample": subsample, 58 | } 59 | } 60 | ) 61 | test_score = train_test_evaluation( 62 | train_years=[2018, 2019, 2020], 63 | test_years=test_years, 64 | model_class=XGBoost, 65 | model_params=xgb_hyperparams, 66 | encoder_models=[ 67 | ( 68 | PCAMatchEncoder, 69 | { 70 | "num_pca_features": 2, 71 | "auto_transform": True, 72 | "columns": [ 73 | "surface", 74 | "result", 75 | "adv_ranking", 76 | "adv_ranking_points", 77 | "num_won_sets", 78 | "num_lost_sets", 79 | "num_won_games", 80 | "num_lost_games", 81 | "num_tie_break_wons", 82 | "num_tie_break_lost", 83 | ], 84 | }, 85 | ) 86 | ], 87 | match_features=match_features, 88 | player_features=player_features, 89 | encoding_params={}, 90 | additional_features=additional_features, 91 | save_path="../../results/history_encoding", 92 | save_all_results=True, 93 | ) 94 | -------------------------------------------------------------------------------- /examples/models/train_test.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | 3 | sys.path.append("../../python") 4 | sys.path.append("../../../") 5 | 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | import pandas as pd 9 | from sklearn.ensemble import RandomForestClassifier 10 | 11 | from data.data_loader import matches_data_loader 12 | from data.data_loader import encode_data 13 | 14 | data_df = matches_data_loader( 15 | path_to_data="../../submodules/tennis_atp", 16 | path_to_cache="../../cache", 17 | flush_cache=False, 18 | keep_values_from_year=2022, 19 | get_match_statistics=True, 20 | get_reversed_match_data=True, 21 | ) 22 | 23 | forgotten_columns = ["Versus_1", "Best_Rank_1", "Last_Tournament_Date"] 24 | 25 | columns_m = ["tournament_level", "round", "best_of", "Winner"] 26 | columns_1 = [ 27 | "ID_1", 28 | "Ranking_1", 29 | "Ranking_Points_1", 30 | "Hand_1", 31 | "Height_1", 32 | "Versus_1", 33 | "Victories_Percentage_1", 34 | "Clay_Victories_Percentage_1", 35 | "Grass_Victories_Percentage_1", 36 | "Carpet_Victories_Percentage_1", 37 | "Hard_Victories_Percentage_1", 38 | "Aces_Percentage_1", 39 | "Doublefaults_Percentage_1", 40 | "First_Serve_Success_Percentage_1", 41 | "Winning_on_1st_Serve_Percentage_1", 42 | "Winning_on_2nd_Serve_Percentage_1", 43 | "Overall_Win_on_Serve_Percentage_1", 44 | "BreakPoint_Face_Percentage_1", 45 | "BreakPoint_Saved_Percentage_1", 46 | "Fatigue_1", 47 | ] 48 | columns_2 = [ 49 | "ID_2", 50 | "Ranking_2", 51 | "Ranking_Points_2", 52 | "Hand_2", 53 | "Height_2", 54 | "Versus_2", 55 | "Victories_Percentage_2", 56 | "Clay_Victories_Percentage_2", 57 | "Grass_Victories_Percentage_2", 58 | "Carpet_Victories_Percentage_2", 59 | "Hard_Victories_Percentage_2", 60 | "Aces_Percentage_2", 61 | "Doublefaults_Percentage_2", 62 | "First_Serve_Success_Percentage_2", 63 | "Winning_on_1st_Serve_Percentage_2", 64 | "Winning_on_2nd_Serve_Percentage_2", 65 | "Overall_Win_on_Serve_Percentage_2", 66 | "BreakPoint_Face_Percentage_2", 67 | "BreakPoint_Saved_Percentage_2", 68 | "Fatigue_2", 69 | ] 70 | 71 | data_df = data_df[columns_m + columns_1 + columns_2] 72 | 73 | print(data_df.head()) 74 | print(data_df.shape) 75 | 76 | data_df = data_df[columns_m + columns_1 + columns_2] 77 | data_df = data_df.dropna(axis=0) 78 | 79 | fdf = encode_data(data_df) 80 | fdf.to_csv("../cache/test.csv") 81 | 82 | fdf = fdf.drop(["ID_1", "Versus_1", "ID_2", "Versus_2"], axis=1) 83 | fdf["diff_ranking"] = fdf["Ranking_2"] - fdf["Ranking_1"] 84 | 85 | y = fdf.Winner 86 | 87 | fdf = fdf[["diff_ranking"]] 88 | X = fdf.values 89 | 90 | print(X) 91 | 92 | model = RandomForestClassifier(n_estimators=1000, max_depth=None) 93 | print("FIT") 94 | print(X.shape, y.shape) 95 | model.fit(X, y) 96 | 97 | y_pred = model.predict(X) 98 | print(len(y), np.sum(y == y_pred)) 99 | print(y_pred) 100 | print(y) 101 | print(np.sum(y_pred)) 102 | 103 | plt.figure() 104 | plt.scatter(X, y) 105 | plt.show() 106 | """ 107 | z = model.predict(np.expand_dims(list(range(-10000, 10001)), axis=1)) 108 | plt.figure() 109 | plt.plot(list(range(-10000, 10001)), z) 110 | plt.show() 111 | """ 112 | -------------------------------------------------------------------------------- /python/history_modeling/encoding_model.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | 3 | import pandas as pd 4 | from sklearn.decomposition import PCA 5 | 6 | from history_modeling.match_representation import ( 7 | create_timeless_dataset, 8 | get_match_info, 9 | ) 10 | 11 | 12 | class MatchEncoder: 13 | def __init__(self, num_match_differences): 14 | self.num_match_differences = num_match_differences 15 | 16 | @property 17 | @abstractmethod 18 | def output_shape(self): 19 | pass 20 | 21 | def select_data(self, X, columns=None): 22 | assert isinstance(X, pd.DataFrame) 23 | 24 | if columns is not None: 25 | X_transformed = create_timeless_dataset(X, columns=columns) 26 | else: 27 | X_transformed = create_timeless_dataset(X) 28 | X_transformed = X_transformed.dropna().reset_index(drop=True) 29 | return X_transformed 30 | 31 | @abstractmethod 32 | def predict(self, match_row): 33 | pass 34 | 35 | 36 | class PCAMatchEncoder(MatchEncoder): 37 | def __init__( 38 | self, 39 | num_pca_features, 40 | auto_transform=False, 41 | columns=[ 42 | "surface", 43 | "result", 44 | "num_played_minutes", 45 | "adv_ranking", 46 | "adv_ranking_points", 47 | "num_won_sets", 48 | "num_lost_sets", 49 | "num_won_games", 50 | "num_lost_games", 51 | "num_tie_break_wons", 52 | "num_tie_break_lost", 53 | ], 54 | ): 55 | self.num_pca_features = num_pca_features 56 | self.auto_transform = auto_transform 57 | self.columns = columns 58 | 59 | self.model = self.instantiate_model() 60 | 61 | def output_shape(self): 62 | return self.num_pca_features 63 | 64 | def instantiate_model(self): 65 | model = PCA(n_components=self.num_pca_features) 66 | return model 67 | 68 | def fit(self, X, transform_data=False): 69 | if transform_data or self.auto_transform: 70 | X = self.select_data(X, columns=self.columns) 71 | self.model.fit(X) 72 | 73 | def predict(self, X, transform_data=False): 74 | if transform_data or self.auto_transform: 75 | X = self.select_data(X, columns=self.columns) 76 | return self.model.transform(X), X 77 | else: 78 | return self.model.transform(X) 79 | 80 | def save_model(self): 81 | pass 82 | 83 | 84 | class IdentityEncoder(MatchEncoder): 85 | def __init__( 86 | self, 87 | auto_transform=False, 88 | columns=[ 89 | "surface", 90 | "result", 91 | "num_played_minutes", 92 | "adv_ranking", 93 | "adv_ranking_points", 94 | "num_won_sets", 95 | "num_lost_sets", 96 | "num_won_games", 97 | "num_lost_games", 98 | "num_tie_break_wons", 99 | "num_tie_break_lost", 100 | ], 101 | ): 102 | self.columns = columns 103 | self.auto_transform = auto_transform 104 | 105 | self.model = self.instantiate_model() 106 | 107 | @property 108 | def output_shape(self): 109 | return len(self.columns) 110 | 111 | def instantiate_model(self): 112 | return None 113 | 114 | def fit(self, X, transform_data=False): 115 | pass 116 | 117 | def predict(self, X, transform_data=False): 118 | if transform_data or self.auto_transform: 119 | X_tr = self.select_data(X, columns=self.columns) 120 | return X_tr, X 121 | else: 122 | return X 123 | 124 | def save_model(self): 125 | pass 126 | 127 | 128 | class MatchesHistoryEncoder: 129 | def __init__(self, match_encoder, num_matches, add_days_difference): 130 | self.match_encoder = match_encoder 131 | self.num_matches = num_matches 132 | self.add_days_difference = add_days_difference 133 | 134 | @abstractmethod 135 | def predict(self, match_row): 136 | pass 137 | -------------------------------------------------------------------------------- /examples/models/train_test_eval.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | 3 | sys.path.append("../../python") 4 | sys.path.append("../../../") 5 | 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | import pandas as pd 9 | from sklearn.ensemble import RandomForestClassifier 10 | from sklearn.ensemble import GradientBoostingClassifier 11 | 12 | from data.data_loader import matches_data_loader 13 | from data.data_loader import encode_data 14 | from evaluation.train_test import train_test_evaluation 15 | 16 | 17 | train_years = [2020, 2021] 18 | test_years = [2022, 2023] 19 | 20 | 21 | model_class = RandomForestClassifier 22 | model_params = {"n_estimators": 2000, "max_depth": None} 23 | match_features = [] 24 | player_features = ["Ranking"] 25 | additional_features = ["diff_rank", "v_perc_versus"] 26 | 27 | test_score = train_test_evaluation( 28 | train_years=train_years, 29 | test_years=test_years, 30 | model_class=model_class, 31 | model_params=model_params, 32 | match_features=match_features, 33 | player_features=player_features, 34 | encoding_params={}, 35 | additional_features=additional_features, 36 | ) 37 | 38 | print("Test Score", test_score) 39 | 40 | 41 | model_class = RandomForestClassifier 42 | model_params = {"n_estimators": 2000, "max_depth": None} 43 | match_features = [] 44 | player_features = ["Ranking"] 45 | additional_features = [] 46 | 47 | test_score = train_test_evaluation( 48 | train_years=train_years, 49 | test_years=test_years, 50 | model_class=model_class, 51 | model_params=model_params, 52 | match_features=match_features, 53 | player_features=player_features, 54 | encoding_params={}, 55 | additional_features=additional_features, 56 | ) 57 | 58 | print("Test Score", test_score) 59 | 60 | 61 | model_class = RandomForestClassifier 62 | model_params = {"n_estimators": 2000, "max_depth": None} 63 | match_features = [] 64 | player_features = [] 65 | additional_features = ["diff_rank"] 66 | 67 | test_score = train_test_evaluation( 68 | train_years=train_years, 69 | test_years=test_years, 70 | model_class=model_class, 71 | model_params=model_params, 72 | match_features=match_features, 73 | player_features=player_features, 74 | encoding_params={}, 75 | additional_features=additional_features, 76 | ) 77 | 78 | print("Test Score", test_score) 79 | 80 | 81 | model_class = RandomForestClassifier 82 | model_params = {"n_estimators": 1, "max_depth": 1} 83 | match_features = [] 84 | player_features = [] 85 | additional_features = ["diff_rank"] 86 | 87 | test_score = train_test_evaluation( 88 | train_years=train_years, 89 | test_years=test_years, 90 | model_class=model_class, 91 | model_params=model_params, 92 | match_features=match_features, 93 | player_features=player_features, 94 | encoding_params={}, 95 | additional_features=additional_features, 96 | ) 97 | 98 | print("Test Score", test_score) 99 | 100 | 101 | model_class = GradientBoostingClassifier 102 | model_params = {"n_estimators": 100, "learning_rate": 1.0, "max_depth": 1} 103 | match_features = [] 104 | player_features = [] 105 | additional_features = ["diff_rank"] 106 | 107 | test_score = train_test_evaluation( 108 | train_years=train_years, 109 | test_years=test_years, 110 | model_class=model_class, 111 | model_params=model_params, 112 | match_features=match_features, 113 | player_features=player_features, 114 | encoding_params={}, 115 | additional_features=additional_features, 116 | ) 117 | 118 | print("Test Score", test_score) 119 | 120 | 121 | model_class = GradientBoostingClassifier 122 | model_params = {"n_estimators": 1000, "learning_rate": 0.1, "max_depth": 4} 123 | match_features = [] 124 | player_features = [] 125 | additional_features = ["diff_rank"] 126 | 127 | test_score = train_test_evaluation( 128 | train_years=train_years, 129 | test_years=test_years, 130 | model_class=model_class, 131 | model_params=model_params, 132 | match_features=match_features, 133 | player_features=player_features, 134 | encoding_params={}, 135 | additional_features=additional_features, 136 | ) 137 | 138 | print("Test Score", test_score) 139 | -------------------------------------------------------------------------------- /examples/models/deep_history.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import matplotlib.pyplot as plt 5 | 6 | sys.path.append("../../python") 7 | import time 8 | 9 | import numpy as np 10 | import pandas as pd 11 | 12 | from data.data_loader import matches_data_loader 13 | from data.data_encoding import ( 14 | encode_data, 15 | create_additional_features, 16 | clean_missing_data, 17 | create_encoded_history, 18 | complete_missing_data, 19 | ) 20 | from history_modeling.encoding_model import IdentityEncoder 21 | from model.deep_model import ConvolutionalHistoryAndFullyConnected 22 | 23 | 24 | absolute_path = os.path.dirname(os.path.abspath(__file__)) 25 | match_features = ["tournament_surface", "tournament_level", "round"] 26 | player_features = [ 27 | "Ranking", 28 | "Ranking_Points", 29 | "Height", 30 | "Victories_Percentage", 31 | "Clay_Victories_Percentage", 32 | "Grass_Victories_Percentage", 33 | "Carpet_Victories_Percentage", 34 | "Hard_Victories_Percentage", 35 | "Aces_Percentage", 36 | ] 37 | additional_features = ["diff_rank", "v_perc_versus", "nb_match_versus"] 38 | encoding_params = {} 39 | 40 | data_df = matches_data_loader( 41 | path_to_data=os.path.join(absolute_path, "../../submodules/tennis_atp"), 42 | path_to_cache=os.path.join(absolute_path, "../../cache"), 43 | flush_cache=False, 44 | keep_values_from_year=2022, 45 | get_match_statistics=True, 46 | get_reversed_match_data=True, 47 | include_davis_cup=False, 48 | ) 49 | print(f"[+] Data Loaded, Now Encoding Data and create additional Features") 50 | print(data_df.head()) 51 | print(data_df.columns) 52 | 53 | # data_df = pd.concat([data_df.iloc[:1000], data_df.iloc[-1000:]]) 54 | 55 | history_columns = [] 56 | encoder_models = [(IdentityEncoder, {})] 57 | for encoding_model, encoding_model_params in encoder_models: 58 | print(f"[+] Training Encoder Model {encoding_model}") 59 | encoder = encoding_model(**encoding_model_params) 60 | encoder.fit(data_df) 61 | 62 | print(f"[+] Encoding using encoder {encoding_model}") 63 | encoded_data = create_encoded_history( 64 | data_df, encoder, num_matches=5, completing_value=0 65 | ) 66 | 67 | cols = ["history_1", "history_2"] 68 | 69 | flatten_data = pd.concat( 70 | [ 71 | pd.DataFrame( 72 | np.array(encoded_data[x].values.tolist()).reshape( 73 | (len(encoded_data), -1) 74 | ) 75 | ).add_prefix(x) 76 | for x in cols 77 | ], 78 | axis=1, 79 | ) 80 | encoded_data = pd.concat([flatten_data, encoded_data.drop(cols, axis=1)], axis=1) 81 | enc_columns = encoded_data.columns 82 | enc_columns = list(set(enc_columns) - set(["id", "ID_1", "ID_2"])) 83 | history_columns.extend(enc_columns) 84 | 85 | data_df = pd.merge(data_df, encoded_data, on=["id", "ID_1", "ID_2"]) 86 | 87 | train_data = data_df.loc[data_df.tournament_year.isin([2022])] 88 | test_data = data_df.loc[data_df.tournament_year.isin([2023])] 89 | # train_data = data_df.loc[data_df.tournament_year.isin([2019, 2020, 2021])] 90 | # test_data = data_df.loc[data_df.tournament_year.isin([2022, 2023])] 91 | train_data = create_additional_features(train_data, additional_features) 92 | train_data = encode_data(train_data, **encoding_params) 93 | test_data = create_additional_features(test_data, additional_features) 94 | test_data = encode_data(test_data, **encoding_params) 95 | 96 | p1_features = [feat + "_1" for feat in player_features] 97 | p2_features = [feat + "_2" for feat in player_features] 98 | match_features = match_features.copy() 99 | 100 | train_data_ = train_data[ 101 | match_features + p1_features + p2_features + ["Winner", "tournament_year"] 102 | ] 103 | test_data_ = test_data[ 104 | match_features + p1_features + p2_features + ["Winner", "tournament_year"] 105 | ] 106 | 107 | # train_data_ = clean_missing_data(train_data_) 108 | # test_data_ = clean_missing_data(test_data_) 109 | 110 | print(data_df.head()) 111 | print(data_df.columns) 112 | 113 | model = ConvolutionalHistoryAndFullyConnected( 114 | num_history_signals=22, 115 | **{ 116 | "input_shape": 23, 117 | "hidden_units": (22, 44, 22, 11, 4), 118 | "output_shape": 2, 119 | "last_activation": "softmax", 120 | "epochs": 100, 121 | "reduced_lr_epochs": 50, 122 | "loss": "categorical_crossentropy", 123 | }, 124 | ) 125 | # model.instantiate_model() 126 | 127 | print(model.summary()) 128 | 129 | print(data_df.head()) 130 | 131 | hist_cols = [] 132 | for col in data_df.columns: 133 | if "history" in col: 134 | hist_cols.append(col) 135 | 136 | print(len(train_data), len(hist_cols)) 137 | 138 | model.fit( 139 | train_data_.values, 140 | train_data[hist_cols].values.reshape((len(train_data), 5, 22)), 141 | train_data["Winner"].values, 142 | ) 143 | 144 | y_pred = model.predict( 145 | test_data_.values, test_data[hist_cols].values.reshape((len(test_data), 5, 22)) 146 | ) 147 | 148 | 149 | print(np.sum(y_pred == test_data["Winner"])) 150 | 151 | plt.plot(y_pred) 152 | plt.show() 153 | -------------------------------------------------------------------------------- /examples/results_reading/best_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import matplotlib.pyplot as plt 4 | from matplotlib.patches import Rectangle 5 | import numpy as np 6 | import pandas as pd 7 | 8 | df_results = pd.read_csv("../../results/20212022/results.csv", sep=";") 9 | 10 | best_row = df_results.iloc[df_results.precision.argmax()] 11 | print(best_row) 12 | 13 | eval_id = best_row["eval_ID"] 14 | best_results = pd.read_csv( 15 | os.path.join("../../results/20212022", f"{eval_id}.csv"), sep=";" 16 | ) 17 | 18 | fig, ax = plt.subplots() 19 | df_ww = best_results.loc[best_results.Winner == 0].loc[best_results.y_pred == 0] 20 | plt.scatter(df_ww.diff_rank, df_ww.Winner, c="tab:pink", label="Well Predicted") 21 | df_wl = best_results.loc[best_results.Winner == 0].loc[best_results.y_pred == 1] 22 | plt.scatter(df_wl.diff_rank, df_wl.Winner + 0.1, c="tab:blue", label="Predicted Wrong") 23 | df_ll = best_results.loc[best_results.Winner == 1].loc[best_results.y_pred == 1] 24 | plt.scatter(df_ll.diff_rank, df_ll.Winner, c="tab:orange", label="Well Wrong") 25 | df_lw = best_results.loc[best_results.Winner == 1].loc[best_results.y_pred == 0] 26 | plt.scatter(df_lw.diff_rank, df_lw.Winner - 0.1, c="tab:red", label="Predicted Wrong") 27 | plt.legend() 28 | 29 | plt.xlabel("Rank Player 0 - Rank Player 1") 30 | plt.ylabel("Winner") 31 | plt.show() 32 | 33 | # Let's evaluate Symmetry 34 | symmetric_same_results = 0 35 | for i in range(int(len(best_results) / 2)): 36 | if best_results.iloc[2 * i]["y_pred"] != best_results.iloc[2 * i + 1]["y_pred"]: 37 | symmetric_same_results += 1 38 | print( 39 | f"{(symmetric_same_results / (len(best_results) / 2))} Results are symmetrically predicted" 40 | ) 41 | 42 | rank_categories = [1, 10, 50, 100, 300, 1000, 9999] 43 | 44 | prediction_percentage = [] 45 | 46 | for cat_1 in range(len(rank_categories) - 1): 47 | lines = [] 48 | nb_matches_lines = [] 49 | for cat_2 in range(len(rank_categories) - 1): 50 | sub_df = best_results.loc[best_results.Ranking_1 >= rank_categories[cat_1]].loc[ 51 | best_results.Ranking_1 < rank_categories[cat_1 + 1] 52 | ] 53 | sub_df = sub_df.loc[sub_df.Ranking_2 >= rank_categories[cat_2]].loc[ 54 | sub_df.Ranking_2 < rank_categories[cat_2 + 1] 55 | ] 56 | sub_df["best_rank"] = sub_df.apply( 57 | lambda row: np.argmin([row["Ranking_1"], row["Ranking_2"]]), axis=1 58 | ) 59 | 60 | if len(sub_df) > 0: 61 | best_player_w_p = np.sum( 62 | sub_df.Winner.values == sub_df.y_pred.values 63 | ) / len(sub_df) 64 | 65 | else: 66 | best_player_w_p = 0 67 | lines.append(best_player_w_p) 68 | nb_matches_lines.append(len(sub_df) / 2) 69 | prediction_percentage.append(lines) 70 | 71 | colors = ["purple", "blue", "cyan", "green", "yellow", "orange", "red"] 72 | fig, ax = plt.subplots() 73 | 74 | for i, val1 in enumerate(prediction_percentage): 75 | for j, val2 in enumerate(val1): 76 | color = colors[int(val2 * (len(colors) - 1))] 77 | rect = plt.Rectangle((i, j), 1, 1, fc=color) 78 | ax.add_patch(rect) 79 | plt.text(i + 0.2, j + 0.35, np.round(val2 * 100, 2)) 80 | 81 | for i in range(len(rank_categories)): 82 | plt.plot([i, i], [0, len(rank_categories) - 1], c="k") 83 | plt.plot([0, len(rank_categories) - 1], [i, i], c="k") 84 | 85 | plt.xticks(list(range(len(rank_categories))), labels=rank_categories) 86 | plt.yticks(list(range(len(rank_categories))), labels=rank_categories) 87 | plt.xlabel("Player 1 Rank Category") 88 | plt.ylabel("Player 2 Rank Category") 89 | plt.title("Precision Percentage") 90 | plt.savefig("precision_percentage_players_ranks.png") 91 | plt.show() 92 | 93 | best_ranked_player_wins_results = pd.read_csv( 94 | os.path.join( 95 | "../../results/20212022", 96 | f"{df_results.loc[df_results.model_class=='BestRankedPlayerWins'].eval_ID.values[0]}.csv", 97 | ), 98 | sep=";", 99 | ) 100 | ticks = [] 101 | fig, ax = plt.subplots() 102 | for surface, surface_code in {"Clay": 0, "Carpet": 1, "Hard": 2, "Grass": 3}.items(): 103 | precision_model = best_results.loc[best_results.tournament_surface == surface_code] 104 | precision_brpw = best_ranked_player_wins_results.loc[ 105 | best_ranked_player_wins_results.tournament_surface == surface_code 106 | ] 107 | 108 | if len(precision_model) > 0: 109 | precision_model = len( 110 | precision_model.loc[precision_model.y_pred == precision_model.Winner] 111 | ) / len(precision_model) 112 | prec_brpw = 0 113 | for n_row, row in precision_brpw.iterrows(): 114 | if int(row["y_pred"][1]) == row["Winner"]: 115 | prec_brpw += 1 116 | precision_brpw = prec_brpw / len(precision_brpw) 117 | else: 118 | precision_model = 0 119 | precision_brpw = 0 120 | rect = Rectangle( 121 | (surface_code * 2, 0), 122 | 1, 123 | precision_model, 124 | edgecolor="k", 125 | facecolor="tab:blue", 126 | label="Model - XGBoost", 127 | ) 128 | ax.add_patch(rect) 129 | rect = Rectangle( 130 | (surface_code * 2 + 1, 0), 131 | 1, 132 | precision_brpw, 133 | edgecolor="k", 134 | facecolor="tab:pink", 135 | label="Best Ranked Player Wins", 136 | ) 137 | ax.add_patch(rect) 138 | ticks.append(surface) 139 | 140 | ax.autoscale() 141 | plt.xticks([1, 3, 5, 7], labels=ticks) 142 | handles, labels = plt.gca().get_legend_handles_labels() 143 | by_label = dict(zip(labels, handles)) 144 | plt.legend(by_label.values(), by_label.keys(), loc=4) 145 | plt.title("Win % for each surface") 146 | plt.savefig("win_per_surface.png") 147 | plt.show() 148 | -------------------------------------------------------------------------------- /python/history_modeling/match_representation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from data.data_utils import get_days_difference 5 | 6 | 7 | def get_match_info(row, verbose=0): 8 | # add adversary age & hand ? 9 | surface = row["tournament_surface"] 10 | result = row["Winner"] 11 | try: 12 | score = row["score"] 13 | except: 14 | print(row) 15 | print(row.index) 16 | print(row.values) 17 | num_played_minutes = row["elapsed_minutes"] 18 | date = row["tournament_date"] 19 | 20 | adv_ranking = row["Ranking_2"] 21 | adv_ranking_points = row["Ranking_Points_2"] 22 | 23 | num_won_sets = 0 24 | num_lost_sets = 0 25 | num_won_games = 0 26 | num_lost_games = 0 27 | num_tie_break_wons = 0 28 | num_tie_break_lost = 0 29 | 30 | for set in row["score"].split(" "): 31 | try: 32 | games_0 = set.split("-")[0] 33 | games_1 = set.split("-")[1] 34 | 35 | if "(" in games_0: 36 | games_0 = games_0.split("(")[0] 37 | num_tie_break_lost += 1 38 | 39 | elif "(" in games_1: 40 | games_1 = games_1.split("(")[0] 41 | num_tie_break_wons += 1 42 | 43 | games_0 = int(games_0) 44 | games_1 = int(games_1) 45 | 46 | if games_0 > games_1: 47 | num_won_sets += 1 48 | elif games_0 < games_1: 49 | num_lost_sets += 1 50 | 51 | num_won_games += games_0 52 | num_lost_games += games_1 53 | except: 54 | if set not in ["ABD", "RET", "W/O"]: 55 | if verbose == 2: 56 | print(set) 57 | else: 58 | pass 59 | 60 | match_df = pd.DataFrame( 61 | { 62 | "surface": [surface], 63 | "result": [result], 64 | "num_played_minutes": [num_played_minutes], 65 | "date": [date], 66 | "adv_ranking": [adv_ranking], 67 | "adv_ranking_points": [adv_ranking_points], 68 | "num_won_sets": [num_won_sets], 69 | "num_lost_sets": [num_lost_sets], 70 | "num_won_games": [num_won_games], 71 | "num_lost_games": [num_lost_games], 72 | "num_tie_break_wons": [num_tie_break_wons], 73 | "num_tie_break_lost": [num_tie_break_lost], 74 | } 75 | ) 76 | return match_df 77 | 78 | 79 | def matches_info_norm(matches_info, current_date=""): 80 | # Normalize values 81 | tournament_surface = {"Clay": 0.0, "Carpet": 1 / 3, "Hard": 2 / 3, "Grass": 1.0} 82 | # nb sets won: max 3 83 | # nb sets lost: max 3 84 | # nb games won: max 100 (from experience - to be validated) 85 | # nb games lost: max 100 (from experience - to be validated) 86 | # nb tiebreaks won: max 100 (from experience - to be validated) -> not number of points but nb of tiebreaks ? 87 | # nb tiebreaks lost: max 100 (from experience - to be validated) 88 | # Ranking points max 16,950 from Djokovic's record -> 20,000 89 | # Ranking max 9,999 90 | # Num played minutes max 671 from Mahut/Isner's record -> 700 91 | # date: compute number of days since tournament date -> normalize by 365 -> if > 365 give up ? 92 | 93 | matches_info = matches_info.copy() 94 | matches_info["surface"] = matches_info["surface"].apply( 95 | lambda val: tournament_surface[val] 96 | ) 97 | matches_info["num_won_sets"] = matches_info["num_won_sets"].apply( 98 | lambda val: val / 3 99 | ) 100 | matches_info["num_lost_sets"] = matches_info["num_lost_sets"].apply( 101 | lambda val: val / 3 102 | ) 103 | 104 | matches_info["date"] = matches_info["date"].apply( 105 | lambda val: get_days_difference(val, current_date) / 365 106 | ) 107 | matches_info["num_played_minutes"] = matches_info["num_played_minutes"].apply( 108 | lambda val: val / 700 109 | ) 110 | 111 | matches_info["adv_ranking"] = matches_info["adv_ranking"].apply( 112 | lambda val: np.log(val) / np.log(9999) 113 | ) 114 | matches_info["adv_ranking_points"] = matches_info["adv_ranking_points"].apply( 115 | lambda val: val / 20000 116 | ) 117 | 118 | matches_info["num_won_games"] = matches_info["num_won_games"].apply( 119 | lambda val: val / 100 120 | ) 121 | matches_info["num_lost_games"] = matches_info["num_lost_games"].apply( 122 | lambda val: val / 100 123 | ) 124 | matches_info["num_tie_break_wons"] = matches_info["num_tie_break_wons"].apply( 125 | lambda val: val / 3 126 | ) 127 | matches_info["num_tie_break_lost"] = matches_info["num_tie_break_lost"].apply( 128 | lambda val: val / 3 129 | ) 130 | 131 | return matches_info 132 | 133 | 134 | def create_dataset( 135 | data_df, num_matches_difference=10, nb_kept_differences=10, randomize_indexes=False 136 | ): 137 | """ 138 | Creates the match representation dataset 139 | :param data_df: 140 | :param num_matches_difference: 141 | :param nb_kept_differences: 142 | :return: 143 | """ 144 | dataset = [] 145 | for i in range(len(data_df)): 146 | current_row = data_df.iloc[i] 147 | current_player = current_row.ID_1 148 | sub_data_df = data_df.loc[data_df.ID_1 == current_player].iloc[: i - 1] 149 | if len(sub_data_df) > 0: 150 | sub_data_df = sub_data_df.reset_index(drop=True) 151 | 152 | kept_indexes = list( 153 | range(-min(len(sub_data_df), num_matches_difference), 0) 154 | ) 155 | if randomize_indexes: 156 | kept_indexes = np.random.permutation(kept_indexes) 157 | kept_indexes = kept_indexes[:nb_kept_differences] 158 | sub_data_df = sub_data_df.iloc[kept_indexes] 159 | sub_data_df = sub_data_df.reset_index(drop=True) 160 | 161 | raw_matches_info = pd.concat( 162 | [get_match_info(sub_data_df.iloc[i]) for i in range(len(sub_data_df))], 163 | axis=0, 164 | ) 165 | normalized_matches_info = matches_info_norm( 166 | raw_matches_info, current_date=current_row["tournament_date"] 167 | ) 168 | dataset.append(normalized_matches_info) 169 | 170 | return pd.concat(dataset, axis=0) 171 | 172 | 173 | def create_timeless_dataset( 174 | data_df, 175 | columns=[ 176 | "surface", 177 | "result", 178 | "num_played_minutes", 179 | "adv_ranking", 180 | "adv_ranking_points", 181 | "num_won_sets", 182 | "num_lost_sets", 183 | "num_won_games", 184 | "num_lost_games", 185 | "num_tie_break_wons", 186 | "num_tie_break_lost", 187 | ], 188 | ): 189 | dataset = [] 190 | for i in range(len(data_df)): 191 | raw_matches_info = get_match_info(data_df.iloc[i]) 192 | dataset.append(raw_matches_info) 193 | dataset = pd.concat(dataset, axis=0) 194 | 195 | dataset = matches_info_norm( 196 | dataset, current_date=data_df["tournament_date"].values[-1] 197 | ) 198 | dataset = dataset.drop(["date"], axis=1) 199 | return dataset[columns] 200 | -------------------------------------------------------------------------------- /python/model/deep_model.py: -------------------------------------------------------------------------------- 1 | from sklearn.preprocessing import StandardScaler 2 | import tensorflow as tf 3 | 4 | from model.base_model import DeepBaseModel 5 | 6 | 7 | def create_dense_model( 8 | input_shape=2, 9 | output_shape=2, 10 | hidden_units=(4, 8, 4), 11 | hidden_activations="relu", 12 | last_activation="softmax", 13 | ): 14 | hid_activation = tf.keras.layers.Activation(hidden_activations) 15 | inputs = tf.keras.layers.Input(shape=input_shape) 16 | hidden_out = inputs 17 | 18 | for n_cells in hidden_units: 19 | hidden_out = tf.keras.layers.Dense(n_cells)(hidden_out) 20 | hidden_out = hid_activation(hidden_out) 21 | 22 | out = tf.keras.layers.Dense(output_shape)(hidden_out) 23 | out = tf.keras.layers.Activation(last_activation)(out) 24 | 25 | return tf.keras.Model(inputs=inputs, outputs=out) 26 | 27 | 28 | class SimpleFullyConnected(DeepBaseModel): 29 | def __init__( 30 | self, 31 | input_shape=2, 32 | output_shape=2, 33 | hidden_units=[4, 8, 4], 34 | hidden_activations="relu", 35 | last_activation="softmax", 36 | epochs=50, 37 | reduced_lr_epochs=10, 38 | optimizer="adamax", 39 | lr=1e-5, 40 | loss="cross_entropy", 41 | ): 42 | self.input_shape = input_shape 43 | self.output_shape = output_shape 44 | self.hidden_units = hidden_units 45 | self.hidden_activations = hidden_activations 46 | self.last_activation = last_activation 47 | self.epochs = epochs 48 | self.reduced_lr_epochs = reduced_lr_epochs 49 | self.optimizer = optimizer 50 | self.lr = lr 51 | self.loss = loss 52 | super().__init__() 53 | 54 | def instantiate_model(self): 55 | self.scaler_x = StandardScaler() 56 | self.model = create_dense_model( 57 | input_shape=self.input_shape, 58 | output_shape=self.output_shape, 59 | hidden_units=self.hidden_units, 60 | hidden_activations=self.hidden_activations, 61 | last_activation=self.last_activation, 62 | ) 63 | 64 | if self.optimizer == "adamax": 65 | self.optimizer = tf.keras.optimizers.Adamax(lr=self.lr) 66 | elif self.optimizer == "rmsprop": 67 | self.optimizer = tf.keras.optimizers.RMSprop(lr=self.lr) 68 | elif self.optimizer == "sgd": 69 | self.optimizer = tf.keras.optimizers.SGD(lr=self.lr) 70 | elif self.optimizer == "Adam": 71 | self.optimizer = tf.keras.optimizers.Adam(lr=self.lr) 72 | else: 73 | raise ValueError( 74 | f"Optimizer {self.optimizer} not understood, must be among ['adam', 'adamax', 'sgd', 'rmsprop']" 75 | ) 76 | 77 | self.model.compile(optimizer=self.optimizer, loss=self.loss) 78 | 79 | def fit(self, X, y): 80 | self.scaler_x.fit(X) 81 | if self.output_shape == 2: 82 | y = tf.one_hot(y.squeeze(), depth=2) 83 | self.model.fit(self.scaler_x.transform(X), y, epochs=self.epochs) 84 | if self.reduced_lr_epochs > 0: 85 | self.optimizer.lr.assign(self.lr / 10) 86 | self.model.fit(self.scaler_x.transform(X), y, epochs=self.reduced_lr_epochs) 87 | 88 | def predict(self, X): 89 | y_pred = self.model.predict(self.scaler_x.transform(X)) 90 | if self.output_shape == 2: 91 | y_pred = tf.argmax(y_pred, axis=-1) 92 | return y_pred 93 | 94 | 95 | def create_conv_dense_model( 96 | input_shape=2, 97 | history_input_shape=(5, 5), 98 | output_shape=2, 99 | hidden_units=(4, 8, 4), 100 | hidden_activations="relu", 101 | last_activation="softmax", 102 | ): 103 | hid_activation = tf.keras.layers.Activation(hidden_activations) 104 | 105 | history_inputs = tf.keras.layers.Input(shape=history_input_shape) 106 | print(history_inputs.shape, history_input_shape) 107 | encoded_history = tf.keras.layers.Conv1D(filters=8, kernel_size=3, padding="same")( 108 | history_inputs 109 | ) 110 | encoded_history = tf.keras.layers.Conv1D(filters=4, kernel_size=3)(history_inputs) 111 | encoded_history = tf.keras.layers.Conv1D(filters=1, kernel_size=3)(encoded_history) 112 | encoded_history = tf.keras.layers.Flatten()(encoded_history) 113 | 114 | inputs = tf.keras.layers.Input(shape=input_shape) 115 | hidden_out = tf.keras.layers.Concatenate()([inputs, encoded_history]) 116 | 117 | for n_cells in hidden_units: 118 | hidden_out = tf.keras.layers.Dense(n_cells)(hidden_out) 119 | hidden_out = hid_activation(hidden_out) 120 | 121 | out = tf.keras.layers.Dense(output_shape)(hidden_out) 122 | out = tf.keras.layers.Activation(last_activation)(out) 123 | 124 | return tf.keras.Model(inputs=[history_inputs, inputs], outputs=out) 125 | 126 | 127 | class ConvolutionalHistoryAndFullyConnected(DeepBaseModel): 128 | def __init__( 129 | self, 130 | num_history_signals=2, 131 | history_length=5, 132 | input_shape=2, 133 | output_shape=2, 134 | hidden_units=[4, 8, 4], 135 | hidden_activations="relu", 136 | last_activation="softmax", 137 | epochs=50, 138 | reduced_lr_epochs=10, 139 | optimizer="adamax", 140 | lr=1e-5, 141 | loss="cross_entropy", 142 | ): 143 | self.num_history_signals = num_history_signals 144 | self.history_length = history_length 145 | self.input_shape = input_shape 146 | self.output_shape = output_shape 147 | self.hidden_units = hidden_units 148 | self.hidden_activations = hidden_activations 149 | self.last_activation = last_activation 150 | self.epochs = epochs 151 | self.reduced_lr_epochs = reduced_lr_epochs 152 | self.optimizer = optimizer 153 | self.lr = lr 154 | self.loss = loss 155 | super().__init__() 156 | 157 | def instantiate_model(self): 158 | self.scaler_x = StandardScaler() 159 | self.model = create_conv_dense_model( 160 | history_input_shape=(self.history_length, self.num_history_signals), 161 | input_shape=self.input_shape, 162 | output_shape=self.output_shape, 163 | hidden_units=self.hidden_units, 164 | hidden_activations=self.hidden_activations, 165 | last_activation=self.last_activation, 166 | ) 167 | 168 | if self.optimizer == "adamax": 169 | self.optimizer = tf.keras.optimizers.Adamax(lr=self.lr) 170 | elif self.optimizer == "rmsprop": 171 | self.optimizer = tf.keras.optimizers.RMSprop(lr=self.lr) 172 | elif self.optimizer == "sgd": 173 | self.optimizer = tf.keras.optimizers.SGD(lr=self.lr) 174 | elif self.optimizer == "Adam": 175 | self.optimizer = tf.keras.optimizers.Adam(lr=self.lr) 176 | else: 177 | raise ValueError( 178 | f"Optimizer {self.optimizer} not understood, must be among ['adam', 'adamax', 'sgd', 'rmsprop']" 179 | ) 180 | 181 | self.model.compile(optimizer=self.optimizer, loss=self.loss) 182 | 183 | def fit(self, X, X_history, y): 184 | # print(X.columns) 185 | self.scaler_x.fit(X) 186 | if self.output_shape == 2: 187 | y = tf.one_hot(y.squeeze(), depth=2) 188 | 189 | print("X shape", X.shape) 190 | print("X history shape", X_history.shape) 191 | print("y shape", y.shape) 192 | self.model.fit([X_history, self.scaler_x.transform(X)], y, epochs=self.epochs) 193 | if self.reduced_lr_epochs > 0: 194 | self.optimizer.lr.assign(self.lr / 10) 195 | self.model.fit( 196 | [X_history, self.scaler_x.transform(X)], 197 | y, 198 | epochs=self.reduced_lr_epochs, 199 | ) 200 | 201 | def predict(self, X, X_history): 202 | y_pred = self.model.predict([X_history, self.scaler_x.transform(X)]) 203 | if self.output_shape == 2: 204 | y_pred = tf.argmax(y_pred, axis=-1) 205 | return y_pred 206 | 207 | def summary(self): 208 | return self.model.summary() 209 | -------------------------------------------------------------------------------- /examples/models/grid_search.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | 3 | sys.path.append("../../python") 4 | sys.path.append("../../../") 5 | 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | import pandas as pd 9 | from sklearn.ensemble import ( 10 | RandomForestClassifier, 11 | GradientBoostingClassifier, 12 | AdaBoostClassifier, 13 | ) 14 | from model.dumb_models import BestRankedPlayerWins 15 | from model.lgbm import LightGBM 16 | from model.sk_model import ScalerSVC 17 | from model.xgboost import XGBoost 18 | 19 | from data.data_loader import matches_data_loader 20 | from data.data_loader import encode_data 21 | from evaluation.train_test import train_test_evaluation 22 | 23 | 24 | train_years = [2018, 2019, 2020] 25 | test_years = [2021, 2022] 26 | 27 | 28 | match_features = ["tournament_surface", "tournament_level"] 29 | player_features = [ 30 | "Ranking", 31 | "Ranking_Points", 32 | "Height", 33 | "Victories_Percentage", 34 | "Clay_Victories_Percentage", 35 | "Grass_Victories_Percentage", 36 | "Carpet_Victories_Percentage", 37 | "Hard_Victories_Percentage", 38 | "Aces_Percentage", 39 | ] 40 | additional_features = ["diff_rank", "v_perc_versus", "nb_match_versus"] 41 | 42 | 43 | test_score = train_test_evaluation( 44 | train_years=train_years, 45 | test_years=test_years, 46 | model_class=BestRankedPlayerWins, 47 | model_params={}, 48 | match_features=match_features, 49 | player_features=player_features, 50 | encoding_params={}, 51 | additional_features=additional_features, 52 | save_path="../../results/20212022_chall", 53 | save_all_results=False, 54 | ) 55 | 56 | lgbm_hyperparams = [] 57 | for num_leaves in [10, 100, 1000, 2000]: 58 | for min_data_leaf in [10, 100, 1000]: 59 | lgbm_hyperparams.append( 60 | { 61 | "params": { 62 | "num_leaves": num_leaves, 63 | "objective": "binary", 64 | "min_data_in_leaf": min_data_leaf, 65 | } 66 | } 67 | ) 68 | test_score = train_test_evaluation( 69 | train_years=train_years, 70 | test_years=test_years, 71 | model_class=LightGBM, 72 | model_params=lgbm_hyperparams, 73 | match_features=match_features, 74 | player_features=player_features, 75 | encoding_params={}, 76 | additional_features=additional_features, 77 | save_path="../../results/20212022_chall", 78 | save_all_results=False, 79 | ) 80 | 81 | 82 | ada_hyperparams = [] 83 | for num_est in [10, 100, 1000, 2000]: 84 | for lr in [0.1, 1.0, 2.0]: 85 | ada_hyperparams.append( 86 | { 87 | "n_estimators": num_est, 88 | "learning_rate": lr, 89 | } 90 | ) 91 | test_score = train_test_evaluation( 92 | train_years=train_years, 93 | test_years=test_years, 94 | model_class=AdaBoostClassifier, 95 | model_params=ada_hyperparams, 96 | match_features=match_features, 97 | player_features=player_features, 98 | encoding_params={}, 99 | additional_features=additional_features, 100 | save_path="../../results/20212022_chall", 101 | save_all_results=False, 102 | ) 103 | 104 | svc_hyperparams = [] 105 | for C in [0.1, 1.0, 10.0, 100.0]: 106 | for kernel in ["linear", "rbf"]: 107 | svc_hyperparams.append( 108 | { 109 | "C": C, 110 | "kernel": kernel, 111 | } 112 | ) 113 | test_score = train_test_evaluation( 114 | train_years=train_years, 115 | test_years=test_years, 116 | model_class=ScalerSVC, 117 | model_params=svc_hyperparams, 118 | match_features=match_features, 119 | player_features=player_features, 120 | encoding_params={}, 121 | additional_features=additional_features, 122 | save_path="../../results/20212022_chall", 123 | save_all_results=False, 124 | ) 125 | 126 | 127 | for mx_depth in [1, 3, 5]: 128 | for n_est in [10, 100, 1000, 2000]: 129 | model_params = {"n_estimators": n_est, "max_depth": mx_depth} 130 | model_class = RandomForestClassifier 131 | 132 | test_score = train_test_evaluation( 133 | train_years=train_years, 134 | test_years=test_years, 135 | model_class=model_class, 136 | model_params=model_params, 137 | match_features=match_features, 138 | player_features=player_features, 139 | encoding_params={}, 140 | additional_features=additional_features, 141 | save_path="../../results/20212022_chall", 142 | save_all_results=False, 143 | ) 144 | print("~~ Current Score ~~", test_score) 145 | 146 | 147 | for mx_depth in [1, 3, 5]: 148 | for n_est in [10, 100, 1000, 2000]: 149 | model_params = {"n_estimators": n_est, "max_depth": mx_depth} 150 | model_class = GradientBoostingClassifier 151 | 152 | test_score = train_test_evaluation( 153 | train_years=train_years, 154 | test_years=test_years, 155 | model_class=model_class, 156 | model_params=model_params, 157 | match_features=match_features, 158 | player_features=player_features, 159 | encoding_params={}, 160 | additional_features=additional_features, 161 | save_path="../../results/20212022_chall", 162 | save_all_results=False, 163 | ) 164 | print("~~ Current Score ~~", test_score) 165 | 166 | 167 | lgbm_hyperparams = [] 168 | for num_leaves in [10, 100, 1000, 2000]: 169 | for min_data_leaf in [10, 100, 1000]: 170 | lgbm_hyperparams.append( 171 | { 172 | "params": { 173 | "num_leaves": num_leaves, 174 | "objective": "binary", 175 | "min_data_in_leaf": min_data_leaf, 176 | } 177 | } 178 | ) 179 | 180 | test_score = train_test_evaluation( 181 | train_years=list([year for year in range(1990, 2021)]), 182 | test_years=test_years, 183 | model_class=LightGBM, 184 | model_params=lgbm_hyperparams, 185 | match_features=match_features, 186 | player_features=player_features, 187 | encoding_params={}, 188 | additional_features=additional_features, 189 | save_path="../../results/20212022_chall", 190 | save_all_results=False, 191 | ) 192 | 193 | xgb_hyperparams = [] 194 | for eta in [0.1, 0.3, 0.6]: 195 | for gamma in [0, 1, 10]: 196 | for max_depth in [2, 4, 6, 8, 10]: 197 | for min_child_weight in [1, 2, 8]: 198 | for subsample in [0.4, 0.8, 1]: 199 | xgb_hyperparams.append( 200 | { 201 | "params": { 202 | "eta": eta, 203 | "objective": "binary:logistic", 204 | "gamma": gamma, 205 | "max_depth": max_depth, 206 | "min_child_weight": min_child_weight, 207 | "subsample": subsample, 208 | } 209 | } 210 | ) 211 | 212 | test_score = train_test_evaluation( 213 | train_years=train_years, 214 | test_years=test_years, 215 | model_class=XGBoost, 216 | model_params=xgb_hyperparams, 217 | match_features=match_features, 218 | player_features=player_features, 219 | encoding_params={}, 220 | additional_features=additional_features, 221 | save_path="../../results/20212022_chall", 222 | save_all_results=False, 223 | ) 224 | 225 | xgb_hyperparams = [] 226 | for eta in [0.1, 0.3, 0.6]: 227 | for gamma in [0, 1, 10]: 228 | for max_depth in [2, 4, 6, 8, 10]: 229 | for min_child_weight in [1, 2, 8]: 230 | for subsample in [0.4, 0.8, 1]: 231 | xgb_hyperparams.append( 232 | { 233 | "params": { 234 | "eta": eta, 235 | "objective": "binary:logistic", 236 | "gamma": gamma, 237 | "max_depth": max_depth, 238 | "min_child_weight": min_child_weight, 239 | "subsample": subsample, 240 | } 241 | } 242 | ) 243 | 244 | test_score = train_test_evaluation( 245 | train_years=list([year for year in range(1990, 2021)]), 246 | test_years=test_years, 247 | model_class=XGBoost, 248 | model_params=xgb_hyperparams, 249 | match_features=match_features, 250 | player_features=player_features, 251 | encoding_params={}, 252 | additional_features=additional_features, 253 | save_path="../../results/20212022_chall", 254 | save_all_results=False, 255 | ) 256 | -------------------------------------------------------------------------------- /python/data/data_encoding.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import numpy as np 3 | import pandas as pd 4 | import tqdm 5 | 6 | from history_modeling.match_representation import ( 7 | create_timeless_dataset, 8 | get_match_info, 9 | ) 10 | 11 | 12 | def clean_missing_data(df): 13 | """ 14 | Cleans rows of df with missing data or to few statistics to be useful 15 | :param df: 16 | :return: 17 | """ 18 | print("Length df before cleaning:", len(df)) 19 | df = df.dropna(axis=0) 20 | print("after dropna", len(df)) 21 | # df = df.loc[df.Ranking_1 != 9999] 22 | df = df.loc[df.Ranking_1 != 0] 23 | # df = df.loc[df.Ranking_2 != 9999] 24 | df = df.loc[df.Ranking_2 != 0] 25 | 26 | return df 27 | 28 | 29 | def complete_missing_data(df, *args): 30 | for column, value in args: 31 | df[column].fillna(value, inplace=True) 32 | 33 | return df 34 | 35 | 36 | def encode_data(df, mode="integer"): 37 | # Remove: 38 | # - index 39 | # - Unnamed: 0 40 | # - Unnamed: 0.1 41 | # - tournament 42 | # - Name 43 | # - ID 44 | # - Birth Year => Age 45 | # - Versus: % V against 2, last 5 matches 46 | # - Matches 47 | 48 | # Refac: 49 | # - Versus 50 | # Best way to do it ? 51 | # - Birth Year 52 | # - Last Tournament => Days since last tournament + result ? 53 | 54 | df_copy = df.copy() 55 | if mode == "integer": 56 | # Considered Variables: 57 | tournament_level = {"G": 0, "A": 1, "M": 2, "F": 3, "D": 4, "C": 5} 58 | tournament_surface = {"Clay": 0, "Carpet": 1, "Hard": 2, "Grass": 3} 59 | 60 | round = { 61 | "F": 0, 62 | "SF": 1, 63 | "QF": 2, 64 | "R16": 3, 65 | "R32": 4, 66 | "R64": 5, 67 | "R128": 6, 68 | "R256": 7, 69 | "RR": 8, 70 | "BR": 9, 71 | "ER": 10, 72 | "Q1": 11, 73 | "Q2": 12, 74 | "Q3": 13, 75 | } 76 | 77 | hand = { 78 | "R": -1, 79 | "L": 1, 80 | "A": 0, 81 | "U": 2, 82 | "nan": 2, 83 | } 84 | 85 | elif mode == "one_hot": 86 | # Considered Variables: 87 | tournament_level = { 88 | "G": [0, 0, 0, 1, 0], 89 | "A": [0, 0, 1, 0, 0], 90 | "M": [0, 1, 0, 0, 0], 91 | "D": [1, 0, 0, 0, 0], 92 | "C": [0, 0, 0, 0, 1], 93 | } 94 | 95 | tournament_surface = { 96 | "Clay": [1, 0, 0, 0], 97 | "Carpet": [0, 1, 0, 0], 98 | "Hard": [0, 0, 1, 0], 99 | "Grass": [0, 0, 0, 1], 100 | } 101 | 102 | round = { 103 | "F": [0, 0, 0, 0, 0, 0, 0, 0, 1], 104 | "SF": [0, 0, 0, 0, 0, 0, 0, 1, 0], 105 | "QF": [0, 0, 0, 0, 0, 0, 1, 0, 0], 106 | "R16": [0, 0, 0, 0, 0, 1, 0, 0, 0], 107 | "R32": [0, 0, 0, 0, 1, 0, 0, 0, 0], 108 | "R64": [0, 0, 0, 1, 0, 0, 0, 0, 0], 109 | "R128": [0, 0, 1, 0, 0, 0, 0, 0, 0], 110 | "R256": [0, 1, 0, 0, 0, 0, 0, 0, 0], 111 | "RR": [1, 0, 0, 0, 0, 0, 0, 0, 0], 112 | } 113 | 114 | hand = { 115 | "R": [1, 0, 0, 0], 116 | "L": [0, 1, 0, 0], 117 | "A": [0, 0, 1, 0], 118 | "U": [0, 0, 0, 1], 119 | } 120 | 121 | elif mode == "mixing": 122 | # Considered Variables: 123 | tournament_level = {"G": 0, "A": 1, "M": 2, "F": 3, "D": 4, "C": 5} 124 | tournament_surface = { 125 | "Clay": [1, 0, 0, 0], 126 | "Carpet": [0, 1, 0, 0], 127 | "Hard": [0, 0, 1, 0], 128 | "Grass": [0, 0, 0, 1], 129 | } 130 | 131 | round = { 132 | "F": 0, 133 | "SF": 1, 134 | "QF": 2, 135 | "R16": 3, 136 | "R32": 4, 137 | "R64": 5, 138 | "R128": 6, 139 | "R256": 7, 140 | "RR": 8, 141 | "BR": 9, 142 | } 143 | 144 | hand = { 145 | "R": [1, 0, 0, 0], 146 | "L": [0, 1, 0, 0], 147 | "A": [0, 0, 1, 0], 148 | "U": [0, 0, 0, 1], 149 | } 150 | 151 | for col in df_copy.columns: 152 | if "hand" in col.lower(): 153 | df_copy[col] = df_copy.apply(lambda row: hand[str(row[col])], axis=1) 154 | elif "round" in col.lower(): 155 | df_copy[col] = df_copy.apply(lambda row: round[row[col]], axis=1) 156 | elif "tournament_level" in col.lower(): 157 | df_copy[col] = df_copy.apply(lambda row: tournament_level[row[col]], axis=1) 158 | elif "tournament_surface" in col.lower(): 159 | df_copy[col] = df_copy.apply( 160 | lambda row: tournament_surface[row[col]], axis=1 161 | ) 162 | else: 163 | pass 164 | 165 | return df_copy 166 | 167 | 168 | def create_additional_features(df, features): 169 | df = df.copy() 170 | 171 | if "nb_match_versus" in features: 172 | df["nb_match_versus"] = df.apply( 173 | lambda row: len([k[0] for k in ast.literal_eval(row["Versus_1"])]), axis=1 174 | ) 175 | 176 | if "v_perc_versus" in features: 177 | df["v_perc_versus"] = df.apply( 178 | lambda row: [k[0] for k in ast.literal_eval(row["Versus_1"])].count("V") 179 | / len([k[0] for k in ast.literal_eval(row["Versus_1"])]) 180 | if len([k[0] for k in ast.literal_eval(row["Versus_1"])]) > 0 181 | else -1, 182 | axis=1, 183 | ) 184 | 185 | if "diff_rank" in features: 186 | df["diff_rank"] = df.apply( 187 | lambda row: row["Ranking_2"] - row["Ranking_1"], axis=1 188 | ) 189 | 190 | if "diff_rank_points" in features: 191 | df["diff_rank_points"] = df.apply( 192 | lambda row: row["Ranking_Points_2"] - row["Ranking_Points_1"], axis=1 193 | ) 194 | 195 | return df 196 | 197 | 198 | def create_encoded_history(df, encoder, num_matches, completing_value=0): 199 | df = df.copy() 200 | history = { 201 | "id": [], 202 | "ID_1": [], 203 | "ID_2": [], 204 | "history_1": [], 205 | "history_2": [], 206 | } 207 | 208 | for n_row, row in tqdm.tqdm(df.iterrows(), total=len(df)): 209 | try: 210 | matches_history_1 = ast.literal_eval(row["Matches_1"])[-num_matches:] 211 | except: 212 | with open("error.txt", "w") as file: 213 | file.write(str(row["Matches_1"])) 214 | matches_history_1 = ast.literal_eval(row["Matches_1"])[-num_matches:] 215 | 216 | matches_history_1 = [_[1] for _ in matches_history_1] 217 | 218 | df_history = df.loc[df.id.isin(matches_history_1)].loc[df.ID_1 == row.ID_1] 219 | 220 | if len(df_history) > 0: 221 | # df_history = create_timeless_dataset(df_history) 222 | # encoded_history_1 = encoder.predict(df_history) 223 | encoded_history_1, df_history = encoder.predict( 224 | df_history, transform_data=True 225 | ) 226 | 227 | if encoded_history_1.shape[0] < num_matches: 228 | encoded_history_1 = np.concatenate( 229 | [ 230 | np.ones( 231 | ( 232 | num_matches - encoded_history_1.shape[0], 233 | encoded_history_1.shape[1], 234 | ) 235 | ) 236 | * completing_value, 237 | encoded_history_1, 238 | ], 239 | axis=0, 240 | ) 241 | else: 242 | encoded_history_1 = ( 243 | np.ones((num_matches, encoder.output_shape)) * completing_value 244 | ) 245 | 246 | matches_history_2 = ast.literal_eval(row["Matches_2"])[-num_matches:] 247 | matches_history_2 = [_[1] for _ in matches_history_2] 248 | 249 | df_history = df.loc[df.id.isin(matches_history_2)].loc[df.ID_1 == row.ID_2] 250 | 251 | if len(df_history) > 0: 252 | # df_history = create_timeless_dataset(df_history) 253 | encoded_history_2, df_history = encoder.predict( 254 | df_history, transform_data=True 255 | ) 256 | 257 | if encoded_history_2.shape[0] < num_matches: 258 | encoded_history_2 = np.concatenate( 259 | [ 260 | np.ones( 261 | ( 262 | num_matches - encoded_history_2.shape[0], 263 | encoded_history_2.shape[1], 264 | ) 265 | ) 266 | * completing_value, 267 | encoded_history_2, 268 | ], 269 | axis=0, 270 | ) 271 | else: 272 | encoded_history_2 = ( 273 | np.ones((num_matches, encoder.output_shape)) * completing_value 274 | ) 275 | 276 | history["id"].append(row.id) 277 | history["ID_1"].append(row.ID_1) 278 | history["ID_2"].append(row.ID_2) 279 | 280 | history["history_1"].append(encoded_history_1) 281 | history["history_2"].append(encoded_history_2) 282 | 283 | if n_row < 100 and len(df_history) > 0: 284 | row.to_csv("row.csv") 285 | df_history.to_csv("df_history.csv") 286 | np.save("encoded_history.npy", encoded_history_2) 287 | return pd.DataFrame(history) 288 | -------------------------------------------------------------------------------- /python/data/match.py: -------------------------------------------------------------------------------- 1 | import ast 2 | 3 | import pandas as pd 4 | import numpy as np 5 | 6 | 7 | class Match: 8 | def __init__(self, winner, loser, tournament, surface, id_prefix=""): 9 | self.winner = winner 10 | self.loser = loser 11 | self.tournament = tournament 12 | self.surface = surface 13 | self.id_prefix = id_prefix 14 | 15 | self.tournament_date = "" 16 | self.tournament_level = "" 17 | self.round = "" 18 | self.data = None 19 | self.match_time_players_data = { 20 | "winner": { 21 | "id": self.winner, 22 | "age": 0, 23 | "rank": 0, 24 | "ranking_points": 0, 25 | "aces_nb": 0, 26 | "df_nb": 0, 27 | "w_svpt": 0, 28 | "w_1stIn": 0, 29 | "w_1stWon": 0, 30 | "w_2ndWon": 0, 31 | "w_SvGms": 0, 32 | "w_bpSaved": 0, 33 | "w_bpFaced": 0, 34 | }, 35 | "loser": { 36 | "id": self.loser, 37 | "age": 0, 38 | "rank": 0, 39 | "ranking_points": 0, 40 | "aces_nb": 0, 41 | "df_nb": 0, 42 | "w_svpt": 0, 43 | "w_1stIn": 0, 44 | "w_1stWon": 0, 45 | "w_2ndWon": 0, 46 | "w_SvGms": 0, 47 | "w_bpSaved": 0, 48 | "w_bpFaced": 0, 49 | }, 50 | } 51 | 52 | self.sets_number = 0 53 | self.score = None 54 | self.elapsed_minutes = None 55 | self.best_of = None 56 | 57 | def get_rankings(self, player_id): 58 | if player_id == self.winner.id: 59 | return ( 60 | self.match_time_players_data["winner"]["rank"], 61 | self.match_time_players_data["winner"]["ranking_points"], 62 | ) 63 | else: 64 | return ( 65 | self.match_time_players_data["loser"]["rank"], 66 | self.match_time_players_data["loser"]["ranking_points"], 67 | ) 68 | 69 | def get_aces_nb(self, player_id): 70 | if player_id == self.winner.id: 71 | return self.match_time_players_data["winner"]["aces_nb"] 72 | else: 73 | return self.match_time_players_data["loser"]["aces_nb"] 74 | 75 | def get_service_points_played(self, player_id): 76 | if player_id == self.winner.id: 77 | return self.match_time_players_data["winner"]["w_svpt"] 78 | else: 79 | return self.match_time_players_data["loser"]["w_svpt"] 80 | 81 | def get_df_nb(self, player_id): 82 | if player_id == self.winner.id: 83 | return self.match_time_players_data["winner"]["df_nb"] 84 | else: 85 | return self.match_time_players_data["loser"]["df_nb"] 86 | 87 | def get_first_serve_win(self, player_id): 88 | if player_id == self.winner.id: 89 | return self.match_time_players_data["winner"]["w_1stWon"] 90 | else: 91 | return self.match_time_players_data["loser"]["w_1stWon"] 92 | 93 | def get_second_serve_win(self, player_id): 94 | if player_id == self.winner.id: 95 | return self.match_time_players_data["winner"]["w_2ndWon"] 96 | else: 97 | return self.match_time_players_data["loser"]["w_2ndWon"] 98 | 99 | def get_first_services_in(self, player_id): 100 | if player_id == self.winner.id: 101 | return self.match_time_players_data["winner"]["w_1stIn"] 102 | else: 103 | return self.match_time_players_data["loser"]["w_1stIn"] 104 | 105 | def get_breakpoint_faced(self, player_id): 106 | if player_id == self.winner.id: 107 | return self.match_time_players_data["winner"]["w_bpFaced"] 108 | else: 109 | return self.match_time_players_data["loser"]["w_bpFaced"] 110 | 111 | def get_breakpoint_saved(self, player_id): 112 | if player_id == self.winner.id: 113 | return self.match_time_players_data["winner"]["w_bpSaved"] 114 | else: 115 | return self.match_time_players_data["loser"]["w_bpSaved"] 116 | 117 | def __str__(self): 118 | return ( 119 | "TOURNAMENT : " 120 | + self.tournament 121 | + " W : " 122 | + self.winner 123 | + " L : " 124 | + self.loser 125 | ) 126 | 127 | def get_prior_data_and_update_players_stats(self): 128 | match_data = pd.DataFrame( 129 | { 130 | "id": [self.id], 131 | "tournament": [self.tournament], 132 | "tournament_level": [self.tournament_level], 133 | "tournament_date": [self.tournament_date], 134 | "tournament_surface": [self.surface], 135 | "round": [self.round], 136 | "best_of": [self.best_of], 137 | } 138 | ) 139 | 140 | w_data = self.winner.get_data_df(opponent=self.loser.id) 141 | lr, lrp = self.winner.get_last_months_rankings( 142 | date=self.tournament_date, nb_months=12, day_of_month="last" 143 | ) 144 | w_data["last_rankings"] = [lr] 145 | w_data["last_ranking_points"] = [lrp] 146 | l_data = self.loser.get_data_df(opponent=self.winner.id) 147 | lr, lrp = self.loser.get_last_months_rankings( 148 | date=self.tournament_date, nb_months=12, day_of_month="last" 149 | ) 150 | l_data["last_rankings"] = [lr] 151 | l_data["last_ranking_points"] = [lrp] 152 | 153 | self.winner.update_from_match(self) 154 | self.loser.update_from_match(self) 155 | return match_data, w_data, l_data 156 | 157 | def get_match_data_results_statistics(self): 158 | match_statistics = { 159 | "score": [self.score], 160 | "elapsed_minutes": [self.elapsed_minutes], 161 | } 162 | 163 | winner_statistics = { 164 | "aces_nb": [self.match_time_players_data["winner"]["aces_nb"]], 165 | "doublefaults_nb": [self.match_time_players_data["winner"]["df_nb"]], 166 | "svpt": [self.match_time_players_data["winner"]["w_svpt"]], 167 | "1stIn": [self.match_time_players_data["winner"]["w_1stIn"]], 168 | "1stWon": [self.match_time_players_data["winner"]["w_1stWon"]], 169 | "2ndWon": [self.match_time_players_data["winner"]["w_2ndWon"]], 170 | "SvGms": [self.match_time_players_data["winner"]["w_SvGms"]], 171 | "bpSaved": [self.match_time_players_data["winner"]["w_bpSaved"]], 172 | "bpFaced": [self.match_time_players_data["winner"]["w_bpFaced"]], 173 | } 174 | loser_statistics = { 175 | "aces_nb": [self.match_time_players_data["loser"]["aces_nb"]], 176 | "doublefaults_nb": [self.match_time_players_data["loser"]["df_nb"]], 177 | "svpt": [self.match_time_players_data["loser"]["w_svpt"]], 178 | "1stIn": [self.match_time_players_data["loser"]["w_1stIn"]], 179 | "1stWon": [self.match_time_players_data["loser"]["w_1stWon"]], 180 | "2ndWon": [self.match_time_players_data["loser"]["w_2ndWon"]], 181 | "SvGms": [self.match_time_players_data["loser"]["w_SvGms"]], 182 | "bpSaved": [self.match_time_players_data["loser"]["w_bpSaved"]], 183 | "bpFaced": [self.match_time_players_data["loser"]["w_bpFaced"]], 184 | } 185 | 186 | return ( 187 | pd.DataFrame(match_statistics), 188 | pd.DataFrame(winner_statistics), 189 | pd.DataFrame(loser_statistics), 190 | ) 191 | 192 | def instantiate_from_data_row(self, data_row): 193 | self.tournament_date = data_row["tourney_date"] 194 | self.tournament_level = data_row["tourney_level"] 195 | self.round = data_row["round"] 196 | self.sets_number = len(str(data_row["score"]).split("-")) 197 | self.games_number = 0 198 | for set in str(data_row["score"]).split(" "): 199 | try: 200 | games_0 = int(set.split("-")[0][0]) 201 | games_a = int(set.split("-")[1][0]) 202 | self.games_number += games_0 203 | self.games_number += games_1 204 | except: 205 | pass 206 | 207 | self.score = data_row["score"] 208 | self.elapsed_minutes = data_row["minutes"] 209 | self.best_of = data_row["best_of"] 210 | 211 | self.id = self.id_prefix + "_" + str(data_row.name) 212 | 213 | self.match_time_players_data = { 214 | "winner": { 215 | "id": data_row["winner_id"], 216 | "age": data_row["winner_age"], 217 | "rank": data_row["winner_rank"], 218 | "ranking_points": data_row["winner_rank_points"], 219 | "aces_nb": data_row["w_ace"], 220 | "df_nb": data_row["w_df"], 221 | "w_svpt": data_row["w_svpt"], 222 | "w_1stIn": data_row["w_1stIn"], 223 | "w_1stWon": data_row["w_1stWon"], 224 | "w_2ndWon": data_row["w_2ndWon"], 225 | "w_SvGms": data_row["w_SvGms"], 226 | "w_bpSaved": data_row["w_bpSaved"], 227 | "w_bpFaced": data_row["w_bpFaced"], 228 | }, 229 | "loser": { 230 | "id": data_row["loser_id"], 231 | "age": data_row["loser_age"], 232 | "rank": data_row["loser_rank"], 233 | "ranking_points": data_row["loser_rank_points"], 234 | "aces_nb": data_row["l_ace"], 235 | "df_nb": data_row["l_df"], 236 | "w_svpt": data_row["l_svpt"], 237 | "w_1stIn": data_row["l_1stIn"], 238 | "w_1stWon": data_row["l_1stWon"], 239 | "w_2ndWon": data_row["l_2ndWon"], 240 | "w_SvGms": data_row["l_SvGms"], 241 | "w_bpSaved": data_row["l_bpSaved"], 242 | "w_bpFaced": data_row["l_bpFaced"], 243 | }, 244 | } 245 | -------------------------------------------------------------------------------- /python/evaluation/train_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from data.data_loader import matches_data_loader 8 | from data.data_encoding import ( 9 | encode_data, 10 | create_additional_features, 11 | clean_missing_data, 12 | create_encoded_history, 13 | ) 14 | 15 | absolute_path = os.path.dirname(os.path.abspath(__file__)) 16 | default_columns_match = ["tournament_level", "round", "best_of", "tournament_surface"] 17 | 18 | default_columns_player = [ 19 | "Ranking", 20 | "Ranking_Points", 21 | "Hand", 22 | "Height", 23 | "Versus", 24 | "Victories_Percentage", 25 | "Clay_Victories_Percentage", 26 | "Grass_Victories_Percentage", 27 | "Carpet_Victories_Percentage", 28 | "Hard_Victories_Percentage", 29 | "Aces_Percentage", 30 | "Doublefaults_Percentage", 31 | "First_Serve_Success_Percentage", 32 | "Winning_on_1st_Serve_Percentage", 33 | "Winning_on_2nd_Serve_Percentage", 34 | "Overall_Win_on_Serve_Percentage", 35 | "BreakPoint_Face_Percentage", 36 | "BreakPoint_Saved_Percentage", 37 | "Fatigue", 38 | ] 39 | 40 | 41 | def train_test_evaluation( 42 | train_years, 43 | test_years, 44 | model_class, 45 | model_params, 46 | encoder_models=[], 47 | use_davis_data=False, 48 | history_encoder_years=1, 49 | match_features=default_columns_match, 50 | player_features=default_columns_player, 51 | encoding_params={}, 52 | additional_features=[], 53 | save_path=None, 54 | save_all_results=False, 55 | ): 56 | global absolute_path 57 | assert len(set(train_years).intersection(set(test_years))) == 0 58 | print(f"[+] Beginning Train/Test Evaluation for model class {model_class}") 59 | 60 | min_year = np.min(train_years + test_years) 61 | min_year -= history_encoder_years 62 | print(f"[+] Loading Data from year {min_year}") 63 | data_df = matches_data_loader( 64 | path_to_data=os.path.join(absolute_path, "../../submodules/tennis_atp"), 65 | path_to_cache=os.path.join(absolute_path, "../../cache"), 66 | flush_cache=False, 67 | keep_values_from_year=min_year, 68 | get_match_statistics=False, 69 | get_reversed_match_data=True, 70 | include_davis_cup=use_davis_data, 71 | ) 72 | print(f"[+] Data Loaded, Now Encoding Data and create additional Features") 73 | 74 | historic_data = data_df.loc[data_df.tournament_year < min(train_years)] 75 | train_data = data_df.loc[data_df.tournament_year.isin(train_years)] 76 | test_data = data_df.loc[data_df.tournament_year.isin(test_years)] 77 | 78 | history_columns = [] 79 | for encoding_model, encoding_model_params in encoder_models: 80 | print(f"[+] Training Encoder Model {encoding_model}") 81 | encoder = encoding_model(**encoding_model_params) 82 | encoder.fit(train_data) 83 | 84 | print(f"[+] Encoding using encoder {encoding_model}") 85 | encoded_data = create_encoded_history( 86 | data_df, encoder, num_matches=5, completing_value=0 87 | ) 88 | 89 | cols = ["history_1", "history_2"] 90 | 91 | flatten_data = pd.concat( 92 | [ 93 | pd.DataFrame( 94 | np.array(encoded_data[x].values.tolist()).reshape( 95 | (len(encoded_data), -1) 96 | ) 97 | ).add_prefix(x) 98 | for x in cols 99 | ], 100 | axis=1, 101 | ) 102 | encoded_data = pd.concat( 103 | [flatten_data, encoded_data.drop(cols, axis=1)], axis=1 104 | ) 105 | enc_columns = encoded_data.columns 106 | enc_columns = list(set(enc_columns) - set(["id", "ID_1", "ID_2"])) 107 | history_columns.extend(enc_columns) 108 | 109 | data_df = pd.merge(data_df, encoded_data, on=["id", "ID_1", "ID_2"]) 110 | 111 | # train_data = pd.merge(train_data, encoded_data, on=["id", "ID_1", "ID_2"]) 112 | # test_data = pd.merge(test_data, encoded_data, on=["id", "ID_1", "ID_2"]) 113 | 114 | train_data = data_df.loc[data_df.tournament_year.isin(train_years)] 115 | test_data = data_df.loc[data_df.tournament_year.isin(test_years)] 116 | train_data = create_additional_features(train_data, additional_features) 117 | train_data = encode_data(train_data, **encoding_params) 118 | test_data = create_additional_features(test_data, additional_features) 119 | test_data = encode_data(test_data, **encoding_params) 120 | 121 | p1_features = [feat + "_1" for feat in player_features] 122 | p2_features = [feat + "_2" for feat in player_features] 123 | match_features = match_features.copy() 124 | match_features.extend(additional_features.copy()) 125 | 126 | train_data = train_data[ 127 | match_features 128 | + p1_features 129 | + p2_features 130 | + history_columns 131 | + ["Winner", "tournament_year"] 132 | ] 133 | test_data = test_data[ 134 | match_features 135 | + p1_features 136 | + p2_features 137 | + history_columns 138 | + ["Winner", "tournament_year"] 139 | ] 140 | 141 | print(f"[+] Cleaning Data") 142 | train_data = clean_missing_data(train_data) 143 | test_data = clean_missing_data(test_data) 144 | print(f"Training on {len(train_data)} data and testing on {len(test_data)} data") 145 | 146 | print(f"[+] Data Ready, now beginning modelling") 147 | if isinstance(model_params, list): 148 | precisions = [] 149 | for params_set in model_params: 150 | model = model_class(**params_set) 151 | t_fit = time.time() 152 | model.fit( 153 | train_data[match_features + p1_features + p2_features], 154 | train_data["Winner"].values.ravel(), 155 | ) 156 | t_fit = time.time() - t_fit 157 | print(f"~~ Fit time: {np.round(t_fit, 0)}") 158 | 159 | preds = model.predict(test_data[match_features + p1_features + p2_features]) 160 | precision = np.sum(np.squeeze(preds) == test_data["Winner"].values) / len( 161 | preds 162 | ) 163 | precisions.append(precision) 164 | 165 | if save_path is not None: 166 | try: 167 | df_res = pd.read_csv( 168 | os.path.join(save_path, "results.csv"), sep=";" 169 | ) 170 | except: 171 | print("save file not found") 172 | os.makedirs(save_path, exist_ok=True) 173 | df_res = pd.DataFrame() 174 | 175 | df_curr = pd.DataFrame( 176 | { 177 | "train_years": [train_years], 178 | "test_years": [test_years], 179 | "model_class": [model_class.__name__], 180 | "model_params": [params_set], 181 | "match_features": [match_features], 182 | "player_features": [player_features], 183 | "encoding_params": [encoding_params], 184 | "additional_features": [additional_features.copy()], 185 | "precision": [precision], 186 | "fit_time": [np.round(t_fit, 0)], 187 | } 188 | ) 189 | 190 | if save_all_results: 191 | eval_id = int(time.time() * 100) 192 | df_curr["eval_ID"] = [eval_id] 193 | test_data["y_pred"] = preds 194 | test_data.to_csv( 195 | os.path.join(save_path, f"{eval_id}.csv"), index=False, sep=";" 196 | ) 197 | 198 | df_res = pd.concat([df_res, df_curr], axis=0) 199 | df_res.to_csv( 200 | os.path.join(save_path, "results.csv"), index=False, sep=";" 201 | ) 202 | 203 | return precisions 204 | 205 | else: 206 | model = model_class(**model_params) 207 | t_fit = time.time() 208 | model.fit( 209 | train_data[match_features + p1_features + p2_features], 210 | train_data["Winner"].values.ravel(), 211 | ) 212 | t_fit = time.time() - t_fit 213 | print(f"~~ Fit time: {np.round(t_fit, 0)}") 214 | 215 | print(f"[+] Fit ended, now predicting on test set") 216 | preds = model.predict(test_data[match_features + p1_features + p2_features]) 217 | precision = np.sum(np.squeeze(preds) == test_data["Winner"].values) / len(preds) 218 | if save_path is not None: 219 | try: 220 | df_res = pd.read_csv(os.path.join(save_path, "results.csv"), sep=";") 221 | except: 222 | print("save file not found") 223 | os.makedirs(save_path, exist_ok=True) 224 | df_res = pd.DataFrame() 225 | 226 | df_curr = pd.DataFrame( 227 | { 228 | "train_years": [train_years], 229 | "test_years": [test_years], 230 | "model_class": [model_class.__name__], 231 | "model_params": [model_params], 232 | "encoder_models": [encoder_models], 233 | "history_encoder_years": [history_encoder_years], 234 | "match_features": [match_features], 235 | "player_features": [player_features], 236 | "encoding_params": [encoding_params], 237 | "additional_features": [additional_features.copy()], 238 | "precision": [precision], 239 | "fit_time": [np.round(t_fit, 0)], 240 | } 241 | ) 242 | if save_all_results: 243 | print(f"[+] Saving Results") 244 | eval_id = int(time.time()) 245 | df_curr["eval_ID"] = [eval_id] 246 | test_data["y_pred"] = preds 247 | test_data.to_csv( 248 | os.path.join(save_path, f"{eval_id}.csv"), index=False, sep=";" 249 | ) 250 | 251 | df_res = pd.concat([df_res, df_curr], axis=0) 252 | df_res.to_csv(os.path.join(save_path, "results.csv"), index=False, sep=";") 253 | 254 | return precision 255 | -------------------------------------------------------------------------------- /examples/data/data_row_example.csv: -------------------------------------------------------------------------------- 1 | ,level_0,index,id,tournament,tournament_level,tournament_date,tournament_surface,round,best_of,match_id,Name_1,ID_1,Ranking_1,Ranking_Points_1,Ranking_History_1,Best_Rank_1,Birth_Year_1,Versus_1,Hand_1,Last_Tournament_Date_1,Height_1,Matches_1,Matches_Clay_1,Matches_Carpet_1,Matches_Grass_1,Matches_Hard_1,Victories_Percentage_1,Clay_Victories_Percentage_1,Carpet_Victories_Percentage_1,Grass_Victories_Percentage_1,Hard_Victories_Percentage_1,Aces_Percentage_1,Doublefaults_Percentage_1,First_Serve_Success_Percentage_1,Winning_on_1st_Serve_Percentage_1,Winning_on_2nd_Serve_Percentage_1,Overall_Win_on_Serve_Percentage_1,BreakPoint_Face_Percentage_1,BreakPoint_Saved_Percentage_1,games_fatigue_1,minutes_fatigue_1,last_rankings_1,last_ranking_points_1,Name_2,ID_2,Ranking_2,Ranking_Points_2,Ranking_History_2,Best_Rank_2,Birth_Year_2,Versus_2,Hand_2,Last_Tournament_Date_2,Height_2,Matches_2,Matches_Clay_2,Matches_Carpet_2,Matches_Grass_2,Matches_Hard_2,Victories_Percentage_2,Clay_Victories_Percentage_2,Carpet_Victories_Percentage_2,Grass_Victories_Percentage_2,Hard_Victories_Percentage_2,Aces_Percentage_2,Doublefaults_Percentage_2,First_Serve_Success_Percentage_2,Winning_on_1st_Serve_Percentage_2,Winning_on_2nd_Serve_Percentage_2,Overall_Win_on_Serve_Percentage_2,BreakPoint_Face_Percentage_2,BreakPoint_Saved_Percentage_2,games_fatigue_2,minutes_fatigue_2,last_rankings_2,last_ranking_points_2,Winner,score,elapsed_minutes,aces_nb_1,doublefaults_nb_1,svpt_1,1stIn_1,1stWon_1,2ndWon_1,SvGms_1,bpSaved_1,bpFaced_1,aces_nb_2,doublefaults_nb_2,svpt_2,1stIn_2,1stWon_2,2ndWon_2,SvGms_2,bpSaved_2,bpFaced_2,tournament_year,Fatigue_1,Fatigue_2 2 | 10,25616,10854,atp_matches_qual_chall_2003_5427,San Benedetto CH,C,20030811,Clay,SF,3,atp_matches_qual_chall_2003_5427,Stan.Wawrinka,104527,284.0,114.0,"{20030616: [387, 68], 20030707: [363, 74], 20030714: [348, 79], 20030721: [303, 99], 20030811: [284, 114]}",284,19850328.0,[],R,20030721,183.0,"[['V', 'atp_matches_qual_chall_2003_3466'], ['D', 'atp_matches_qual_chall_2003_3481'], ['D', 'atp_matches_2003_4049'], ['V', 'atp_matches_2003_4315'], ['D', 'atp_matches_2003_4328'], ['V', 'atp_matches_2003_4773'], ['D', 'atp_matches_2003_4782'], ['V', 'atp_matches_qual_chall_2003_5408'], ['V', 'atp_matches_qual_chall_2003_5419'], ['V', 'atp_matches_qual_chall_2003_5424']]","['V', 'D', 'D', 'V', 'D', 'V', 'D', 'V', 'V', 'V']",[],[],[],60.0,60.0,0.0,0.0,0.0,3.418803418803419,4.273504273504273,64.95726495726495,54.98575498575499,15.669515669515668,70.65527065527066,11.396011396011396,7.6923076923076925,38.09090909090909,,"[303, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 387]","[99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 68]",Martin.Vassallo Arguello,103506,125.0,296.0,"{19990201: [817, 13], 20000710: [398, 61], 20000731: [354, 75], 20000807: [377, 70], 20010625: [459, 48], 20010709: [405, 61], 20010813: [391, 68], 20010820: [374, 72], 20010827: [342, 88], 20010917: [291, 117], 20010924: [286, 122], 20011008: [238, 154], 20011015: [237, 157], 20011022: [211, 178], 20011112: [206, 181], 20011126: [198, 186], 20011203: [201, 186], 20011231: [202, 186], 20020318: [175, 220], 20020325: [175, 220], 20020401: [178, 213], 20020408: [173, 219], 20020422: [174, 219], 20020429: [176, 217], 20020506: [151, 265], 20020513: [140, 286], 20020527: [140, 285], 20020610: [135, 304], 20020617: [123, 328], 20020624: [123, 328], 20020701: [123, 328], 20020708: [125, 320], 20020715: [132, 311], 20020722: [129, 312], 20020819: [136, 304], 20020930: [165, 220], 20021007: [158, 232], 20030127: [204, 164], 20030210: [204, 164], 20030217: [203, 168], 20030224: [198, 172], 20030324: [197, 177], 20030421: [195, 177], 20030428: [188, 188], 20030512: [255, 118], 20030526: [204, 167], 20030602: [204, 167], 20030609: [211, 163], 20030616: [230, 137], 20030623: [233, 137], 20030630: [233, 137], 20030707: [218, 157], 20030714: [181, 202], 20030721: [163, 232], 20030728: [157, 247], 20030804: [126, 296], 20030811: [125, 296]}",123.0,19800210.0,[],R,20030804.0,183.0,"[['V', 'atp_matches_qual_chall_1999_380'], ['D', 'atp_matches_qual_chall_1999_393'], ['V', 'atp_matches_qual_chall_2000_3972'], ['V', 'atp_matches_qual_chall_2000_3988'], ['D', 'atp_matches_qual_chall_2000_3996'], ['D', 'atp_matches_qual_chall_2000_4725'], ['D', 'atp_matches_qual_chall_2000_4758'], ['V', 'atp_matches_qual_chall_2001_3699'], ['V', 'atp_matches_qual_chall_2001_3712'], ['D', 'atp_matches_qual_chall_2001_3719'], ['V', 'atp_matches_qual_chall_2001_4080'], ['V', 'atp_matches_qual_chall_2001_4089'], ['D', 'atp_matches_qual_chall_2001_4093'], ['V', 'atp_matches_qual_chall_2001_5286'], ['D', 'atp_matches_qual_chall_2001_5295'], ['V', 'atp_matches_qual_chall_2001_5433'], ['V', 'atp_matches_qual_chall_2001_5446'], ['D', 'atp_matches_qual_chall_2001_5453'], ['V', 'atp_matches_qual_chall_2001_5805'], ['V', 'atp_matches_qual_chall_2001_5814'], ['V', 'atp_matches_qual_chall_2001_5818'], ['D', 'atp_matches_qual_chall_2001_5820'], ['V', 'atp_matches_qual_chall_2001_6263'], ['D', 'atp_matches_qual_chall_2001_6275'], ['V', 'atp_matches_qual_chall_2001_6452'], ['V', 'atp_matches_qual_chall_2001_6461'], ['V', 'atp_matches_qual_chall_2001_6466'], ['V', 'atp_matches_qual_chall_2001_6468'], ['D', 'atp_matches_qual_chall_2001_6469'], ['D', 'atp_matches_qual_chall_2001_6943'], ['V', 'atp_matches_qual_chall_2001_7080'], ['V', 'atp_matches_qual_chall_2001_7090'], ['V', 'atp_matches_qual_chall_2001_7095'], ['D', 'atp_matches_qual_chall_2001_7097'], ['V', 'atp_matches_qual_chall_2001_7140'], ['D', 'atp_matches_qual_chall_2001_7151'], ['V', 'atp_matches_qual_chall_2001_7658'], ['D', 'atp_matches_qual_chall_2001_7673'], ['D', 'atp_matches_qual_chall_2001_7822'], ['D', 'atp_matches_qual_chall_2001_7886'], ['V', 'atp_matches_qual_chall_2002_70'], ['V', 'atp_matches_qual_chall_2002_82'], ['V', 'atp_matches_qual_chall_2002_88'], ['V', 'atp_matches_qual_chall_2002_91'], ['D', 'atp_matches_qual_chall_2002_92'], ['D', 'atp_matches_qual_chall_2002_1710'], ['D', 'atp_matches_qual_chall_2002_1773'], ['V', 'atp_matches_qual_chall_2002_1840'], ['D', 'atp_matches_qual_chall_2002_1849'], ['D', 'atp_matches_qual_chall_2002_1983'], ['D', 'atp_matches_qual_chall_2002_2256'], ['V', 'atp_matches_qual_chall_2002_2326'], ['V', 'atp_matches_qual_chall_2002_2334'], ['V', 'atp_matches_qual_chall_2002_2338'], ['V', 'atp_matches_qual_chall_2002_2340'], ['V', 'atp_matches_qual_chall_2002_2341'], ['V', 'atp_matches_qual_chall_2002_2537'], ['V', 'atp_matches_qual_chall_2002_2548'], ['V', 'atp_matches_qual_chall_2002_2554'], ['D', 'atp_matches_qual_chall_2002_2557'], ['D', 'atp_matches_qual_chall_2002_2594'], ['D', 'atp_matches_2002_2922'], ['V', 'atp_matches_qual_chall_2002_3031'], ['V', 'atp_matches_qual_chall_2002_3055'], ['V', 'atp_matches_qual_chall_2002_3103'], ['V', 'atp_matches_qual_chall_2002_3384'], ['V', 'atp_matches_qual_chall_2002_3396'], ['V', 'atp_matches_qual_chall_2002_3402'], ['D', 'atp_matches_qual_chall_2002_3405'], ['D', 'atp_matches_qual_chall_2002_3475'], ['D', 'atp_matches_qual_chall_2002_3812'], ['V', 'atp_matches_qual_chall_2002_3874'], ['D', 'atp_matches_qual_chall_2002_3884'], ['D', 'atp_matches_2002_4022'], ['V', 'atp_matches_qual_chall_2002_4321'], ['D', 'atp_matches_qual_chall_2002_4333'], ['D', 'atp_matches_qual_chall_2002_4407'], ['D', 'atp_matches_qual_chall_2002_5341'], ['V', 'atp_matches_qual_chall_2002_6432'], ['V', 'atp_matches_qual_chall_2002_6447'], ['D', 'atp_matches_qual_chall_2002_6455'], ['D', 'atp_matches_qual_chall_2002_6528'], ['D', 'atp_matches_qual_chall_2003_527'], ['V', 'atp_matches_qual_chall_2003_859'], ['D', 'atp_matches_qual_chall_2003_874'], ['V', 'atp_matches_qual_chall_2003_959'], ['D', 'atp_matches_qual_chall_2003_972'], ['V', 'atp_matches_qual_chall_2003_1047'], ['D', 'atp_matches_qual_chall_2003_1062'], ['V', 'atp_matches_qual_chall_2003_1650'], ['D', 'atp_matches_qual_chall_2003_1660'], ['V', 'atp_matches_qual_chall_2003_2123'], ['V', 'atp_matches_qual_chall_2003_2137'], ['D', 'atp_matches_qual_chall_2003_2144'], ['D', 'atp_matches_qual_chall_2003_2219'], ['V', 'atp_matches_qual_chall_2003_2620'], ['V', 'atp_matches_qual_chall_2003_2635'], ['V', 'atp_matches_qual_chall_2003_2642'], ['V', 'atp_matches_qual_chall_2003_2646'], ['V', 'atp_matches_qual_chall_2003_2648'], ['V', 'atp_matches_qual_chall_2003_2774'], ['V', 'atp_matches_qual_chall_2003_2787'], ['D', 'atp_matches_qual_chall_2003_2793'], ['D', 'atp_matches_qual_chall_2003_3000'], ['V', 'atp_matches_qual_chall_2003_3174'], ['D', 'atp_matches_qual_chall_2003_3184'], ['D', 'atp_matches_qual_chall_2003_3348'], ['D', 'atp_matches_qual_chall_2003_3467'], ['V', 'atp_matches_qual_chall_2003_3562'], ['V', 'atp_matches_qual_chall_2003_3577'], ['V', 'atp_matches_qual_chall_2003_3585'], ['D', 'atp_matches_qual_chall_2003_3589'], ['V', 'atp_matches_qual_chall_2003_4000'], ['D', 'atp_matches_qual_chall_2003_4009'], ['V', 'atp_matches_qual_chall_2003_4184'], ['V', 'atp_matches_qual_chall_2003_4194'], ['V', 'atp_matches_qual_chall_2003_4199'], ['V', 'atp_matches_qual_chall_2003_4201'], ['V', 'atp_matches_qual_chall_2003_4202'], ['V', 'atp_matches_qual_chall_2003_4491'], ['V', 'atp_matches_qual_chall_2003_4501'], ['V', 'atp_matches_qual_chall_2003_4506'], ['V', 'atp_matches_qual_chall_2003_4509'], ['D', 'atp_matches_qual_chall_2003_4510'], ['V', 'atp_matches_qual_chall_2003_4544'], ['V', 'atp_matches_qual_chall_2003_4559'], ['D', 'atp_matches_qual_chall_2003_4566'], ['V', 'atp_matches_qual_chall_2003_4853'], ['V', 'atp_matches_qual_chall_2003_4869'], ['V', 'atp_matches_qual_chall_2003_4877'], ['V', 'atp_matches_qual_chall_2003_4881'], ['V', 'atp_matches_qual_chall_2003_4883'], ['D', 'atp_matches_qual_chall_2003_5283'], ['V', 'atp_matches_qual_chall_2003_5413'], ['V', 'atp_matches_qual_chall_2003_5421'], ['V', 'atp_matches_qual_chall_2003_5425']]","['V', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'D', 'D', 'D', 'D', 'V', 'D', 'D', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'D', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'D', 'D', 'V', 'D', 'D', 'D', 'V', 'V', 'D', 'D', 'V', 'D', 'V', 'V', 'D', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'D', 'V', 'D', 'D', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'V']","['D', 'V', 'D', 'V', 'D']",['D'],"['D', 'D', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'D']",61.02941176470589,63.559322033898304,40.0,0.0,50.0,4.824561403508771,5.263157894736842,61.40350877192983,46.49122807017544,18.859649122807017,65.35087719298245,9.649122807017545,5.701754385964912,40.44444444444444,,"[157, 136, 165, 158, 9999, 9999, 204, 198, 197, 188, 204, 233]","[247, 304, 220, 232, 0, 0, 164, 172, 177, 188, 167, 137]",0.0,2-6 7-5 7-5,,,,,,,,,,,,,,,,,,,,2003.0,, 3 | -------------------------------------------------------------------------------- /examples/data/data_loading.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import os, sys 3 | 4 | sys.path.append("../../python") 5 | sys.path.append("../../") 6 | 7 | import matplotlib.pyplot as plt 8 | from matplotlib.patches import Rectangle 9 | import numpy as np 10 | import pandas as pd 11 | 12 | from data.data_loader import matches_data_loader 13 | 14 | 15 | data_df = matches_data_loader( 16 | path_to_data="../../submodules/tennis_atp", # Path to tennis_atp submodule, keep as is if repo cloned with subdmodule 17 | path_to_cache="../../cache", # Path to caching directory 18 | flush_cache=False, # Whether or not to flush a potentially existing cache. Set to True if you want to create the data from scratch 19 | keep_values_from_year=2002, # Returned data will date back to January 2002 up to today 20 | get_match_statistics=True, # Whether to also retrun match statistics (time, score, etc...) 21 | get_reversed_match_data=True, # Whether to duplicate the mathc row and exchange winner and loser positions 22 | include_davis_cup=True, # Whether or not to include davis cup matches 23 | match_type=[ 24 | "main_atp", 25 | "qualifying_challengers", 26 | ], # Which match to keep. You can look at tennis_atp submodule to see possibilities 27 | ) 28 | 29 | print(data_df.head()) 30 | print(data_df.shape) 31 | 32 | # Creation of first figure 33 | # Win percentages considering the ranks of players 34 | 35 | # Rank categories 36 | categories = [1, 10, 50, 100, 300, 1000, 9999] 37 | 38 | best_ranked_player_win_percentage = [] 39 | categories_number_of_matches = [] 40 | 41 | for cat_1 in range(len(categories) - 1): 42 | lines = [] 43 | nb_matches_lines = [] 44 | for cat_2 in range(len(categories) - 1): 45 | sub_df = data_df.loc[data_df.Ranking_1 >= categories[cat_1]].loc[ 46 | data_df.Ranking_1 < categories[cat_1 + 1] 47 | ] 48 | sub_df = sub_df.loc[sub_df.Ranking_2 >= categories[cat_2]].loc[ 49 | sub_df.Ranking_2 < categories[cat_2 + 1] 50 | ] 51 | sub_df["best_rank"] = sub_df.apply( 52 | lambda row: np.argmin([row["Ranking_1"], row["Ranking_2"]]), axis=1 53 | ) 54 | 55 | if len(sub_df) > 0: 56 | best_player_w_p = np.sum( 57 | sub_df.Winner.values == sub_df.best_rank.values 58 | ) / len(sub_df) 59 | 60 | else: 61 | best_player_w_p = 0 62 | lines.append(best_player_w_p) 63 | nb_matches_lines.append(len(sub_df) / 2) 64 | best_ranked_player_win_percentage.append(lines) 65 | categories_number_of_matches.append(nb_matches_lines) 66 | print( 67 | "Number of matches with player ranked 0:", len(data_df.loc[data_df.Ranking_1 == 0]) 68 | ) 69 | print( 70 | "Number of matches with player ranked > 9999:", 71 | len(data_df.loc[data_df.Ranking_1 > 9999]), 72 | ) 73 | 74 | colors = ["purple", "blue", "cyan", "green", "yellow", "orange", "red"] 75 | fig, ax = plt.subplots() 76 | 77 | for i, val1 in enumerate(best_ranked_player_win_percentage): 78 | for j, val2 in enumerate(val1): 79 | color = colors[int(val2 * (len(colors) - 1))] 80 | rect = plt.Rectangle((i, j), 1, 1, fc=color) 81 | ax.add_patch(rect) 82 | plt.text(i + 0.2, j + 0.35, np.round(val2 * 100, 2)) 83 | 84 | for i in range(len(categories)): 85 | plt.plot([i, i], [0, len(categories) - 1], c="k") 86 | plt.plot([0, len(categories) - 1], [i, i], c="k") 87 | 88 | plt.xticks(list(range(len(categories))), labels=categories) 89 | plt.yticks(list(range(len(categories))), labels=categories) 90 | plt.xlabel("Player 1 Rank Category") 91 | plt.ylabel("Player 2 Rank Category") 92 | plt.title("Best player Win percentage per Rank Category") 93 | plt.savefig("Best_player_win_percentage.png") 94 | plt.show() 95 | 96 | # Second figure 97 | # Number of matches considering players ranks 98 | fig, ax = plt.subplots() 99 | 100 | for i, val1 in enumerate(categories_number_of_matches): 101 | for j, val2 in enumerate(val1): 102 | color = colors[ 103 | int( 104 | val2**0.5 105 | / np.max(categories_number_of_matches) ** 0.5 106 | * (len(colors) - 1) 107 | ) 108 | ] 109 | rect = plt.Rectangle((i, j), 1, 1, fc=color) 110 | ax.add_patch(rect) 111 | plt.text(i + 0.2, j + 0.35, int(val2)) 112 | 113 | for i in range(len(categories)): 114 | plt.plot([i, i], [0, len(categories) - 1], c="k") 115 | plt.plot([0, len(categories) - 1], [i, i], c="k") 116 | 117 | plt.xticks(list(range(len(categories))), labels=categories) 118 | plt.yticks(list(range(len(categories))), labels=categories) 119 | plt.xlabel("Player 1 Rank Category") 120 | plt.ylabel("Player 2 Rank Category") 121 | plt.title("Number of matches recorded per Rank Category") 122 | plt.savefig("nb_matches.png") 123 | plt.show() 124 | 125 | #### Stan the man 126 | # Statistics analysis of Stan Wawrinka over time 127 | overall_v = [] 128 | last_hundred_v = [] 129 | 130 | overall_clay = [] 131 | overall_carpet = [] 132 | overall_grass = [] 133 | overall_hard = [] 134 | 135 | wins_clay = [] 136 | wins_carpet = [] 137 | wins_grass = [] 138 | wins_hard = [] 139 | 140 | dates = [] 141 | stan_df = data_df.loc[data_df.ID_1 == 104527] 142 | stan_df = stan_df.reset_index() 143 | 144 | stan_df.iloc[100].to_csv("single_row_example.csv") 145 | 146 | for n_row, row in stan_df.iterrows(): 147 | matches = [r[0] for r in ast.literal_eval(str(row["Matches_1"]))] 148 | 149 | if len(matches) > 0: 150 | overall_v.append(matches.count("V") / len(matches) * 100) 151 | last_hundred_v.append(matches[-100:].count("V") / len(matches[-100:]) * 100) 152 | 153 | if str(row["tournament_date"])[:4] not in [d[0] for d in dates]: 154 | dates.append((str(row["tournament_date"])[:4], n_row)) 155 | overall_clay.append(row["Clay_Victories_Percentage_1"]) 156 | overall_grass.append(row["Grass_Victories_Percentage_1"]) 157 | overall_hard.append(row["Hard_Victories_Percentage_1"]) 158 | overall_carpet.append(row["Carpet_Victories_Percentage_1"]) 159 | 160 | wins_clay.append(list(row.Matches_Clay_1).count("V")) 161 | wins_carpet.append(list(row.Matches_Carpet_1).count("V")) 162 | wins_grass.append(list(row.Matches_Grass_1).count("V")) 163 | wins_hard.append(list(row.Matches_Hard_1).count("V")) 164 | 165 | # % Victory over time and surfaces 166 | plt.figure() 167 | plt.plot(overall_v, label="overall") 168 | plt.plot(last_hundred_v, label="last 100 matches") 169 | plt.plot(overall_clay, label="overall clay") 170 | plt.plot(overall_grass, label="overall grass") 171 | plt.plot(overall_hard, label="overall hard") 172 | plt.plot(overall_carpet, label="overall carpet") 173 | plt.legend() 174 | plt.xticks([d[1] for d in dates], [d[0] for d in dates], rotation="vertical") 175 | plt.title("Stanislas Wawrinka win percentage on main ATP tournamnents") 176 | plt.savefig("stan_the_man_win_percentage.png") 177 | plt.show() 178 | 179 | 180 | fig, ax1 = plt.subplots() 181 | ax1.plot(overall_v, label="overall", c="k") 182 | ax1.plot(last_hundred_v, label="last 100 matches", c="purple") 183 | ax1.plot(overall_clay, label="overall clay", c="orange") 184 | ax1.plot(overall_grass, label="overall grass", c="green") 185 | ax1.plot(overall_hard, label="overall hard", c="blue") 186 | ax1.plot(overall_carpet, label="overall carpet", c="gray") 187 | ax1.set_ylabel("Win %") 188 | plt.legend() 189 | 190 | ax2 = ax1.twinx() 191 | for i, (wcarpet, wgrass, wclay, whard) in enumerate( 192 | zip(wins_carpet, wins_grass, wins_clay, wins_hard) 193 | ): 194 | if i % 2 == 0: 195 | ax2.add_patch( 196 | Rectangle( 197 | (i, 0), 198 | width=2, 199 | height=wcarpet, 200 | edgecolor=None, 201 | facecolor="gray", 202 | alpha=0.2, 203 | ) 204 | ) 205 | ax2.add_patch( 206 | Rectangle( 207 | (i, wcarpet), 208 | width=2, 209 | height=wgrass, 210 | edgecolor=None, 211 | facecolor="green", 212 | alpha=0.2, 213 | ) 214 | ) 215 | ax2.add_patch( 216 | Rectangle( 217 | (i, wcarpet + wgrass), 218 | width=2, 219 | height=wclay, 220 | edgecolor=None, 221 | facecolor="orange", 222 | alpha=0.2, 223 | ) 224 | ) 225 | ax2.add_patch( 226 | Rectangle( 227 | (i, wcarpet + wgrass + wclay), 228 | width=2, 229 | height=whard, 230 | edgecolor=None, 231 | facecolor="blue", 232 | alpha=0.2, 233 | ) 234 | ) 235 | 236 | ax2.set_yticks([0, 100, 200, 300, 400, 500, 600, 700]) 237 | ax2.set_ylabel("Number of victory for each surface") 238 | plt.tight_layout() 239 | ax1.set_xticks([d[1] for d in dates], [d[0] for d in dates], rotation="vertical") 240 | plt.title("Stanislas Wawrinka victories on ATP tournamnents") 241 | plt.savefig("stan_the_man_win_percentage.png") 242 | plt.show() 243 | 244 | 245 | aces = {"diff_aces": [], "winner": []} 246 | 247 | for n_row, row in stan_df.iterrows(): 248 | diff_aces = row["Aces_Percentage_1"] - row["Aces_Percentage_2"] 249 | winner = row["Winner"] 250 | aces["diff_aces"].append(diff_aces) 251 | aces["winner"].append(winner) 252 | 253 | aces = pd.DataFrame(aces) 254 | classes = [val * 2.5 for val in range(-6, 4, 1)] 255 | fig, ax = plt.subplots(1) 256 | for min_class, max_class in zip(classes[:-1], classes[1:]): 257 | values = aces.loc[aces.diff_aces < max_class].loc[aces.diff_aces > min_class] 258 | ax.add_patch( 259 | Rectangle( 260 | xy=(min_class, 0), 261 | width=2.5, 262 | height=len(values.loc[values.winner == 0]), 263 | edgecolor="k", 264 | facecolor="blue", 265 | label="Victory", 266 | ) 267 | ) 268 | ax.add_patch( 269 | Rectangle( 270 | xy=(min_class, len(values.loc[values.winner == 0])), 271 | width=2.5, 272 | height=len(values.loc[values.winner == 1]), 273 | edgecolor="k", 274 | facecolor="orange", 275 | label="Defeat", 276 | ) 277 | ) 278 | ax.autoscale_view() 279 | ax.set_xlabel("Career ace percentage difference with adversary") 280 | ax.set_ylabel("Number of matches") 281 | ax.set_title( 282 | "Histogram of career aces percentage difference for Stan Wawrinka, colored by match results", 283 | wrap=True, 284 | ) 285 | handles, labels = plt.gca().get_legend_handles_labels() 286 | by_label = dict(zip(labels, handles)) 287 | plt.legend(by_label.values(), by_label.keys()) 288 | plt.savefig("stanimal_aces_percentage_difference.png") 289 | plt.show() 290 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tennis-Prediction 2 | 3 | 4 | 5 | The goal of this project is to predict the outcome of a tennis match using the data of both players and ML models.\ 6 | The data used comes from [Jeff Sackmann's repository](https://github.com/JeffSackmann). 7 | 8 | - [Installation](#installation) 9 | - [Usage](#usage) 10 | - [Data Loading](#data-loading) 11 | - [Machine Learning modelling](#ml-modelling) 12 | - [Encoding Matches](#encoding-matches) 13 | - [License](#license) 14 | 15 | ## Installation 16 | 17 | To clone the repository, with the data you need to also clone the submodules: 18 | 19 | ```bash 20 | git clone --recurse-submodules https://github.com/VincentAuriau/Tennis-Prediction.git 21 | ``` 22 | 23 | ## Usage 24 | 25 | You can find examples in /examples: 26 | 27 | ### Data Loading 28 | Loading players statistics at match time and match outcome: 29 | [Example](examples/data/data_loading.py) 30 | 31 | ```python 32 | from data.data_loader import matches_data_loader 33 | data_df = matches_data_loader(path_to_data="submodules/tennis_atp") 34 | ``` 35 | data_df contains let you access information about players (statistics prior to the match) along statistics of the match. 36 | A basic example statistic: the victory percentage of the best ranked player in a match, depending on players rankings. 37 | 38 | 39 | Number of ATP main matches depending on players rank | Victory % of best ranked player 40 | :-------------------------:|:-------------------------: 41 | ![](examples/data/nb_matches.png) | ![](examples/data/Best_player_win_percentage.png) 42 | 43 | It can be easily used to also compute players statistics over their carreer, and/or at match time. Here is a simple example with Stan Wawrinka: 44 | Stan's Victory % in main ATP matches | Stan's career aces % diff with adversary 45 | :-------------------------:|:-------------------------: 46 | ![](examples/data/stan_the_man_win_percentage.png) | ![](examples/data/stanimal_aces_percentage_difference.png) 47 | 48 | Here is an example of a data row: 49 | 50 | | id | tournament | tournament_level | tournament_date | tournament_surface | round | best_of | match_id | Winner | Score | 51 | | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | 52 | | atp_matches_qual_chall_2003_5427 | San Benedetto CH | C | 20030811 | Clay | SF | 3 | 20030811 | 0 | 2-6 7-5 7-5 | 53 | 54 | Base Match Statistics: 55 | - **id and match_id:** unique identification of the matches 56 | - **tournament:** name of the tournament 57 | - **tournament_level:** Category of the tournament 'G' (Grand Slams) 'M' (Masters 1000s), 'A' (other tour-level events), 'C' (Challengers), 'S' (Satellites/ITFs), 'F' (Tour finals), 'D'( Davis Cup) 58 | - **tournament_date:** date 59 | - **tournament_surface:** surface 'Grass', 'Clay', 'Carpet', 'Hard' 60 | - **round:** tournament round of the match 'F' (finale), 'SF' (semi-finale), 61 | - **best_of:** number of won games needded (3 or 5) 62 | - **Winner:** index of the winner: 0 (Player1) or 1 (Player2) 63 | - **Score:** finale score 64 | 65 | Additional match statistics: 66 | - **elapsed_minutes:** Duration of the match 67 | - **aces_nb_x:** Number of aces of player x 68 | - **doublefaults_nb_x:** Number of doublefaults 69 | - **svpt_x:** Number of serve points 70 | - **1stIn_1:** Number of first serves made 71 | - **1stWon_1:** Number of first-serve points won 72 | - **2ndWon_x:** Number of second-serve points won 73 | - **SvGms_1:** Number of serve games 74 | - **bpSaved_1:** Number of break points saved 75 | - **bpFaced_1:** Number of break points faced 76 | 77 | Example of match statistics: 78 | 79 | | Name_1 | ID_1 | Ranking_1 | Ranking_Points_1 | Ranking_History_1 | Best_Rank_1 | Birth_Year_1 | Versus_1 | Hand_1 | Last_Tournament_Date_1 | Height_1 | Matches_1 | Matchs_Clay_1 | Matches_Carpet_1 | Matches_Grass_1 | Matches_Hard_1 | Victories_Percentage_1 | Clay_Victories_Percentage_1 | Carpet_Victories_Percentage_1 | Grass_Victories_Percentage_1 | Hard_Victories_Percentage_1 | Aces_Percentage_1 | Doublefaults_Percentage_1 | First_Save_Success_Percentage_1 | Winning_on_1st_Serve_Percentage_1 | Winning_on_2nd_Serve_Percentage_1 | Overall_Win_on_Serve_Percentage_1 | BreakPoint_Face_Percentage_1 | BreakPoint_Saved_Percentage_1 | last_rankings_1 | last_ranking_points_1 | 80 | | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | 81 | | Stan.Wawrinka | 104527 | 184 | 114 | {20030616: [387, 68], 20030707: [363, 74], 20030714: [348, 79], 20030721: [303, 99], 20030811: [284, 114]} | 284 | 19850328 | [] | R | 20030721 | 183 | [['V', 'atp_matches_qual_chall_2003_3466'], ['D', 'atp_matches_qual_chall_2003_3481'], ['D', 'atp_matches_2003_4049'], ['V', 'atp_matches_2003_4315'], ['D', 'atp_matches_2003_4328'], ['V', 'atp_matches_2003_4773'], ['D', 'atp_matches_2003_4782'], ['V', 'atp_matches_qual_chall_2003_5408'], ['V', 'atp_matches_qual_chall_2003_5419'], ['V', 'atp_matches_qual_chall_2003_5424']] | ['V', 'D', 'D', 'V', 'D', 'V', 'D', 'V', 'V', 'V'] | [] | [] | [] | 60 | 60 | 0 | 0 | 0 | 3.41880341880342 | 4.27350427350427 | 64.957264957265 | 54.985754985755 | 15.6695156695157 | 70.6552706552707 | 11.3960113960114 | 7.69230769230769 | [303, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 387] | [99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 68] | 82 | 83 | 84 | | Name_2 | ID_2 | Ranking_2 | Ranking_Points_2 | Ranking_History_2 | Best_Rank_2 | Birth_Year_2 | Versus_2 | Hand_2 | Last_Tournament_Date_2 | Height_2 | Matches_2 | Matchs_Clay_2 | Matches_Carpet_2 | Matches_Grass_2 | Matches_Hard_2 | Victories_Percentage_2 | Clay_Victories_Percentage_2 | Carpet_Victories_Percentage_2 | Grass_Victories_Percentage_2 | Hard_Victories_Percentage_2 | Aces_Percentage_2 | Doublefaults_Percentage_2 | First_Save_Success_Percentage_2 | Winning_on_1st_Serve_Percentage_2 | Winning_on_2nd_Serve_Percentage_2 | Overall_Win_on_Serve_Percentage_2 | BreakPoint_Face_Percentage_2 | BreakPoint_Saved_Percentage_2 | last_rankings_2 | last_ranking_points_2 | 85 | | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | 86 | | Martin.Vassallo Arguello | 103506 | 125 | 296 | {19990201: [817, 13], 20000710: [398, 61], 20000731: [354, 75], 20000807: [377, 70], 20010625: [459, 48], 20010709: [405, 61], 20010813: [391, 68], 20010820: [374, 72], 20010827: [342, 88], 20010917: [291, 117], 20010924: [286, 122], etc...} | 123 | 19800210 | [] | R | 20030804 | 183 | [['V', 'atp_matches_qual_chall_1999_380'], ['D', 'atp_matches_qual_chall_1999_393'], ['V', 'atp_matches_qual_chall_2000_3972'], ['V', 'atp_matches_qual_chall_2000_3988'], ['D', 'atp_matches_qual_chall_2000_3996'], ['D', 'atp_matches_qual_chall_2000_4725'], ['D', 'atp_matches_qual_chall_2000_4758'], ['V', 'atp_matches_qual_chall_2001_3699'], etc...] | ['V', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'D', 'D', etc...] | ['D', 'V', 'D', 'V', 'D'] | ['D'] | ['D', 'D', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'D'] | 61.0294117647059 | 63.5593220338983 | 40 | 0 | 50 | 4.82456140350877 | 5.26315789473684 | 61.4035087719298 | 46.4912280701754 | 18.859649122807 | 65.3508771929825 | 9.64912280701754 | 5.70175438596491 | [157, 136, 165, 158, 9999, 9999, 204, 198, 197, 188, 204, 233] | [247, 304, 220, 232, 0, 0, 164, 172, 177, 188, 167, 137] | 87 | 88 | Player statistics before the match: 89 | - **Name_x**: Name of the player 90 | - **ID_x:** ID of the player 91 | - **Ranking_x:** ATP ranking of the player. For all rankings, 9999 means unranked. 92 | - **Ranking_Points_x:** Number of ATP points 93 | - **Ranking_History_x:** All recorded rankings 94 | - **Best_Rank_x:** Best reached ATP rank 95 | - **Birth_Year_x:** Birth year 96 | - **Versus_x:** Dictionnary containing all match outcomes agains other players 97 | - **Hand_x**: Hand used to play 'R', 'L' or 'U' for unknown 98 | - **Last_Tournament_Date_x:** Date of the last previous tournament attended 99 | - **Height_x:** Height 100 | - **Matches_x:** History of outcomes of previous matches 101 | - **Matchs_Clay_x:** History of outcomes of previous matches on clay 102 | - **Matchs_Carpet_x:** History of outcomes of previous matches on carpet 103 | - **Matchs_Grass_x:** History of outcomes of previous matches on grass 104 | - **Matchs_Hard_x:** History of outcomes of previous matches on hard 105 | - **Victories_Percentage_x:** Victory percentage over all player ATP matches 106 | - **Clay_Victories_Percentage_x:** Victory percentage over all player ATP matches on clay 107 | - **Carpet_Victories_Percentage_x:** Victory percentage over all player ATP matches on carpet 108 | - **Grass_Victories_Percentage_x:** Victory percentage over all player ATP matches on grass 109 | - **Hard_Victories_Percentage_x:** Victory percentage over all player ATP matches on hard 110 | - **Aces_Percentage_x:** Aces percentage over all player ATP matches 111 | - **Doublefaults_Percentage_x:** DoubleFaults percentage over all player ATP matches 112 | - **First_Save_Success_Percentage_x:** First save success percentage over all player ATP matches 113 | - **Winning_on_1st_Serve_Percentage_x:** Winning on first serve percentage over all player ATP matches 114 | - **Winning_on_2nd_Serve_Percentage_x:** Winning on second serve percentage over all player ATP matches 115 | - **Overall_Win_on_Serve_Percentage_x:** Overall winning percentage on serve over all player ATP matches 116 | - **BreakPoint_Face_Percentage_x:** Overall breakpoint face percentage over all player ATP matches 117 | - **BreakPoint_Saved_Percentage_x:** Overall breakpoint saved percentage over all player ATP matches 118 | - **last_rankings_x:** Five previous recorded ATP rankings 119 | - **last_ranking_points_x:** Five previous ATP ranking points recorded 120 | 121 | ### Machine-Learning modelling 122 | Train/Testing on matches outcome 123 | [[Example]](examples/models/train_test.py). 124 | 125 | A generic function lets you evaluate your model with a train/test scheme without much work. Your model only needs a scikit-learn like signature. 126 | By playing with the years, columns to use in modelling and models & hyperparmaters, you can easily create your own best-performing model. 127 | 128 | ```python 129 | from sklearn.ensemble import RandomForestClassifier 130 | from evaluation.train_test import train_test_evaluation 131 | 132 | test_score = train_test_evaluation( 133 | train_years=[2020, 2021], 134 | test_years=[2022, 2023], 135 | model_class=RandomForestClassifier, 136 | model_params={"n_estimators": 2000, "max_depth": None}, 137 | match_features=[], 138 | player_features=["Ranking"], 139 | encoding_params={}, 140 | additional_features=[], 141 | save_path="./results", 142 | save_all_results=False 143 | ) 144 | 145 | print("Test Score", test_score) 146 | ``` 147 | 148 | Models and hyperparamters can easily be compared with the file results.csv saved in save_path. 149 | 150 | Accuracy of different models 151 | :-------------------------: 152 | ![](examples/results_reading/models_performances.png) 153 | 154 | If the argument save_all_results is set to True, the whole csv of test data is saved. It helps to get more in-depth analysis of results 155 | 156 | Model precision compared with best ranked player wins strategy | Model precision depending of players ranks 157 | :-------------------------:|:-------------------------: 158 | ![](examples/results_reading/win_per_surface.png) | ![](examples/results_reading/precision_percentage_players_ranks.png) 159 | 160 | ### Encoding matches 161 | [Example](examples/history_modeling/first_example.py) 162 | In order to represent history of a player, one can use MatchEncoders: 163 | 164 | ```python 165 | from history_modeling.encoding_model import PCAMatchEncoder 166 | 167 | model = PCAMatchEncoder(num_pca_features=2) 168 | model.fit(data_df, transform_data=True) 169 | X_r, match_info = model.predict(data_df, transform_data=True) 170 | ``` 171 | 172 | 2D representation of match outcome: 173 | :-------------------------: 174 | ![](examples/history_modeling/2d_pca_match_representation_test.png) 175 | 176 | ## License 177 | 178 | 179 | 180 |   181 |   182 | 183 | The repository is under the MIT License, you can freely use any part as you like.\ 184 | If you find this repository useful, you can cite it and add a star ⭐ ! 185 | 186 | -------------------------------------------------------------------------------- /python/data/data_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import re 4 | import time 5 | from ast import literal_eval 6 | 7 | import numpy as np 8 | import pandas as pd 9 | 10 | import data.player as player 11 | import data.match as match 12 | from data.data_utils import reverse_score 13 | 14 | 15 | def create_player_profiles(df): 16 | """ 17 | Creates database of players from df containing list of players 18 | :param df: pandas.DataFrame corresponding to atp_players.csv 19 | :return: databaser of player.Players objects 20 | """ 21 | players_db = {} 22 | for n_row, row in df.iterrows(): 23 | pl = player.Player( 24 | name=(str(row["name_first"]) + "." + str(row["name_last"])), 25 | birthdate=row["dob"], 26 | country=row["ioc"], 27 | nb_id=row["player_id"], 28 | hand=row["hand"], 29 | height=row["height"], 30 | ) 31 | 32 | if row["player_id"] in players_db.keys(): 33 | print(f"Player ID {row['player_id']} already in database, appears twice ?") 34 | else: 35 | players_db[row["player_id"]] = pl 36 | return players_db 37 | 38 | 39 | def read_matches_file(path_to_file): 40 | """ 41 | Opens a csv file with matches 42 | :param path_to_file: 43 | :return: corresponding df 44 | """ 45 | df_match = pd.read_csv(path_to_file) 46 | return df_match 47 | 48 | 49 | def get_match_files(path_to_data_dir, match_type=["main_atp"]): 50 | """ 51 | Lists the available csv containing matches 52 | :param path_to_data_dir: path to directory with all files 53 | :param match_type: matches we want to retrieve list of elements among ["main_atp", "futures", "qualifying_challengers"] 54 | :return: 55 | """ 56 | main_atp_pattern = "atp_matches_(?P\d+).csv" 57 | futures_pattern = "atp_matches_futures_(?P\d+).csv" 58 | qual_chall_pattern = "atp_matches_qual_chall_(?P\d+).csv" 59 | 60 | matches_data_file = {} 61 | 62 | for file in os.listdir(path_to_data_dir): 63 | if "main_atp" in match_type: 64 | regex_match = re.match(main_atp_pattern, file) 65 | if regex_match is not None: 66 | matches_data_file["filepath"] = matches_data_file.get( 67 | "filepath", [] 68 | ) + [os.path.join(path_to_data_dir, file)] 69 | matches_data_file["match_type"] = matches_data_file.get( 70 | "match_type", [] 71 | ) + ["main_tour"] 72 | match_dict = regex_match.groupdict() 73 | for key, value in match_dict.items(): 74 | matches_data_file[key] = matches_data_file.get(key, []) + [value] 75 | if "futures" in match_type: 76 | regex_match = re.match(futures_pattern, file) 77 | if regex_match is not None: 78 | matches_data_file["filepath"] = matches_data_file.get( 79 | "filepath", [] 80 | ) + [os.path.join(path_to_data_dir, file)] 81 | matches_data_file["match_type"] = matches_data_file.get( 82 | "match_type", [] 83 | ) + ["main_tour"] 84 | match_dict = regex_match.groupdict() 85 | for key, value in match_dict.items(): 86 | matches_data_file[key] = matches_data_file.get(key, []) + [value] 87 | if "qualifying_challengers" in match_type: 88 | regex_match = re.match(qual_chall_pattern, file) 89 | if regex_match is not None: 90 | matches_data_file["filepath"] = matches_data_file.get( 91 | "filepath", [] 92 | ) + [os.path.join(path_to_data_dir, file)] 93 | matches_data_file["match_type"] = matches_data_file.get( 94 | "match_type", [] 95 | ) + ["main_tour"] 96 | match_dict = regex_match.groupdict() 97 | for key, value in match_dict.items(): 98 | matches_data_file[key] = matches_data_file.get(key, []) + [value] 99 | return pd.DataFrame(matches_data_file) 100 | 101 | 102 | def load_match_data_from_path( 103 | players_db, paths_to_matchs_file, get_match_statistics=False 104 | ): 105 | """ 106 | Loads file from path and creates the matches data while updating players databaser 107 | :param players_db: 108 | :param path_to_matchs_file: 109 | :return: 110 | """ 111 | 112 | def extract_file_id(file_path): 113 | file_id = file_path.split("/")[-1].split(".")[0] 114 | if "\\" in file_id: 115 | file_id = file_id.split("\\")[1] 116 | 117 | return file_id 118 | 119 | if not isinstance(paths_to_matchs_file, list): 120 | paths_to_matchs_file = [paths_to_matchs_file] 121 | 122 | files = [] 123 | for path in paths_to_matchs_file: 124 | match_df = pd.read_csv(path) 125 | match_df["filepath"] = path 126 | files.append(match_df) 127 | match_df = pd.concat(files, axis=0) 128 | match_df = match_df.sort_values(["tourney_date", "tourney_id", "match_num"]) 129 | match_df = match_df.reset_index(drop=True) 130 | """ 131 | match_df["match_id"] = match_df.apply( 132 | lambda row: extract_file_id(path_to_matchs_file) + "_" + str(row.name), 133 | axis=1, 134 | ) 135 | """ 136 | matches_data = [] 137 | for n_row, row in match_df.iterrows(): 138 | m_winner = players_db[row["winner_id"]] 139 | m_loser = players_db[row["loser_id"]] 140 | m_tournament = row["tourney_name"] 141 | m_surface = row["surface"] 142 | 143 | match_o = match.Match( 144 | winner=m_winner, 145 | loser=m_loser, 146 | tournament=m_tournament, 147 | surface=m_surface, 148 | id_prefix=extract_file_id(row["filepath"]), 149 | ) 150 | match_o.instantiate_from_data_row(row) 151 | ( 152 | match_data, 153 | w_data, 154 | l_data, 155 | ) = match_o.get_prior_data_and_update_players_stats() 156 | 157 | match_data["match_id"] = match_o.id 158 | 159 | to_1 = {} 160 | to_2 = {} 161 | for col in w_data.columns: 162 | to_1[col] = col + "_1" 163 | to_2[col] = col + "_2" 164 | 165 | concat_1 = pd.concat( 166 | [w_data.copy().rename(to_1, axis=1), l_data.copy().rename(to_1, axis=1)], 167 | axis=0, 168 | ) 169 | concat_2 = pd.concat( 170 | [l_data.copy().rename(to_2, axis=1), w_data.copy().rename(to_2, axis=1)], 171 | axis=0, 172 | ) 173 | final_df = pd.concat( 174 | [pd.concat([match_data] * 2, axis=0), concat_1, concat_2], axis=1 175 | ) 176 | final_df["Winner"] = [0, 1] 177 | 178 | if get_match_statistics: 179 | ( 180 | match_stats, 181 | w_mstats, 182 | l_mstats, 183 | ) = match_o.get_match_data_results_statistics() 184 | ms_to_1 = {} 185 | ms_to_2 = {} 186 | for col in w_mstats.columns: 187 | ms_to_1[col] = col + "_1" 188 | ms_to_2[col] = col + "_2" 189 | 190 | ms_concat_1 = pd.concat( 191 | [ 192 | w_mstats.copy().rename(ms_to_1, axis=1), 193 | l_mstats.copy().rename(ms_to_1, axis=1), 194 | ], 195 | axis=0, 196 | ) 197 | ms_concat_2 = pd.concat( 198 | [ 199 | l_mstats.copy().rename(ms_to_2, axis=1), 200 | w_mstats.copy().rename(ms_to_2, axis=1), 201 | ], 202 | axis=0, 203 | ) 204 | 205 | match_stats_1 = match_stats.copy() 206 | match_stats_2 = match_stats.copy() 207 | match_stats_2["score"] = match_stats_2.apply( 208 | lambda row: reverse_score(row["score"]), axis=1 209 | ) 210 | 211 | match_stats_df = pd.concat( 212 | [ 213 | pd.concat([match_stats_1, match_stats_2], axis=0), 214 | ms_concat_1, 215 | ms_concat_2, 216 | ], 217 | axis=1, 218 | ) 219 | final_df = pd.concat([final_df, match_stats_df], axis=1) 220 | matches_data.append(final_df) 221 | 222 | matches_data = pd.concat(matches_data, axis=0) 223 | return matches_data 224 | 225 | 226 | def matches_data_loader( 227 | keep_values_from_year=1990, 228 | path_to_data="submodules/tennis_atp", 229 | path_to_cache="/cache", 230 | flush_cache=True, 231 | get_match_statistics=False, 232 | get_reversed_match_data=False, 233 | include_davis_cup=False, 234 | match_type=["main_atp", "futures", "qualifying_challengers"], 235 | ): 236 | """ 237 | Main matches data loading function 238 | :param keep_values_from_year: int [1968; present], min year to keep values from 239 | :param path_to_data: str, path to tennis_atp submodule 240 | :param path_to_cache: str, path to local personal cache 241 | :param flush_cache: bool, whether cache should be erased and whole function run again 242 | :param get_match_statistics: bool, return each match statistics along pre match statistics 243 | :param get_reversed_match_data: bool, should each match be double, with Winner = 0 and Winner = 1 244 | :return: pandas.DataFrame with all matches data 245 | """ 246 | 247 | total_elapsed_time = 0 248 | # Check if data already in cache 249 | if os.path.exists(os.path.join(path_to_cache, "players_db")): 250 | print("Payers DB exists") 251 | players_db_cached = True 252 | else: 253 | players_db_cached = False 254 | 255 | if os.path.exists( 256 | os.path.join(path_to_cache, f"matches_data_{keep_values_from_year}.csv") 257 | ): 258 | matches_data_cached = True 259 | else: 260 | matches_data_cached = False 261 | 262 | if not players_db_cached or flush_cache: 263 | df_players = pd.read_csv( 264 | os.path.join(path_to_data, "atp_players.csv"), 265 | header=0, 266 | encoding="ISO-8859-1", 267 | ) 268 | players_db = create_player_profiles(df_players) 269 | with open(os.path.join(path_to_cache, "players_db"), "wb") as file: 270 | pickle.dump(players_db, file, protocol=pickle.HIGHEST_PROTOCOL) 271 | else: 272 | with open(os.path.join(path_to_cache, "players_db"), "rb") as file: 273 | players_db = pickle.load(file) 274 | 275 | if not matches_data_cached or flush_cache: 276 | print("data not cached or flush=True") 277 | data_files = get_match_files(path_to_data, match_type=match_type) 278 | data_years = data_files.year.astype( 279 | "uint32" 280 | ) # to change when handling different type of tournament (qualifiers, main, etc...) 281 | 282 | data_per_year = [] 283 | for year in np.sort(np.unique(data_years.values)): 284 | t_start = time.time() 285 | print("+---------+---------+") 286 | print(" Year %i " % year) 287 | if year >= keep_values_from_year: 288 | print("Updating players statistics & saving matches data") 289 | else: 290 | print("Only updating players statistics") 291 | print("+---------+---------+") 292 | filepaths = data_files.loc[data_files.year == str(year)][ 293 | "filepath" 294 | ].values.tolist() 295 | df_year = load_match_data_from_path( 296 | players_db, filepaths, get_match_statistics=get_match_statistics 297 | ) 298 | df_year["tournament_year"] = year 299 | if year >= keep_values_from_year: 300 | data_per_year.append(df_year) 301 | 302 | df_year.to_csv( 303 | os.path.join(path_to_cache, f"matches_data_{year}.csv"), 304 | sep=";", 305 | index=False, 306 | ) 307 | total_elapsed_time += time.time() - t_start 308 | print(f"Elapsed Time: {np.round(time.time() - t_start, 2)} seconds") 309 | print(f"Total Elapsed Time: {np.round(total_elapsed_time, 2)} seconds") 310 | 311 | data_matches = pd.concat(data_per_year, axis=0) 312 | data_matches = data_matches.reset_index() 313 | 314 | else: 315 | years = [] 316 | file_pattern = "matches_data_(?P\d+).csv" 317 | for file in os.listdir(path_to_cache): 318 | regex_match = re.match(file_pattern, file) 319 | if regex_match is not None: 320 | years.append(int(regex_match["year"])) 321 | 322 | data_per_year = [] 323 | for year in np.sort(years): 324 | if year >= keep_values_from_year: 325 | df_year = pd.read_csv( 326 | os.path.join(path_to_cache, f"matches_data_{year}.csv"), sep=";" 327 | ) 328 | data_per_year.append(df_year) 329 | 330 | data_matches = pd.concat(data_per_year, axis=0) 331 | data_matches.reset_index(drop=True, inplace=True) 332 | 333 | if not include_davis_cup: 334 | data_matches = data_matches.loc[~data_matches.tournament.str.contains("Davis")] 335 | 336 | if get_reversed_match_data: 337 | return data_matches 338 | else: 339 | return data_matches.iloc[::2] 340 | 341 | 342 | def clean_missing_data(df): 343 | """ 344 | Cleans rows of df with missing data or to few statistics to be useful 345 | :param df: 346 | :return: 347 | """ 348 | 349 | df.dropna(axis=0) 350 | df = df.loc[df.Ranking_1 != 9999] 351 | df = df.loc[df.Ranking_1 != 0] 352 | df = df.loc[df.Ranking_2 != 9999] 353 | df = df.loc[df.Ranking_2 != 0] 354 | 355 | return df 356 | 357 | 358 | def encode_data(df, mode="integer"): 359 | # Remove: 360 | # - index 361 | # - Unnamed: 0 362 | # - Unnamed: 0.1 363 | # - tournament 364 | # - Name 365 | # - ID 366 | # - Birth Year => Age 367 | # - Versus: % V against 2, last 5 matches 368 | # - Matches 369 | 370 | # Refac: 371 | # - Versus 372 | # Best way to do it ? 373 | # - Birth Year 374 | # - Last Tournament => Days since last tournament + result ? 375 | 376 | df_copy = df 377 | if mode == "integer": 378 | # Considered Variables: 379 | tournament_level = {"G": 0, "A": 1, "M": 2, "F": 3, "D": 4} 380 | 381 | round = { 382 | "F": 0, 383 | "SF": 1, 384 | "QF": 2, 385 | "R16": 3, 386 | "R32": 4, 387 | "R64": 5, 388 | "R128": 6, 389 | "R256": 7, 390 | "RR": 8, 391 | "BR": 9, 392 | "ER": 10, 393 | } 394 | 395 | hand = { 396 | "R": -1, 397 | "L": 1, 398 | "A": 0, 399 | "U": 2, 400 | "nan": 2, 401 | } 402 | 403 | elif mode == "one_hot": 404 | # Considered Variables: 405 | tournament_level = { 406 | "G": [0, 0, 0, 1], 407 | "A": [0, 0, 1, 0], 408 | "M": [0, 1, 0, 0], 409 | "D": [1, 0, 0, 0], 410 | } 411 | 412 | round = { 413 | "F": [0, 0, 0, 0, 0, 0, 0, 0, 1], 414 | "SF": [0, 0, 0, 0, 0, 0, 0, 1, 0], 415 | "QF": [0, 0, 0, 0, 0, 0, 1, 0, 0], 416 | "R16": [0, 0, 0, 0, 0, 1, 0, 0, 0], 417 | "R32": [0, 0, 0, 0, 1, 0, 0, 0, 0], 418 | "R64": [0, 0, 0, 1, 0, 0, 0, 0, 0], 419 | "R128": [0, 0, 1, 0, 0, 0, 0, 0, 0], 420 | "R256": [0, 1, 0, 0, 0, 0, 0, 0, 0], 421 | "RR": [1, 0, 0, 0, 0, 0, 0, 0, 0], 422 | } 423 | 424 | hand = { 425 | "R": [1, 0, 0, 0], 426 | "L": [0, 1, 0, 0], 427 | "A": [0, 0, 1, 0], 428 | "U": [0, 0, 0, 1], 429 | } 430 | 431 | elif mode == "mixing": 432 | # Considered Variables: 433 | tournament_level = {"G": 0, "A": 1, "M": 2, "F": 3, "D": 4} 434 | 435 | round = { 436 | "F": 0, 437 | "SF": 1, 438 | "QF": 2, 439 | "R16": 3, 440 | "R32": 4, 441 | "R64": 5, 442 | "R128": 6, 443 | "R256": 7, 444 | "RR": 8, 445 | "BR": 9, 446 | } 447 | 448 | hand = { 449 | "R": [1, 0, 0, 0], 450 | "L": [0, 1, 0, 0], 451 | "A": [0, 0, 1, 0], 452 | "U": [0, 0, 0, 1], 453 | } 454 | 455 | for col in df_copy.columns: 456 | if "hand" in col.lower(): 457 | df_copy[col] = df_copy.apply(lambda row: hand[str(row[col])], axis=1) 458 | elif "round" in col.lower(): 459 | df_copy[col] = df_copy.apply(lambda row: round[row[col]], axis=1) 460 | elif "tournament_level" in col.lower(): 461 | df_copy[col] = df_copy.apply(lambda row: tournament_level[row[col]], axis=1) 462 | else: 463 | pass 464 | 465 | def get_versus_1(row): 466 | vs_1 = row["Versus_1"] 467 | if isinstance(vs_1, str): 468 | try: 469 | vs_1 = literal_eval(vs_1) 470 | except: 471 | raise ValueError("Err_OR") 472 | return vs_1.get(row["ID_2"], []) 473 | 474 | if "Versus_1" in df_copy.columns: 475 | df_copy["nb_match_versus"] = df_copy.apply( 476 | lambda row: len(row["Versus_1"]), axis=1 477 | ) 478 | df_copy["v_perc_versus"] = df_copy.apply( 479 | lambda row: row["Versus_1"].count("V") / len(row["Versus_1"]) 480 | if len(row["Versus_1"]) > 0 481 | else -1, 482 | axis=1, 483 | ) 484 | 485 | return df_copy 486 | -------------------------------------------------------------------------------- /python/data/player.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from data.data_utils import get_days_difference 5 | 6 | # How to update player's ranking ? 7 | 8 | 9 | class Player: 10 | def __init__(self, name, birthdate, country, nb_id, hand="", height=0): 11 | self.name = name 12 | self.birthdate = birthdate 13 | 14 | self.rankings_history = {} 15 | self.ranking = 9999 16 | self.ranking_points = 0 17 | self.ranking_over_time = 0 18 | self.country = country 19 | self.id = nb_id 20 | self.last_tournament_date = "" 21 | self.versus = {} 22 | self.hand = hand 23 | self.height = height 24 | 25 | self.last_matches = ["", "", "", "", ""] 26 | # self.matches = [] 27 | self.matches_history = [] 28 | self.victories_percentage = 0 29 | 30 | self.matches_hard = [] 31 | self.hard_victories_percentage = 0 32 | self.matches_carpet = [] 33 | self.carpet_victories_percentage = 0 34 | self.matches_clay = [] 35 | self.clay_victories_percentage = 0 36 | self.matches_grass = [] 37 | self.grass_victories_percentage = 0 38 | 39 | self.aces_percentage = 0 40 | 41 | self.doublefaults_percentage = 0 42 | self.first_serve_success_percentage = 0 43 | self.winning_on_1st_serve_percentage = 0 44 | self.winning_on_2nd_serve_percentage = 0 45 | self.overall_win_on_serve_percentage = 0 46 | 47 | self.service_data = { 48 | "service_games_played": [], 49 | "1st_serve_success": [], 50 | "aces_nb": [], 51 | "doublefaults_nb": [], 52 | "win_on_1st_serve": [], 53 | "win_on_2nd_serve": [], 54 | "breakpoints_faced": [], 55 | "breakpoints_saved": [], 56 | } 57 | 58 | self.breakpoint_faced_percentage = 0 59 | self.breakpoint_saved_percentage = 0 60 | 61 | self.games_fatigue = ( 62 | 0 # nb games curr tourney + nb games prev tourney / diff days 63 | ) 64 | self.minutes_fatigue = ( 65 | 0 # nb minutes curr tourney + nb minutes prev tourney / diff days 66 | ) 67 | self.fatigue_features = { 68 | "previous tournament": { 69 | "date": "19000000", 70 | "num_games": 0, 71 | "num_matchs": 0, 72 | "num_minutes": 0, 73 | }, 74 | "current tournament": { 75 | "date": "19000000", 76 | "num_games": 0, 77 | "num_matchs": 0, 78 | "num_minutes": 0, 79 | }, 80 | } 81 | 82 | def __str__(self): 83 | return ( 84 | "ID : " 85 | + str(self.id) 86 | + " *** Name : " 87 | + self.name 88 | + " " * (35 - len(self.name)) 89 | + " *** Born Year : " 90 | + str(self.birthdate) 91 | + " *** Country : " 92 | + str(self.country) 93 | + " *** Hand : " 94 | + str(self.hand) 95 | + " *** Height : " 96 | + str(self.height) 97 | ) 98 | 99 | def _add_victory(self, id_loser, match_id, tournament_date="19000101"): 100 | """ 101 | Update last_matches argument with a victories and updates versus argument using id_loser 102 | :param id_loser: ID of los of match against current player 103 | :return: 104 | """ 105 | self.last_matches[4] = self.last_matches[3] 106 | self.last_matches[3] = self.last_matches[2] 107 | self.last_matches[2] = self.last_matches[1] 108 | self.last_matches[1] = self.last_matches[0] 109 | self.last_matches[0] = "V" 110 | if id_loser in self.versus.keys(): 111 | self.versus[id_loser].append(["V", tournament_date, match_id]) 112 | else: 113 | self.versus[id_loser] = [["V", tournament_date, match_id]] 114 | self._update_victories_percentage("V", match_id) 115 | 116 | def _add_defeat(self, id_winner, match_id, tournament_date="19000101"): 117 | """ 118 | Add a Defeat 119 | :param id_winner: 120 | :return: 121 | """ 122 | self.last_matches[4] = self.last_matches[3] 123 | self.last_matches[3] = self.last_matches[2] 124 | self.last_matches[2] = self.last_matches[1] 125 | self.last_matches[1] = self.last_matches[0] 126 | self.last_matches[0] = "D" 127 | if id_winner in self.versus.keys(): 128 | self.versus[id_winner].append(["D", tournament_date, match_id]) 129 | else: 130 | self.versus[id_winner] = [["D", tournament_date, match_id]] 131 | self._update_victories_percentage("D", match_id) 132 | 133 | def _update_victories_percentage(self, match_outcome, match_id): 134 | """ 135 | Updates Victories Percentage with a V/D of last match 136 | :param match_outcome: 137 | :return: 138 | """ 139 | self.matches_history.append([match_outcome, match_id]) 140 | # self.matches.append(match_outcome) 141 | victories_number = [_[0] for _ in self.matches_history].count("V") 142 | matches_number = len(self.matches_history) 143 | self.victories_percentage = (victories_number / matches_number) * 100 144 | 145 | def _update_surfaces_victories_percentage(self, surface, outcome): 146 | """ 147 | Updates % of victories on a given surface (V/D) 148 | :param surface: 149 | :param outcome: 150 | :return: 151 | """ 152 | if surface == "Clay": 153 | self.matches_clay.append(outcome) 154 | self.clay_victories_percentage = ( 155 | self.matches_clay.count("V") / len(self.matches_clay) * 100 156 | ) 157 | 158 | elif surface == "Grass": 159 | self.matches_grass.append(outcome) 160 | self.grass_victories_percentage = ( 161 | self.matches_grass.count("V") / len(self.matches_grass) * 100 162 | ) 163 | 164 | elif surface == "Hard": 165 | self.matches_hard.append(outcome) 166 | self.hard_victories_percentage = ( 167 | self.matches_hard.count("V") / len(self.matches_hard) * 100 168 | ) 169 | 170 | elif surface == "Carpet": 171 | self.matches_carpet.append(outcome) 172 | self.carpet_victories_percentage = ( 173 | self.matches_carpet.count("V") / len(self.matches_carpet) * 100 174 | ) 175 | 176 | def _update_fatigue(self, tournament_date, games_number, minutes_number): 177 | """ 178 | Updates Fatigue arguments: self.fatigue but also self.fatigue_features 179 | :param tournament_date: 180 | :param sets_number: 181 | :return: 182 | """ 183 | if games_number == games_number and games_number != "nan": 184 | if tournament_date == self.fatigue_features["current tournament"]["date"]: 185 | self.fatigue_features["current tournament"]["num_games"] += games_number 186 | if minutes_number == minutes_number and minutes_number != "nan": 187 | self.fatigue_features["current tournament"][ 188 | "num_minutes" 189 | ] += minutes_number 190 | self.fatigue_features["current tournament"]["num_matchs"] += 1 191 | else: 192 | self.fatigue_features["previous tournament"] = self.fatigue_features[ 193 | "current tournament" 194 | ] 195 | self.fatigue_features["current tournament"] = { 196 | "date": tournament_date, 197 | "num_games": games_number, 198 | "num_minutes": minutes_number, 199 | "num_matchs": 1, 200 | } 201 | 202 | previous_tournament_date = str( 203 | self.fatigue_features["previous tournament"]["date"] 204 | ) 205 | current_tournament_date = str( 206 | self.fatigue_features["current tournament"]["date"] 207 | ) 208 | 209 | days_difference_tournaments = get_days_difference( 210 | previous_tournament_date, current_tournament_date 211 | ) 212 | if days_difference_tournaments == 0: 213 | print(previous_tournament_date, current_tournament_date) 214 | print(tournament_date) 215 | 216 | self.games_fatigue = ( 217 | self.fatigue_features["previous tournament"]["num_games"] 218 | / days_difference_tournaments 219 | + self.fatigue_features["current tournament"]["num_games"] 220 | ) 221 | self.minutes_fatigue = ( 222 | self.fatigue_features["previous tournament"]["num_minutes"] 223 | / days_difference_tournaments 224 | + self.fatigue_features["current tournament"]["num_minutes"] 225 | ) 226 | else: 227 | print("NaN in sets number", games_number) 228 | 229 | def _update_aces_percentage(self, aces_nb): 230 | """ 231 | Upates Aces Percentage 232 | :param aces_nb: 233 | :return: 234 | """ 235 | 236 | if aces_nb == aces_nb and aces_nb != "nan": 237 | self.service_data["aces_nb"].append(aces_nb) 238 | total_aces_nbr = sum(self.service_data["aces_nb"]) 239 | total_service_points_played = sum(self.service_data["service_games_played"]) 240 | 241 | if total_service_points_played != 0: 242 | self.aces_percentage = ( 243 | total_aces_nbr / total_service_points_played * 100 244 | ) 245 | else: 246 | print("No point played :", aces_nb) 247 | 248 | def _update_doublefaults_percentage(self, df_nb): 249 | """ 250 | Update doublefaults percentage 251 | :param df_nb: 252 | :return: 253 | """ 254 | if df_nb == df_nb and df_nb != "nan": 255 | self.service_data["doublefaults_nb"].append(df_nb) 256 | total_df_nbr = sum(self.service_data["doublefaults_nb"]) 257 | total_service_points_played = sum(self.service_data["service_games_played"]) 258 | 259 | if total_service_points_played != 0: 260 | self.doublefaults_percentage = ( 261 | total_df_nbr / total_service_points_played * 100 262 | ) 263 | else: 264 | print("No point played :", total_df_nbr) 265 | self.doublefaults_percentage = 0 266 | 267 | else: 268 | print("NaN in Double Faults", df_nb) 269 | 270 | def _update_winning_on_1st_serve_percentage(self, first_serve_win): 271 | """ 272 | 273 | :param first_serve_win: 274 | :return: 275 | """ 276 | self.service_data["win_on_1st_serve"].append(first_serve_win) 277 | 278 | total_first_serves_win = sum(self.service_data["1st_serve_success"]) 279 | total_service_points_played = sum(self.service_data["service_games_played"]) 280 | 281 | if total_service_points_played != 0: 282 | self.winning_on_1st_serve_percentage = ( 283 | total_first_serves_win / total_service_points_played * 100 284 | ) 285 | else: 286 | print("No point played :", total_first_serves_win) 287 | 288 | def _update_winning_on_2nd_serve_percentage(self, second_serve_win): 289 | """ 290 | 291 | :param second_serve_win: 292 | :return: 293 | """ 294 | self.service_data["win_on_2nd_serve"].append(second_serve_win) 295 | 296 | total_second_serves_win = sum(self.service_data["win_on_2nd_serve"]) 297 | total_service_points_played = sum(self.service_data["service_games_played"]) 298 | 299 | if total_service_points_played != 0: 300 | self.winning_on_2nd_serve_percentage = ( 301 | total_second_serves_win / total_service_points_played * 100 302 | ) 303 | else: 304 | print("No point played :", total_second_serves_win) 305 | 306 | def _update_first_serve_success_percentage(self, first_services_in): 307 | """ 308 | 309 | :param first_services_in: 310 | :return: 311 | """ 312 | self.service_data["1st_serve_success"].append(first_services_in) 313 | 314 | total_first_serves_in = sum(self.service_data["1st_serve_success"]) 315 | total_service_points_played = sum(self.service_data["service_games_played"]) 316 | 317 | if total_service_points_played != 0: 318 | self.first_serve_success_percentage = ( 319 | total_first_serves_in / total_service_points_played * 100 320 | ) 321 | else: 322 | print("No point played :", total_first_serves_in) 323 | 324 | def _update_breakpoints_faced_and_saved(self, breakpoint_faced, breakpoint_saved): 325 | """ 326 | 327 | :param breakpoint_faced: 328 | :param breakpoint_saved: 329 | :return: 330 | """ 331 | self.service_data["breakpoints_saved"].append(breakpoint_saved) 332 | self.service_data["breakpoints_faced"].append(breakpoint_faced) 333 | 334 | total_breakpoint_faced = sum(self.service_data["breakpoints_faced"]) 335 | total_games_played = sum(self.service_data["service_games_played"]) 336 | total_breakpoint_saved = sum(self.service_data["breakpoints_saved"]) 337 | 338 | if total_games_played != 0: 339 | self.breakpoint_faced_percentage = ( 340 | total_breakpoint_faced / total_games_played * 100 341 | ) 342 | self.breakpoint_saved_percentage = ( 343 | total_breakpoint_saved / total_games_played * 100 344 | ) 345 | else: 346 | print("No point played :", self.service_data["breakpoints_saved"]) 347 | 348 | def _update_service_data( 349 | self, 350 | service_games_played, 351 | aces_nb, 352 | doublefaults_nb, 353 | first_serve_success, 354 | winning_on_1st_serve, 355 | winning_on_2nd_serve, 356 | breakpoints_faced, 357 | breakpoints_saved, 358 | ): 359 | # Assert data exists 360 | if ( 361 | service_games_played == service_games_played 362 | and aces_nb == aces_nb 363 | and doublefaults_nb == doublefaults_nb 364 | and first_serve_success == first_serve_success 365 | and winning_on_1st_serve == winning_on_1st_serve 366 | and winning_on_2nd_serve == winning_on_2nd_serve 367 | and breakpoints_faced == breakpoints_faced 368 | and breakpoints_saved == breakpoints_saved 369 | ): 370 | self.service_data["service_games_played"].append(service_games_played) 371 | 372 | self._update_aces_percentage(aces_nb=aces_nb) 373 | self._update_doublefaults_percentage(df_nb=doublefaults_nb) 374 | self._update_winning_on_1st_serve_percentage( 375 | first_serve_win=winning_on_1st_serve 376 | ) 377 | self._update_winning_on_2nd_serve_percentage( 378 | second_serve_win=winning_on_2nd_serve 379 | ) 380 | self.overall_win_on_serve_percentage = ( 381 | self.winning_on_1st_serve_percentage 382 | + self.winning_on_2nd_serve_percentage 383 | ) 384 | self._update_first_serve_success_percentage( 385 | first_services_in=first_serve_success 386 | ) 387 | self._update_breakpoints_faced_and_saved( 388 | breakpoint_saved=breakpoints_saved, breakpoint_faced=breakpoints_faced 389 | ) 390 | 391 | else: 392 | # Future argument ;) 393 | verbose = 1 394 | if verbose > 2: 395 | print("Service data not complete...") 396 | 397 | def _update_rankings(self, new_ranking, new_ranking_points, date): 398 | if new_ranking_points != new_ranking_points: 399 | if new_ranking_points == new_ranking_points: 400 | print("No ranking points", new_ranking, new_ranking_points) 401 | new_ranking_points = 0 402 | else: 403 | try: 404 | new_ranking_points = int(new_ranking_points) 405 | except: 406 | new_ranking_points = 0 407 | 408 | if new_ranking != new_ranking: 409 | new_ranking = 9999 410 | else: 411 | try: 412 | new_ranking = int(new_ranking) 413 | except: 414 | new_ranking = 9999 415 | 416 | self.ranking = new_ranking 417 | self.ranking_points = new_ranking_points 418 | 419 | self.rankings_history[date] = [ 420 | int(new_ranking), 421 | int(new_ranking_points), 422 | ] 423 | 424 | def _get_best_ranking(self): 425 | all_ranks = [ 426 | self.rankings_history[date][0] for date in self.rankings_history.keys() 427 | ] 428 | if len(all_ranks) > 0: 429 | return np.min(all_ranks) 430 | else: 431 | return -1 432 | 433 | def update_from_match(self, match): 434 | """ 435 | Updates the whole player profile from a match 436 | :param match: 437 | :return: 438 | """ 439 | 440 | # Update Rankings ? 441 | if match.winner.id == self.id: 442 | self._add_victory( 443 | match.loser.id, match_id=match.id, tournament_date=match.tournament_date 444 | ) 445 | self._update_surfaces_victories_percentage(match.surface, "V") 446 | else: 447 | assert match.loser.id == self.id 448 | self._add_defeat( 449 | match.winner.id, 450 | match_id=match.id, 451 | tournament_date=match.tournament_date, 452 | ) 453 | self._update_surfaces_victories_percentage(match.surface, "D") 454 | self._update_fatigue( 455 | match.tournament_date, match.games_number, match.elapsed_minutes 456 | ) 457 | 458 | self._update_service_data( 459 | service_games_played=match.get_service_points_played(self.id), 460 | aces_nb=match.get_aces_nb(self.id), 461 | doublefaults_nb=match.get_df_nb(self.id), 462 | first_serve_success=match.get_first_services_in(self.id), 463 | winning_on_1st_serve=match.get_first_serve_win(self.id), 464 | winning_on_2nd_serve=match.get_second_serve_win(self.id), 465 | breakpoints_faced=match.get_breakpoint_faced(self.id), 466 | breakpoints_saved=match.get_breakpoint_saved(self.id), 467 | ) 468 | 469 | self._update_rankings(*match.get_rankings(self.id), date=match.tournament_date) 470 | 471 | def get_data_df(self, opponent=None): 472 | data_dict = { 473 | "Name": [self.name], 474 | "ID": [self.id], 475 | "Ranking": [self.ranking], 476 | "Ranking_Points": [self.ranking_points], 477 | "Ranking_History": [self.rankings_history.copy()], 478 | "Best_Rank": [self._get_best_ranking()], 479 | "Birth_Year": [self.birthdate], 480 | "Versus": [ 481 | self.versus.copy() 482 | if opponent is None 483 | else self.versus.get(opponent, []).copy() 484 | ], 485 | "Hand": [self.hand], 486 | "Last_Tournament_Date": [ 487 | self.fatigue_features["previous tournament"]["date"] 488 | ], 489 | "Height": [self.height], 490 | "Matches": [self.matches_history.copy()], 491 | "Matches_Clay": [self.matches_clay.copy()], 492 | "Matches_Carpet": [self.matches_carpet.copy()], 493 | "Matches_Grass": [self.matches_grass.copy()], 494 | "Matches_Hard": [self.matches_hard.copy()], 495 | "Victories_Percentage": [self.victories_percentage], 496 | "Clay_Victories_Percentage": [self.clay_victories_percentage], 497 | "Carpet_Victories_Percentage": [self.carpet_victories_percentage], 498 | "Grass_Victories_Percentage": [self.grass_victories_percentage], 499 | "Hard_Victories_Percentage": [self.hard_victories_percentage], 500 | "Aces_Percentage": [self.aces_percentage], 501 | "Doublefaults_Percentage": [self.doublefaults_percentage], 502 | "First_Serve_Success_Percentage": [self.first_serve_success_percentage], 503 | "Winning_on_1st_Serve_Percentage": [self.winning_on_1st_serve_percentage], 504 | "Winning_on_2nd_Serve_Percentage": [self.winning_on_2nd_serve_percentage], 505 | "Overall_Win_on_Serve_Percentage": [self.overall_win_on_serve_percentage], 506 | "BreakPoint_Face_Percentage": [self.breakpoint_faced_percentage], 507 | "BreakPoint_Saved_Percentage": [self.breakpoint_saved_percentage], 508 | "games_fatigue": [self.games_fatigue], 509 | "minutes_fatigue": [self.minutes_fatigue], 510 | } 511 | return pd.DataFrame(data_dict) 512 | 513 | def get_last_months_rankings(self, date, nb_months=12, day_of_month="last"): 514 | assert day_of_month in [ 515 | "last", 516 | "first", 517 | ], f"For now you can only use first or last month day for ranking, you chose {day_of_month}" 518 | if day_of_month == "last": 519 | f = max 520 | else: 521 | f = min 522 | date = str(date) 523 | last_months_ranks = [9999 for _ in range(nb_months)] 524 | last_months_points = [0 for _ in range(nb_months)] 525 | date_year = int(date[:4]) 526 | date_month = int(date[4:6]) 527 | 528 | for i in range(nb_months): 529 | if date_month == 1: 530 | date_month = 12 531 | date_year = date_year - 1 532 | else: 533 | date_month = date_month - 1 534 | 535 | days_with_rankings = [] 536 | for key in self.rankings_history.keys(): 537 | if f"{date_year}{date_month:02d}" in str(key): 538 | days_with_rankings.append(int(str(key)[6:])) 539 | try: 540 | if len(days_with_rankings) > 0: 541 | last_months_ranks[-i] = self.rankings_history[ 542 | int(f"{date_year}{date_month:02d}{f(days_with_rankings):02d}") 543 | ][0] 544 | last_months_points[-i] = self.rankings_history[ 545 | int(f"{date_year}{date_month:02d}{f(days_with_rankings):02d}") 546 | ][1] 547 | 548 | except: 549 | print(days_with_rankings) 550 | print(self.rankings_history) 551 | print(date_month, date_year) 552 | 553 | print(f"{date_year}{date_month:02d}{f(days_with_rankings):02d}") 554 | print( 555 | self.rankings_history[ 556 | f"{date_year}{date_month:02d}{f(days_with_rankings):02d}" 557 | ] 558 | ) 559 | raise ValueError 560 | 561 | return last_months_ranks, last_months_points 562 | -------------------------------------------------------------------------------- /examples/data/single_row_example.csv: -------------------------------------------------------------------------------- 1 | ,100 2 | index,55550 3 | id,atp_matches_2005_4281 4 | tournament,Gstaad 5 | tournament_level,A 6 | tournament_date,20050704 7 | tournament_surface,Clay 8 | round,R32 9 | best_of,3 10 | match_id,atp_matches_2005_4281 11 | Name_1,Stan.Wawrinka 12 | ID_1,104527 13 | Ranking_1,74 14 | Ranking_Points_1,547 15 | Ranking_History_1,"{20030616: [387, 68], 20030707: [363, 74], 20030714: [348, 79], 20030721: [303, 99], 20030811: [284, 114], 20030818: [215, 164], 20030825: [175, 219], 20030929: [171, 215], 20031020: [172, 214], 20040105: [171, 216], 20040119: [166, 226], 20040206: [162, 225], 20040209: [164, 225], 20040301: [161, 227], 20040412: [184, 205], 20040419: [179, 210], 20040426: [148, 260], 20040510: [147, 270], 20040524: [150, 266], 20040531: [150, 266], 20040607: [156, 267], 20040614: [152, 272], 20040705: [146, 275], 20040719: [153, 261], 20040809: [159, 256], 20040816: [162, 241], 20040830: [163, 241], 20040906: [163, 241], 20040913: [167, 233], 20040920: [166, 242], 20041025: [161, 258], 20041101: [159, 262], 20050103: [168, 262], 20050117: [165, 267], 20050124: [165, 267], 20050131: [153, 291], 20050214: [128, 346], 20050221: [120, 377], 20050304: [118, 377], 20050328: [117, 362], 20050418: [113, 397], 20050502: [99, 421], 20050523: [87, 467], 20050613: [73, 551], 20050620: [74, 547]}" 16 | Best_Rank_1,73 17 | Birth_Year_1,19850328.0 18 | Versus_1,"[['D', 20030721, 'atp_matches_2003_4782']]" 19 | Hand_1,R 20 | Last_Tournament_Date_1,20050613 21 | Height_1,183.0 22 | Matches_1,"[['V', 'atp_matches_qual_chall_2003_3466'], ['D', 'atp_matches_qual_chall_2003_3481'], ['D', 'atp_matches_2003_4049'], ['V', 'atp_matches_2003_4315'], ['D', 'atp_matches_2003_4328'], ['V', 'atp_matches_2003_4773'], ['D', 'atp_matches_2003_4782'], ['V', 'atp_matches_qual_chall_2003_5408'], ['V', 'atp_matches_qual_chall_2003_5419'], ['V', 'atp_matches_qual_chall_2003_5424'], ['V', 'atp_matches_qual_chall_2003_5427'], ['V', 'atp_matches_qual_chall_2003_5428'], ['V', 'atp_matches_qual_chall_2003_5558'], ['V', 'atp_matches_qual_chall_2003_5571'], ['V', 'atp_matches_qual_chall_2003_5578'], ['V', 'atp_matches_qual_chall_2003_5581'], ['V', 'atp_matches_qual_chall_2003_5583'], ['V', 'atp_matches_qual_chall_2003_5889'], ['V', 'atp_matches_qual_chall_2003_5903'], ['D', 'atp_matches_qual_chall_2003_5910'], ['D', 'atp_matches_qual_chall_2003_6734'], ['D', 'atp_matches_2003_7265'], ['D', 'atp_matches_2004_5'], ['D', 'atp_matches_qual_chall_2004_423'], ['D', 'atp_matches_2004_630'], ['D', 'atp_matches_qual_chall_2004_752'], ['V', 'atp_matches_qual_chall_2004_1154'], ['V', 'atp_matches_qual_chall_2004_1163'], ['D', 'atp_matches_qual_chall_2004_1167'], ['V', 'atp_matches_qual_chall_2004_2073'], ['D', 'atp_matches_qual_chall_2004_2085'], ['V', 'atp_matches_qual_chall_2004_2287'], ['V', 'atp_matches_qual_chall_2004_2301'], ['V', 'atp_matches_qual_chall_2004_2308'], ['V', 'atp_matches_qual_chall_2004_2312'], ['V', 'atp_matches_qual_chall_2004_2314'], ['D', 'atp_matches_2004_2416'], ['D', 'atp_matches_qual_chall_2004_2596'], ['D', 'atp_matches_qual_chall_2004_3191'], ['D', 'atp_matches_qual_chall_2004_3236'], ['V', 'atp_matches_qual_chall_2004_3481'], ['D', 'atp_matches_qual_chall_2004_3490'], ['V', 'atp_matches_qual_chall_2004_3600'], ['D', 'atp_matches_qual_chall_2004_3611'], ['D', 'atp_matches_2004_4123'], ['D', 'atp_matches_2004_4825'], ['V', 'atp_matches_qual_chall_2004_5381'], ['V', 'atp_matches_qual_chall_2004_5393'], ['V', 'atp_matches_qual_chall_2004_5399'], ['V', 'atp_matches_qual_chall_2004_5402'], ['D', 'atp_matches_qual_chall_2004_5403'], ['V', 'atp_matches_qual_chall_2004_5629'], ['V', 'atp_matches_qual_chall_2004_5641'], ['V', 'atp_matches_qual_chall_2004_5647'], ['V', 'atp_matches_qual_chall_2004_5650'], ['V', 'atp_matches_qual_chall_2004_5651'], ['D', 'atp_matches_qual_chall_2004_6096'], ['V', 'atp_matches_qual_chall_2004_6148'], ['D', 'atp_matches_qual_chall_2004_6335'], ['D', 'atp_matches_2004_6494'], ['V', 'atp_matches_qual_chall_2004_6580'], ['V', 'atp_matches_qual_chall_2004_6593'], ['D', 'atp_matches_qual_chall_2004_6600'], ['D', 'atp_matches_2004_7371'], ['D', 'atp_matches_qual_chall_2004_7567'], ['D', 'atp_matches_2005_70'], ['D', 'atp_matches_qual_chall_2005_393'], ['V', 'atp_matches_qual_chall_2005_427'], ['V', 'atp_matches_qual_chall_2005_560'], ['V', 'atp_matches_qual_chall_2005_570'], ['V', 'atp_matches_qual_chall_2005_575'], ['D', 'atp_matches_qual_chall_2005_578'], ['V', 'atp_matches_qual_chall_2005_772'], ['V', 'atp_matches_qual_chall_2005_785'], ['V', 'atp_matches_qual_chall_2005_791'], ['V', 'atp_matches_qual_chall_2005_794'], ['D', 'atp_matches_qual_chall_2005_796'], ['V', 'atp_matches_2005_1046'], ['D', 'atp_matches_2005_1061'], ['D', 'atp_matches_qual_chall_2005_1150'], ['D', 'atp_matches_2005_1298'], ['D', 'atp_matches_2005_1299'], ['V', 'atp_matches_qual_chall_2005_1907'], ['V', 'atp_matches_qual_chall_2005_1918'], ['V', 'atp_matches_qual_chall_2005_1924'], ['D', 'atp_matches_qual_chall_2005_1927'], ['V', 'atp_matches_2005_2334'], ['V', 'atp_matches_2005_2358'], ['V', 'atp_matches_2005_2373'], ['D', 'atp_matches_2005_2381'], ['V', 'atp_matches_2005_2630'], ['D', 'atp_matches_2005_2661'], ['V', 'atp_matches_2005_3180'], ['V', 'atp_matches_2005_3214'], ['D', 'atp_matches_2005_3231'], ['V', 'atp_matches_qual_chall_2005_3262'], ['V', 'atp_matches_qual_chall_2005_3293'], ['V', 'atp_matches_qual_chall_2005_3355'], ['D', 'atp_matches_2005_3757'], ['D', 'atp_matches_2005_3914']]" 23 | Matches_Clay_1,"['V', 'D', 'D', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'D', 'D', 'D', 'D', 'V', 'D', 'V', 'D', 'D', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'D', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V']" 24 | Matches_Carpet_1,"['D', 'D', 'D']" 25 | Matches_Grass_1,"['D', 'D']" 26 | Matches_Hard_1,"['D', 'D', 'D', 'D', 'V', 'V', 'D', 'D', 'V', 'D', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'D', 'D', 'D', 'D']" 27 | Victories_Percentage_1,57.99999999999999 28 | Clay_Victories_Percentage_1,66.66666666666666 29 | Carpet_Victories_Percentage_1,0.0 30 | Grass_Victories_Percentage_1,0.0 31 | Hard_Victories_Percentage_1,46.15384615384615 32 | Aces_Percentage_1,7.41825280624695 33 | Doublefaults_Percentage_1,4.978038067349927 34 | First_Serve_Success_Percentage_1,57.10102489019033 35 | Winning_on_1st_Serve_Percentage_1,51.87896534895072 36 | Winning_on_2nd_Serve_Percentage_1,21.57149829184968 37 | Overall_Win_on_Serve_Percentage_1,73.4504636408004 38 | BreakPoint_Face_Percentage_1,9.809663250366032 39 | BreakPoint_Saved_Percentage_1,6.149341142020498 40 | games_fatigue_1,26.666666666666668 41 | minutes_fatigue_1,209.44444444444449 42 | last_rankings_1,"[74, 153, 163, 166, 161, 159, 9999, 153, 120, 117, 113, 87]" 43 | last_ranking_points_1,"[547, 261, 241, 242, 258, 262, 0, 291, 377, 362, 397, 467]" 44 | Name_2,Filippo.Volandri 45 | ID_2,103835 46 | Ranking_2,32 47 | Ranking_Points_2,1060 48 | Ranking_History_2,"{19990329: [354, 93], 19990719: [360, 94], 19990809: [452, 63], 19990816: [433, 70], 19991025: [274, 134], 20000214: [263, 119], 20000221: [264, 120], 20000228: [247, 132], 20000313: [281, 110], 20000327: [269, 117], 20000515: [294, 105], 20000522: [257, 127], 20000612: [248, 128], 20000626: [253, 129], 20000710: [245, 129], 20000724: [252, 129], 20000731: [242, 140], 20000807: [242, 140], 20000821: [210, 169], 20000828: [208, 174], 20000904: [208, 174], 20000918: [219, 161], 20000925: [219, 161], 20001009: [152, 252], 20001023: [152, 249], 20010115: [163, 239], 20010205: [161, 239], 20010212: [161, 239], 20010219: [152, 250], 20010226: [158, 244], 20010312: [152, 253], 20010406: [159, 247], 20010507: [156, 247], 20010514: [157, 252], 20010528: [163, 236], 20010604: [163, 236], 20010611: [138, 286], 20010625: [138, 286], 20010702: [138, 286], 20010709: [126, 315], 20010730: [125, 319], 20010806: [126, 319], 20010813: [138, 289], 20010820: [137, 289], 20010827: [139, 289], 20010903: [139, 289], 20010921: [141, 277], 20010924: [145, 277], 20011001: [199, 190], 20011008: [200, 190], 20011126: [212, 175], 20020318: [243, 146], 20020325: [243, 146], 20020422: [209, 171], 20020429: [202, 182], 20020506: [181, 216], 20020513: [179, 215], 20020527: [170, 222], 20020603: [170, 222], 20020610: [197, 193], 20020617: [196, 193], 20020624: [184, 198], 20020701: [184, 198], 20020708: [183, 201], 20020722: [190, 186], 20020729: [194, 186], 20020805: [191, 186], 20020812: [189, 193], 20020826: [163, 229], 20020902: [163, 229], 20020909: [165, 229], 20020923: [158, 233], 20020930: [154, 238], 20021007: [153, 242], 20021014: [155, 242], 20021125: [154, 244], 20030113: [158, 244], 20030224: [150, 247], 20030317: [149, 257], 20030324: [149, 257], 20030404: [129, 285], 20030407: [128, 285], 20030414: [131, 285], 20030421: [98, 420], 20030505: [100, 408], 20030512: [79, 528], 20030526: [79, 535], 20030609: [80, 536], 20030623: [69, 607], 20030630: [69, 607], 20030707: [68, 614], 20030714: [60, 649], 20030721: [53, 704], 20030728: [47, 818], 20030825: [47, 775], 20030919: [48, 768], 20030922: [47, 768], 20030929: [49, 773], 20031006: [49, 773], 20031020: [49, 772], 20040105: [47, 772], 20040119: [46, 772], 20040209: [44, 799], 20040216: [45, 834], 20040223: [43, 844], 20040301: [44, 844], 20040308: [42, 889], 20040419: [51, 704], 20040503: [54, 679], 20040510: [58, 629], 20040517: [61, 614], 20040524: [47, 774], 20040607: [45, 766], 20040621: [52, 701], 20040705: [51, 710], 20040716: [58, 685], 20040719: [65, 630], 20040726: [65, 630], 20040802: [66, 633], 20040816: [59, 685], 20040830: [60, 685], 20040913: [53, 715], 20040924: [46, 775], 20040927: [45, 775], 20041011: [37, 880], 20041025: [39, 890], 20041101: [40, 890], 20050103: [43, 880], 20050110: [42, 880], 20050117: [41, 880], 20050131: [43, 850], 20050207: [40, 900], 20050214: [41, 885], 20050221: [42, 885], 20050307: [40, 885], 20050321: [41, 885], 20050404: [42, 890], 20050411: [37, 940], 20050428: [31, 1060], 20050502: [31, 1060], 20050509: [31, 1020], 20050523: [34, 990], 20050620: [32, 1060], 20050627: [32, 1060]}" 49 | Best_Rank_2,31 50 | Birth_Year_2,19810905.0 51 | Versus_2,"[['V', 20030721, 'atp_matches_2003_4782']]" 52 | Hand_2,R 53 | Last_Tournament_Date_2,20050620 54 | Height_2,183.0 55 | Matches_2,"[['D', 'atp_matches_qual_chall_1999_1308'], ['D', 'atp_matches_qual_chall_1999_3765'], ['V', 'atp_matches_qual_chall_1999_4422'], ['D', 'atp_matches_qual_chall_1999_4431'], ['D', 'atp_matches_qual_chall_1999_4654'], ['V', 'atp_matches_qual_chall_1999_6165'], ['D', 'atp_matches_qual_chall_1999_6174'], ['D', 'atp_matches_qual_chall_2000_917'], ['V', 'atp_matches_qual_chall_2000_1002'], ['V', 'atp_matches_qual_chall_2000_1017'], ['D', 'atp_matches_qual_chall_2000_1024'], ['V', 'atp_matches_qual_chall_2000_1195'], ['D', 'atp_matches_qual_chall_2000_1206'], ['V', 'atp_matches_qual_chall_2000_1504'], ['D', 'atp_matches_qual_chall_2000_1516'], ['D', 'atp_matches_qual_chall_2000_1667'], ['V', 'atp_matches_qual_chall_2000_2497'], ['V', 'atp_matches_qual_chall_2000_2506'], ['V', 'atp_matches_qual_chall_2000_2510'], ['D', 'atp_matches_qual_chall_2000_2512'], ['D', 'atp_matches_qual_chall_2000_2676'], ['D', 'atp_matches_qual_chall_2000_3244'], ['D', 'atp_matches_qual_chall_2000_3668'], ['D', 'atp_matches_qual_chall_2000_3986'], ['V', 'atp_matches_qual_chall_2000_4308'], ['V', 'atp_matches_qual_chall_2000_4320'], ['D', 'atp_matches_qual_chall_2000_4326'], ['D', 'atp_matches_qual_chall_2000_4633'], ['V', 'atp_matches_qual_chall_2000_4847'], ['V', 'atp_matches_qual_chall_2000_4862'], ['V', 'atp_matches_qual_chall_2000_4869'], ['V', 'atp_matches_qual_chall_2000_4873'], ['D', 'atp_matches_qual_chall_2000_4875'], ['V', 'atp_matches_qual_chall_2000_5257'], ['D', 'atp_matches_qual_chall_2000_5269'], ['D', 'atp_matches_qual_chall_2000_5351'], ['V', 'atp_matches_qual_chall_2000_5711'], ['V', 'atp_matches_qual_chall_2000_5724'], ['V', 'atp_matches_qual_chall_2000_5731'], ['D', 'atp_matches_qual_chall_2000_5734'], ['V', 'atp_matches_qual_chall_2000_5874'], ['V', 'atp_matches_qual_chall_2000_5883'], ['V', 'atp_matches_qual_chall_2000_5888'], ['V', 'atp_matches_qual_chall_2000_5890'], ['V', 'atp_matches_qual_chall_2000_5891'], ['V', 'atp_matches_qual_chall_2000_6085'], ['V', 'atp_matches_qual_chall_2000_6098'], ['D', 'atp_matches_qual_chall_2000_6105'], ['V', 'atp_matches_qual_chall_2000_6332'], ['V', 'atp_matches_qual_chall_2000_6344'], ['D', 'atp_matches_qual_chall_2000_6350'], ['D', 'atp_matches_qual_chall_2000_6729'], ['D', 'atp_matches_qual_chall_2001_421'], ['D', 'atp_matches_qual_chall_2001_615'], ['V', 'atp_matches_qual_chall_2001_830'], ['V', 'atp_matches_qual_chall_2001_846'], ['D', 'atp_matches_qual_chall_2001_854'], ['V', 'atp_matches_qual_chall_2001_896'], ['D', 'atp_matches_qual_chall_2001_910'], ['V', 'atp_matches_qual_chall_2001_1264'], ['V', 'atp_matches_qual_chall_2001_1272'], ['D', 'atp_matches_qual_chall_2001_1276'], ['D', 'atp_matches_qual_chall_2001_1506'], ['D', 'atp_matches_2001_1801'], ['D', 'atp_matches_2001_2340'], ['V', 'atp_matches_qual_chall_2001_2490'], ['D', 'atp_matches_qual_chall_2001_2503'], ['D', 'atp_matches_qual_chall_2001_2886'], ['V', 'atp_matches_qual_chall_2001_2935'], ['V', 'atp_matches_qual_chall_2001_3001'], ['V', 'atp_matches_qual_chall_2001_3015'], ['V', 'atp_matches_qual_chall_2001_3022'], ['D', 'atp_matches_qual_chall_2001_3025'], ['V', 'atp_matches_qual_chall_2001_3042'], ['V', 'atp_matches_qual_chall_2001_3051'], ['V', 'atp_matches_qual_chall_2001_3055'], ['D', 'atp_matches_qual_chall_2001_3057'], ['D', 'atp_matches_qual_chall_2001_3221'], ['V', 'atp_matches_qual_chall_2001_3733'], ['V', 'atp_matches_qual_chall_2001_3745'], ['D', 'atp_matches_qual_chall_2001_3751'], ['V', 'atp_matches_qual_chall_2001_3823'], ['V', 'atp_matches_qual_chall_2001_3836'], ['D', 'atp_matches_qual_chall_2001_3843'], ['V', 'atp_matches_qual_chall_2001_4050'], ['V', 'atp_matches_qual_chall_2001_4058'], ['D', 'atp_matches_qual_chall_2001_4062'], ['D', 'atp_matches_qual_chall_2001_4769'], ['V', 'atp_matches_qual_chall_2001_4884'], ['D', 'atp_matches_qual_chall_2001_4900'], ['D', 'atp_matches_qual_chall_2001_5222'], ['V', 'atp_matches_qual_chall_2001_5377'], ['D', 'atp_matches_qual_chall_2001_5387'], ['V', 'atp_matches_qual_chall_2001_5764'], ['V', 'atp_matches_qual_chall_2001_5778'], ['D', 'atp_matches_qual_chall_2001_5785'], ['D', 'atp_matches_qual_chall_2001_5834'], ['V', 'atp_matches_2001_6354'], ['V', 'atp_matches_2001_6357'], ['D', 'atp_matches_2001_6381'], ['D', 'atp_matches_qual_chall_2001_6778'], ['D', 'atp_matches_qual_chall_2001_6910'], ['D', 'atp_matches_qual_chall_2001_7787'], ['V', 'atp_matches_qual_chall_2002_1705'], ['V', 'atp_matches_qual_chall_2002_1719'], ['V', 'atp_matches_qual_chall_2002_1726'], ['D', 'atp_matches_qual_chall_2002_1730'], ['V', 'atp_matches_qual_chall_2002_1773'], ['D', 'atp_matches_qual_chall_2002_1784'], ['V', 'atp_matches_qual_chall_2002_2263'], ['V', 'atp_matches_qual_chall_2002_2272'], ['D', 'atp_matches_qual_chall_2002_2276'], ['V', 'atp_matches_qual_chall_2002_2317'], ['V', 'atp_matches_qual_chall_2002_2330'], ['V', 'atp_matches_qual_chall_2002_2336'], ['V', 'atp_matches_qual_chall_2002_2339'], ['D', 'atp_matches_qual_chall_2002_2341'], ['D', 'atp_matches_2002_2478'], ['V', 'atp_matches_qual_chall_2002_2597'], ['V', 'atp_matches_qual_chall_2002_2609'], ['D', 'atp_matches_qual_chall_2002_2615'], ['V', 'atp_matches_qual_chall_2002_2866'], ['V', 'atp_matches_qual_chall_2002_2881'], ['D', 'atp_matches_qual_chall_2002_2889'], ['D', 'atp_matches_qual_chall_2002_3046'], ['V', 'atp_matches_qual_chall_2002_3085'], ['V', 'atp_matches_qual_chall_2002_3177'], ['D', 'atp_matches_qual_chall_2002_3187'], ['D', 'atp_matches_qual_chall_2002_3320'], ['V', 'atp_matches_qual_chall_2002_3440'], ['D', 'atp_matches_qual_chall_2002_3455'], ['V', 'atp_matches_qual_chall_2002_3832'], ['V', 'atp_matches_qual_chall_2002_3848'], ['D', 'atp_matches_qual_chall_2002_3856'], ['V', 'atp_matches_qual_chall_2002_3929'], ['V', 'atp_matches_qual_chall_2002_3943'], ['V', 'atp_matches_qual_chall_2002_3950'], ['D', 'atp_matches_qual_chall_2002_3953'], ['D', 'atp_matches_qual_chall_2002_4087'], ['D', 'atp_matches_qual_chall_2002_4410'], ['D', 'atp_matches_qual_chall_2002_4863'], ['V', 'atp_matches_qual_chall_2002_4897'], ['V', 'atp_matches_qual_chall_2002_4909'], ['D', 'atp_matches_qual_chall_2002_4915'], ['V', 'atp_matches_qual_chall_2002_5081'], ['V', 'atp_matches_qual_chall_2002_5094'], ['V', 'atp_matches_qual_chall_2002_5101'], ['V', 'atp_matches_qual_chall_2002_5104'], ['D', 'atp_matches_qual_chall_2002_5106'], ['D', 'atp_matches_qual_chall_2002_5616'], ['V', 'atp_matches_qual_chall_2002_5639'], ['V', 'atp_matches_qual_chall_2002_5649'], ['D', 'atp_matches_qual_chall_2002_5654'], ['D', 'atp_matches_qual_chall_2002_5704'], ['V', 'atp_matches_qual_chall_2002_5786'], ['D', 'atp_matches_qual_chall_2002_5800'], ['D', 'atp_matches_2002_6105'], ['V', 'atp_matches_qual_chall_2002_6441'], ['D', 'atp_matches_qual_chall_2002_6452'], ['D', 'atp_matches_qual_chall_2002_6525'], ['V', 'atp_matches_qual_chall_2002_6636'], ['D', 'atp_matches_qual_chall_2002_6650'], ['D', 'atp_matches_qual_chall_2002_7293'], ['D', 'atp_matches_qual_chall_2003_324'], ['V', 'atp_matches_qual_chall_2003_351'], ['V', 'atp_matches_qual_chall_2003_405'], ['D', 'atp_matches_2003_1184'], ['V', 'atp_matches_qual_chall_2003_1621'], ['V', 'atp_matches_qual_chall_2003_1630'], ['V', 'atp_matches_qual_chall_2003_1634'], ['V', 'atp_matches_qual_chall_2003_1636'], ['V', 'atp_matches_qual_chall_2003_1637'], ['D', 'atp_matches_qual_chall_2003_1644'], ['D', 'atp_matches_2003_1745'], ['D', 'atp_matches_2003_1748'], ['V', 'atp_matches_qual_chall_2003_1788'], ['D', 'atp_matches_qual_chall_2003_1802'], ['V', 'atp_matches_2003_1954'], ['V', 'atp_matches_2003_1979'], ['V', 'atp_matches_2003_1991'], ['D', 'atp_matches_2003_1997'], ['V', 'atp_matches_2003_2066'], ['V', 'atp_matches_2003_2090'], ['D', 'atp_matches_2003_2105'], ['V', 'atp_matches_2003_2415'], ['V', 'atp_matches_2003_2439'], ['V', 'atp_matches_2003_2451'], ['D', 'atp_matches_2003_2457'], ['D', 'atp_matches_2003_2520'], ['D', 'atp_matches_2003_2846'], ['D', 'atp_matches_qual_chall_2003_2941'], ['V', 'atp_matches_qual_chall_2003_2973'], ['V', 'atp_matches_qual_chall_2003_3037'], ['V', 'atp_matches_qual_chall_2003_3356'], ['V', 'atp_matches_qual_chall_2003_3364'], ['V', 'atp_matches_qual_chall_2003_3368'], ['V', 'atp_matches_qual_chall_2003_3370'], ['V', 'atp_matches_qual_chall_2003_3371'], ['D', 'atp_matches_2003_3636'], ['V', 'atp_matches_qual_chall_2003_3877'], ['V', 'atp_matches_qual_chall_2003_3885'], ['V', 'atp_matches_qual_chall_2003_3889'], ['D', 'atp_matches_qual_chall_2003_3891'], ['V', 'atp_matches_2003_4123'], ['V', 'atp_matches_2003_4132'], ['D', 'atp_matches_2003_4137'], ['V', 'atp_matches_2003_4344'], ['V', 'atp_matches_2003_4360'], ['V', 'atp_matches_2003_4374'], ['D', 'atp_matches_2003_4381'], ['V', 'atp_matches_2003_4772'], ['V', 'atp_matches_2003_4782'], ['V', 'atp_matches_2003_4787'], ['V', 'atp_matches_2003_4789'], ['D', 'atp_matches_2003_4790'], ['D', 'atp_matches_2003_5036'], ['D', 'atp_matches_2003_5655'], ['D', 'atp_matches_2003_6509'], ['V', 'atp_matches_2003_6587'], ['D', 'atp_matches_2003_6597'], ['D', 'atp_matches_2003_6884'], ['D', 'atp_matches_2003_6973'], ['D', 'atp_matches_2003_7273'], ['D', 'atp_matches_2004_34'], ['V', 'atp_matches_2004_232'], ['D', 'atp_matches_2004_273'], ['V', 'atp_matches_2004_842'], ['V', 'atp_matches_2004_852'], ['D', 'atp_matches_2004_857'], ['V', 'atp_matches_2004_928'], ['D', 'atp_matches_2004_941'], ['V', 'atp_matches_2004_1050'], ['D', 'atp_matches_2004_1064'], ['V', 'atp_matches_2004_1174'], ['V', 'atp_matches_2004_1188'], ['D', 'atp_matches_2004_1195'], ['D', 'atp_matches_2004_1269'], ['D', 'atp_matches_2004_2180'], ['V', 'atp_matches_2004_2507'], ['V', 'atp_matches_2004_2532'], ['D', 'atp_matches_2004_2545'], ['D', 'atp_matches_2004_2620'], ['V', 'atp_matches_2004_2747'], ['V', 'atp_matches_2004_2761'], ['V', 'atp_matches_2004_2768'], ['V', 'atp_matches_2004_2772'], ['V', 'atp_matches_2004_2774'], ['D', 'atp_matches_2004_2962'], ['D', 'atp_matches_2004_3507'], ['V', 'atp_matches_2004_3728'], ['D', 'atp_matches_2004_3785'], ['V', 'atp_matches_2004_4172'], ['D', 'atp_matches_2004_4187'], ['V', 'atp_matches_2004_4567'], ['V', 'atp_matches_2004_4569'], ['V', 'atp_matches_2004_4817'], ['V', 'atp_matches_2004_4831'], ['V', 'atp_matches_2004_4838'], ['V', 'atp_matches_2004_4841'], ['D', 'atp_matches_2004_4843'], ['V', 'atp_matches_qual_chall_2004_5093'], ['D', 'atp_matches_qual_chall_2004_5109'], ['V', 'atp_matches_qual_chall_2004_5124'], ['V', 'atp_matches_qual_chall_2004_5140'], ['V', 'atp_matches_qual_chall_2004_5148'], ['V', 'atp_matches_qual_chall_2004_5152'], ['V', 'atp_matches_qual_chall_2004_5154'], ['D', 'atp_matches_2004_5716'], ['V', 'atp_matches_2004_5994'], ['D', 'atp_matches_2004_6027'], ['V', 'atp_matches_2004_6485'], ['V', 'atp_matches_2004_6499'], ['V', 'atp_matches_2004_6506'], ['D', 'atp_matches_2004_6510'], ['V', 'atp_matches_2004_6661'], ['D', 'atp_matches_2004_6663'], ['V', 'atp_matches_2004_6772'], ['V', 'atp_matches_2004_6786'], ['V', 'atp_matches_2004_6793'], ['V', 'atp_matches_2004_6797'], ['D', 'atp_matches_2004_6799'], ['V', 'atp_matches_2004_7231'], ['D', 'atp_matches_2004_7241'], ['D', 'atp_matches_2004_7364'], ['D', 'atp_matches_2004_7528'], ['D', 'atp_matches_2005_65'], ['D', 'atp_matches_2005_188'], ['D', 'atp_matches_2005_297'], ['V', 'atp_matches_2005_715'], ['V', 'atp_matches_2005_725'], ['V', 'atp_matches_2005_730'], ['D', 'atp_matches_2005_733'], ['D', 'atp_matches_2005_893'], ['D', 'atp_matches_2005_1118'], ['V', 'atp_matches_2005_1274'], ['V', 'atp_matches_2005_1284'], ['D', 'atp_matches_2005_1289'], ['D', 'atp_matches_2005_1432'], ['D', 'atp_matches_2005_1770'], ['V', 'atp_matches_2005_1944'], ['V', 'atp_matches_2005_1952'], ['V', 'atp_matches_2005_1956'], ['D', 'atp_matches_2005_1958'], ['V', 'atp_matches_2005_2141'], ['V', 'atp_matches_2005_2160'], ['V', 'atp_matches_2005_2169'], ['D', 'atp_matches_2005_2174'], ['V', 'atp_matches_2005_2544'], ['D', 'atp_matches_2005_2545'], ['V', 'atp_matches_2005_2650'], ['D', 'atp_matches_2005_2671'], ['V', 'atp_matches_2005_2767'], ['V', 'atp_matches_2005_2792'], ['V', 'atp_matches_2005_2804'], ['D', 'atp_matches_2005_2810'], ['V', 'atp_matches_2005_3181'], ['V', 'atp_matches_2005_3215'], ['D', 'atp_matches_2005_3232'], ['D', 'atp_matches_2005_3931'], ['V', 'atp_matches_qual_chall_2005_4150'], ['V', 'atp_matches_qual_chall_2005_4166'], ['V', 'atp_matches_qual_chall_2005_4174'], ['D', 'atp_matches_qual_chall_2005_4178']]" 56 | Matches_Clay_2,"['D', 'D', 'V', 'D', 'V', 'D', 'D', 'V', 'V', 'V', 'D', 'D', 'D', 'D', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'D', 'D', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'D', 'D', 'D', 'V', 'D', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'D', 'V', 'D', 'D', 'V', 'D', 'V', 'V', 'D', 'D', 'V', 'V', 'D', 'D', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'D', 'V', 'V', 'D', 'D', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'D', 'D', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'D', 'V', 'D', 'D', 'V', 'D', 'D', 'V', 'D', 'D', 'V', 'V', 'V', 'V', 'V', 'D', 'D', 'D', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'D', 'D', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'D', 'V', 'D', 'V', 'V', 'D', 'V', 'D', 'V', 'D', 'V', 'V', 'D', 'D', 'V', 'V', 'D', 'D', 'V', 'V', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'D', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D']" 57 | Matches_Carpet_2,"['D', 'D', 'D', 'D', 'D', 'D', 'D', 'V', 'D', 'D', 'D']" 58 | Matches_Grass_2,"['D', 'D', 'V', 'D', 'D']" 59 | Matches_Hard_2,"['D', 'V', 'D', 'V', 'V', 'D', 'V', 'D', 'D', 'D', 'D', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'D', 'D', 'V', 'V', 'D', 'D', 'D', 'D', 'V', 'D', 'D', 'D', 'V', 'D', 'D', 'D', 'D', 'D', 'D']" 60 | Victories_Percentage_2,56.03715170278638 61 | Clay_Victories_Percentage_2,61.36363636363637 62 | Carpet_Victories_Percentage_2,9.090909090909092 63 | Grass_Victories_Percentage_2,20.0 64 | Hard_Victories_Percentage_2,39.53488372093023 65 | Aces_Percentage_2,1.2510230328539695 66 | Doublefaults_Percentage_2,3.7881445106980007 67 | First_Serve_Success_Percentage_2,66.30422074126038 68 | Winning_on_1st_Serve_Percentage_2,65.75470595112826 69 | Winning_on_2nd_Serve_Percentage_2,16.49713550800889 70 | Overall_Win_on_Serve_Percentage_2,82.25184145913715 71 | BreakPoint_Face_Percentage_2,10.312171168011224 72 | BreakPoint_Saved_Percentage_2,5.869285630772828 73 | games_fatigue_2,58.0 74 | minutes_fatigue_2, 75 | last_rankings_2,"[32, 65, 60, 45, 39, 40, 9999, 43, 42, 41, 31, 34]" 76 | last_ranking_points_2,"[1060, 630, 685, 775, 890, 890, 0, 850, 885, 885, 1060, 990]" 77 | Winner,0 78 | score,6-1 2-6 6-4 79 | elapsed_minutes,100.0 80 | aces_nb_1,4.0 81 | doublefaults_nb_1,5.0 82 | svpt_1,68.0 83 | 1stIn_1,36.0 84 | 1stWon_1,27.0 85 | 2ndWon_1,16.0 86 | SvGms_1,13.0 87 | bpSaved_1,5.0 88 | bpFaced_1,9.0 89 | aces_nb_2,0.0 90 | doublefaults_nb_2,6.0 91 | svpt_2,93.0 92 | 1stIn_2,63.0 93 | 1stWon_2,33.0 94 | 2ndWon_2,17.0 95 | SvGms_2,12.0 96 | bpSaved_2,6.0 97 | bpFaced_2,11.0 98 | tournament_year,2005 99 | --------------------------------------------------------------------------------