├── .gitignore
├── robot.png
├── tennis_robot.png
├── tennis_robot_2.png
├── examples
├── data
│ ├── nb_matches.png
│ ├── Best_player_win_percentage.png
│ ├── stan_the_man_win_percentage.png
│ ├── stanimal_aces_percentage_difference.png
│ ├── data_row_example.csv
│ ├── data_loading.py
│ └── single_row_example.csv
├── results_reading
│ ├── win_per_surface.png
│ ├── models_performances.png
│ ├── precision_percentage_players_ranks.png
│ ├── models_comparison.py
│ └── best_model.py
├── history_modeling
│ ├── 2d_pca_match_representation.png
│ ├── 2d_pca_match_representation_test.png
│ ├── first_example.py
│ ├── history_encoding.py
│ ├── pca_representation.py
│ ├── pca_match_encoder_train.py
│ └── train_test.py
└── models
│ ├── dl_train_test.py
│ ├── prediction.py
│ ├── train_test.py
│ ├── train_test_eval.py
│ ├── deep_history.py
│ └── grid_search.py
├── envs
├── minimal_env.yml
└── requirements.txt
├── .gitmodules
├── .github
├── dependabot.yml
└── workflows
│ └── black_action.yml
├── python
├── model
│ ├── base_model.py
│ ├── lgbm.py
│ ├── dumb_models.py
│ ├── sk_model.py
│ ├── xgboost.py
│ └── deep_model.py
├── data
│ ├── data_utils.py
│ ├── data_encoding.py
│ ├── match.py
│ ├── data_loader.py
│ └── player.py
├── history_modeling
│ ├── encoding_model.py
│ └── match_representation.py
└── evaluation
│ └── train_test.py
├── LICENSE.md
├── notes.txt
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | DataBase/
3 |
4 | \.idea/
5 | cache/
6 | results/
7 | *.pyc
--------------------------------------------------------------------------------
/robot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/robot.png
--------------------------------------------------------------------------------
/tennis_robot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/tennis_robot.png
--------------------------------------------------------------------------------
/tennis_robot_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/tennis_robot_2.png
--------------------------------------------------------------------------------
/examples/data/nb_matches.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/data/nb_matches.png
--------------------------------------------------------------------------------
/envs/minimal_env.yml:
--------------------------------------------------------------------------------
1 | name: min_tennis
2 | channels:
3 | - defaults
4 | dependencies:
5 | - pandas
6 | - matplotlib
7 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "submodules/tennis_atp"]
2 | path = submodules/tennis_atp
3 | url = https://github.com/JeffSackmann/tennis_atp.git
4 |
--------------------------------------------------------------------------------
/examples/data/Best_player_win_percentage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/data/Best_player_win_percentage.png
--------------------------------------------------------------------------------
/examples/data/stan_the_man_win_percentage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/data/stan_the_man_win_percentage.png
--------------------------------------------------------------------------------
/examples/results_reading/win_per_surface.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/results_reading/win_per_surface.png
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: "gitsubmodule"
4 | directory: "/"
5 | schedule:
6 | interval: "weekly"
7 |
--------------------------------------------------------------------------------
/examples/results_reading/models_performances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/results_reading/models_performances.png
--------------------------------------------------------------------------------
/examples/data/stanimal_aces_percentage_difference.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/data/stanimal_aces_percentage_difference.png
--------------------------------------------------------------------------------
/envs/requirements.txt:
--------------------------------------------------------------------------------
1 | lightgbm==3.3.5
2 | matplotlib==3.5.1
3 | numpy==1.22.3
4 | pandas==1.5.2
5 | scikit_learn==1.2.2
6 | tensorflow==2.8.0
7 | tqdm==4.65.0
8 | xgboost==1.7.4
--------------------------------------------------------------------------------
/examples/history_modeling/2d_pca_match_representation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/history_modeling/2d_pca_match_representation.png
--------------------------------------------------------------------------------
/examples/history_modeling/2d_pca_match_representation_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/history_modeling/2d_pca_match_representation_test.png
--------------------------------------------------------------------------------
/examples/results_reading/precision_percentage_players_ranks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/results_reading/precision_percentage_players_ranks.png
--------------------------------------------------------------------------------
/python/model/base_model.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod
2 |
3 |
4 | class BaseModel:
5 | def __init__(self):
6 | pass
7 |
8 | @abstractmethod
9 | def fit(self, X):
10 | pass
11 |
12 | @abstractmethod
13 | def predict(self, X):
14 | pass
15 |
16 |
17 | class DeepBaseModel(BaseModel):
18 | def __init__(self):
19 | super().__init__()
20 | self.instantiate_model()
21 |
22 | @abstractmethod
23 | def instantiate_model(self, X):
24 | pass
25 |
--------------------------------------------------------------------------------
/python/model/lgbm.py:
--------------------------------------------------------------------------------
1 | import lightgbm as lgb
2 | import numpy as np
3 |
4 | from model.base_model import BaseModel
5 |
6 |
7 | class LightGBM(BaseModel):
8 | def __init__(self, params, num_rounds=10):
9 | self.params = params
10 | self.num_rounds = num_rounds
11 |
12 | def fit(self, X, y):
13 | train_data = lgb.Dataset(X, label=y)
14 | self.model = lgb.train(self.params, train_data, self.num_rounds)
15 | return self.model
16 |
17 | def predict(self, X):
18 | return np.round(self.model.predict(X), 0)
19 |
20 | def save(self, path):
21 | self.model.save_model(path)
22 |
--------------------------------------------------------------------------------
/python/model/dumb_models.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from model.base_model import BaseModel
4 |
5 |
6 | class BestRankedPlayerWins(BaseModel):
7 | def fit(self, X, y):
8 | pass
9 |
10 | def predict(self, X):
11 | y_pred = []
12 | for n_row, row in X.iterrows():
13 | rank_1 = row["Ranking_1"]
14 | rank_2 = row["Ranking_2"]
15 | y_pred.append([np.argmin([rank_1, rank_2])])
16 | return y_pred
17 |
18 |
19 | class RandomModel(BaseModel):
20 | def fit(self, X, y):
21 | pass
22 |
23 | def predict(self, X):
24 | return np.random.randint(0, 2, 1)
25 |
--------------------------------------------------------------------------------
/python/data/data_utils.py:
--------------------------------------------------------------------------------
1 | def get_days_difference(prev_date, curr_date):
2 | prev_date, curr_date = str(prev_date), str(curr_date)
3 | days_difference = (
4 | (int(curr_date[:4]) - int(prev_date[:4])) * 365
5 | + (int(curr_date[4:6]) - int(prev_date[4:6])) * 30
6 | + int(curr_date[6:8])
7 | - int(prev_date[6:8])
8 | ) + 2
9 | return days_difference
10 |
11 |
12 | def reverse_score(score):
13 | score = str(score)
14 | reversed_score = []
15 | sets = score.split(" ")
16 | for set in sets:
17 | games = set.split("-")
18 | reversed_score.append("-".join(games[::-1]))
19 | return " ".join(reversed_score)
20 |
--------------------------------------------------------------------------------
/python/model/sk_model.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from sklearn.preprocessing import StandardScaler
4 | from sklearn.svm import SVC
5 |
6 | from model.base_model import BaseModel
7 |
8 |
9 | class ScalerSVC(BaseModel):
10 | def __init__(self, C=1.0, kernel="linear", degree=3, gamma="scale", tol=1e-3):
11 | self.C = C
12 | self.kernel = kernel
13 | self.degree = degree
14 | self.gamma = gamma
15 | self.tol = tol
16 |
17 | self.model = SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, tol=tol)
18 | self.scaler_x = StandardScaler()
19 |
20 | def fit(self, X, y):
21 | self.scaler_x.fit(X)
22 | self.model.fit(self.scaler_x.transform(X), y.reshape(-1, 1))
23 |
24 | def predict(self, X):
25 | return self.model.predict(self.scaler_x.transform(X)).reshape(-1, 1)
26 |
--------------------------------------------------------------------------------
/python/model/xgboost.py:
--------------------------------------------------------------------------------
1 | import xgboost as xgb
2 | import numpy as np
3 |
4 | from model.base_model import BaseModel
5 |
6 |
7 | class XGBoost(BaseModel):
8 | def __init__(self, params, num_rounds=10):
9 | self.params = params
10 | self.num_rounds = num_rounds
11 |
12 | def fit(self, X, y, validation_data=None):
13 | train_data = xgb.DMatrix(X, label=y)
14 | if validation_data is not None:
15 | evallist = [
16 | (train_data, "train"),
17 | (xgb.DMatrix(validation_data[0], label=validation_data[1]), "eval"),
18 | ]
19 | else:
20 | evallist = []
21 | self.model = xgb.train(self.params, train_data, self.num_rounds, evals=evallist)
22 | return self.model
23 |
24 | def predict(self, X):
25 | X = xgb.DMatrix(X)
26 | return np.round(self.model.predict(X), 0)
27 |
28 | def save(self, path):
29 | self.model.save_model(path)
30 |
--------------------------------------------------------------------------------
/.github/workflows/black_action.yml:
--------------------------------------------------------------------------------
1 | name: black-action
2 | on: [pull_request]
3 | jobs:
4 | linter_name:
5 | name: runner / black
6 | runs-on: ubuntu-latest
7 | steps:
8 | - uses: actions/checkout@v2
9 | - name: Check files using the black formatter
10 | uses: rickstaa/action-black@v1
11 | id: action_black
12 | with:
13 | black_args: "."
14 | - name: Create Pull Request
15 | if: steps.action_black.outputs.is_formatted == 'true'
16 | uses: peter-evans/create-pull-request@v3
17 | with:
18 | token: ${{ secrets.GITHUB_TOKEN }}
19 | title: "Format Python code with psf/black push"
20 | commit-message: ":art: Format Python code with psf/black"
21 | body: |
22 | There appear to be some python formatting errors in ${{ github.sha }}. This pull request
23 | uses the [psf/black](https://github.com/psf/black) formatter to fix these issues.
24 | base: ${{ github.head_ref }} # Creates pull request onto pull request or commit branch
25 | branch: actions/black
26 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 VincentAuriau
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/examples/history_modeling/first_example.py:
--------------------------------------------------------------------------------
1 | import ast
2 | import os, sys
3 |
4 | sys.path.append("../../python")
5 | sys.path.append("../../")
6 |
7 | import matplotlib.pyplot as plt
8 | from matplotlib.patches import Rectangle
9 | import numpy as np
10 | import pandas as pd
11 |
12 | from data.data_loader import matches_data_loader
13 | from history_modeling.match_representation import get_match_info, matches_info_norm
14 |
15 | data_df = matches_data_loader(
16 | path_to_data="../../submodules/tennis_atp",
17 | path_to_cache="../../cache",
18 | flush_cache=False,
19 | keep_values_from_year=2023,
20 | get_match_statistics=True,
21 | get_reversed_match_data=True,
22 | )
23 |
24 | data_df = data_df.loc[data_df.ID_1 == 105173] # Adrian Mannarino
25 | print(f"Adrian Mannarino has played {len(data_df)} matches in 2023 in our database")
26 |
27 | ten_matches_history = pd.concat(
28 | [get_match_info(data_df.iloc[i]) for i in range(10)], axis=0
29 | )
30 | ten_matches_history.reset_index(inplace=True, drop=True)
31 | match_info = matches_info_norm(ten_matches_history, data_df.iloc[10]["tournament_date"])
32 |
33 | print(match_info.columns)
34 | plt.figure()
35 | plt.imshow(match_info.values)
36 | plt.show()
37 |
--------------------------------------------------------------------------------
/notes.txt:
--------------------------------------------------------------------------------
1 | Features to add:
2 | - Ranking over time: ranking last months
3 | - Ranking should come from ranking files and not match files
4 | - Last x (5 currently) matches could be an argument (5, 10, 15, etc...)
5 | - % Victory against players of same rank as adv
6 |
7 |
8 | Improvements:
9 |
10 | - encoding last x matches [v, v, v, d, d] => [1, 1, 1, 0, 0]
11 | - encoding last x matches versus
12 | - add non main atp matches
13 |
14 | Done:
15 | - Using nb of games played instead of nb of sets ?
16 | - use match elapsed minutes to model fatigue
17 | - player.versus should not be given as a whole from match but only versus against adv ?
18 | - cut cache in several files for dynamic loading with different parameters
19 | - Fix columns names
20 | - Remove all persons from versus
21 | - add unique match ID to link to original matches data
22 | - add concatenation of players stats at match time, result & match stats
23 | - age at match time not well handled
24 | # Either get it from match data (easier)
25 | # Or better handling birthdate vs match date
26 | - have better historic of matches order & versus matches = {id: [date, result], [date, result], ...]}
27 | - Remove double data
28 |
29 |
--------------------------------------------------------------------------------
/examples/results_reading/models_comparison.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | from matplotlib.patches import Rectangle
3 | import pandas as pd
4 |
5 | df_results = pd.read_csv("../../results/20212022_chall/results.csv", sep=";")
6 |
7 | print(df_results.head())
8 | models_color = {}
9 | for i, model in enumerate(df_results.model_class.unique()):
10 | models_color[model] = [
11 | "tab:blue",
12 | "tab:orange",
13 | "tab:green",
14 | "tab:red",
15 | "tab:purple",
16 | "tab:brown",
17 | "tab:pink",
18 | "tab:grey",
19 | "tab:olive",
20 | "tab:cyan",
21 | ][i]
22 | fig, ax = plt.subplots()
23 | for n_row, row in df_results.iterrows():
24 | if n_row < 200:
25 | rect = Rectangle(
26 | (n_row, 0),
27 | 1,
28 | row["precision"] * 100,
29 | edgecolor=models_color[row["model_class"]],
30 | facecolor=models_color[row["model_class"]],
31 | label=row["model_class"],
32 | )
33 | ax.add_patch(rect)
34 |
35 | ax.autoscale()
36 | handles, labels = plt.gca().get_legend_handles_labels()
37 | by_label = dict(zip(labels, handles))
38 | plt.legend(by_label.values(), by_label.keys(), loc=1)
39 | plt.ylabel("Precision %")
40 | plt.savefig("models_performances.png")
41 | plt.show()
42 |
--------------------------------------------------------------------------------
/examples/models/dl_train_test.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 |
3 | sys.path.append("../../python")
4 | sys.path.append("../../../")
5 |
6 | from model.deep_model import SimpleFullyConnected
7 | from evaluation.train_test import train_test_evaluation
8 |
9 |
10 | train_years = [2018, 2019, 2020]
11 | test_years = [2021, 2022]
12 |
13 | match_features = ["tournament_surface", "tournament_level"]
14 | player_features = [
15 | "Ranking",
16 | "Ranking_Points",
17 | "Height",
18 | "Victories_Percentage",
19 | "Clay_Victories_Percentage",
20 | "Grass_Victories_Percentage",
21 | "Carpet_Victories_Percentage",
22 | "Hard_Victories_Percentage",
23 | "Aces_Percentage",
24 | ]
25 | additional_features = ["diff_rank", "v_perc_versus"]
26 | test_score = train_test_evaluation(
27 | train_years=train_years,
28 | test_years=test_years,
29 | model_class=SimpleFullyConnected,
30 | model_params={
31 | "input_shape": 22,
32 | "hidden_units": (22, 44, 44, 22, 11, 4),
33 | "output_shape": 2,
34 | "last_activation": "softmax",
35 | "epochs": 100,
36 | "reduced_lr_epochs": 50,
37 | "loss": "categorical_crossentropy",
38 | },
39 | match_features=match_features,
40 | player_features=player_features,
41 | encoding_params={},
42 | additional_features=additional_features,
43 | save_path="../../results/test",
44 | save_all_results=True,
45 | )
46 |
--------------------------------------------------------------------------------
/examples/models/prediction.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 |
3 | sys.path.append("../../python")
4 | sys.path.append("../../../")
5 |
6 | import numpy as np
7 |
8 | from data.data_loader import matches_data_loader
9 | from model.dumb_models import RandomModel, BestRankedPlayerWins
10 |
11 | data_df = matches_data_loader(
12 | path_to_data="../../submodules/tennis_atp",
13 | path_to_cache="../../cache",
14 | flush_cache=True,
15 | keep_values_from_year=2021,
16 | get_match_statistics=False,
17 | )
18 |
19 | random_model = RandomModel()
20 | best_player_model = BestRankedPlayerWins()
21 |
22 | random_predictions = []
23 | best_player_predictions = []
24 | ground_truths = []
25 | for n_row, row in data_df.iterrows():
26 | r_prediction = random_model.predict(row)
27 | bp_prediction = best_player_model.predict(row)
28 | truth = row["Winner"]
29 |
30 | random_predictions.append(r_prediction)
31 | best_player_predictions.append(bp_prediction)
32 | ground_truths.append(truth)
33 |
34 | ground_truths = np.array(ground_truths)
35 | random_predictions = np.squeeze(np.array(random_predictions))
36 | best_player_predictions = np.squeeze(best_player_predictions)
37 |
38 | print("Among the", len(ground_truths), "matches analyzed, we have found:")
39 |
40 | random_percentage = (
41 | np.sum(ground_truths == random_predictions) / len(random_predictions) * 100
42 | )
43 | print("Random Prediction Percentage:", np.round(random_percentage, 2), "%")
44 | bp_percentage = (
45 | np.sum(ground_truths == best_player_predictions)
46 | / len(best_player_predictions)
47 | * 100
48 | )
49 | print("Best Ranked Player Prediction Percentage:", np.round(bp_percentage, 2), "%")
50 |
--------------------------------------------------------------------------------
/examples/history_modeling/history_encoding.py:
--------------------------------------------------------------------------------
1 | import ast
2 | import os, sys
3 |
4 | sys.path.append("../../python")
5 | sys.path.append("../../")
6 |
7 | import matplotlib.pyplot as plt
8 | import numpy as np
9 | import pandas as pd
10 |
11 | from data.data_loader import matches_data_loader
12 | from history_modeling.encoding_model import PCAMatchEncoder
13 |
14 | from data.data_encoding import create_encoded_history
15 |
16 | data_df = matches_data_loader(
17 | path_to_data="../../submodules/tennis_atp",
18 | path_to_cache="../../cache",
19 | flush_cache=False,
20 | keep_values_from_year=2022,
21 | get_match_statistics=True,
22 | get_reversed_match_data=True,
23 | include_davis_cup=False,
24 | )
25 |
26 | print("Data Loaded")
27 | columns = [
28 | "surface",
29 | "result",
30 | "adv_ranking",
31 | "adv_ranking_points",
32 | "num_won_sets",
33 | "num_lost_sets",
34 | "num_won_games",
35 | "num_lost_games",
36 | "num_tie_break_wons",
37 | "num_tie_break_lost",
38 | ]
39 | model = PCAMatchEncoder(num_pca_features=2, columns=columns)
40 | model.fit(data_df, transform_data=True)
41 |
42 | print("Model Fitted, now predicting")
43 | X_r, match_info = model.predict(data_df, transform_data=True)
44 |
45 | history_df = create_encoded_history(data_df, model, 5)
46 |
47 | cols = ["history_1", "history_2"]
48 | print(
49 | pd.DataFrame(
50 | np.array(history_df["history_1"].values.tolist())
51 | .reshape((len(history_df), -1))
52 | .tolist()
53 | )
54 | )
55 | print(
56 | np.array(history_df["history_1"].values.tolist())
57 | .reshape((len(history_df), -1))
58 | .shape
59 | )
60 | flatten_data = pd.concat(
61 | [
62 | pd.DataFrame(
63 | np.array(history_df["history_1"].values.tolist())
64 | .reshape((len(history_df), -1))
65 | .tolist()
66 | ).add_prefix(x)
67 | for x in cols
68 | ],
69 | axis=1,
70 | )
71 | flatten_data.to_csv("flatten_data.csv", sep=";", index=False)
72 | encoded_data = pd.concat([flatten_data, history_df.drop(cols, axis=1)], axis=1)
73 | history_df = pd.merge(data_df, encoded_data, on=["id", "ID_1", "ID_2"])
74 | history_df = history_df.loc[history_df.tournament_year == 2023]
75 | history_df.to_csv("history_df.csv", sep=";", index=False)
76 |
--------------------------------------------------------------------------------
/examples/history_modeling/pca_representation.py:
--------------------------------------------------------------------------------
1 | import ast
2 | import os, sys
3 |
4 | sys.path.append("../../python")
5 | sys.path.append("../../")
6 |
7 | import matplotlib.pyplot as plt
8 | import pandas as pd
9 | from sklearn.decomposition import PCA
10 |
11 | from data.data_loader import matches_data_loader
12 | from history_modeling.match_representation import get_match_info, matches_info_norm
13 |
14 | data_df = matches_data_loader(
15 | path_to_data="../../submodules/tennis_atp",
16 | path_to_cache="../../cache",
17 | flush_cache=False,
18 | keep_values_from_year=2023,
19 | get_match_statistics=True,
20 | get_reversed_match_data=True,
21 | )
22 |
23 | ten_matches_history = pd.concat(
24 | [get_match_info(data_df.iloc[i]) for i in range(len(data_df))], axis=0
25 | )
26 | ten_matches_history.reset_index(inplace=True, drop=True)
27 | match_info = matches_info_norm(ten_matches_history, "20230401")
28 |
29 | match_info = match_info.dropna().reset_index(drop=True)
30 |
31 | X = match_info.values
32 | pca = PCA(n_components=2)
33 | X_r = pca.fit(X).transform(X)
34 |
35 | plt.figure(figsize=(20, 12))
36 |
37 | plt.subplot(2, 4, 1)
38 | v_i = match_info.loc[match_info.result == 0].index.values
39 | d_i = match_info.loc[match_info.result == 1].index.values
40 | plt.scatter(X_r[v_i, 0], X_r[v_i, 1], label="Victories")
41 | plt.scatter(X_r[d_i, 0], X_r[d_i, 1], label="Defeats")
42 | plt.legend()
43 | plt.title("Result")
44 |
45 | plt.subplot(2, 4, 2)
46 | c_i = match_info.loc[match_info.surface == 0.0].index.values
47 | h_i = match_info.loc[match_info.surface == 2 / 3].index.values
48 | g_i = match_info.loc[match_info.surface == 1.0].index.values
49 | plt.scatter(X_r[c_i, 0], X_r[c_i, 1], label="Clay")
50 | plt.scatter(X_r[h_i, 0], X_r[h_i, 1], label="Hard")
51 | plt.scatter(X_r[g_i, 0], X_r[g_i, 1], label="Grass")
52 | plt.legend()
53 | plt.title("Surface")
54 |
55 | plt.subplot(2, 4, 3)
56 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_played_minutes)
57 | plt.title("played minutes")
58 |
59 | plt.subplot(2, 4, 4)
60 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.adv_ranking)
61 | plt.title("Ranking Adversary")
62 |
63 | plt.subplot(2, 4, 5)
64 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_won_sets)
65 | plt.title("Won sets Number")
66 | plt.subplot(2, 4, 6)
67 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_lost_sets)
68 | plt.title("Lost set Number")
69 | plt.subplot(2, 4, 7)
70 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_won_games)
71 | plt.title("Won games Number")
72 | plt.subplot(2, 4, 8)
73 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_lost_games)
74 | plt.title("Lost games Number")
75 |
76 | plt.savefig("2d_pca_match_representation.png")
77 | plt.show()
78 |
--------------------------------------------------------------------------------
/examples/history_modeling/pca_match_encoder_train.py:
--------------------------------------------------------------------------------
1 | import ast
2 | import os, sys
3 |
4 | sys.path.append("../../python")
5 | sys.path.append("../../")
6 |
7 | import matplotlib.pyplot as plt
8 | import pandas as pd
9 | from sklearn.decomposition import PCA
10 |
11 | from data.data_loader import matches_data_loader
12 | from history_modeling.match_representation import (
13 | create_timeless_dataset,
14 | create_dataset,
15 | )
16 | from history_modeling.encoding_model import PCAMatchEncoder
17 |
18 | data_df = matches_data_loader(
19 | path_to_data="../../submodules/tennis_atp",
20 | path_to_cache="../../cache",
21 | flush_cache=False,
22 | keep_values_from_year=2020,
23 | get_match_statistics=True,
24 | get_reversed_match_data=True,
25 | )
26 |
27 | """
28 | match_info = create_timeless_dataset(data_df)
29 | print(len(match_info))
30 | match_info = match_info.dropna().reset_index(drop=True)
31 | print(len(match_info))
32 |
33 | X = match_info.values
34 | pca = PCA(n_components=2)
35 | X_r = pca.fit(X).transform(X)
36 | """
37 | model = PCAMatchEncoder(num_pca_features=2)
38 | model.fit(data_df, transform_data=True)
39 | X_r, match_info = model.predict(data_df, transform_data=True)
40 |
41 | plt.figure(figsize=(20, 12))
42 |
43 | plt.subplot(2, 4, 1)
44 | v_i = match_info.loc[match_info.result == 0].index.values
45 | d_i = match_info.loc[match_info.result == 1].index.values
46 | plt.scatter(X_r[v_i, 0], X_r[v_i, 1], label="Victories")
47 | plt.scatter(X_r[d_i, 0], X_r[d_i, 1], label="Defeats")
48 | plt.legend()
49 | plt.title("Result")
50 |
51 | plt.subplot(2, 4, 2)
52 | c_i = match_info.loc[match_info.surface == 0.0].index.values
53 | h_i = match_info.loc[match_info.surface == 2 / 3].index.values
54 | g_i = match_info.loc[match_info.surface == 1.0].index.values
55 | plt.scatter(X_r[c_i, 0], X_r[c_i, 1], label="Clay")
56 | plt.scatter(X_r[h_i, 0], X_r[h_i, 1], label="Hard")
57 | plt.scatter(X_r[g_i, 0], X_r[g_i, 1], label="Grass")
58 | plt.legend()
59 | plt.title("Surface")
60 |
61 | plt.subplot(2, 4, 3)
62 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_played_minutes)
63 | plt.title("played minutes")
64 |
65 | plt.subplot(2, 4, 4)
66 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.adv_ranking)
67 | plt.title("Ranking Adversary")
68 |
69 | plt.subplot(2, 4, 5)
70 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_won_sets)
71 | plt.title("Won sets Number")
72 | plt.subplot(2, 4, 6)
73 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_lost_sets)
74 | plt.title("Lost set Number")
75 | plt.subplot(2, 4, 7)
76 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_won_games)
77 | plt.title("Won games Number")
78 | plt.subplot(2, 4, 8)
79 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_lost_games)
80 | plt.title("Lost games Number")
81 |
82 | plt.savefig("2d_pca_match_representation.png")
83 | plt.show()
84 |
--------------------------------------------------------------------------------
/examples/history_modeling/train_test.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 |
3 | sys.path.append("../../python")
4 | sys.path.append("../../../")
5 |
6 | import matplotlib.pyplot as plt
7 | import numpy as np
8 | import pandas as pd
9 |
10 | from model.xgboost import XGBoost
11 | from history_modeling.encoding_model import PCAMatchEncoder
12 | from evaluation.train_test import train_test_evaluation
13 |
14 |
15 | train_years = [2018, 2019, 2020]
16 | test_years = [2021, 2022]
17 |
18 |
19 | match_features = ["tournament_surface", "tournament_level"]
20 | player_features = [
21 | "Ranking",
22 | "Ranking_Points",
23 | "Height",
24 | "Victories_Percentage",
25 | "Clay_Victories_Percentage",
26 | "Grass_Victories_Percentage",
27 | "Carpet_Victories_Percentage",
28 | "Hard_Victories_Percentage",
29 | "Aces_Percentage",
30 | ]
31 | additional_features = ["diff_rank", "v_perc_versus", "nb_match_versus"]
32 | xgb_hyperparams = {
33 | "params": {
34 | "eta": 0.3,
35 | "objective": "binary:logistic",
36 | "gamma": 10,
37 | "max_depth": 10,
38 | "min_child_weight": 8,
39 | "subsample": 1,
40 | }
41 | }
42 |
43 | xgb_hyperparams = []
44 | for eta in [0.1, 0.3, 0.6]:
45 | for gamma in [0, 1, 10]:
46 | for max_depth in [2, 4, 6, 8, 10]:
47 | for min_child_weight in [1, 2, 8]:
48 | for subsample in [0.4, 0.8, 1]:
49 | xgb_hyperparams.append(
50 | {
51 | "params": {
52 | "eta": eta,
53 | "objective": "binary:logistic",
54 | "gamma": gamma,
55 | "max_depth": max_depth,
56 | "min_child_weight": min_child_weight,
57 | "subsample": subsample,
58 | }
59 | }
60 | )
61 | test_score = train_test_evaluation(
62 | train_years=[2018, 2019, 2020],
63 | test_years=test_years,
64 | model_class=XGBoost,
65 | model_params=xgb_hyperparams,
66 | encoder_models=[
67 | (
68 | PCAMatchEncoder,
69 | {
70 | "num_pca_features": 2,
71 | "auto_transform": True,
72 | "columns": [
73 | "surface",
74 | "result",
75 | "adv_ranking",
76 | "adv_ranking_points",
77 | "num_won_sets",
78 | "num_lost_sets",
79 | "num_won_games",
80 | "num_lost_games",
81 | "num_tie_break_wons",
82 | "num_tie_break_lost",
83 | ],
84 | },
85 | )
86 | ],
87 | match_features=match_features,
88 | player_features=player_features,
89 | encoding_params={},
90 | additional_features=additional_features,
91 | save_path="../../results/history_encoding",
92 | save_all_results=True,
93 | )
94 |
--------------------------------------------------------------------------------
/examples/models/train_test.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 |
3 | sys.path.append("../../python")
4 | sys.path.append("../../../")
5 |
6 | import matplotlib.pyplot as plt
7 | import numpy as np
8 | import pandas as pd
9 | from sklearn.ensemble import RandomForestClassifier
10 |
11 | from data.data_loader import matches_data_loader
12 | from data.data_loader import encode_data
13 |
14 | data_df = matches_data_loader(
15 | path_to_data="../../submodules/tennis_atp",
16 | path_to_cache="../../cache",
17 | flush_cache=False,
18 | keep_values_from_year=2022,
19 | get_match_statistics=True,
20 | get_reversed_match_data=True,
21 | )
22 |
23 | forgotten_columns = ["Versus_1", "Best_Rank_1", "Last_Tournament_Date"]
24 |
25 | columns_m = ["tournament_level", "round", "best_of", "Winner"]
26 | columns_1 = [
27 | "ID_1",
28 | "Ranking_1",
29 | "Ranking_Points_1",
30 | "Hand_1",
31 | "Height_1",
32 | "Versus_1",
33 | "Victories_Percentage_1",
34 | "Clay_Victories_Percentage_1",
35 | "Grass_Victories_Percentage_1",
36 | "Carpet_Victories_Percentage_1",
37 | "Hard_Victories_Percentage_1",
38 | "Aces_Percentage_1",
39 | "Doublefaults_Percentage_1",
40 | "First_Serve_Success_Percentage_1",
41 | "Winning_on_1st_Serve_Percentage_1",
42 | "Winning_on_2nd_Serve_Percentage_1",
43 | "Overall_Win_on_Serve_Percentage_1",
44 | "BreakPoint_Face_Percentage_1",
45 | "BreakPoint_Saved_Percentage_1",
46 | "Fatigue_1",
47 | ]
48 | columns_2 = [
49 | "ID_2",
50 | "Ranking_2",
51 | "Ranking_Points_2",
52 | "Hand_2",
53 | "Height_2",
54 | "Versus_2",
55 | "Victories_Percentage_2",
56 | "Clay_Victories_Percentage_2",
57 | "Grass_Victories_Percentage_2",
58 | "Carpet_Victories_Percentage_2",
59 | "Hard_Victories_Percentage_2",
60 | "Aces_Percentage_2",
61 | "Doublefaults_Percentage_2",
62 | "First_Serve_Success_Percentage_2",
63 | "Winning_on_1st_Serve_Percentage_2",
64 | "Winning_on_2nd_Serve_Percentage_2",
65 | "Overall_Win_on_Serve_Percentage_2",
66 | "BreakPoint_Face_Percentage_2",
67 | "BreakPoint_Saved_Percentage_2",
68 | "Fatigue_2",
69 | ]
70 |
71 | data_df = data_df[columns_m + columns_1 + columns_2]
72 |
73 | print(data_df.head())
74 | print(data_df.shape)
75 |
76 | data_df = data_df[columns_m + columns_1 + columns_2]
77 | data_df = data_df.dropna(axis=0)
78 |
79 | fdf = encode_data(data_df)
80 | fdf.to_csv("../cache/test.csv")
81 |
82 | fdf = fdf.drop(["ID_1", "Versus_1", "ID_2", "Versus_2"], axis=1)
83 | fdf["diff_ranking"] = fdf["Ranking_2"] - fdf["Ranking_1"]
84 |
85 | y = fdf.Winner
86 |
87 | fdf = fdf[["diff_ranking"]]
88 | X = fdf.values
89 |
90 | print(X)
91 |
92 | model = RandomForestClassifier(n_estimators=1000, max_depth=None)
93 | print("FIT")
94 | print(X.shape, y.shape)
95 | model.fit(X, y)
96 |
97 | y_pred = model.predict(X)
98 | print(len(y), np.sum(y == y_pred))
99 | print(y_pred)
100 | print(y)
101 | print(np.sum(y_pred))
102 |
103 | plt.figure()
104 | plt.scatter(X, y)
105 | plt.show()
106 | """
107 | z = model.predict(np.expand_dims(list(range(-10000, 10001)), axis=1))
108 | plt.figure()
109 | plt.plot(list(range(-10000, 10001)), z)
110 | plt.show()
111 | """
112 |
--------------------------------------------------------------------------------
/python/history_modeling/encoding_model.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod
2 |
3 | import pandas as pd
4 | from sklearn.decomposition import PCA
5 |
6 | from history_modeling.match_representation import (
7 | create_timeless_dataset,
8 | get_match_info,
9 | )
10 |
11 |
12 | class MatchEncoder:
13 | def __init__(self, num_match_differences):
14 | self.num_match_differences = num_match_differences
15 |
16 | @property
17 | @abstractmethod
18 | def output_shape(self):
19 | pass
20 |
21 | def select_data(self, X, columns=None):
22 | assert isinstance(X, pd.DataFrame)
23 |
24 | if columns is not None:
25 | X_transformed = create_timeless_dataset(X, columns=columns)
26 | else:
27 | X_transformed = create_timeless_dataset(X)
28 | X_transformed = X_transformed.dropna().reset_index(drop=True)
29 | return X_transformed
30 |
31 | @abstractmethod
32 | def predict(self, match_row):
33 | pass
34 |
35 |
36 | class PCAMatchEncoder(MatchEncoder):
37 | def __init__(
38 | self,
39 | num_pca_features,
40 | auto_transform=False,
41 | columns=[
42 | "surface",
43 | "result",
44 | "num_played_minutes",
45 | "adv_ranking",
46 | "adv_ranking_points",
47 | "num_won_sets",
48 | "num_lost_sets",
49 | "num_won_games",
50 | "num_lost_games",
51 | "num_tie_break_wons",
52 | "num_tie_break_lost",
53 | ],
54 | ):
55 | self.num_pca_features = num_pca_features
56 | self.auto_transform = auto_transform
57 | self.columns = columns
58 |
59 | self.model = self.instantiate_model()
60 |
61 | def output_shape(self):
62 | return self.num_pca_features
63 |
64 | def instantiate_model(self):
65 | model = PCA(n_components=self.num_pca_features)
66 | return model
67 |
68 | def fit(self, X, transform_data=False):
69 | if transform_data or self.auto_transform:
70 | X = self.select_data(X, columns=self.columns)
71 | self.model.fit(X)
72 |
73 | def predict(self, X, transform_data=False):
74 | if transform_data or self.auto_transform:
75 | X = self.select_data(X, columns=self.columns)
76 | return self.model.transform(X), X
77 | else:
78 | return self.model.transform(X)
79 |
80 | def save_model(self):
81 | pass
82 |
83 |
84 | class IdentityEncoder(MatchEncoder):
85 | def __init__(
86 | self,
87 | auto_transform=False,
88 | columns=[
89 | "surface",
90 | "result",
91 | "num_played_minutes",
92 | "adv_ranking",
93 | "adv_ranking_points",
94 | "num_won_sets",
95 | "num_lost_sets",
96 | "num_won_games",
97 | "num_lost_games",
98 | "num_tie_break_wons",
99 | "num_tie_break_lost",
100 | ],
101 | ):
102 | self.columns = columns
103 | self.auto_transform = auto_transform
104 |
105 | self.model = self.instantiate_model()
106 |
107 | @property
108 | def output_shape(self):
109 | return len(self.columns)
110 |
111 | def instantiate_model(self):
112 | return None
113 |
114 | def fit(self, X, transform_data=False):
115 | pass
116 |
117 | def predict(self, X, transform_data=False):
118 | if transform_data or self.auto_transform:
119 | X_tr = self.select_data(X, columns=self.columns)
120 | return X_tr, X
121 | else:
122 | return X
123 |
124 | def save_model(self):
125 | pass
126 |
127 |
128 | class MatchesHistoryEncoder:
129 | def __init__(self, match_encoder, num_matches, add_days_difference):
130 | self.match_encoder = match_encoder
131 | self.num_matches = num_matches
132 | self.add_days_difference = add_days_difference
133 |
134 | @abstractmethod
135 | def predict(self, match_row):
136 | pass
137 |
--------------------------------------------------------------------------------
/examples/models/train_test_eval.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 |
3 | sys.path.append("../../python")
4 | sys.path.append("../../../")
5 |
6 | import matplotlib.pyplot as plt
7 | import numpy as np
8 | import pandas as pd
9 | from sklearn.ensemble import RandomForestClassifier
10 | from sklearn.ensemble import GradientBoostingClassifier
11 |
12 | from data.data_loader import matches_data_loader
13 | from data.data_loader import encode_data
14 | from evaluation.train_test import train_test_evaluation
15 |
16 |
17 | train_years = [2020, 2021]
18 | test_years = [2022, 2023]
19 |
20 |
21 | model_class = RandomForestClassifier
22 | model_params = {"n_estimators": 2000, "max_depth": None}
23 | match_features = []
24 | player_features = ["Ranking"]
25 | additional_features = ["diff_rank", "v_perc_versus"]
26 |
27 | test_score = train_test_evaluation(
28 | train_years=train_years,
29 | test_years=test_years,
30 | model_class=model_class,
31 | model_params=model_params,
32 | match_features=match_features,
33 | player_features=player_features,
34 | encoding_params={},
35 | additional_features=additional_features,
36 | )
37 |
38 | print("Test Score", test_score)
39 |
40 |
41 | model_class = RandomForestClassifier
42 | model_params = {"n_estimators": 2000, "max_depth": None}
43 | match_features = []
44 | player_features = ["Ranking"]
45 | additional_features = []
46 |
47 | test_score = train_test_evaluation(
48 | train_years=train_years,
49 | test_years=test_years,
50 | model_class=model_class,
51 | model_params=model_params,
52 | match_features=match_features,
53 | player_features=player_features,
54 | encoding_params={},
55 | additional_features=additional_features,
56 | )
57 |
58 | print("Test Score", test_score)
59 |
60 |
61 | model_class = RandomForestClassifier
62 | model_params = {"n_estimators": 2000, "max_depth": None}
63 | match_features = []
64 | player_features = []
65 | additional_features = ["diff_rank"]
66 |
67 | test_score = train_test_evaluation(
68 | train_years=train_years,
69 | test_years=test_years,
70 | model_class=model_class,
71 | model_params=model_params,
72 | match_features=match_features,
73 | player_features=player_features,
74 | encoding_params={},
75 | additional_features=additional_features,
76 | )
77 |
78 | print("Test Score", test_score)
79 |
80 |
81 | model_class = RandomForestClassifier
82 | model_params = {"n_estimators": 1, "max_depth": 1}
83 | match_features = []
84 | player_features = []
85 | additional_features = ["diff_rank"]
86 |
87 | test_score = train_test_evaluation(
88 | train_years=train_years,
89 | test_years=test_years,
90 | model_class=model_class,
91 | model_params=model_params,
92 | match_features=match_features,
93 | player_features=player_features,
94 | encoding_params={},
95 | additional_features=additional_features,
96 | )
97 |
98 | print("Test Score", test_score)
99 |
100 |
101 | model_class = GradientBoostingClassifier
102 | model_params = {"n_estimators": 100, "learning_rate": 1.0, "max_depth": 1}
103 | match_features = []
104 | player_features = []
105 | additional_features = ["diff_rank"]
106 |
107 | test_score = train_test_evaluation(
108 | train_years=train_years,
109 | test_years=test_years,
110 | model_class=model_class,
111 | model_params=model_params,
112 | match_features=match_features,
113 | player_features=player_features,
114 | encoding_params={},
115 | additional_features=additional_features,
116 | )
117 |
118 | print("Test Score", test_score)
119 |
120 |
121 | model_class = GradientBoostingClassifier
122 | model_params = {"n_estimators": 1000, "learning_rate": 0.1, "max_depth": 4}
123 | match_features = []
124 | player_features = []
125 | additional_features = ["diff_rank"]
126 |
127 | test_score = train_test_evaluation(
128 | train_years=train_years,
129 | test_years=test_years,
130 | model_class=model_class,
131 | model_params=model_params,
132 | match_features=match_features,
133 | player_features=player_features,
134 | encoding_params={},
135 | additional_features=additional_features,
136 | )
137 |
138 | print("Test Score", test_score)
139 |
--------------------------------------------------------------------------------
/examples/models/deep_history.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | import matplotlib.pyplot as plt
5 |
6 | sys.path.append("../../python")
7 | import time
8 |
9 | import numpy as np
10 | import pandas as pd
11 |
12 | from data.data_loader import matches_data_loader
13 | from data.data_encoding import (
14 | encode_data,
15 | create_additional_features,
16 | clean_missing_data,
17 | create_encoded_history,
18 | complete_missing_data,
19 | )
20 | from history_modeling.encoding_model import IdentityEncoder
21 | from model.deep_model import ConvolutionalHistoryAndFullyConnected
22 |
23 |
24 | absolute_path = os.path.dirname(os.path.abspath(__file__))
25 | match_features = ["tournament_surface", "tournament_level", "round"]
26 | player_features = [
27 | "Ranking",
28 | "Ranking_Points",
29 | "Height",
30 | "Victories_Percentage",
31 | "Clay_Victories_Percentage",
32 | "Grass_Victories_Percentage",
33 | "Carpet_Victories_Percentage",
34 | "Hard_Victories_Percentage",
35 | "Aces_Percentage",
36 | ]
37 | additional_features = ["diff_rank", "v_perc_versus", "nb_match_versus"]
38 | encoding_params = {}
39 |
40 | data_df = matches_data_loader(
41 | path_to_data=os.path.join(absolute_path, "../../submodules/tennis_atp"),
42 | path_to_cache=os.path.join(absolute_path, "../../cache"),
43 | flush_cache=False,
44 | keep_values_from_year=2022,
45 | get_match_statistics=True,
46 | get_reversed_match_data=True,
47 | include_davis_cup=False,
48 | )
49 | print(f"[+] Data Loaded, Now Encoding Data and create additional Features")
50 | print(data_df.head())
51 | print(data_df.columns)
52 |
53 | # data_df = pd.concat([data_df.iloc[:1000], data_df.iloc[-1000:]])
54 |
55 | history_columns = []
56 | encoder_models = [(IdentityEncoder, {})]
57 | for encoding_model, encoding_model_params in encoder_models:
58 | print(f"[+] Training Encoder Model {encoding_model}")
59 | encoder = encoding_model(**encoding_model_params)
60 | encoder.fit(data_df)
61 |
62 | print(f"[+] Encoding using encoder {encoding_model}")
63 | encoded_data = create_encoded_history(
64 | data_df, encoder, num_matches=5, completing_value=0
65 | )
66 |
67 | cols = ["history_1", "history_2"]
68 |
69 | flatten_data = pd.concat(
70 | [
71 | pd.DataFrame(
72 | np.array(encoded_data[x].values.tolist()).reshape(
73 | (len(encoded_data), -1)
74 | )
75 | ).add_prefix(x)
76 | for x in cols
77 | ],
78 | axis=1,
79 | )
80 | encoded_data = pd.concat([flatten_data, encoded_data.drop(cols, axis=1)], axis=1)
81 | enc_columns = encoded_data.columns
82 | enc_columns = list(set(enc_columns) - set(["id", "ID_1", "ID_2"]))
83 | history_columns.extend(enc_columns)
84 |
85 | data_df = pd.merge(data_df, encoded_data, on=["id", "ID_1", "ID_2"])
86 |
87 | train_data = data_df.loc[data_df.tournament_year.isin([2022])]
88 | test_data = data_df.loc[data_df.tournament_year.isin([2023])]
89 | # train_data = data_df.loc[data_df.tournament_year.isin([2019, 2020, 2021])]
90 | # test_data = data_df.loc[data_df.tournament_year.isin([2022, 2023])]
91 | train_data = create_additional_features(train_data, additional_features)
92 | train_data = encode_data(train_data, **encoding_params)
93 | test_data = create_additional_features(test_data, additional_features)
94 | test_data = encode_data(test_data, **encoding_params)
95 |
96 | p1_features = [feat + "_1" for feat in player_features]
97 | p2_features = [feat + "_2" for feat in player_features]
98 | match_features = match_features.copy()
99 |
100 | train_data_ = train_data[
101 | match_features + p1_features + p2_features + ["Winner", "tournament_year"]
102 | ]
103 | test_data_ = test_data[
104 | match_features + p1_features + p2_features + ["Winner", "tournament_year"]
105 | ]
106 |
107 | # train_data_ = clean_missing_data(train_data_)
108 | # test_data_ = clean_missing_data(test_data_)
109 |
110 | print(data_df.head())
111 | print(data_df.columns)
112 |
113 | model = ConvolutionalHistoryAndFullyConnected(
114 | num_history_signals=22,
115 | **{
116 | "input_shape": 23,
117 | "hidden_units": (22, 44, 22, 11, 4),
118 | "output_shape": 2,
119 | "last_activation": "softmax",
120 | "epochs": 100,
121 | "reduced_lr_epochs": 50,
122 | "loss": "categorical_crossentropy",
123 | },
124 | )
125 | # model.instantiate_model()
126 |
127 | print(model.summary())
128 |
129 | print(data_df.head())
130 |
131 | hist_cols = []
132 | for col in data_df.columns:
133 | if "history" in col:
134 | hist_cols.append(col)
135 |
136 | print(len(train_data), len(hist_cols))
137 |
138 | model.fit(
139 | train_data_.values,
140 | train_data[hist_cols].values.reshape((len(train_data), 5, 22)),
141 | train_data["Winner"].values,
142 | )
143 |
144 | y_pred = model.predict(
145 | test_data_.values, test_data[hist_cols].values.reshape((len(test_data), 5, 22))
146 | )
147 |
148 |
149 | print(np.sum(y_pred == test_data["Winner"]))
150 |
151 | plt.plot(y_pred)
152 | plt.show()
153 |
--------------------------------------------------------------------------------
/examples/results_reading/best_model.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import matplotlib.pyplot as plt
4 | from matplotlib.patches import Rectangle
5 | import numpy as np
6 | import pandas as pd
7 |
8 | df_results = pd.read_csv("../../results/20212022/results.csv", sep=";")
9 |
10 | best_row = df_results.iloc[df_results.precision.argmax()]
11 | print(best_row)
12 |
13 | eval_id = best_row["eval_ID"]
14 | best_results = pd.read_csv(
15 | os.path.join("../../results/20212022", f"{eval_id}.csv"), sep=";"
16 | )
17 |
18 | fig, ax = plt.subplots()
19 | df_ww = best_results.loc[best_results.Winner == 0].loc[best_results.y_pred == 0]
20 | plt.scatter(df_ww.diff_rank, df_ww.Winner, c="tab:pink", label="Well Predicted")
21 | df_wl = best_results.loc[best_results.Winner == 0].loc[best_results.y_pred == 1]
22 | plt.scatter(df_wl.diff_rank, df_wl.Winner + 0.1, c="tab:blue", label="Predicted Wrong")
23 | df_ll = best_results.loc[best_results.Winner == 1].loc[best_results.y_pred == 1]
24 | plt.scatter(df_ll.diff_rank, df_ll.Winner, c="tab:orange", label="Well Wrong")
25 | df_lw = best_results.loc[best_results.Winner == 1].loc[best_results.y_pred == 0]
26 | plt.scatter(df_lw.diff_rank, df_lw.Winner - 0.1, c="tab:red", label="Predicted Wrong")
27 | plt.legend()
28 |
29 | plt.xlabel("Rank Player 0 - Rank Player 1")
30 | plt.ylabel("Winner")
31 | plt.show()
32 |
33 | # Let's evaluate Symmetry
34 | symmetric_same_results = 0
35 | for i in range(int(len(best_results) / 2)):
36 | if best_results.iloc[2 * i]["y_pred"] != best_results.iloc[2 * i + 1]["y_pred"]:
37 | symmetric_same_results += 1
38 | print(
39 | f"{(symmetric_same_results / (len(best_results) / 2))} Results are symmetrically predicted"
40 | )
41 |
42 | rank_categories = [1, 10, 50, 100, 300, 1000, 9999]
43 |
44 | prediction_percentage = []
45 |
46 | for cat_1 in range(len(rank_categories) - 1):
47 | lines = []
48 | nb_matches_lines = []
49 | for cat_2 in range(len(rank_categories) - 1):
50 | sub_df = best_results.loc[best_results.Ranking_1 >= rank_categories[cat_1]].loc[
51 | best_results.Ranking_1 < rank_categories[cat_1 + 1]
52 | ]
53 | sub_df = sub_df.loc[sub_df.Ranking_2 >= rank_categories[cat_2]].loc[
54 | sub_df.Ranking_2 < rank_categories[cat_2 + 1]
55 | ]
56 | sub_df["best_rank"] = sub_df.apply(
57 | lambda row: np.argmin([row["Ranking_1"], row["Ranking_2"]]), axis=1
58 | )
59 |
60 | if len(sub_df) > 0:
61 | best_player_w_p = np.sum(
62 | sub_df.Winner.values == sub_df.y_pred.values
63 | ) / len(sub_df)
64 |
65 | else:
66 | best_player_w_p = 0
67 | lines.append(best_player_w_p)
68 | nb_matches_lines.append(len(sub_df) / 2)
69 | prediction_percentage.append(lines)
70 |
71 | colors = ["purple", "blue", "cyan", "green", "yellow", "orange", "red"]
72 | fig, ax = plt.subplots()
73 |
74 | for i, val1 in enumerate(prediction_percentage):
75 | for j, val2 in enumerate(val1):
76 | color = colors[int(val2 * (len(colors) - 1))]
77 | rect = plt.Rectangle((i, j), 1, 1, fc=color)
78 | ax.add_patch(rect)
79 | plt.text(i + 0.2, j + 0.35, np.round(val2 * 100, 2))
80 |
81 | for i in range(len(rank_categories)):
82 | plt.plot([i, i], [0, len(rank_categories) - 1], c="k")
83 | plt.plot([0, len(rank_categories) - 1], [i, i], c="k")
84 |
85 | plt.xticks(list(range(len(rank_categories))), labels=rank_categories)
86 | plt.yticks(list(range(len(rank_categories))), labels=rank_categories)
87 | plt.xlabel("Player 1 Rank Category")
88 | plt.ylabel("Player 2 Rank Category")
89 | plt.title("Precision Percentage")
90 | plt.savefig("precision_percentage_players_ranks.png")
91 | plt.show()
92 |
93 | best_ranked_player_wins_results = pd.read_csv(
94 | os.path.join(
95 | "../../results/20212022",
96 | f"{df_results.loc[df_results.model_class=='BestRankedPlayerWins'].eval_ID.values[0]}.csv",
97 | ),
98 | sep=";",
99 | )
100 | ticks = []
101 | fig, ax = plt.subplots()
102 | for surface, surface_code in {"Clay": 0, "Carpet": 1, "Hard": 2, "Grass": 3}.items():
103 | precision_model = best_results.loc[best_results.tournament_surface == surface_code]
104 | precision_brpw = best_ranked_player_wins_results.loc[
105 | best_ranked_player_wins_results.tournament_surface == surface_code
106 | ]
107 |
108 | if len(precision_model) > 0:
109 | precision_model = len(
110 | precision_model.loc[precision_model.y_pred == precision_model.Winner]
111 | ) / len(precision_model)
112 | prec_brpw = 0
113 | for n_row, row in precision_brpw.iterrows():
114 | if int(row["y_pred"][1]) == row["Winner"]:
115 | prec_brpw += 1
116 | precision_brpw = prec_brpw / len(precision_brpw)
117 | else:
118 | precision_model = 0
119 | precision_brpw = 0
120 | rect = Rectangle(
121 | (surface_code * 2, 0),
122 | 1,
123 | precision_model,
124 | edgecolor="k",
125 | facecolor="tab:blue",
126 | label="Model - XGBoost",
127 | )
128 | ax.add_patch(rect)
129 | rect = Rectangle(
130 | (surface_code * 2 + 1, 0),
131 | 1,
132 | precision_brpw,
133 | edgecolor="k",
134 | facecolor="tab:pink",
135 | label="Best Ranked Player Wins",
136 | )
137 | ax.add_patch(rect)
138 | ticks.append(surface)
139 |
140 | ax.autoscale()
141 | plt.xticks([1, 3, 5, 7], labels=ticks)
142 | handles, labels = plt.gca().get_legend_handles_labels()
143 | by_label = dict(zip(labels, handles))
144 | plt.legend(by_label.values(), by_label.keys(), loc=4)
145 | plt.title("Win % for each surface")
146 | plt.savefig("win_per_surface.png")
147 | plt.show()
148 |
--------------------------------------------------------------------------------
/python/history_modeling/match_representation.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | from data.data_utils import get_days_difference
5 |
6 |
7 | def get_match_info(row, verbose=0):
8 | # add adversary age & hand ?
9 | surface = row["tournament_surface"]
10 | result = row["Winner"]
11 | try:
12 | score = row["score"]
13 | except:
14 | print(row)
15 | print(row.index)
16 | print(row.values)
17 | num_played_minutes = row["elapsed_minutes"]
18 | date = row["tournament_date"]
19 |
20 | adv_ranking = row["Ranking_2"]
21 | adv_ranking_points = row["Ranking_Points_2"]
22 |
23 | num_won_sets = 0
24 | num_lost_sets = 0
25 | num_won_games = 0
26 | num_lost_games = 0
27 | num_tie_break_wons = 0
28 | num_tie_break_lost = 0
29 |
30 | for set in row["score"].split(" "):
31 | try:
32 | games_0 = set.split("-")[0]
33 | games_1 = set.split("-")[1]
34 |
35 | if "(" in games_0:
36 | games_0 = games_0.split("(")[0]
37 | num_tie_break_lost += 1
38 |
39 | elif "(" in games_1:
40 | games_1 = games_1.split("(")[0]
41 | num_tie_break_wons += 1
42 |
43 | games_0 = int(games_0)
44 | games_1 = int(games_1)
45 |
46 | if games_0 > games_1:
47 | num_won_sets += 1
48 | elif games_0 < games_1:
49 | num_lost_sets += 1
50 |
51 | num_won_games += games_0
52 | num_lost_games += games_1
53 | except:
54 | if set not in ["ABD", "RET", "W/O"]:
55 | if verbose == 2:
56 | print(set)
57 | else:
58 | pass
59 |
60 | match_df = pd.DataFrame(
61 | {
62 | "surface": [surface],
63 | "result": [result],
64 | "num_played_minutes": [num_played_minutes],
65 | "date": [date],
66 | "adv_ranking": [adv_ranking],
67 | "adv_ranking_points": [adv_ranking_points],
68 | "num_won_sets": [num_won_sets],
69 | "num_lost_sets": [num_lost_sets],
70 | "num_won_games": [num_won_games],
71 | "num_lost_games": [num_lost_games],
72 | "num_tie_break_wons": [num_tie_break_wons],
73 | "num_tie_break_lost": [num_tie_break_lost],
74 | }
75 | )
76 | return match_df
77 |
78 |
79 | def matches_info_norm(matches_info, current_date=""):
80 | # Normalize values
81 | tournament_surface = {"Clay": 0.0, "Carpet": 1 / 3, "Hard": 2 / 3, "Grass": 1.0}
82 | # nb sets won: max 3
83 | # nb sets lost: max 3
84 | # nb games won: max 100 (from experience - to be validated)
85 | # nb games lost: max 100 (from experience - to be validated)
86 | # nb tiebreaks won: max 100 (from experience - to be validated) -> not number of points but nb of tiebreaks ?
87 | # nb tiebreaks lost: max 100 (from experience - to be validated)
88 | # Ranking points max 16,950 from Djokovic's record -> 20,000
89 | # Ranking max 9,999
90 | # Num played minutes max 671 from Mahut/Isner's record -> 700
91 | # date: compute number of days since tournament date -> normalize by 365 -> if > 365 give up ?
92 |
93 | matches_info = matches_info.copy()
94 | matches_info["surface"] = matches_info["surface"].apply(
95 | lambda val: tournament_surface[val]
96 | )
97 | matches_info["num_won_sets"] = matches_info["num_won_sets"].apply(
98 | lambda val: val / 3
99 | )
100 | matches_info["num_lost_sets"] = matches_info["num_lost_sets"].apply(
101 | lambda val: val / 3
102 | )
103 |
104 | matches_info["date"] = matches_info["date"].apply(
105 | lambda val: get_days_difference(val, current_date) / 365
106 | )
107 | matches_info["num_played_minutes"] = matches_info["num_played_minutes"].apply(
108 | lambda val: val / 700
109 | )
110 |
111 | matches_info["adv_ranking"] = matches_info["adv_ranking"].apply(
112 | lambda val: np.log(val) / np.log(9999)
113 | )
114 | matches_info["adv_ranking_points"] = matches_info["adv_ranking_points"].apply(
115 | lambda val: val / 20000
116 | )
117 |
118 | matches_info["num_won_games"] = matches_info["num_won_games"].apply(
119 | lambda val: val / 100
120 | )
121 | matches_info["num_lost_games"] = matches_info["num_lost_games"].apply(
122 | lambda val: val / 100
123 | )
124 | matches_info["num_tie_break_wons"] = matches_info["num_tie_break_wons"].apply(
125 | lambda val: val / 3
126 | )
127 | matches_info["num_tie_break_lost"] = matches_info["num_tie_break_lost"].apply(
128 | lambda val: val / 3
129 | )
130 |
131 | return matches_info
132 |
133 |
134 | def create_dataset(
135 | data_df, num_matches_difference=10, nb_kept_differences=10, randomize_indexes=False
136 | ):
137 | """
138 | Creates the match representation dataset
139 | :param data_df:
140 | :param num_matches_difference:
141 | :param nb_kept_differences:
142 | :return:
143 | """
144 | dataset = []
145 | for i in range(len(data_df)):
146 | current_row = data_df.iloc[i]
147 | current_player = current_row.ID_1
148 | sub_data_df = data_df.loc[data_df.ID_1 == current_player].iloc[: i - 1]
149 | if len(sub_data_df) > 0:
150 | sub_data_df = sub_data_df.reset_index(drop=True)
151 |
152 | kept_indexes = list(
153 | range(-min(len(sub_data_df), num_matches_difference), 0)
154 | )
155 | if randomize_indexes:
156 | kept_indexes = np.random.permutation(kept_indexes)
157 | kept_indexes = kept_indexes[:nb_kept_differences]
158 | sub_data_df = sub_data_df.iloc[kept_indexes]
159 | sub_data_df = sub_data_df.reset_index(drop=True)
160 |
161 | raw_matches_info = pd.concat(
162 | [get_match_info(sub_data_df.iloc[i]) for i in range(len(sub_data_df))],
163 | axis=0,
164 | )
165 | normalized_matches_info = matches_info_norm(
166 | raw_matches_info, current_date=current_row["tournament_date"]
167 | )
168 | dataset.append(normalized_matches_info)
169 |
170 | return pd.concat(dataset, axis=0)
171 |
172 |
173 | def create_timeless_dataset(
174 | data_df,
175 | columns=[
176 | "surface",
177 | "result",
178 | "num_played_minutes",
179 | "adv_ranking",
180 | "adv_ranking_points",
181 | "num_won_sets",
182 | "num_lost_sets",
183 | "num_won_games",
184 | "num_lost_games",
185 | "num_tie_break_wons",
186 | "num_tie_break_lost",
187 | ],
188 | ):
189 | dataset = []
190 | for i in range(len(data_df)):
191 | raw_matches_info = get_match_info(data_df.iloc[i])
192 | dataset.append(raw_matches_info)
193 | dataset = pd.concat(dataset, axis=0)
194 |
195 | dataset = matches_info_norm(
196 | dataset, current_date=data_df["tournament_date"].values[-1]
197 | )
198 | dataset = dataset.drop(["date"], axis=1)
199 | return dataset[columns]
200 |
--------------------------------------------------------------------------------
/python/model/deep_model.py:
--------------------------------------------------------------------------------
1 | from sklearn.preprocessing import StandardScaler
2 | import tensorflow as tf
3 |
4 | from model.base_model import DeepBaseModel
5 |
6 |
7 | def create_dense_model(
8 | input_shape=2,
9 | output_shape=2,
10 | hidden_units=(4, 8, 4),
11 | hidden_activations="relu",
12 | last_activation="softmax",
13 | ):
14 | hid_activation = tf.keras.layers.Activation(hidden_activations)
15 | inputs = tf.keras.layers.Input(shape=input_shape)
16 | hidden_out = inputs
17 |
18 | for n_cells in hidden_units:
19 | hidden_out = tf.keras.layers.Dense(n_cells)(hidden_out)
20 | hidden_out = hid_activation(hidden_out)
21 |
22 | out = tf.keras.layers.Dense(output_shape)(hidden_out)
23 | out = tf.keras.layers.Activation(last_activation)(out)
24 |
25 | return tf.keras.Model(inputs=inputs, outputs=out)
26 |
27 |
28 | class SimpleFullyConnected(DeepBaseModel):
29 | def __init__(
30 | self,
31 | input_shape=2,
32 | output_shape=2,
33 | hidden_units=[4, 8, 4],
34 | hidden_activations="relu",
35 | last_activation="softmax",
36 | epochs=50,
37 | reduced_lr_epochs=10,
38 | optimizer="adamax",
39 | lr=1e-5,
40 | loss="cross_entropy",
41 | ):
42 | self.input_shape = input_shape
43 | self.output_shape = output_shape
44 | self.hidden_units = hidden_units
45 | self.hidden_activations = hidden_activations
46 | self.last_activation = last_activation
47 | self.epochs = epochs
48 | self.reduced_lr_epochs = reduced_lr_epochs
49 | self.optimizer = optimizer
50 | self.lr = lr
51 | self.loss = loss
52 | super().__init__()
53 |
54 | def instantiate_model(self):
55 | self.scaler_x = StandardScaler()
56 | self.model = create_dense_model(
57 | input_shape=self.input_shape,
58 | output_shape=self.output_shape,
59 | hidden_units=self.hidden_units,
60 | hidden_activations=self.hidden_activations,
61 | last_activation=self.last_activation,
62 | )
63 |
64 | if self.optimizer == "adamax":
65 | self.optimizer = tf.keras.optimizers.Adamax(lr=self.lr)
66 | elif self.optimizer == "rmsprop":
67 | self.optimizer = tf.keras.optimizers.RMSprop(lr=self.lr)
68 | elif self.optimizer == "sgd":
69 | self.optimizer = tf.keras.optimizers.SGD(lr=self.lr)
70 | elif self.optimizer == "Adam":
71 | self.optimizer = tf.keras.optimizers.Adam(lr=self.lr)
72 | else:
73 | raise ValueError(
74 | f"Optimizer {self.optimizer} not understood, must be among ['adam', 'adamax', 'sgd', 'rmsprop']"
75 | )
76 |
77 | self.model.compile(optimizer=self.optimizer, loss=self.loss)
78 |
79 | def fit(self, X, y):
80 | self.scaler_x.fit(X)
81 | if self.output_shape == 2:
82 | y = tf.one_hot(y.squeeze(), depth=2)
83 | self.model.fit(self.scaler_x.transform(X), y, epochs=self.epochs)
84 | if self.reduced_lr_epochs > 0:
85 | self.optimizer.lr.assign(self.lr / 10)
86 | self.model.fit(self.scaler_x.transform(X), y, epochs=self.reduced_lr_epochs)
87 |
88 | def predict(self, X):
89 | y_pred = self.model.predict(self.scaler_x.transform(X))
90 | if self.output_shape == 2:
91 | y_pred = tf.argmax(y_pred, axis=-1)
92 | return y_pred
93 |
94 |
95 | def create_conv_dense_model(
96 | input_shape=2,
97 | history_input_shape=(5, 5),
98 | output_shape=2,
99 | hidden_units=(4, 8, 4),
100 | hidden_activations="relu",
101 | last_activation="softmax",
102 | ):
103 | hid_activation = tf.keras.layers.Activation(hidden_activations)
104 |
105 | history_inputs = tf.keras.layers.Input(shape=history_input_shape)
106 | print(history_inputs.shape, history_input_shape)
107 | encoded_history = tf.keras.layers.Conv1D(filters=8, kernel_size=3, padding="same")(
108 | history_inputs
109 | )
110 | encoded_history = tf.keras.layers.Conv1D(filters=4, kernel_size=3)(history_inputs)
111 | encoded_history = tf.keras.layers.Conv1D(filters=1, kernel_size=3)(encoded_history)
112 | encoded_history = tf.keras.layers.Flatten()(encoded_history)
113 |
114 | inputs = tf.keras.layers.Input(shape=input_shape)
115 | hidden_out = tf.keras.layers.Concatenate()([inputs, encoded_history])
116 |
117 | for n_cells in hidden_units:
118 | hidden_out = tf.keras.layers.Dense(n_cells)(hidden_out)
119 | hidden_out = hid_activation(hidden_out)
120 |
121 | out = tf.keras.layers.Dense(output_shape)(hidden_out)
122 | out = tf.keras.layers.Activation(last_activation)(out)
123 |
124 | return tf.keras.Model(inputs=[history_inputs, inputs], outputs=out)
125 |
126 |
127 | class ConvolutionalHistoryAndFullyConnected(DeepBaseModel):
128 | def __init__(
129 | self,
130 | num_history_signals=2,
131 | history_length=5,
132 | input_shape=2,
133 | output_shape=2,
134 | hidden_units=[4, 8, 4],
135 | hidden_activations="relu",
136 | last_activation="softmax",
137 | epochs=50,
138 | reduced_lr_epochs=10,
139 | optimizer="adamax",
140 | lr=1e-5,
141 | loss="cross_entropy",
142 | ):
143 | self.num_history_signals = num_history_signals
144 | self.history_length = history_length
145 | self.input_shape = input_shape
146 | self.output_shape = output_shape
147 | self.hidden_units = hidden_units
148 | self.hidden_activations = hidden_activations
149 | self.last_activation = last_activation
150 | self.epochs = epochs
151 | self.reduced_lr_epochs = reduced_lr_epochs
152 | self.optimizer = optimizer
153 | self.lr = lr
154 | self.loss = loss
155 | super().__init__()
156 |
157 | def instantiate_model(self):
158 | self.scaler_x = StandardScaler()
159 | self.model = create_conv_dense_model(
160 | history_input_shape=(self.history_length, self.num_history_signals),
161 | input_shape=self.input_shape,
162 | output_shape=self.output_shape,
163 | hidden_units=self.hidden_units,
164 | hidden_activations=self.hidden_activations,
165 | last_activation=self.last_activation,
166 | )
167 |
168 | if self.optimizer == "adamax":
169 | self.optimizer = tf.keras.optimizers.Adamax(lr=self.lr)
170 | elif self.optimizer == "rmsprop":
171 | self.optimizer = tf.keras.optimizers.RMSprop(lr=self.lr)
172 | elif self.optimizer == "sgd":
173 | self.optimizer = tf.keras.optimizers.SGD(lr=self.lr)
174 | elif self.optimizer == "Adam":
175 | self.optimizer = tf.keras.optimizers.Adam(lr=self.lr)
176 | else:
177 | raise ValueError(
178 | f"Optimizer {self.optimizer} not understood, must be among ['adam', 'adamax', 'sgd', 'rmsprop']"
179 | )
180 |
181 | self.model.compile(optimizer=self.optimizer, loss=self.loss)
182 |
183 | def fit(self, X, X_history, y):
184 | # print(X.columns)
185 | self.scaler_x.fit(X)
186 | if self.output_shape == 2:
187 | y = tf.one_hot(y.squeeze(), depth=2)
188 |
189 | print("X shape", X.shape)
190 | print("X history shape", X_history.shape)
191 | print("y shape", y.shape)
192 | self.model.fit([X_history, self.scaler_x.transform(X)], y, epochs=self.epochs)
193 | if self.reduced_lr_epochs > 0:
194 | self.optimizer.lr.assign(self.lr / 10)
195 | self.model.fit(
196 | [X_history, self.scaler_x.transform(X)],
197 | y,
198 | epochs=self.reduced_lr_epochs,
199 | )
200 |
201 | def predict(self, X, X_history):
202 | y_pred = self.model.predict([X_history, self.scaler_x.transform(X)])
203 | if self.output_shape == 2:
204 | y_pred = tf.argmax(y_pred, axis=-1)
205 | return y_pred
206 |
207 | def summary(self):
208 | return self.model.summary()
209 |
--------------------------------------------------------------------------------
/examples/models/grid_search.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 |
3 | sys.path.append("../../python")
4 | sys.path.append("../../../")
5 |
6 | import matplotlib.pyplot as plt
7 | import numpy as np
8 | import pandas as pd
9 | from sklearn.ensemble import (
10 | RandomForestClassifier,
11 | GradientBoostingClassifier,
12 | AdaBoostClassifier,
13 | )
14 | from model.dumb_models import BestRankedPlayerWins
15 | from model.lgbm import LightGBM
16 | from model.sk_model import ScalerSVC
17 | from model.xgboost import XGBoost
18 |
19 | from data.data_loader import matches_data_loader
20 | from data.data_loader import encode_data
21 | from evaluation.train_test import train_test_evaluation
22 |
23 |
24 | train_years = [2018, 2019, 2020]
25 | test_years = [2021, 2022]
26 |
27 |
28 | match_features = ["tournament_surface", "tournament_level"]
29 | player_features = [
30 | "Ranking",
31 | "Ranking_Points",
32 | "Height",
33 | "Victories_Percentage",
34 | "Clay_Victories_Percentage",
35 | "Grass_Victories_Percentage",
36 | "Carpet_Victories_Percentage",
37 | "Hard_Victories_Percentage",
38 | "Aces_Percentage",
39 | ]
40 | additional_features = ["diff_rank", "v_perc_versus", "nb_match_versus"]
41 |
42 |
43 | test_score = train_test_evaluation(
44 | train_years=train_years,
45 | test_years=test_years,
46 | model_class=BestRankedPlayerWins,
47 | model_params={},
48 | match_features=match_features,
49 | player_features=player_features,
50 | encoding_params={},
51 | additional_features=additional_features,
52 | save_path="../../results/20212022_chall",
53 | save_all_results=False,
54 | )
55 |
56 | lgbm_hyperparams = []
57 | for num_leaves in [10, 100, 1000, 2000]:
58 | for min_data_leaf in [10, 100, 1000]:
59 | lgbm_hyperparams.append(
60 | {
61 | "params": {
62 | "num_leaves": num_leaves,
63 | "objective": "binary",
64 | "min_data_in_leaf": min_data_leaf,
65 | }
66 | }
67 | )
68 | test_score = train_test_evaluation(
69 | train_years=train_years,
70 | test_years=test_years,
71 | model_class=LightGBM,
72 | model_params=lgbm_hyperparams,
73 | match_features=match_features,
74 | player_features=player_features,
75 | encoding_params={},
76 | additional_features=additional_features,
77 | save_path="../../results/20212022_chall",
78 | save_all_results=False,
79 | )
80 |
81 |
82 | ada_hyperparams = []
83 | for num_est in [10, 100, 1000, 2000]:
84 | for lr in [0.1, 1.0, 2.0]:
85 | ada_hyperparams.append(
86 | {
87 | "n_estimators": num_est,
88 | "learning_rate": lr,
89 | }
90 | )
91 | test_score = train_test_evaluation(
92 | train_years=train_years,
93 | test_years=test_years,
94 | model_class=AdaBoostClassifier,
95 | model_params=ada_hyperparams,
96 | match_features=match_features,
97 | player_features=player_features,
98 | encoding_params={},
99 | additional_features=additional_features,
100 | save_path="../../results/20212022_chall",
101 | save_all_results=False,
102 | )
103 |
104 | svc_hyperparams = []
105 | for C in [0.1, 1.0, 10.0, 100.0]:
106 | for kernel in ["linear", "rbf"]:
107 | svc_hyperparams.append(
108 | {
109 | "C": C,
110 | "kernel": kernel,
111 | }
112 | )
113 | test_score = train_test_evaluation(
114 | train_years=train_years,
115 | test_years=test_years,
116 | model_class=ScalerSVC,
117 | model_params=svc_hyperparams,
118 | match_features=match_features,
119 | player_features=player_features,
120 | encoding_params={},
121 | additional_features=additional_features,
122 | save_path="../../results/20212022_chall",
123 | save_all_results=False,
124 | )
125 |
126 |
127 | for mx_depth in [1, 3, 5]:
128 | for n_est in [10, 100, 1000, 2000]:
129 | model_params = {"n_estimators": n_est, "max_depth": mx_depth}
130 | model_class = RandomForestClassifier
131 |
132 | test_score = train_test_evaluation(
133 | train_years=train_years,
134 | test_years=test_years,
135 | model_class=model_class,
136 | model_params=model_params,
137 | match_features=match_features,
138 | player_features=player_features,
139 | encoding_params={},
140 | additional_features=additional_features,
141 | save_path="../../results/20212022_chall",
142 | save_all_results=False,
143 | )
144 | print("~~ Current Score ~~", test_score)
145 |
146 |
147 | for mx_depth in [1, 3, 5]:
148 | for n_est in [10, 100, 1000, 2000]:
149 | model_params = {"n_estimators": n_est, "max_depth": mx_depth}
150 | model_class = GradientBoostingClassifier
151 |
152 | test_score = train_test_evaluation(
153 | train_years=train_years,
154 | test_years=test_years,
155 | model_class=model_class,
156 | model_params=model_params,
157 | match_features=match_features,
158 | player_features=player_features,
159 | encoding_params={},
160 | additional_features=additional_features,
161 | save_path="../../results/20212022_chall",
162 | save_all_results=False,
163 | )
164 | print("~~ Current Score ~~", test_score)
165 |
166 |
167 | lgbm_hyperparams = []
168 | for num_leaves in [10, 100, 1000, 2000]:
169 | for min_data_leaf in [10, 100, 1000]:
170 | lgbm_hyperparams.append(
171 | {
172 | "params": {
173 | "num_leaves": num_leaves,
174 | "objective": "binary",
175 | "min_data_in_leaf": min_data_leaf,
176 | }
177 | }
178 | )
179 |
180 | test_score = train_test_evaluation(
181 | train_years=list([year for year in range(1990, 2021)]),
182 | test_years=test_years,
183 | model_class=LightGBM,
184 | model_params=lgbm_hyperparams,
185 | match_features=match_features,
186 | player_features=player_features,
187 | encoding_params={},
188 | additional_features=additional_features,
189 | save_path="../../results/20212022_chall",
190 | save_all_results=False,
191 | )
192 |
193 | xgb_hyperparams = []
194 | for eta in [0.1, 0.3, 0.6]:
195 | for gamma in [0, 1, 10]:
196 | for max_depth in [2, 4, 6, 8, 10]:
197 | for min_child_weight in [1, 2, 8]:
198 | for subsample in [0.4, 0.8, 1]:
199 | xgb_hyperparams.append(
200 | {
201 | "params": {
202 | "eta": eta,
203 | "objective": "binary:logistic",
204 | "gamma": gamma,
205 | "max_depth": max_depth,
206 | "min_child_weight": min_child_weight,
207 | "subsample": subsample,
208 | }
209 | }
210 | )
211 |
212 | test_score = train_test_evaluation(
213 | train_years=train_years,
214 | test_years=test_years,
215 | model_class=XGBoost,
216 | model_params=xgb_hyperparams,
217 | match_features=match_features,
218 | player_features=player_features,
219 | encoding_params={},
220 | additional_features=additional_features,
221 | save_path="../../results/20212022_chall",
222 | save_all_results=False,
223 | )
224 |
225 | xgb_hyperparams = []
226 | for eta in [0.1, 0.3, 0.6]:
227 | for gamma in [0, 1, 10]:
228 | for max_depth in [2, 4, 6, 8, 10]:
229 | for min_child_weight in [1, 2, 8]:
230 | for subsample in [0.4, 0.8, 1]:
231 | xgb_hyperparams.append(
232 | {
233 | "params": {
234 | "eta": eta,
235 | "objective": "binary:logistic",
236 | "gamma": gamma,
237 | "max_depth": max_depth,
238 | "min_child_weight": min_child_weight,
239 | "subsample": subsample,
240 | }
241 | }
242 | )
243 |
244 | test_score = train_test_evaluation(
245 | train_years=list([year for year in range(1990, 2021)]),
246 | test_years=test_years,
247 | model_class=XGBoost,
248 | model_params=xgb_hyperparams,
249 | match_features=match_features,
250 | player_features=player_features,
251 | encoding_params={},
252 | additional_features=additional_features,
253 | save_path="../../results/20212022_chall",
254 | save_all_results=False,
255 | )
256 |
--------------------------------------------------------------------------------
/python/data/data_encoding.py:
--------------------------------------------------------------------------------
1 | import ast
2 | import numpy as np
3 | import pandas as pd
4 | import tqdm
5 |
6 | from history_modeling.match_representation import (
7 | create_timeless_dataset,
8 | get_match_info,
9 | )
10 |
11 |
12 | def clean_missing_data(df):
13 | """
14 | Cleans rows of df with missing data or to few statistics to be useful
15 | :param df:
16 | :return:
17 | """
18 | print("Length df before cleaning:", len(df))
19 | df = df.dropna(axis=0)
20 | print("after dropna", len(df))
21 | # df = df.loc[df.Ranking_1 != 9999]
22 | df = df.loc[df.Ranking_1 != 0]
23 | # df = df.loc[df.Ranking_2 != 9999]
24 | df = df.loc[df.Ranking_2 != 0]
25 |
26 | return df
27 |
28 |
29 | def complete_missing_data(df, *args):
30 | for column, value in args:
31 | df[column].fillna(value, inplace=True)
32 |
33 | return df
34 |
35 |
36 | def encode_data(df, mode="integer"):
37 | # Remove:
38 | # - index
39 | # - Unnamed: 0
40 | # - Unnamed: 0.1
41 | # - tournament
42 | # - Name
43 | # - ID
44 | # - Birth Year => Age
45 | # - Versus: % V against 2, last 5 matches
46 | # - Matches
47 |
48 | # Refac:
49 | # - Versus
50 | # Best way to do it ?
51 | # - Birth Year
52 | # - Last Tournament => Days since last tournament + result ?
53 |
54 | df_copy = df.copy()
55 | if mode == "integer":
56 | # Considered Variables:
57 | tournament_level = {"G": 0, "A": 1, "M": 2, "F": 3, "D": 4, "C": 5}
58 | tournament_surface = {"Clay": 0, "Carpet": 1, "Hard": 2, "Grass": 3}
59 |
60 | round = {
61 | "F": 0,
62 | "SF": 1,
63 | "QF": 2,
64 | "R16": 3,
65 | "R32": 4,
66 | "R64": 5,
67 | "R128": 6,
68 | "R256": 7,
69 | "RR": 8,
70 | "BR": 9,
71 | "ER": 10,
72 | "Q1": 11,
73 | "Q2": 12,
74 | "Q3": 13,
75 | }
76 |
77 | hand = {
78 | "R": -1,
79 | "L": 1,
80 | "A": 0,
81 | "U": 2,
82 | "nan": 2,
83 | }
84 |
85 | elif mode == "one_hot":
86 | # Considered Variables:
87 | tournament_level = {
88 | "G": [0, 0, 0, 1, 0],
89 | "A": [0, 0, 1, 0, 0],
90 | "M": [0, 1, 0, 0, 0],
91 | "D": [1, 0, 0, 0, 0],
92 | "C": [0, 0, 0, 0, 1],
93 | }
94 |
95 | tournament_surface = {
96 | "Clay": [1, 0, 0, 0],
97 | "Carpet": [0, 1, 0, 0],
98 | "Hard": [0, 0, 1, 0],
99 | "Grass": [0, 0, 0, 1],
100 | }
101 |
102 | round = {
103 | "F": [0, 0, 0, 0, 0, 0, 0, 0, 1],
104 | "SF": [0, 0, 0, 0, 0, 0, 0, 1, 0],
105 | "QF": [0, 0, 0, 0, 0, 0, 1, 0, 0],
106 | "R16": [0, 0, 0, 0, 0, 1, 0, 0, 0],
107 | "R32": [0, 0, 0, 0, 1, 0, 0, 0, 0],
108 | "R64": [0, 0, 0, 1, 0, 0, 0, 0, 0],
109 | "R128": [0, 0, 1, 0, 0, 0, 0, 0, 0],
110 | "R256": [0, 1, 0, 0, 0, 0, 0, 0, 0],
111 | "RR": [1, 0, 0, 0, 0, 0, 0, 0, 0],
112 | }
113 |
114 | hand = {
115 | "R": [1, 0, 0, 0],
116 | "L": [0, 1, 0, 0],
117 | "A": [0, 0, 1, 0],
118 | "U": [0, 0, 0, 1],
119 | }
120 |
121 | elif mode == "mixing":
122 | # Considered Variables:
123 | tournament_level = {"G": 0, "A": 1, "M": 2, "F": 3, "D": 4, "C": 5}
124 | tournament_surface = {
125 | "Clay": [1, 0, 0, 0],
126 | "Carpet": [0, 1, 0, 0],
127 | "Hard": [0, 0, 1, 0],
128 | "Grass": [0, 0, 0, 1],
129 | }
130 |
131 | round = {
132 | "F": 0,
133 | "SF": 1,
134 | "QF": 2,
135 | "R16": 3,
136 | "R32": 4,
137 | "R64": 5,
138 | "R128": 6,
139 | "R256": 7,
140 | "RR": 8,
141 | "BR": 9,
142 | }
143 |
144 | hand = {
145 | "R": [1, 0, 0, 0],
146 | "L": [0, 1, 0, 0],
147 | "A": [0, 0, 1, 0],
148 | "U": [0, 0, 0, 1],
149 | }
150 |
151 | for col in df_copy.columns:
152 | if "hand" in col.lower():
153 | df_copy[col] = df_copy.apply(lambda row: hand[str(row[col])], axis=1)
154 | elif "round" in col.lower():
155 | df_copy[col] = df_copy.apply(lambda row: round[row[col]], axis=1)
156 | elif "tournament_level" in col.lower():
157 | df_copy[col] = df_copy.apply(lambda row: tournament_level[row[col]], axis=1)
158 | elif "tournament_surface" in col.lower():
159 | df_copy[col] = df_copy.apply(
160 | lambda row: tournament_surface[row[col]], axis=1
161 | )
162 | else:
163 | pass
164 |
165 | return df_copy
166 |
167 |
168 | def create_additional_features(df, features):
169 | df = df.copy()
170 |
171 | if "nb_match_versus" in features:
172 | df["nb_match_versus"] = df.apply(
173 | lambda row: len([k[0] for k in ast.literal_eval(row["Versus_1"])]), axis=1
174 | )
175 |
176 | if "v_perc_versus" in features:
177 | df["v_perc_versus"] = df.apply(
178 | lambda row: [k[0] for k in ast.literal_eval(row["Versus_1"])].count("V")
179 | / len([k[0] for k in ast.literal_eval(row["Versus_1"])])
180 | if len([k[0] for k in ast.literal_eval(row["Versus_1"])]) > 0
181 | else -1,
182 | axis=1,
183 | )
184 |
185 | if "diff_rank" in features:
186 | df["diff_rank"] = df.apply(
187 | lambda row: row["Ranking_2"] - row["Ranking_1"], axis=1
188 | )
189 |
190 | if "diff_rank_points" in features:
191 | df["diff_rank_points"] = df.apply(
192 | lambda row: row["Ranking_Points_2"] - row["Ranking_Points_1"], axis=1
193 | )
194 |
195 | return df
196 |
197 |
198 | def create_encoded_history(df, encoder, num_matches, completing_value=0):
199 | df = df.copy()
200 | history = {
201 | "id": [],
202 | "ID_1": [],
203 | "ID_2": [],
204 | "history_1": [],
205 | "history_2": [],
206 | }
207 |
208 | for n_row, row in tqdm.tqdm(df.iterrows(), total=len(df)):
209 | try:
210 | matches_history_1 = ast.literal_eval(row["Matches_1"])[-num_matches:]
211 | except:
212 | with open("error.txt", "w") as file:
213 | file.write(str(row["Matches_1"]))
214 | matches_history_1 = ast.literal_eval(row["Matches_1"])[-num_matches:]
215 |
216 | matches_history_1 = [_[1] for _ in matches_history_1]
217 |
218 | df_history = df.loc[df.id.isin(matches_history_1)].loc[df.ID_1 == row.ID_1]
219 |
220 | if len(df_history) > 0:
221 | # df_history = create_timeless_dataset(df_history)
222 | # encoded_history_1 = encoder.predict(df_history)
223 | encoded_history_1, df_history = encoder.predict(
224 | df_history, transform_data=True
225 | )
226 |
227 | if encoded_history_1.shape[0] < num_matches:
228 | encoded_history_1 = np.concatenate(
229 | [
230 | np.ones(
231 | (
232 | num_matches - encoded_history_1.shape[0],
233 | encoded_history_1.shape[1],
234 | )
235 | )
236 | * completing_value,
237 | encoded_history_1,
238 | ],
239 | axis=0,
240 | )
241 | else:
242 | encoded_history_1 = (
243 | np.ones((num_matches, encoder.output_shape)) * completing_value
244 | )
245 |
246 | matches_history_2 = ast.literal_eval(row["Matches_2"])[-num_matches:]
247 | matches_history_2 = [_[1] for _ in matches_history_2]
248 |
249 | df_history = df.loc[df.id.isin(matches_history_2)].loc[df.ID_1 == row.ID_2]
250 |
251 | if len(df_history) > 0:
252 | # df_history = create_timeless_dataset(df_history)
253 | encoded_history_2, df_history = encoder.predict(
254 | df_history, transform_data=True
255 | )
256 |
257 | if encoded_history_2.shape[0] < num_matches:
258 | encoded_history_2 = np.concatenate(
259 | [
260 | np.ones(
261 | (
262 | num_matches - encoded_history_2.shape[0],
263 | encoded_history_2.shape[1],
264 | )
265 | )
266 | * completing_value,
267 | encoded_history_2,
268 | ],
269 | axis=0,
270 | )
271 | else:
272 | encoded_history_2 = (
273 | np.ones((num_matches, encoder.output_shape)) * completing_value
274 | )
275 |
276 | history["id"].append(row.id)
277 | history["ID_1"].append(row.ID_1)
278 | history["ID_2"].append(row.ID_2)
279 |
280 | history["history_1"].append(encoded_history_1)
281 | history["history_2"].append(encoded_history_2)
282 |
283 | if n_row < 100 and len(df_history) > 0:
284 | row.to_csv("row.csv")
285 | df_history.to_csv("df_history.csv")
286 | np.save("encoded_history.npy", encoded_history_2)
287 | return pd.DataFrame(history)
288 |
--------------------------------------------------------------------------------
/python/data/match.py:
--------------------------------------------------------------------------------
1 | import ast
2 |
3 | import pandas as pd
4 | import numpy as np
5 |
6 |
7 | class Match:
8 | def __init__(self, winner, loser, tournament, surface, id_prefix=""):
9 | self.winner = winner
10 | self.loser = loser
11 | self.tournament = tournament
12 | self.surface = surface
13 | self.id_prefix = id_prefix
14 |
15 | self.tournament_date = ""
16 | self.tournament_level = ""
17 | self.round = ""
18 | self.data = None
19 | self.match_time_players_data = {
20 | "winner": {
21 | "id": self.winner,
22 | "age": 0,
23 | "rank": 0,
24 | "ranking_points": 0,
25 | "aces_nb": 0,
26 | "df_nb": 0,
27 | "w_svpt": 0,
28 | "w_1stIn": 0,
29 | "w_1stWon": 0,
30 | "w_2ndWon": 0,
31 | "w_SvGms": 0,
32 | "w_bpSaved": 0,
33 | "w_bpFaced": 0,
34 | },
35 | "loser": {
36 | "id": self.loser,
37 | "age": 0,
38 | "rank": 0,
39 | "ranking_points": 0,
40 | "aces_nb": 0,
41 | "df_nb": 0,
42 | "w_svpt": 0,
43 | "w_1stIn": 0,
44 | "w_1stWon": 0,
45 | "w_2ndWon": 0,
46 | "w_SvGms": 0,
47 | "w_bpSaved": 0,
48 | "w_bpFaced": 0,
49 | },
50 | }
51 |
52 | self.sets_number = 0
53 | self.score = None
54 | self.elapsed_minutes = None
55 | self.best_of = None
56 |
57 | def get_rankings(self, player_id):
58 | if player_id == self.winner.id:
59 | return (
60 | self.match_time_players_data["winner"]["rank"],
61 | self.match_time_players_data["winner"]["ranking_points"],
62 | )
63 | else:
64 | return (
65 | self.match_time_players_data["loser"]["rank"],
66 | self.match_time_players_data["loser"]["ranking_points"],
67 | )
68 |
69 | def get_aces_nb(self, player_id):
70 | if player_id == self.winner.id:
71 | return self.match_time_players_data["winner"]["aces_nb"]
72 | else:
73 | return self.match_time_players_data["loser"]["aces_nb"]
74 |
75 | def get_service_points_played(self, player_id):
76 | if player_id == self.winner.id:
77 | return self.match_time_players_data["winner"]["w_svpt"]
78 | else:
79 | return self.match_time_players_data["loser"]["w_svpt"]
80 |
81 | def get_df_nb(self, player_id):
82 | if player_id == self.winner.id:
83 | return self.match_time_players_data["winner"]["df_nb"]
84 | else:
85 | return self.match_time_players_data["loser"]["df_nb"]
86 |
87 | def get_first_serve_win(self, player_id):
88 | if player_id == self.winner.id:
89 | return self.match_time_players_data["winner"]["w_1stWon"]
90 | else:
91 | return self.match_time_players_data["loser"]["w_1stWon"]
92 |
93 | def get_second_serve_win(self, player_id):
94 | if player_id == self.winner.id:
95 | return self.match_time_players_data["winner"]["w_2ndWon"]
96 | else:
97 | return self.match_time_players_data["loser"]["w_2ndWon"]
98 |
99 | def get_first_services_in(self, player_id):
100 | if player_id == self.winner.id:
101 | return self.match_time_players_data["winner"]["w_1stIn"]
102 | else:
103 | return self.match_time_players_data["loser"]["w_1stIn"]
104 |
105 | def get_breakpoint_faced(self, player_id):
106 | if player_id == self.winner.id:
107 | return self.match_time_players_data["winner"]["w_bpFaced"]
108 | else:
109 | return self.match_time_players_data["loser"]["w_bpFaced"]
110 |
111 | def get_breakpoint_saved(self, player_id):
112 | if player_id == self.winner.id:
113 | return self.match_time_players_data["winner"]["w_bpSaved"]
114 | else:
115 | return self.match_time_players_data["loser"]["w_bpSaved"]
116 |
117 | def __str__(self):
118 | return (
119 | "TOURNAMENT : "
120 | + self.tournament
121 | + " W : "
122 | + self.winner
123 | + " L : "
124 | + self.loser
125 | )
126 |
127 | def get_prior_data_and_update_players_stats(self):
128 | match_data = pd.DataFrame(
129 | {
130 | "id": [self.id],
131 | "tournament": [self.tournament],
132 | "tournament_level": [self.tournament_level],
133 | "tournament_date": [self.tournament_date],
134 | "tournament_surface": [self.surface],
135 | "round": [self.round],
136 | "best_of": [self.best_of],
137 | }
138 | )
139 |
140 | w_data = self.winner.get_data_df(opponent=self.loser.id)
141 | lr, lrp = self.winner.get_last_months_rankings(
142 | date=self.tournament_date, nb_months=12, day_of_month="last"
143 | )
144 | w_data["last_rankings"] = [lr]
145 | w_data["last_ranking_points"] = [lrp]
146 | l_data = self.loser.get_data_df(opponent=self.winner.id)
147 | lr, lrp = self.loser.get_last_months_rankings(
148 | date=self.tournament_date, nb_months=12, day_of_month="last"
149 | )
150 | l_data["last_rankings"] = [lr]
151 | l_data["last_ranking_points"] = [lrp]
152 |
153 | self.winner.update_from_match(self)
154 | self.loser.update_from_match(self)
155 | return match_data, w_data, l_data
156 |
157 | def get_match_data_results_statistics(self):
158 | match_statistics = {
159 | "score": [self.score],
160 | "elapsed_minutes": [self.elapsed_minutes],
161 | }
162 |
163 | winner_statistics = {
164 | "aces_nb": [self.match_time_players_data["winner"]["aces_nb"]],
165 | "doublefaults_nb": [self.match_time_players_data["winner"]["df_nb"]],
166 | "svpt": [self.match_time_players_data["winner"]["w_svpt"]],
167 | "1stIn": [self.match_time_players_data["winner"]["w_1stIn"]],
168 | "1stWon": [self.match_time_players_data["winner"]["w_1stWon"]],
169 | "2ndWon": [self.match_time_players_data["winner"]["w_2ndWon"]],
170 | "SvGms": [self.match_time_players_data["winner"]["w_SvGms"]],
171 | "bpSaved": [self.match_time_players_data["winner"]["w_bpSaved"]],
172 | "bpFaced": [self.match_time_players_data["winner"]["w_bpFaced"]],
173 | }
174 | loser_statistics = {
175 | "aces_nb": [self.match_time_players_data["loser"]["aces_nb"]],
176 | "doublefaults_nb": [self.match_time_players_data["loser"]["df_nb"]],
177 | "svpt": [self.match_time_players_data["loser"]["w_svpt"]],
178 | "1stIn": [self.match_time_players_data["loser"]["w_1stIn"]],
179 | "1stWon": [self.match_time_players_data["loser"]["w_1stWon"]],
180 | "2ndWon": [self.match_time_players_data["loser"]["w_2ndWon"]],
181 | "SvGms": [self.match_time_players_data["loser"]["w_SvGms"]],
182 | "bpSaved": [self.match_time_players_data["loser"]["w_bpSaved"]],
183 | "bpFaced": [self.match_time_players_data["loser"]["w_bpFaced"]],
184 | }
185 |
186 | return (
187 | pd.DataFrame(match_statistics),
188 | pd.DataFrame(winner_statistics),
189 | pd.DataFrame(loser_statistics),
190 | )
191 |
192 | def instantiate_from_data_row(self, data_row):
193 | self.tournament_date = data_row["tourney_date"]
194 | self.tournament_level = data_row["tourney_level"]
195 | self.round = data_row["round"]
196 | self.sets_number = len(str(data_row["score"]).split("-"))
197 | self.games_number = 0
198 | for set in str(data_row["score"]).split(" "):
199 | try:
200 | games_0 = int(set.split("-")[0][0])
201 | games_a = int(set.split("-")[1][0])
202 | self.games_number += games_0
203 | self.games_number += games_1
204 | except:
205 | pass
206 |
207 | self.score = data_row["score"]
208 | self.elapsed_minutes = data_row["minutes"]
209 | self.best_of = data_row["best_of"]
210 |
211 | self.id = self.id_prefix + "_" + str(data_row.name)
212 |
213 | self.match_time_players_data = {
214 | "winner": {
215 | "id": data_row["winner_id"],
216 | "age": data_row["winner_age"],
217 | "rank": data_row["winner_rank"],
218 | "ranking_points": data_row["winner_rank_points"],
219 | "aces_nb": data_row["w_ace"],
220 | "df_nb": data_row["w_df"],
221 | "w_svpt": data_row["w_svpt"],
222 | "w_1stIn": data_row["w_1stIn"],
223 | "w_1stWon": data_row["w_1stWon"],
224 | "w_2ndWon": data_row["w_2ndWon"],
225 | "w_SvGms": data_row["w_SvGms"],
226 | "w_bpSaved": data_row["w_bpSaved"],
227 | "w_bpFaced": data_row["w_bpFaced"],
228 | },
229 | "loser": {
230 | "id": data_row["loser_id"],
231 | "age": data_row["loser_age"],
232 | "rank": data_row["loser_rank"],
233 | "ranking_points": data_row["loser_rank_points"],
234 | "aces_nb": data_row["l_ace"],
235 | "df_nb": data_row["l_df"],
236 | "w_svpt": data_row["l_svpt"],
237 | "w_1stIn": data_row["l_1stIn"],
238 | "w_1stWon": data_row["l_1stWon"],
239 | "w_2ndWon": data_row["l_2ndWon"],
240 | "w_SvGms": data_row["l_SvGms"],
241 | "w_bpSaved": data_row["l_bpSaved"],
242 | "w_bpFaced": data_row["l_bpFaced"],
243 | },
244 | }
245 |
--------------------------------------------------------------------------------
/python/evaluation/train_test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 |
4 | import numpy as np
5 | import pandas as pd
6 |
7 | from data.data_loader import matches_data_loader
8 | from data.data_encoding import (
9 | encode_data,
10 | create_additional_features,
11 | clean_missing_data,
12 | create_encoded_history,
13 | )
14 |
15 | absolute_path = os.path.dirname(os.path.abspath(__file__))
16 | default_columns_match = ["tournament_level", "round", "best_of", "tournament_surface"]
17 |
18 | default_columns_player = [
19 | "Ranking",
20 | "Ranking_Points",
21 | "Hand",
22 | "Height",
23 | "Versus",
24 | "Victories_Percentage",
25 | "Clay_Victories_Percentage",
26 | "Grass_Victories_Percentage",
27 | "Carpet_Victories_Percentage",
28 | "Hard_Victories_Percentage",
29 | "Aces_Percentage",
30 | "Doublefaults_Percentage",
31 | "First_Serve_Success_Percentage",
32 | "Winning_on_1st_Serve_Percentage",
33 | "Winning_on_2nd_Serve_Percentage",
34 | "Overall_Win_on_Serve_Percentage",
35 | "BreakPoint_Face_Percentage",
36 | "BreakPoint_Saved_Percentage",
37 | "Fatigue",
38 | ]
39 |
40 |
41 | def train_test_evaluation(
42 | train_years,
43 | test_years,
44 | model_class,
45 | model_params,
46 | encoder_models=[],
47 | use_davis_data=False,
48 | history_encoder_years=1,
49 | match_features=default_columns_match,
50 | player_features=default_columns_player,
51 | encoding_params={},
52 | additional_features=[],
53 | save_path=None,
54 | save_all_results=False,
55 | ):
56 | global absolute_path
57 | assert len(set(train_years).intersection(set(test_years))) == 0
58 | print(f"[+] Beginning Train/Test Evaluation for model class {model_class}")
59 |
60 | min_year = np.min(train_years + test_years)
61 | min_year -= history_encoder_years
62 | print(f"[+] Loading Data from year {min_year}")
63 | data_df = matches_data_loader(
64 | path_to_data=os.path.join(absolute_path, "../../submodules/tennis_atp"),
65 | path_to_cache=os.path.join(absolute_path, "../../cache"),
66 | flush_cache=False,
67 | keep_values_from_year=min_year,
68 | get_match_statistics=False,
69 | get_reversed_match_data=True,
70 | include_davis_cup=use_davis_data,
71 | )
72 | print(f"[+] Data Loaded, Now Encoding Data and create additional Features")
73 |
74 | historic_data = data_df.loc[data_df.tournament_year < min(train_years)]
75 | train_data = data_df.loc[data_df.tournament_year.isin(train_years)]
76 | test_data = data_df.loc[data_df.tournament_year.isin(test_years)]
77 |
78 | history_columns = []
79 | for encoding_model, encoding_model_params in encoder_models:
80 | print(f"[+] Training Encoder Model {encoding_model}")
81 | encoder = encoding_model(**encoding_model_params)
82 | encoder.fit(train_data)
83 |
84 | print(f"[+] Encoding using encoder {encoding_model}")
85 | encoded_data = create_encoded_history(
86 | data_df, encoder, num_matches=5, completing_value=0
87 | )
88 |
89 | cols = ["history_1", "history_2"]
90 |
91 | flatten_data = pd.concat(
92 | [
93 | pd.DataFrame(
94 | np.array(encoded_data[x].values.tolist()).reshape(
95 | (len(encoded_data), -1)
96 | )
97 | ).add_prefix(x)
98 | for x in cols
99 | ],
100 | axis=1,
101 | )
102 | encoded_data = pd.concat(
103 | [flatten_data, encoded_data.drop(cols, axis=1)], axis=1
104 | )
105 | enc_columns = encoded_data.columns
106 | enc_columns = list(set(enc_columns) - set(["id", "ID_1", "ID_2"]))
107 | history_columns.extend(enc_columns)
108 |
109 | data_df = pd.merge(data_df, encoded_data, on=["id", "ID_1", "ID_2"])
110 |
111 | # train_data = pd.merge(train_data, encoded_data, on=["id", "ID_1", "ID_2"])
112 | # test_data = pd.merge(test_data, encoded_data, on=["id", "ID_1", "ID_2"])
113 |
114 | train_data = data_df.loc[data_df.tournament_year.isin(train_years)]
115 | test_data = data_df.loc[data_df.tournament_year.isin(test_years)]
116 | train_data = create_additional_features(train_data, additional_features)
117 | train_data = encode_data(train_data, **encoding_params)
118 | test_data = create_additional_features(test_data, additional_features)
119 | test_data = encode_data(test_data, **encoding_params)
120 |
121 | p1_features = [feat + "_1" for feat in player_features]
122 | p2_features = [feat + "_2" for feat in player_features]
123 | match_features = match_features.copy()
124 | match_features.extend(additional_features.copy())
125 |
126 | train_data = train_data[
127 | match_features
128 | + p1_features
129 | + p2_features
130 | + history_columns
131 | + ["Winner", "tournament_year"]
132 | ]
133 | test_data = test_data[
134 | match_features
135 | + p1_features
136 | + p2_features
137 | + history_columns
138 | + ["Winner", "tournament_year"]
139 | ]
140 |
141 | print(f"[+] Cleaning Data")
142 | train_data = clean_missing_data(train_data)
143 | test_data = clean_missing_data(test_data)
144 | print(f"Training on {len(train_data)} data and testing on {len(test_data)} data")
145 |
146 | print(f"[+] Data Ready, now beginning modelling")
147 | if isinstance(model_params, list):
148 | precisions = []
149 | for params_set in model_params:
150 | model = model_class(**params_set)
151 | t_fit = time.time()
152 | model.fit(
153 | train_data[match_features + p1_features + p2_features],
154 | train_data["Winner"].values.ravel(),
155 | )
156 | t_fit = time.time() - t_fit
157 | print(f"~~ Fit time: {np.round(t_fit, 0)}")
158 |
159 | preds = model.predict(test_data[match_features + p1_features + p2_features])
160 | precision = np.sum(np.squeeze(preds) == test_data["Winner"].values) / len(
161 | preds
162 | )
163 | precisions.append(precision)
164 |
165 | if save_path is not None:
166 | try:
167 | df_res = pd.read_csv(
168 | os.path.join(save_path, "results.csv"), sep=";"
169 | )
170 | except:
171 | print("save file not found")
172 | os.makedirs(save_path, exist_ok=True)
173 | df_res = pd.DataFrame()
174 |
175 | df_curr = pd.DataFrame(
176 | {
177 | "train_years": [train_years],
178 | "test_years": [test_years],
179 | "model_class": [model_class.__name__],
180 | "model_params": [params_set],
181 | "match_features": [match_features],
182 | "player_features": [player_features],
183 | "encoding_params": [encoding_params],
184 | "additional_features": [additional_features.copy()],
185 | "precision": [precision],
186 | "fit_time": [np.round(t_fit, 0)],
187 | }
188 | )
189 |
190 | if save_all_results:
191 | eval_id = int(time.time() * 100)
192 | df_curr["eval_ID"] = [eval_id]
193 | test_data["y_pred"] = preds
194 | test_data.to_csv(
195 | os.path.join(save_path, f"{eval_id}.csv"), index=False, sep=";"
196 | )
197 |
198 | df_res = pd.concat([df_res, df_curr], axis=0)
199 | df_res.to_csv(
200 | os.path.join(save_path, "results.csv"), index=False, sep=";"
201 | )
202 |
203 | return precisions
204 |
205 | else:
206 | model = model_class(**model_params)
207 | t_fit = time.time()
208 | model.fit(
209 | train_data[match_features + p1_features + p2_features],
210 | train_data["Winner"].values.ravel(),
211 | )
212 | t_fit = time.time() - t_fit
213 | print(f"~~ Fit time: {np.round(t_fit, 0)}")
214 |
215 | print(f"[+] Fit ended, now predicting on test set")
216 | preds = model.predict(test_data[match_features + p1_features + p2_features])
217 | precision = np.sum(np.squeeze(preds) == test_data["Winner"].values) / len(preds)
218 | if save_path is not None:
219 | try:
220 | df_res = pd.read_csv(os.path.join(save_path, "results.csv"), sep=";")
221 | except:
222 | print("save file not found")
223 | os.makedirs(save_path, exist_ok=True)
224 | df_res = pd.DataFrame()
225 |
226 | df_curr = pd.DataFrame(
227 | {
228 | "train_years": [train_years],
229 | "test_years": [test_years],
230 | "model_class": [model_class.__name__],
231 | "model_params": [model_params],
232 | "encoder_models": [encoder_models],
233 | "history_encoder_years": [history_encoder_years],
234 | "match_features": [match_features],
235 | "player_features": [player_features],
236 | "encoding_params": [encoding_params],
237 | "additional_features": [additional_features.copy()],
238 | "precision": [precision],
239 | "fit_time": [np.round(t_fit, 0)],
240 | }
241 | )
242 | if save_all_results:
243 | print(f"[+] Saving Results")
244 | eval_id = int(time.time())
245 | df_curr["eval_ID"] = [eval_id]
246 | test_data["y_pred"] = preds
247 | test_data.to_csv(
248 | os.path.join(save_path, f"{eval_id}.csv"), index=False, sep=";"
249 | )
250 |
251 | df_res = pd.concat([df_res, df_curr], axis=0)
252 | df_res.to_csv(os.path.join(save_path, "results.csv"), index=False, sep=";")
253 |
254 | return precision
255 |
--------------------------------------------------------------------------------
/examples/data/data_row_example.csv:
--------------------------------------------------------------------------------
1 | ,level_0,index,id,tournament,tournament_level,tournament_date,tournament_surface,round,best_of,match_id,Name_1,ID_1,Ranking_1,Ranking_Points_1,Ranking_History_1,Best_Rank_1,Birth_Year_1,Versus_1,Hand_1,Last_Tournament_Date_1,Height_1,Matches_1,Matches_Clay_1,Matches_Carpet_1,Matches_Grass_1,Matches_Hard_1,Victories_Percentage_1,Clay_Victories_Percentage_1,Carpet_Victories_Percentage_1,Grass_Victories_Percentage_1,Hard_Victories_Percentage_1,Aces_Percentage_1,Doublefaults_Percentage_1,First_Serve_Success_Percentage_1,Winning_on_1st_Serve_Percentage_1,Winning_on_2nd_Serve_Percentage_1,Overall_Win_on_Serve_Percentage_1,BreakPoint_Face_Percentage_1,BreakPoint_Saved_Percentage_1,games_fatigue_1,minutes_fatigue_1,last_rankings_1,last_ranking_points_1,Name_2,ID_2,Ranking_2,Ranking_Points_2,Ranking_History_2,Best_Rank_2,Birth_Year_2,Versus_2,Hand_2,Last_Tournament_Date_2,Height_2,Matches_2,Matches_Clay_2,Matches_Carpet_2,Matches_Grass_2,Matches_Hard_2,Victories_Percentage_2,Clay_Victories_Percentage_2,Carpet_Victories_Percentage_2,Grass_Victories_Percentage_2,Hard_Victories_Percentage_2,Aces_Percentage_2,Doublefaults_Percentage_2,First_Serve_Success_Percentage_2,Winning_on_1st_Serve_Percentage_2,Winning_on_2nd_Serve_Percentage_2,Overall_Win_on_Serve_Percentage_2,BreakPoint_Face_Percentage_2,BreakPoint_Saved_Percentage_2,games_fatigue_2,minutes_fatigue_2,last_rankings_2,last_ranking_points_2,Winner,score,elapsed_minutes,aces_nb_1,doublefaults_nb_1,svpt_1,1stIn_1,1stWon_1,2ndWon_1,SvGms_1,bpSaved_1,bpFaced_1,aces_nb_2,doublefaults_nb_2,svpt_2,1stIn_2,1stWon_2,2ndWon_2,SvGms_2,bpSaved_2,bpFaced_2,tournament_year,Fatigue_1,Fatigue_2
2 | 10,25616,10854,atp_matches_qual_chall_2003_5427,San Benedetto CH,C,20030811,Clay,SF,3,atp_matches_qual_chall_2003_5427,Stan.Wawrinka,104527,284.0,114.0,"{20030616: [387, 68], 20030707: [363, 74], 20030714: [348, 79], 20030721: [303, 99], 20030811: [284, 114]}",284,19850328.0,[],R,20030721,183.0,"[['V', 'atp_matches_qual_chall_2003_3466'], ['D', 'atp_matches_qual_chall_2003_3481'], ['D', 'atp_matches_2003_4049'], ['V', 'atp_matches_2003_4315'], ['D', 'atp_matches_2003_4328'], ['V', 'atp_matches_2003_4773'], ['D', 'atp_matches_2003_4782'], ['V', 'atp_matches_qual_chall_2003_5408'], ['V', 'atp_matches_qual_chall_2003_5419'], ['V', 'atp_matches_qual_chall_2003_5424']]","['V', 'D', 'D', 'V', 'D', 'V', 'D', 'V', 'V', 'V']",[],[],[],60.0,60.0,0.0,0.0,0.0,3.418803418803419,4.273504273504273,64.95726495726495,54.98575498575499,15.669515669515668,70.65527065527066,11.396011396011396,7.6923076923076925,38.09090909090909,,"[303, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 387]","[99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 68]",Martin.Vassallo Arguello,103506,125.0,296.0,"{19990201: [817, 13], 20000710: [398, 61], 20000731: [354, 75], 20000807: [377, 70], 20010625: [459, 48], 20010709: [405, 61], 20010813: [391, 68], 20010820: [374, 72], 20010827: [342, 88], 20010917: [291, 117], 20010924: [286, 122], 20011008: [238, 154], 20011015: [237, 157], 20011022: [211, 178], 20011112: [206, 181], 20011126: [198, 186], 20011203: [201, 186], 20011231: [202, 186], 20020318: [175, 220], 20020325: [175, 220], 20020401: [178, 213], 20020408: [173, 219], 20020422: [174, 219], 20020429: [176, 217], 20020506: [151, 265], 20020513: [140, 286], 20020527: [140, 285], 20020610: [135, 304], 20020617: [123, 328], 20020624: [123, 328], 20020701: [123, 328], 20020708: [125, 320], 20020715: [132, 311], 20020722: [129, 312], 20020819: [136, 304], 20020930: [165, 220], 20021007: [158, 232], 20030127: [204, 164], 20030210: [204, 164], 20030217: [203, 168], 20030224: [198, 172], 20030324: [197, 177], 20030421: [195, 177], 20030428: [188, 188], 20030512: [255, 118], 20030526: [204, 167], 20030602: [204, 167], 20030609: [211, 163], 20030616: [230, 137], 20030623: [233, 137], 20030630: [233, 137], 20030707: [218, 157], 20030714: [181, 202], 20030721: [163, 232], 20030728: [157, 247], 20030804: [126, 296], 20030811: [125, 296]}",123.0,19800210.0,[],R,20030804.0,183.0,"[['V', 'atp_matches_qual_chall_1999_380'], ['D', 'atp_matches_qual_chall_1999_393'], ['V', 'atp_matches_qual_chall_2000_3972'], ['V', 'atp_matches_qual_chall_2000_3988'], ['D', 'atp_matches_qual_chall_2000_3996'], ['D', 'atp_matches_qual_chall_2000_4725'], ['D', 'atp_matches_qual_chall_2000_4758'], ['V', 'atp_matches_qual_chall_2001_3699'], ['V', 'atp_matches_qual_chall_2001_3712'], ['D', 'atp_matches_qual_chall_2001_3719'], ['V', 'atp_matches_qual_chall_2001_4080'], ['V', 'atp_matches_qual_chall_2001_4089'], ['D', 'atp_matches_qual_chall_2001_4093'], ['V', 'atp_matches_qual_chall_2001_5286'], ['D', 'atp_matches_qual_chall_2001_5295'], ['V', 'atp_matches_qual_chall_2001_5433'], ['V', 'atp_matches_qual_chall_2001_5446'], ['D', 'atp_matches_qual_chall_2001_5453'], ['V', 'atp_matches_qual_chall_2001_5805'], ['V', 'atp_matches_qual_chall_2001_5814'], ['V', 'atp_matches_qual_chall_2001_5818'], ['D', 'atp_matches_qual_chall_2001_5820'], ['V', 'atp_matches_qual_chall_2001_6263'], ['D', 'atp_matches_qual_chall_2001_6275'], ['V', 'atp_matches_qual_chall_2001_6452'], ['V', 'atp_matches_qual_chall_2001_6461'], ['V', 'atp_matches_qual_chall_2001_6466'], ['V', 'atp_matches_qual_chall_2001_6468'], ['D', 'atp_matches_qual_chall_2001_6469'], ['D', 'atp_matches_qual_chall_2001_6943'], ['V', 'atp_matches_qual_chall_2001_7080'], ['V', 'atp_matches_qual_chall_2001_7090'], ['V', 'atp_matches_qual_chall_2001_7095'], ['D', 'atp_matches_qual_chall_2001_7097'], ['V', 'atp_matches_qual_chall_2001_7140'], ['D', 'atp_matches_qual_chall_2001_7151'], ['V', 'atp_matches_qual_chall_2001_7658'], ['D', 'atp_matches_qual_chall_2001_7673'], ['D', 'atp_matches_qual_chall_2001_7822'], ['D', 'atp_matches_qual_chall_2001_7886'], ['V', 'atp_matches_qual_chall_2002_70'], ['V', 'atp_matches_qual_chall_2002_82'], ['V', 'atp_matches_qual_chall_2002_88'], ['V', 'atp_matches_qual_chall_2002_91'], ['D', 'atp_matches_qual_chall_2002_92'], ['D', 'atp_matches_qual_chall_2002_1710'], ['D', 'atp_matches_qual_chall_2002_1773'], ['V', 'atp_matches_qual_chall_2002_1840'], ['D', 'atp_matches_qual_chall_2002_1849'], ['D', 'atp_matches_qual_chall_2002_1983'], ['D', 'atp_matches_qual_chall_2002_2256'], ['V', 'atp_matches_qual_chall_2002_2326'], ['V', 'atp_matches_qual_chall_2002_2334'], ['V', 'atp_matches_qual_chall_2002_2338'], ['V', 'atp_matches_qual_chall_2002_2340'], ['V', 'atp_matches_qual_chall_2002_2341'], ['V', 'atp_matches_qual_chall_2002_2537'], ['V', 'atp_matches_qual_chall_2002_2548'], ['V', 'atp_matches_qual_chall_2002_2554'], ['D', 'atp_matches_qual_chall_2002_2557'], ['D', 'atp_matches_qual_chall_2002_2594'], ['D', 'atp_matches_2002_2922'], ['V', 'atp_matches_qual_chall_2002_3031'], ['V', 'atp_matches_qual_chall_2002_3055'], ['V', 'atp_matches_qual_chall_2002_3103'], ['V', 'atp_matches_qual_chall_2002_3384'], ['V', 'atp_matches_qual_chall_2002_3396'], ['V', 'atp_matches_qual_chall_2002_3402'], ['D', 'atp_matches_qual_chall_2002_3405'], ['D', 'atp_matches_qual_chall_2002_3475'], ['D', 'atp_matches_qual_chall_2002_3812'], ['V', 'atp_matches_qual_chall_2002_3874'], ['D', 'atp_matches_qual_chall_2002_3884'], ['D', 'atp_matches_2002_4022'], ['V', 'atp_matches_qual_chall_2002_4321'], ['D', 'atp_matches_qual_chall_2002_4333'], ['D', 'atp_matches_qual_chall_2002_4407'], ['D', 'atp_matches_qual_chall_2002_5341'], ['V', 'atp_matches_qual_chall_2002_6432'], ['V', 'atp_matches_qual_chall_2002_6447'], ['D', 'atp_matches_qual_chall_2002_6455'], ['D', 'atp_matches_qual_chall_2002_6528'], ['D', 'atp_matches_qual_chall_2003_527'], ['V', 'atp_matches_qual_chall_2003_859'], ['D', 'atp_matches_qual_chall_2003_874'], ['V', 'atp_matches_qual_chall_2003_959'], ['D', 'atp_matches_qual_chall_2003_972'], ['V', 'atp_matches_qual_chall_2003_1047'], ['D', 'atp_matches_qual_chall_2003_1062'], ['V', 'atp_matches_qual_chall_2003_1650'], ['D', 'atp_matches_qual_chall_2003_1660'], ['V', 'atp_matches_qual_chall_2003_2123'], ['V', 'atp_matches_qual_chall_2003_2137'], ['D', 'atp_matches_qual_chall_2003_2144'], ['D', 'atp_matches_qual_chall_2003_2219'], ['V', 'atp_matches_qual_chall_2003_2620'], ['V', 'atp_matches_qual_chall_2003_2635'], ['V', 'atp_matches_qual_chall_2003_2642'], ['V', 'atp_matches_qual_chall_2003_2646'], ['V', 'atp_matches_qual_chall_2003_2648'], ['V', 'atp_matches_qual_chall_2003_2774'], ['V', 'atp_matches_qual_chall_2003_2787'], ['D', 'atp_matches_qual_chall_2003_2793'], ['D', 'atp_matches_qual_chall_2003_3000'], ['V', 'atp_matches_qual_chall_2003_3174'], ['D', 'atp_matches_qual_chall_2003_3184'], ['D', 'atp_matches_qual_chall_2003_3348'], ['D', 'atp_matches_qual_chall_2003_3467'], ['V', 'atp_matches_qual_chall_2003_3562'], ['V', 'atp_matches_qual_chall_2003_3577'], ['V', 'atp_matches_qual_chall_2003_3585'], ['D', 'atp_matches_qual_chall_2003_3589'], ['V', 'atp_matches_qual_chall_2003_4000'], ['D', 'atp_matches_qual_chall_2003_4009'], ['V', 'atp_matches_qual_chall_2003_4184'], ['V', 'atp_matches_qual_chall_2003_4194'], ['V', 'atp_matches_qual_chall_2003_4199'], ['V', 'atp_matches_qual_chall_2003_4201'], ['V', 'atp_matches_qual_chall_2003_4202'], ['V', 'atp_matches_qual_chall_2003_4491'], ['V', 'atp_matches_qual_chall_2003_4501'], ['V', 'atp_matches_qual_chall_2003_4506'], ['V', 'atp_matches_qual_chall_2003_4509'], ['D', 'atp_matches_qual_chall_2003_4510'], ['V', 'atp_matches_qual_chall_2003_4544'], ['V', 'atp_matches_qual_chall_2003_4559'], ['D', 'atp_matches_qual_chall_2003_4566'], ['V', 'atp_matches_qual_chall_2003_4853'], ['V', 'atp_matches_qual_chall_2003_4869'], ['V', 'atp_matches_qual_chall_2003_4877'], ['V', 'atp_matches_qual_chall_2003_4881'], ['V', 'atp_matches_qual_chall_2003_4883'], ['D', 'atp_matches_qual_chall_2003_5283'], ['V', 'atp_matches_qual_chall_2003_5413'], ['V', 'atp_matches_qual_chall_2003_5421'], ['V', 'atp_matches_qual_chall_2003_5425']]","['V', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'D', 'D', 'D', 'D', 'V', 'D', 'D', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'D', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'D', 'D', 'V', 'D', 'D', 'D', 'V', 'V', 'D', 'D', 'V', 'D', 'V', 'V', 'D', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'D', 'V', 'D', 'D', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'V']","['D', 'V', 'D', 'V', 'D']",['D'],"['D', 'D', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'D']",61.02941176470589,63.559322033898304,40.0,0.0,50.0,4.824561403508771,5.263157894736842,61.40350877192983,46.49122807017544,18.859649122807017,65.35087719298245,9.649122807017545,5.701754385964912,40.44444444444444,,"[157, 136, 165, 158, 9999, 9999, 204, 198, 197, 188, 204, 233]","[247, 304, 220, 232, 0, 0, 164, 172, 177, 188, 167, 137]",0.0,2-6 7-5 7-5,,,,,,,,,,,,,,,,,,,,2003.0,,
3 |
--------------------------------------------------------------------------------
/examples/data/data_loading.py:
--------------------------------------------------------------------------------
1 | import ast
2 | import os, sys
3 |
4 | sys.path.append("../../python")
5 | sys.path.append("../../")
6 |
7 | import matplotlib.pyplot as plt
8 | from matplotlib.patches import Rectangle
9 | import numpy as np
10 | import pandas as pd
11 |
12 | from data.data_loader import matches_data_loader
13 |
14 |
15 | data_df = matches_data_loader(
16 | path_to_data="../../submodules/tennis_atp", # Path to tennis_atp submodule, keep as is if repo cloned with subdmodule
17 | path_to_cache="../../cache", # Path to caching directory
18 | flush_cache=False, # Whether or not to flush a potentially existing cache. Set to True if you want to create the data from scratch
19 | keep_values_from_year=2002, # Returned data will date back to January 2002 up to today
20 | get_match_statistics=True, # Whether to also retrun match statistics (time, score, etc...)
21 | get_reversed_match_data=True, # Whether to duplicate the mathc row and exchange winner and loser positions
22 | include_davis_cup=True, # Whether or not to include davis cup matches
23 | match_type=[
24 | "main_atp",
25 | "qualifying_challengers",
26 | ], # Which match to keep. You can look at tennis_atp submodule to see possibilities
27 | )
28 |
29 | print(data_df.head())
30 | print(data_df.shape)
31 |
32 | # Creation of first figure
33 | # Win percentages considering the ranks of players
34 |
35 | # Rank categories
36 | categories = [1, 10, 50, 100, 300, 1000, 9999]
37 |
38 | best_ranked_player_win_percentage = []
39 | categories_number_of_matches = []
40 |
41 | for cat_1 in range(len(categories) - 1):
42 | lines = []
43 | nb_matches_lines = []
44 | for cat_2 in range(len(categories) - 1):
45 | sub_df = data_df.loc[data_df.Ranking_1 >= categories[cat_1]].loc[
46 | data_df.Ranking_1 < categories[cat_1 + 1]
47 | ]
48 | sub_df = sub_df.loc[sub_df.Ranking_2 >= categories[cat_2]].loc[
49 | sub_df.Ranking_2 < categories[cat_2 + 1]
50 | ]
51 | sub_df["best_rank"] = sub_df.apply(
52 | lambda row: np.argmin([row["Ranking_1"], row["Ranking_2"]]), axis=1
53 | )
54 |
55 | if len(sub_df) > 0:
56 | best_player_w_p = np.sum(
57 | sub_df.Winner.values == sub_df.best_rank.values
58 | ) / len(sub_df)
59 |
60 | else:
61 | best_player_w_p = 0
62 | lines.append(best_player_w_p)
63 | nb_matches_lines.append(len(sub_df) / 2)
64 | best_ranked_player_win_percentage.append(lines)
65 | categories_number_of_matches.append(nb_matches_lines)
66 | print(
67 | "Number of matches with player ranked 0:", len(data_df.loc[data_df.Ranking_1 == 0])
68 | )
69 | print(
70 | "Number of matches with player ranked > 9999:",
71 | len(data_df.loc[data_df.Ranking_1 > 9999]),
72 | )
73 |
74 | colors = ["purple", "blue", "cyan", "green", "yellow", "orange", "red"]
75 | fig, ax = plt.subplots()
76 |
77 | for i, val1 in enumerate(best_ranked_player_win_percentage):
78 | for j, val2 in enumerate(val1):
79 | color = colors[int(val2 * (len(colors) - 1))]
80 | rect = plt.Rectangle((i, j), 1, 1, fc=color)
81 | ax.add_patch(rect)
82 | plt.text(i + 0.2, j + 0.35, np.round(val2 * 100, 2))
83 |
84 | for i in range(len(categories)):
85 | plt.plot([i, i], [0, len(categories) - 1], c="k")
86 | plt.plot([0, len(categories) - 1], [i, i], c="k")
87 |
88 | plt.xticks(list(range(len(categories))), labels=categories)
89 | plt.yticks(list(range(len(categories))), labels=categories)
90 | plt.xlabel("Player 1 Rank Category")
91 | plt.ylabel("Player 2 Rank Category")
92 | plt.title("Best player Win percentage per Rank Category")
93 | plt.savefig("Best_player_win_percentage.png")
94 | plt.show()
95 |
96 | # Second figure
97 | # Number of matches considering players ranks
98 | fig, ax = plt.subplots()
99 |
100 | for i, val1 in enumerate(categories_number_of_matches):
101 | for j, val2 in enumerate(val1):
102 | color = colors[
103 | int(
104 | val2**0.5
105 | / np.max(categories_number_of_matches) ** 0.5
106 | * (len(colors) - 1)
107 | )
108 | ]
109 | rect = plt.Rectangle((i, j), 1, 1, fc=color)
110 | ax.add_patch(rect)
111 | plt.text(i + 0.2, j + 0.35, int(val2))
112 |
113 | for i in range(len(categories)):
114 | plt.plot([i, i], [0, len(categories) - 1], c="k")
115 | plt.plot([0, len(categories) - 1], [i, i], c="k")
116 |
117 | plt.xticks(list(range(len(categories))), labels=categories)
118 | plt.yticks(list(range(len(categories))), labels=categories)
119 | plt.xlabel("Player 1 Rank Category")
120 | plt.ylabel("Player 2 Rank Category")
121 | plt.title("Number of matches recorded per Rank Category")
122 | plt.savefig("nb_matches.png")
123 | plt.show()
124 |
125 | #### Stan the man
126 | # Statistics analysis of Stan Wawrinka over time
127 | overall_v = []
128 | last_hundred_v = []
129 |
130 | overall_clay = []
131 | overall_carpet = []
132 | overall_grass = []
133 | overall_hard = []
134 |
135 | wins_clay = []
136 | wins_carpet = []
137 | wins_grass = []
138 | wins_hard = []
139 |
140 | dates = []
141 | stan_df = data_df.loc[data_df.ID_1 == 104527]
142 | stan_df = stan_df.reset_index()
143 |
144 | stan_df.iloc[100].to_csv("single_row_example.csv")
145 |
146 | for n_row, row in stan_df.iterrows():
147 | matches = [r[0] for r in ast.literal_eval(str(row["Matches_1"]))]
148 |
149 | if len(matches) > 0:
150 | overall_v.append(matches.count("V") / len(matches) * 100)
151 | last_hundred_v.append(matches[-100:].count("V") / len(matches[-100:]) * 100)
152 |
153 | if str(row["tournament_date"])[:4] not in [d[0] for d in dates]:
154 | dates.append((str(row["tournament_date"])[:4], n_row))
155 | overall_clay.append(row["Clay_Victories_Percentage_1"])
156 | overall_grass.append(row["Grass_Victories_Percentage_1"])
157 | overall_hard.append(row["Hard_Victories_Percentage_1"])
158 | overall_carpet.append(row["Carpet_Victories_Percentage_1"])
159 |
160 | wins_clay.append(list(row.Matches_Clay_1).count("V"))
161 | wins_carpet.append(list(row.Matches_Carpet_1).count("V"))
162 | wins_grass.append(list(row.Matches_Grass_1).count("V"))
163 | wins_hard.append(list(row.Matches_Hard_1).count("V"))
164 |
165 | # % Victory over time and surfaces
166 | plt.figure()
167 | plt.plot(overall_v, label="overall")
168 | plt.plot(last_hundred_v, label="last 100 matches")
169 | plt.plot(overall_clay, label="overall clay")
170 | plt.plot(overall_grass, label="overall grass")
171 | plt.plot(overall_hard, label="overall hard")
172 | plt.plot(overall_carpet, label="overall carpet")
173 | plt.legend()
174 | plt.xticks([d[1] for d in dates], [d[0] for d in dates], rotation="vertical")
175 | plt.title("Stanislas Wawrinka win percentage on main ATP tournamnents")
176 | plt.savefig("stan_the_man_win_percentage.png")
177 | plt.show()
178 |
179 |
180 | fig, ax1 = plt.subplots()
181 | ax1.plot(overall_v, label="overall", c="k")
182 | ax1.plot(last_hundred_v, label="last 100 matches", c="purple")
183 | ax1.plot(overall_clay, label="overall clay", c="orange")
184 | ax1.plot(overall_grass, label="overall grass", c="green")
185 | ax1.plot(overall_hard, label="overall hard", c="blue")
186 | ax1.plot(overall_carpet, label="overall carpet", c="gray")
187 | ax1.set_ylabel("Win %")
188 | plt.legend()
189 |
190 | ax2 = ax1.twinx()
191 | for i, (wcarpet, wgrass, wclay, whard) in enumerate(
192 | zip(wins_carpet, wins_grass, wins_clay, wins_hard)
193 | ):
194 | if i % 2 == 0:
195 | ax2.add_patch(
196 | Rectangle(
197 | (i, 0),
198 | width=2,
199 | height=wcarpet,
200 | edgecolor=None,
201 | facecolor="gray",
202 | alpha=0.2,
203 | )
204 | )
205 | ax2.add_patch(
206 | Rectangle(
207 | (i, wcarpet),
208 | width=2,
209 | height=wgrass,
210 | edgecolor=None,
211 | facecolor="green",
212 | alpha=0.2,
213 | )
214 | )
215 | ax2.add_patch(
216 | Rectangle(
217 | (i, wcarpet + wgrass),
218 | width=2,
219 | height=wclay,
220 | edgecolor=None,
221 | facecolor="orange",
222 | alpha=0.2,
223 | )
224 | )
225 | ax2.add_patch(
226 | Rectangle(
227 | (i, wcarpet + wgrass + wclay),
228 | width=2,
229 | height=whard,
230 | edgecolor=None,
231 | facecolor="blue",
232 | alpha=0.2,
233 | )
234 | )
235 |
236 | ax2.set_yticks([0, 100, 200, 300, 400, 500, 600, 700])
237 | ax2.set_ylabel("Number of victory for each surface")
238 | plt.tight_layout()
239 | ax1.set_xticks([d[1] for d in dates], [d[0] for d in dates], rotation="vertical")
240 | plt.title("Stanislas Wawrinka victories on ATP tournamnents")
241 | plt.savefig("stan_the_man_win_percentage.png")
242 | plt.show()
243 |
244 |
245 | aces = {"diff_aces": [], "winner": []}
246 |
247 | for n_row, row in stan_df.iterrows():
248 | diff_aces = row["Aces_Percentage_1"] - row["Aces_Percentage_2"]
249 | winner = row["Winner"]
250 | aces["diff_aces"].append(diff_aces)
251 | aces["winner"].append(winner)
252 |
253 | aces = pd.DataFrame(aces)
254 | classes = [val * 2.5 for val in range(-6, 4, 1)]
255 | fig, ax = plt.subplots(1)
256 | for min_class, max_class in zip(classes[:-1], classes[1:]):
257 | values = aces.loc[aces.diff_aces < max_class].loc[aces.diff_aces > min_class]
258 | ax.add_patch(
259 | Rectangle(
260 | xy=(min_class, 0),
261 | width=2.5,
262 | height=len(values.loc[values.winner == 0]),
263 | edgecolor="k",
264 | facecolor="blue",
265 | label="Victory",
266 | )
267 | )
268 | ax.add_patch(
269 | Rectangle(
270 | xy=(min_class, len(values.loc[values.winner == 0])),
271 | width=2.5,
272 | height=len(values.loc[values.winner == 1]),
273 | edgecolor="k",
274 | facecolor="orange",
275 | label="Defeat",
276 | )
277 | )
278 | ax.autoscale_view()
279 | ax.set_xlabel("Career ace percentage difference with adversary")
280 | ax.set_ylabel("Number of matches")
281 | ax.set_title(
282 | "Histogram of career aces percentage difference for Stan Wawrinka, colored by match results",
283 | wrap=True,
284 | )
285 | handles, labels = plt.gca().get_legend_handles_labels()
286 | by_label = dict(zip(labels, handles))
287 | plt.legend(by_label.values(), by_label.keys())
288 | plt.savefig("stanimal_aces_percentage_difference.png")
289 | plt.show()
290 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Tennis-Prediction
2 |
3 |
4 |
5 | The goal of this project is to predict the outcome of a tennis match using the data of both players and ML models.\
6 | The data used comes from [Jeff Sackmann's repository](https://github.com/JeffSackmann).
7 |
8 | - [Installation](#installation)
9 | - [Usage](#usage)
10 | - [Data Loading](#data-loading)
11 | - [Machine Learning modelling](#ml-modelling)
12 | - [Encoding Matches](#encoding-matches)
13 | - [License](#license)
14 |
15 | ## Installation
16 |
17 | To clone the repository, with the data you need to also clone the submodules:
18 |
19 | ```bash
20 | git clone --recurse-submodules https://github.com/VincentAuriau/Tennis-Prediction.git
21 | ```
22 |
23 | ## Usage
24 |
25 | You can find examples in /examples:
26 |
27 | ### Data Loading
28 | Loading players statistics at match time and match outcome:
29 | [Example](examples/data/data_loading.py)
30 |
31 | ```python
32 | from data.data_loader import matches_data_loader
33 | data_df = matches_data_loader(path_to_data="submodules/tennis_atp")
34 | ```
35 | data_df contains let you access information about players (statistics prior to the match) along statistics of the match.
36 | A basic example statistic: the victory percentage of the best ranked player in a match, depending on players rankings.
37 |
38 |
39 | Number of ATP main matches depending on players rank | Victory % of best ranked player
40 | :-------------------------:|:-------------------------:
41 |  | 
42 |
43 | It can be easily used to also compute players statistics over their carreer, and/or at match time. Here is a simple example with Stan Wawrinka:
44 | Stan's Victory % in main ATP matches | Stan's career aces % diff with adversary
45 | :-------------------------:|:-------------------------:
46 |  | 
47 |
48 | Here is an example of a data row:
49 |
50 | | id | tournament | tournament_level | tournament_date | tournament_surface | round | best_of | match_id | Winner | Score |
51 | | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
52 | | atp_matches_qual_chall_2003_5427 | San Benedetto CH | C | 20030811 | Clay | SF | 3 | 20030811 | 0 | 2-6 7-5 7-5 |
53 |
54 | Base Match Statistics:
55 | - **id and match_id:** unique identification of the matches
56 | - **tournament:** name of the tournament
57 | - **tournament_level:** Category of the tournament 'G' (Grand Slams) 'M' (Masters 1000s), 'A' (other tour-level events), 'C' (Challengers), 'S' (Satellites/ITFs), 'F' (Tour finals), 'D'( Davis Cup)
58 | - **tournament_date:** date
59 | - **tournament_surface:** surface 'Grass', 'Clay', 'Carpet', 'Hard'
60 | - **round:** tournament round of the match 'F' (finale), 'SF' (semi-finale),
61 | - **best_of:** number of won games needded (3 or 5)
62 | - **Winner:** index of the winner: 0 (Player1) or 1 (Player2)
63 | - **Score:** finale score
64 |
65 | Additional match statistics:
66 | - **elapsed_minutes:** Duration of the match
67 | - **aces_nb_x:** Number of aces of player x
68 | - **doublefaults_nb_x:** Number of doublefaults
69 | - **svpt_x:** Number of serve points
70 | - **1stIn_1:** Number of first serves made
71 | - **1stWon_1:** Number of first-serve points won
72 | - **2ndWon_x:** Number of second-serve points won
73 | - **SvGms_1:** Number of serve games
74 | - **bpSaved_1:** Number of break points saved
75 | - **bpFaced_1:** Number of break points faced
76 |
77 | Example of match statistics:
78 |
79 | | Name_1 | ID_1 | Ranking_1 | Ranking_Points_1 | Ranking_History_1 | Best_Rank_1 | Birth_Year_1 | Versus_1 | Hand_1 | Last_Tournament_Date_1 | Height_1 | Matches_1 | Matchs_Clay_1 | Matches_Carpet_1 | Matches_Grass_1 | Matches_Hard_1 | Victories_Percentage_1 | Clay_Victories_Percentage_1 | Carpet_Victories_Percentage_1 | Grass_Victories_Percentage_1 | Hard_Victories_Percentage_1 | Aces_Percentage_1 | Doublefaults_Percentage_1 | First_Save_Success_Percentage_1 | Winning_on_1st_Serve_Percentage_1 | Winning_on_2nd_Serve_Percentage_1 | Overall_Win_on_Serve_Percentage_1 | BreakPoint_Face_Percentage_1 | BreakPoint_Saved_Percentage_1 | last_rankings_1 | last_ranking_points_1 |
80 | | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
81 | | Stan.Wawrinka | 104527 | 184 | 114 | {20030616: [387, 68], 20030707: [363, 74], 20030714: [348, 79], 20030721: [303, 99], 20030811: [284, 114]} | 284 | 19850328 | [] | R | 20030721 | 183 | [['V', 'atp_matches_qual_chall_2003_3466'], ['D', 'atp_matches_qual_chall_2003_3481'], ['D', 'atp_matches_2003_4049'], ['V', 'atp_matches_2003_4315'], ['D', 'atp_matches_2003_4328'], ['V', 'atp_matches_2003_4773'], ['D', 'atp_matches_2003_4782'], ['V', 'atp_matches_qual_chall_2003_5408'], ['V', 'atp_matches_qual_chall_2003_5419'], ['V', 'atp_matches_qual_chall_2003_5424']] | ['V', 'D', 'D', 'V', 'D', 'V', 'D', 'V', 'V', 'V'] | [] | [] | [] | 60 | 60 | 0 | 0 | 0 | 3.41880341880342 | 4.27350427350427 | 64.957264957265 | 54.985754985755 | 15.6695156695157 | 70.6552706552707 | 11.3960113960114 | 7.69230769230769 | [303, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 387] | [99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 68] |
82 |
83 |
84 | | Name_2 | ID_2 | Ranking_2 | Ranking_Points_2 | Ranking_History_2 | Best_Rank_2 | Birth_Year_2 | Versus_2 | Hand_2 | Last_Tournament_Date_2 | Height_2 | Matches_2 | Matchs_Clay_2 | Matches_Carpet_2 | Matches_Grass_2 | Matches_Hard_2 | Victories_Percentage_2 | Clay_Victories_Percentage_2 | Carpet_Victories_Percentage_2 | Grass_Victories_Percentage_2 | Hard_Victories_Percentage_2 | Aces_Percentage_2 | Doublefaults_Percentage_2 | First_Save_Success_Percentage_2 | Winning_on_1st_Serve_Percentage_2 | Winning_on_2nd_Serve_Percentage_2 | Overall_Win_on_Serve_Percentage_2 | BreakPoint_Face_Percentage_2 | BreakPoint_Saved_Percentage_2 | last_rankings_2 | last_ranking_points_2 |
85 | | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
86 | | Martin.Vassallo Arguello | 103506 | 125 | 296 | {19990201: [817, 13], 20000710: [398, 61], 20000731: [354, 75], 20000807: [377, 70], 20010625: [459, 48], 20010709: [405, 61], 20010813: [391, 68], 20010820: [374, 72], 20010827: [342, 88], 20010917: [291, 117], 20010924: [286, 122], etc...} | 123 | 19800210 | [] | R | 20030804 | 183 | [['V', 'atp_matches_qual_chall_1999_380'], ['D', 'atp_matches_qual_chall_1999_393'], ['V', 'atp_matches_qual_chall_2000_3972'], ['V', 'atp_matches_qual_chall_2000_3988'], ['D', 'atp_matches_qual_chall_2000_3996'], ['D', 'atp_matches_qual_chall_2000_4725'], ['D', 'atp_matches_qual_chall_2000_4758'], ['V', 'atp_matches_qual_chall_2001_3699'], etc...] | ['V', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'D', 'D', etc...] | ['D', 'V', 'D', 'V', 'D'] | ['D'] | ['D', 'D', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'D'] | 61.0294117647059 | 63.5593220338983 | 40 | 0 | 50 | 4.82456140350877 | 5.26315789473684 | 61.4035087719298 | 46.4912280701754 | 18.859649122807 | 65.3508771929825 | 9.64912280701754 | 5.70175438596491 | [157, 136, 165, 158, 9999, 9999, 204, 198, 197, 188, 204, 233] | [247, 304, 220, 232, 0, 0, 164, 172, 177, 188, 167, 137] |
87 |
88 | Player statistics before the match:
89 | - **Name_x**: Name of the player
90 | - **ID_x:** ID of the player
91 | - **Ranking_x:** ATP ranking of the player. For all rankings, 9999 means unranked.
92 | - **Ranking_Points_x:** Number of ATP points
93 | - **Ranking_History_x:** All recorded rankings
94 | - **Best_Rank_x:** Best reached ATP rank
95 | - **Birth_Year_x:** Birth year
96 | - **Versus_x:** Dictionnary containing all match outcomes agains other players
97 | - **Hand_x**: Hand used to play 'R', 'L' or 'U' for unknown
98 | - **Last_Tournament_Date_x:** Date of the last previous tournament attended
99 | - **Height_x:** Height
100 | - **Matches_x:** History of outcomes of previous matches
101 | - **Matchs_Clay_x:** History of outcomes of previous matches on clay
102 | - **Matchs_Carpet_x:** History of outcomes of previous matches on carpet
103 | - **Matchs_Grass_x:** History of outcomes of previous matches on grass
104 | - **Matchs_Hard_x:** History of outcomes of previous matches on hard
105 | - **Victories_Percentage_x:** Victory percentage over all player ATP matches
106 | - **Clay_Victories_Percentage_x:** Victory percentage over all player ATP matches on clay
107 | - **Carpet_Victories_Percentage_x:** Victory percentage over all player ATP matches on carpet
108 | - **Grass_Victories_Percentage_x:** Victory percentage over all player ATP matches on grass
109 | - **Hard_Victories_Percentage_x:** Victory percentage over all player ATP matches on hard
110 | - **Aces_Percentage_x:** Aces percentage over all player ATP matches
111 | - **Doublefaults_Percentage_x:** DoubleFaults percentage over all player ATP matches
112 | - **First_Save_Success_Percentage_x:** First save success percentage over all player ATP matches
113 | - **Winning_on_1st_Serve_Percentage_x:** Winning on first serve percentage over all player ATP matches
114 | - **Winning_on_2nd_Serve_Percentage_x:** Winning on second serve percentage over all player ATP matches
115 | - **Overall_Win_on_Serve_Percentage_x:** Overall winning percentage on serve over all player ATP matches
116 | - **BreakPoint_Face_Percentage_x:** Overall breakpoint face percentage over all player ATP matches
117 | - **BreakPoint_Saved_Percentage_x:** Overall breakpoint saved percentage over all player ATP matches
118 | - **last_rankings_x:** Five previous recorded ATP rankings
119 | - **last_ranking_points_x:** Five previous ATP ranking points recorded
120 |
121 | ### Machine-Learning modelling
122 | Train/Testing on matches outcome
123 | [[Example]](examples/models/train_test.py).
124 |
125 | A generic function lets you evaluate your model with a train/test scheme without much work. Your model only needs a scikit-learn like signature.
126 | By playing with the years, columns to use in modelling and models & hyperparmaters, you can easily create your own best-performing model.
127 |
128 | ```python
129 | from sklearn.ensemble import RandomForestClassifier
130 | from evaluation.train_test import train_test_evaluation
131 |
132 | test_score = train_test_evaluation(
133 | train_years=[2020, 2021],
134 | test_years=[2022, 2023],
135 | model_class=RandomForestClassifier,
136 | model_params={"n_estimators": 2000, "max_depth": None},
137 | match_features=[],
138 | player_features=["Ranking"],
139 | encoding_params={},
140 | additional_features=[],
141 | save_path="./results",
142 | save_all_results=False
143 | )
144 |
145 | print("Test Score", test_score)
146 | ```
147 |
148 | Models and hyperparamters can easily be compared with the file results.csv saved in save_path.
149 |
150 | Accuracy of different models
151 | :-------------------------:
152 | 
153 |
154 | If the argument save_all_results is set to True, the whole csv of test data is saved. It helps to get more in-depth analysis of results
155 |
156 | Model precision compared with best ranked player wins strategy | Model precision depending of players ranks
157 | :-------------------------:|:-------------------------:
158 |  | 
159 |
160 | ### Encoding matches
161 | [Example](examples/history_modeling/first_example.py)
162 | In order to represent history of a player, one can use MatchEncoders:
163 |
164 | ```python
165 | from history_modeling.encoding_model import PCAMatchEncoder
166 |
167 | model = PCAMatchEncoder(num_pca_features=2)
168 | model.fit(data_df, transform_data=True)
169 | X_r, match_info = model.predict(data_df, transform_data=True)
170 | ```
171 |
172 | 2D representation of match outcome:
173 | :-------------------------:
174 | 
175 |
176 | ## License
177 |
178 |
179 |
180 |
181 |
182 |
183 | The repository is under the MIT License, you can freely use any part as you like.\
184 | If you find this repository useful, you can cite it and add a star ⭐ !
185 |
186 |
--------------------------------------------------------------------------------
/python/data/data_loader.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pickle
3 | import re
4 | import time
5 | from ast import literal_eval
6 |
7 | import numpy as np
8 | import pandas as pd
9 |
10 | import data.player as player
11 | import data.match as match
12 | from data.data_utils import reverse_score
13 |
14 |
15 | def create_player_profiles(df):
16 | """
17 | Creates database of players from df containing list of players
18 | :param df: pandas.DataFrame corresponding to atp_players.csv
19 | :return: databaser of player.Players objects
20 | """
21 | players_db = {}
22 | for n_row, row in df.iterrows():
23 | pl = player.Player(
24 | name=(str(row["name_first"]) + "." + str(row["name_last"])),
25 | birthdate=row["dob"],
26 | country=row["ioc"],
27 | nb_id=row["player_id"],
28 | hand=row["hand"],
29 | height=row["height"],
30 | )
31 |
32 | if row["player_id"] in players_db.keys():
33 | print(f"Player ID {row['player_id']} already in database, appears twice ?")
34 | else:
35 | players_db[row["player_id"]] = pl
36 | return players_db
37 |
38 |
39 | def read_matches_file(path_to_file):
40 | """
41 | Opens a csv file with matches
42 | :param path_to_file:
43 | :return: corresponding df
44 | """
45 | df_match = pd.read_csv(path_to_file)
46 | return df_match
47 |
48 |
49 | def get_match_files(path_to_data_dir, match_type=["main_atp"]):
50 | """
51 | Lists the available csv containing matches
52 | :param path_to_data_dir: path to directory with all files
53 | :param match_type: matches we want to retrieve list of elements among ["main_atp", "futures", "qualifying_challengers"]
54 | :return:
55 | """
56 | main_atp_pattern = "atp_matches_(?P\d+).csv"
57 | futures_pattern = "atp_matches_futures_(?P\d+).csv"
58 | qual_chall_pattern = "atp_matches_qual_chall_(?P\d+).csv"
59 |
60 | matches_data_file = {}
61 |
62 | for file in os.listdir(path_to_data_dir):
63 | if "main_atp" in match_type:
64 | regex_match = re.match(main_atp_pattern, file)
65 | if regex_match is not None:
66 | matches_data_file["filepath"] = matches_data_file.get(
67 | "filepath", []
68 | ) + [os.path.join(path_to_data_dir, file)]
69 | matches_data_file["match_type"] = matches_data_file.get(
70 | "match_type", []
71 | ) + ["main_tour"]
72 | match_dict = regex_match.groupdict()
73 | for key, value in match_dict.items():
74 | matches_data_file[key] = matches_data_file.get(key, []) + [value]
75 | if "futures" in match_type:
76 | regex_match = re.match(futures_pattern, file)
77 | if regex_match is not None:
78 | matches_data_file["filepath"] = matches_data_file.get(
79 | "filepath", []
80 | ) + [os.path.join(path_to_data_dir, file)]
81 | matches_data_file["match_type"] = matches_data_file.get(
82 | "match_type", []
83 | ) + ["main_tour"]
84 | match_dict = regex_match.groupdict()
85 | for key, value in match_dict.items():
86 | matches_data_file[key] = matches_data_file.get(key, []) + [value]
87 | if "qualifying_challengers" in match_type:
88 | regex_match = re.match(qual_chall_pattern, file)
89 | if regex_match is not None:
90 | matches_data_file["filepath"] = matches_data_file.get(
91 | "filepath", []
92 | ) + [os.path.join(path_to_data_dir, file)]
93 | matches_data_file["match_type"] = matches_data_file.get(
94 | "match_type", []
95 | ) + ["main_tour"]
96 | match_dict = regex_match.groupdict()
97 | for key, value in match_dict.items():
98 | matches_data_file[key] = matches_data_file.get(key, []) + [value]
99 | return pd.DataFrame(matches_data_file)
100 |
101 |
102 | def load_match_data_from_path(
103 | players_db, paths_to_matchs_file, get_match_statistics=False
104 | ):
105 | """
106 | Loads file from path and creates the matches data while updating players databaser
107 | :param players_db:
108 | :param path_to_matchs_file:
109 | :return:
110 | """
111 |
112 | def extract_file_id(file_path):
113 | file_id = file_path.split("/")[-1].split(".")[0]
114 | if "\\" in file_id:
115 | file_id = file_id.split("\\")[1]
116 |
117 | return file_id
118 |
119 | if not isinstance(paths_to_matchs_file, list):
120 | paths_to_matchs_file = [paths_to_matchs_file]
121 |
122 | files = []
123 | for path in paths_to_matchs_file:
124 | match_df = pd.read_csv(path)
125 | match_df["filepath"] = path
126 | files.append(match_df)
127 | match_df = pd.concat(files, axis=0)
128 | match_df = match_df.sort_values(["tourney_date", "tourney_id", "match_num"])
129 | match_df = match_df.reset_index(drop=True)
130 | """
131 | match_df["match_id"] = match_df.apply(
132 | lambda row: extract_file_id(path_to_matchs_file) + "_" + str(row.name),
133 | axis=1,
134 | )
135 | """
136 | matches_data = []
137 | for n_row, row in match_df.iterrows():
138 | m_winner = players_db[row["winner_id"]]
139 | m_loser = players_db[row["loser_id"]]
140 | m_tournament = row["tourney_name"]
141 | m_surface = row["surface"]
142 |
143 | match_o = match.Match(
144 | winner=m_winner,
145 | loser=m_loser,
146 | tournament=m_tournament,
147 | surface=m_surface,
148 | id_prefix=extract_file_id(row["filepath"]),
149 | )
150 | match_o.instantiate_from_data_row(row)
151 | (
152 | match_data,
153 | w_data,
154 | l_data,
155 | ) = match_o.get_prior_data_and_update_players_stats()
156 |
157 | match_data["match_id"] = match_o.id
158 |
159 | to_1 = {}
160 | to_2 = {}
161 | for col in w_data.columns:
162 | to_1[col] = col + "_1"
163 | to_2[col] = col + "_2"
164 |
165 | concat_1 = pd.concat(
166 | [w_data.copy().rename(to_1, axis=1), l_data.copy().rename(to_1, axis=1)],
167 | axis=0,
168 | )
169 | concat_2 = pd.concat(
170 | [l_data.copy().rename(to_2, axis=1), w_data.copy().rename(to_2, axis=1)],
171 | axis=0,
172 | )
173 | final_df = pd.concat(
174 | [pd.concat([match_data] * 2, axis=0), concat_1, concat_2], axis=1
175 | )
176 | final_df["Winner"] = [0, 1]
177 |
178 | if get_match_statistics:
179 | (
180 | match_stats,
181 | w_mstats,
182 | l_mstats,
183 | ) = match_o.get_match_data_results_statistics()
184 | ms_to_1 = {}
185 | ms_to_2 = {}
186 | for col in w_mstats.columns:
187 | ms_to_1[col] = col + "_1"
188 | ms_to_2[col] = col + "_2"
189 |
190 | ms_concat_1 = pd.concat(
191 | [
192 | w_mstats.copy().rename(ms_to_1, axis=1),
193 | l_mstats.copy().rename(ms_to_1, axis=1),
194 | ],
195 | axis=0,
196 | )
197 | ms_concat_2 = pd.concat(
198 | [
199 | l_mstats.copy().rename(ms_to_2, axis=1),
200 | w_mstats.copy().rename(ms_to_2, axis=1),
201 | ],
202 | axis=0,
203 | )
204 |
205 | match_stats_1 = match_stats.copy()
206 | match_stats_2 = match_stats.copy()
207 | match_stats_2["score"] = match_stats_2.apply(
208 | lambda row: reverse_score(row["score"]), axis=1
209 | )
210 |
211 | match_stats_df = pd.concat(
212 | [
213 | pd.concat([match_stats_1, match_stats_2], axis=0),
214 | ms_concat_1,
215 | ms_concat_2,
216 | ],
217 | axis=1,
218 | )
219 | final_df = pd.concat([final_df, match_stats_df], axis=1)
220 | matches_data.append(final_df)
221 |
222 | matches_data = pd.concat(matches_data, axis=0)
223 | return matches_data
224 |
225 |
226 | def matches_data_loader(
227 | keep_values_from_year=1990,
228 | path_to_data="submodules/tennis_atp",
229 | path_to_cache="/cache",
230 | flush_cache=True,
231 | get_match_statistics=False,
232 | get_reversed_match_data=False,
233 | include_davis_cup=False,
234 | match_type=["main_atp", "futures", "qualifying_challengers"],
235 | ):
236 | """
237 | Main matches data loading function
238 | :param keep_values_from_year: int [1968; present], min year to keep values from
239 | :param path_to_data: str, path to tennis_atp submodule
240 | :param path_to_cache: str, path to local personal cache
241 | :param flush_cache: bool, whether cache should be erased and whole function run again
242 | :param get_match_statistics: bool, return each match statistics along pre match statistics
243 | :param get_reversed_match_data: bool, should each match be double, with Winner = 0 and Winner = 1
244 | :return: pandas.DataFrame with all matches data
245 | """
246 |
247 | total_elapsed_time = 0
248 | # Check if data already in cache
249 | if os.path.exists(os.path.join(path_to_cache, "players_db")):
250 | print("Payers DB exists")
251 | players_db_cached = True
252 | else:
253 | players_db_cached = False
254 |
255 | if os.path.exists(
256 | os.path.join(path_to_cache, f"matches_data_{keep_values_from_year}.csv")
257 | ):
258 | matches_data_cached = True
259 | else:
260 | matches_data_cached = False
261 |
262 | if not players_db_cached or flush_cache:
263 | df_players = pd.read_csv(
264 | os.path.join(path_to_data, "atp_players.csv"),
265 | header=0,
266 | encoding="ISO-8859-1",
267 | )
268 | players_db = create_player_profiles(df_players)
269 | with open(os.path.join(path_to_cache, "players_db"), "wb") as file:
270 | pickle.dump(players_db, file, protocol=pickle.HIGHEST_PROTOCOL)
271 | else:
272 | with open(os.path.join(path_to_cache, "players_db"), "rb") as file:
273 | players_db = pickle.load(file)
274 |
275 | if not matches_data_cached or flush_cache:
276 | print("data not cached or flush=True")
277 | data_files = get_match_files(path_to_data, match_type=match_type)
278 | data_years = data_files.year.astype(
279 | "uint32"
280 | ) # to change when handling different type of tournament (qualifiers, main, etc...)
281 |
282 | data_per_year = []
283 | for year in np.sort(np.unique(data_years.values)):
284 | t_start = time.time()
285 | print("+---------+---------+")
286 | print(" Year %i " % year)
287 | if year >= keep_values_from_year:
288 | print("Updating players statistics & saving matches data")
289 | else:
290 | print("Only updating players statistics")
291 | print("+---------+---------+")
292 | filepaths = data_files.loc[data_files.year == str(year)][
293 | "filepath"
294 | ].values.tolist()
295 | df_year = load_match_data_from_path(
296 | players_db, filepaths, get_match_statistics=get_match_statistics
297 | )
298 | df_year["tournament_year"] = year
299 | if year >= keep_values_from_year:
300 | data_per_year.append(df_year)
301 |
302 | df_year.to_csv(
303 | os.path.join(path_to_cache, f"matches_data_{year}.csv"),
304 | sep=";",
305 | index=False,
306 | )
307 | total_elapsed_time += time.time() - t_start
308 | print(f"Elapsed Time: {np.round(time.time() - t_start, 2)} seconds")
309 | print(f"Total Elapsed Time: {np.round(total_elapsed_time, 2)} seconds")
310 |
311 | data_matches = pd.concat(data_per_year, axis=0)
312 | data_matches = data_matches.reset_index()
313 |
314 | else:
315 | years = []
316 | file_pattern = "matches_data_(?P\d+).csv"
317 | for file in os.listdir(path_to_cache):
318 | regex_match = re.match(file_pattern, file)
319 | if regex_match is not None:
320 | years.append(int(regex_match["year"]))
321 |
322 | data_per_year = []
323 | for year in np.sort(years):
324 | if year >= keep_values_from_year:
325 | df_year = pd.read_csv(
326 | os.path.join(path_to_cache, f"matches_data_{year}.csv"), sep=";"
327 | )
328 | data_per_year.append(df_year)
329 |
330 | data_matches = pd.concat(data_per_year, axis=0)
331 | data_matches.reset_index(drop=True, inplace=True)
332 |
333 | if not include_davis_cup:
334 | data_matches = data_matches.loc[~data_matches.tournament.str.contains("Davis")]
335 |
336 | if get_reversed_match_data:
337 | return data_matches
338 | else:
339 | return data_matches.iloc[::2]
340 |
341 |
342 | def clean_missing_data(df):
343 | """
344 | Cleans rows of df with missing data or to few statistics to be useful
345 | :param df:
346 | :return:
347 | """
348 |
349 | df.dropna(axis=0)
350 | df = df.loc[df.Ranking_1 != 9999]
351 | df = df.loc[df.Ranking_1 != 0]
352 | df = df.loc[df.Ranking_2 != 9999]
353 | df = df.loc[df.Ranking_2 != 0]
354 |
355 | return df
356 |
357 |
358 | def encode_data(df, mode="integer"):
359 | # Remove:
360 | # - index
361 | # - Unnamed: 0
362 | # - Unnamed: 0.1
363 | # - tournament
364 | # - Name
365 | # - ID
366 | # - Birth Year => Age
367 | # - Versus: % V against 2, last 5 matches
368 | # - Matches
369 |
370 | # Refac:
371 | # - Versus
372 | # Best way to do it ?
373 | # - Birth Year
374 | # - Last Tournament => Days since last tournament + result ?
375 |
376 | df_copy = df
377 | if mode == "integer":
378 | # Considered Variables:
379 | tournament_level = {"G": 0, "A": 1, "M": 2, "F": 3, "D": 4}
380 |
381 | round = {
382 | "F": 0,
383 | "SF": 1,
384 | "QF": 2,
385 | "R16": 3,
386 | "R32": 4,
387 | "R64": 5,
388 | "R128": 6,
389 | "R256": 7,
390 | "RR": 8,
391 | "BR": 9,
392 | "ER": 10,
393 | }
394 |
395 | hand = {
396 | "R": -1,
397 | "L": 1,
398 | "A": 0,
399 | "U": 2,
400 | "nan": 2,
401 | }
402 |
403 | elif mode == "one_hot":
404 | # Considered Variables:
405 | tournament_level = {
406 | "G": [0, 0, 0, 1],
407 | "A": [0, 0, 1, 0],
408 | "M": [0, 1, 0, 0],
409 | "D": [1, 0, 0, 0],
410 | }
411 |
412 | round = {
413 | "F": [0, 0, 0, 0, 0, 0, 0, 0, 1],
414 | "SF": [0, 0, 0, 0, 0, 0, 0, 1, 0],
415 | "QF": [0, 0, 0, 0, 0, 0, 1, 0, 0],
416 | "R16": [0, 0, 0, 0, 0, 1, 0, 0, 0],
417 | "R32": [0, 0, 0, 0, 1, 0, 0, 0, 0],
418 | "R64": [0, 0, 0, 1, 0, 0, 0, 0, 0],
419 | "R128": [0, 0, 1, 0, 0, 0, 0, 0, 0],
420 | "R256": [0, 1, 0, 0, 0, 0, 0, 0, 0],
421 | "RR": [1, 0, 0, 0, 0, 0, 0, 0, 0],
422 | }
423 |
424 | hand = {
425 | "R": [1, 0, 0, 0],
426 | "L": [0, 1, 0, 0],
427 | "A": [0, 0, 1, 0],
428 | "U": [0, 0, 0, 1],
429 | }
430 |
431 | elif mode == "mixing":
432 | # Considered Variables:
433 | tournament_level = {"G": 0, "A": 1, "M": 2, "F": 3, "D": 4}
434 |
435 | round = {
436 | "F": 0,
437 | "SF": 1,
438 | "QF": 2,
439 | "R16": 3,
440 | "R32": 4,
441 | "R64": 5,
442 | "R128": 6,
443 | "R256": 7,
444 | "RR": 8,
445 | "BR": 9,
446 | }
447 |
448 | hand = {
449 | "R": [1, 0, 0, 0],
450 | "L": [0, 1, 0, 0],
451 | "A": [0, 0, 1, 0],
452 | "U": [0, 0, 0, 1],
453 | }
454 |
455 | for col in df_copy.columns:
456 | if "hand" in col.lower():
457 | df_copy[col] = df_copy.apply(lambda row: hand[str(row[col])], axis=1)
458 | elif "round" in col.lower():
459 | df_copy[col] = df_copy.apply(lambda row: round[row[col]], axis=1)
460 | elif "tournament_level" in col.lower():
461 | df_copy[col] = df_copy.apply(lambda row: tournament_level[row[col]], axis=1)
462 | else:
463 | pass
464 |
465 | def get_versus_1(row):
466 | vs_1 = row["Versus_1"]
467 | if isinstance(vs_1, str):
468 | try:
469 | vs_1 = literal_eval(vs_1)
470 | except:
471 | raise ValueError("Err_OR")
472 | return vs_1.get(row["ID_2"], [])
473 |
474 | if "Versus_1" in df_copy.columns:
475 | df_copy["nb_match_versus"] = df_copy.apply(
476 | lambda row: len(row["Versus_1"]), axis=1
477 | )
478 | df_copy["v_perc_versus"] = df_copy.apply(
479 | lambda row: row["Versus_1"].count("V") / len(row["Versus_1"])
480 | if len(row["Versus_1"]) > 0
481 | else -1,
482 | axis=1,
483 | )
484 |
485 | return df_copy
486 |
--------------------------------------------------------------------------------
/python/data/player.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | from data.data_utils import get_days_difference
5 |
6 | # How to update player's ranking ?
7 |
8 |
9 | class Player:
10 | def __init__(self, name, birthdate, country, nb_id, hand="", height=0):
11 | self.name = name
12 | self.birthdate = birthdate
13 |
14 | self.rankings_history = {}
15 | self.ranking = 9999
16 | self.ranking_points = 0
17 | self.ranking_over_time = 0
18 | self.country = country
19 | self.id = nb_id
20 | self.last_tournament_date = ""
21 | self.versus = {}
22 | self.hand = hand
23 | self.height = height
24 |
25 | self.last_matches = ["", "", "", "", ""]
26 | # self.matches = []
27 | self.matches_history = []
28 | self.victories_percentage = 0
29 |
30 | self.matches_hard = []
31 | self.hard_victories_percentage = 0
32 | self.matches_carpet = []
33 | self.carpet_victories_percentage = 0
34 | self.matches_clay = []
35 | self.clay_victories_percentage = 0
36 | self.matches_grass = []
37 | self.grass_victories_percentage = 0
38 |
39 | self.aces_percentage = 0
40 |
41 | self.doublefaults_percentage = 0
42 | self.first_serve_success_percentage = 0
43 | self.winning_on_1st_serve_percentage = 0
44 | self.winning_on_2nd_serve_percentage = 0
45 | self.overall_win_on_serve_percentage = 0
46 |
47 | self.service_data = {
48 | "service_games_played": [],
49 | "1st_serve_success": [],
50 | "aces_nb": [],
51 | "doublefaults_nb": [],
52 | "win_on_1st_serve": [],
53 | "win_on_2nd_serve": [],
54 | "breakpoints_faced": [],
55 | "breakpoints_saved": [],
56 | }
57 |
58 | self.breakpoint_faced_percentage = 0
59 | self.breakpoint_saved_percentage = 0
60 |
61 | self.games_fatigue = (
62 | 0 # nb games curr tourney + nb games prev tourney / diff days
63 | )
64 | self.minutes_fatigue = (
65 | 0 # nb minutes curr tourney + nb minutes prev tourney / diff days
66 | )
67 | self.fatigue_features = {
68 | "previous tournament": {
69 | "date": "19000000",
70 | "num_games": 0,
71 | "num_matchs": 0,
72 | "num_minutes": 0,
73 | },
74 | "current tournament": {
75 | "date": "19000000",
76 | "num_games": 0,
77 | "num_matchs": 0,
78 | "num_minutes": 0,
79 | },
80 | }
81 |
82 | def __str__(self):
83 | return (
84 | "ID : "
85 | + str(self.id)
86 | + " *** Name : "
87 | + self.name
88 | + " " * (35 - len(self.name))
89 | + " *** Born Year : "
90 | + str(self.birthdate)
91 | + " *** Country : "
92 | + str(self.country)
93 | + " *** Hand : "
94 | + str(self.hand)
95 | + " *** Height : "
96 | + str(self.height)
97 | )
98 |
99 | def _add_victory(self, id_loser, match_id, tournament_date="19000101"):
100 | """
101 | Update last_matches argument with a victories and updates versus argument using id_loser
102 | :param id_loser: ID of los of match against current player
103 | :return:
104 | """
105 | self.last_matches[4] = self.last_matches[3]
106 | self.last_matches[3] = self.last_matches[2]
107 | self.last_matches[2] = self.last_matches[1]
108 | self.last_matches[1] = self.last_matches[0]
109 | self.last_matches[0] = "V"
110 | if id_loser in self.versus.keys():
111 | self.versus[id_loser].append(["V", tournament_date, match_id])
112 | else:
113 | self.versus[id_loser] = [["V", tournament_date, match_id]]
114 | self._update_victories_percentage("V", match_id)
115 |
116 | def _add_defeat(self, id_winner, match_id, tournament_date="19000101"):
117 | """
118 | Add a Defeat
119 | :param id_winner:
120 | :return:
121 | """
122 | self.last_matches[4] = self.last_matches[3]
123 | self.last_matches[3] = self.last_matches[2]
124 | self.last_matches[2] = self.last_matches[1]
125 | self.last_matches[1] = self.last_matches[0]
126 | self.last_matches[0] = "D"
127 | if id_winner in self.versus.keys():
128 | self.versus[id_winner].append(["D", tournament_date, match_id])
129 | else:
130 | self.versus[id_winner] = [["D", tournament_date, match_id]]
131 | self._update_victories_percentage("D", match_id)
132 |
133 | def _update_victories_percentage(self, match_outcome, match_id):
134 | """
135 | Updates Victories Percentage with a V/D of last match
136 | :param match_outcome:
137 | :return:
138 | """
139 | self.matches_history.append([match_outcome, match_id])
140 | # self.matches.append(match_outcome)
141 | victories_number = [_[0] for _ in self.matches_history].count("V")
142 | matches_number = len(self.matches_history)
143 | self.victories_percentage = (victories_number / matches_number) * 100
144 |
145 | def _update_surfaces_victories_percentage(self, surface, outcome):
146 | """
147 | Updates % of victories on a given surface (V/D)
148 | :param surface:
149 | :param outcome:
150 | :return:
151 | """
152 | if surface == "Clay":
153 | self.matches_clay.append(outcome)
154 | self.clay_victories_percentage = (
155 | self.matches_clay.count("V") / len(self.matches_clay) * 100
156 | )
157 |
158 | elif surface == "Grass":
159 | self.matches_grass.append(outcome)
160 | self.grass_victories_percentage = (
161 | self.matches_grass.count("V") / len(self.matches_grass) * 100
162 | )
163 |
164 | elif surface == "Hard":
165 | self.matches_hard.append(outcome)
166 | self.hard_victories_percentage = (
167 | self.matches_hard.count("V") / len(self.matches_hard) * 100
168 | )
169 |
170 | elif surface == "Carpet":
171 | self.matches_carpet.append(outcome)
172 | self.carpet_victories_percentage = (
173 | self.matches_carpet.count("V") / len(self.matches_carpet) * 100
174 | )
175 |
176 | def _update_fatigue(self, tournament_date, games_number, minutes_number):
177 | """
178 | Updates Fatigue arguments: self.fatigue but also self.fatigue_features
179 | :param tournament_date:
180 | :param sets_number:
181 | :return:
182 | """
183 | if games_number == games_number and games_number != "nan":
184 | if tournament_date == self.fatigue_features["current tournament"]["date"]:
185 | self.fatigue_features["current tournament"]["num_games"] += games_number
186 | if minutes_number == minutes_number and minutes_number != "nan":
187 | self.fatigue_features["current tournament"][
188 | "num_minutes"
189 | ] += minutes_number
190 | self.fatigue_features["current tournament"]["num_matchs"] += 1
191 | else:
192 | self.fatigue_features["previous tournament"] = self.fatigue_features[
193 | "current tournament"
194 | ]
195 | self.fatigue_features["current tournament"] = {
196 | "date": tournament_date,
197 | "num_games": games_number,
198 | "num_minutes": minutes_number,
199 | "num_matchs": 1,
200 | }
201 |
202 | previous_tournament_date = str(
203 | self.fatigue_features["previous tournament"]["date"]
204 | )
205 | current_tournament_date = str(
206 | self.fatigue_features["current tournament"]["date"]
207 | )
208 |
209 | days_difference_tournaments = get_days_difference(
210 | previous_tournament_date, current_tournament_date
211 | )
212 | if days_difference_tournaments == 0:
213 | print(previous_tournament_date, current_tournament_date)
214 | print(tournament_date)
215 |
216 | self.games_fatigue = (
217 | self.fatigue_features["previous tournament"]["num_games"]
218 | / days_difference_tournaments
219 | + self.fatigue_features["current tournament"]["num_games"]
220 | )
221 | self.minutes_fatigue = (
222 | self.fatigue_features["previous tournament"]["num_minutes"]
223 | / days_difference_tournaments
224 | + self.fatigue_features["current tournament"]["num_minutes"]
225 | )
226 | else:
227 | print("NaN in sets number", games_number)
228 |
229 | def _update_aces_percentage(self, aces_nb):
230 | """
231 | Upates Aces Percentage
232 | :param aces_nb:
233 | :return:
234 | """
235 |
236 | if aces_nb == aces_nb and aces_nb != "nan":
237 | self.service_data["aces_nb"].append(aces_nb)
238 | total_aces_nbr = sum(self.service_data["aces_nb"])
239 | total_service_points_played = sum(self.service_data["service_games_played"])
240 |
241 | if total_service_points_played != 0:
242 | self.aces_percentage = (
243 | total_aces_nbr / total_service_points_played * 100
244 | )
245 | else:
246 | print("No point played :", aces_nb)
247 |
248 | def _update_doublefaults_percentage(self, df_nb):
249 | """
250 | Update doublefaults percentage
251 | :param df_nb:
252 | :return:
253 | """
254 | if df_nb == df_nb and df_nb != "nan":
255 | self.service_data["doublefaults_nb"].append(df_nb)
256 | total_df_nbr = sum(self.service_data["doublefaults_nb"])
257 | total_service_points_played = sum(self.service_data["service_games_played"])
258 |
259 | if total_service_points_played != 0:
260 | self.doublefaults_percentage = (
261 | total_df_nbr / total_service_points_played * 100
262 | )
263 | else:
264 | print("No point played :", total_df_nbr)
265 | self.doublefaults_percentage = 0
266 |
267 | else:
268 | print("NaN in Double Faults", df_nb)
269 |
270 | def _update_winning_on_1st_serve_percentage(self, first_serve_win):
271 | """
272 |
273 | :param first_serve_win:
274 | :return:
275 | """
276 | self.service_data["win_on_1st_serve"].append(first_serve_win)
277 |
278 | total_first_serves_win = sum(self.service_data["1st_serve_success"])
279 | total_service_points_played = sum(self.service_data["service_games_played"])
280 |
281 | if total_service_points_played != 0:
282 | self.winning_on_1st_serve_percentage = (
283 | total_first_serves_win / total_service_points_played * 100
284 | )
285 | else:
286 | print("No point played :", total_first_serves_win)
287 |
288 | def _update_winning_on_2nd_serve_percentage(self, second_serve_win):
289 | """
290 |
291 | :param second_serve_win:
292 | :return:
293 | """
294 | self.service_data["win_on_2nd_serve"].append(second_serve_win)
295 |
296 | total_second_serves_win = sum(self.service_data["win_on_2nd_serve"])
297 | total_service_points_played = sum(self.service_data["service_games_played"])
298 |
299 | if total_service_points_played != 0:
300 | self.winning_on_2nd_serve_percentage = (
301 | total_second_serves_win / total_service_points_played * 100
302 | )
303 | else:
304 | print("No point played :", total_second_serves_win)
305 |
306 | def _update_first_serve_success_percentage(self, first_services_in):
307 | """
308 |
309 | :param first_services_in:
310 | :return:
311 | """
312 | self.service_data["1st_serve_success"].append(first_services_in)
313 |
314 | total_first_serves_in = sum(self.service_data["1st_serve_success"])
315 | total_service_points_played = sum(self.service_data["service_games_played"])
316 |
317 | if total_service_points_played != 0:
318 | self.first_serve_success_percentage = (
319 | total_first_serves_in / total_service_points_played * 100
320 | )
321 | else:
322 | print("No point played :", total_first_serves_in)
323 |
324 | def _update_breakpoints_faced_and_saved(self, breakpoint_faced, breakpoint_saved):
325 | """
326 |
327 | :param breakpoint_faced:
328 | :param breakpoint_saved:
329 | :return:
330 | """
331 | self.service_data["breakpoints_saved"].append(breakpoint_saved)
332 | self.service_data["breakpoints_faced"].append(breakpoint_faced)
333 |
334 | total_breakpoint_faced = sum(self.service_data["breakpoints_faced"])
335 | total_games_played = sum(self.service_data["service_games_played"])
336 | total_breakpoint_saved = sum(self.service_data["breakpoints_saved"])
337 |
338 | if total_games_played != 0:
339 | self.breakpoint_faced_percentage = (
340 | total_breakpoint_faced / total_games_played * 100
341 | )
342 | self.breakpoint_saved_percentage = (
343 | total_breakpoint_saved / total_games_played * 100
344 | )
345 | else:
346 | print("No point played :", self.service_data["breakpoints_saved"])
347 |
348 | def _update_service_data(
349 | self,
350 | service_games_played,
351 | aces_nb,
352 | doublefaults_nb,
353 | first_serve_success,
354 | winning_on_1st_serve,
355 | winning_on_2nd_serve,
356 | breakpoints_faced,
357 | breakpoints_saved,
358 | ):
359 | # Assert data exists
360 | if (
361 | service_games_played == service_games_played
362 | and aces_nb == aces_nb
363 | and doublefaults_nb == doublefaults_nb
364 | and first_serve_success == first_serve_success
365 | and winning_on_1st_serve == winning_on_1st_serve
366 | and winning_on_2nd_serve == winning_on_2nd_serve
367 | and breakpoints_faced == breakpoints_faced
368 | and breakpoints_saved == breakpoints_saved
369 | ):
370 | self.service_data["service_games_played"].append(service_games_played)
371 |
372 | self._update_aces_percentage(aces_nb=aces_nb)
373 | self._update_doublefaults_percentage(df_nb=doublefaults_nb)
374 | self._update_winning_on_1st_serve_percentage(
375 | first_serve_win=winning_on_1st_serve
376 | )
377 | self._update_winning_on_2nd_serve_percentage(
378 | second_serve_win=winning_on_2nd_serve
379 | )
380 | self.overall_win_on_serve_percentage = (
381 | self.winning_on_1st_serve_percentage
382 | + self.winning_on_2nd_serve_percentage
383 | )
384 | self._update_first_serve_success_percentage(
385 | first_services_in=first_serve_success
386 | )
387 | self._update_breakpoints_faced_and_saved(
388 | breakpoint_saved=breakpoints_saved, breakpoint_faced=breakpoints_faced
389 | )
390 |
391 | else:
392 | # Future argument ;)
393 | verbose = 1
394 | if verbose > 2:
395 | print("Service data not complete...")
396 |
397 | def _update_rankings(self, new_ranking, new_ranking_points, date):
398 | if new_ranking_points != new_ranking_points:
399 | if new_ranking_points == new_ranking_points:
400 | print("No ranking points", new_ranking, new_ranking_points)
401 | new_ranking_points = 0
402 | else:
403 | try:
404 | new_ranking_points = int(new_ranking_points)
405 | except:
406 | new_ranking_points = 0
407 |
408 | if new_ranking != new_ranking:
409 | new_ranking = 9999
410 | else:
411 | try:
412 | new_ranking = int(new_ranking)
413 | except:
414 | new_ranking = 9999
415 |
416 | self.ranking = new_ranking
417 | self.ranking_points = new_ranking_points
418 |
419 | self.rankings_history[date] = [
420 | int(new_ranking),
421 | int(new_ranking_points),
422 | ]
423 |
424 | def _get_best_ranking(self):
425 | all_ranks = [
426 | self.rankings_history[date][0] for date in self.rankings_history.keys()
427 | ]
428 | if len(all_ranks) > 0:
429 | return np.min(all_ranks)
430 | else:
431 | return -1
432 |
433 | def update_from_match(self, match):
434 | """
435 | Updates the whole player profile from a match
436 | :param match:
437 | :return:
438 | """
439 |
440 | # Update Rankings ?
441 | if match.winner.id == self.id:
442 | self._add_victory(
443 | match.loser.id, match_id=match.id, tournament_date=match.tournament_date
444 | )
445 | self._update_surfaces_victories_percentage(match.surface, "V")
446 | else:
447 | assert match.loser.id == self.id
448 | self._add_defeat(
449 | match.winner.id,
450 | match_id=match.id,
451 | tournament_date=match.tournament_date,
452 | )
453 | self._update_surfaces_victories_percentage(match.surface, "D")
454 | self._update_fatigue(
455 | match.tournament_date, match.games_number, match.elapsed_minutes
456 | )
457 |
458 | self._update_service_data(
459 | service_games_played=match.get_service_points_played(self.id),
460 | aces_nb=match.get_aces_nb(self.id),
461 | doublefaults_nb=match.get_df_nb(self.id),
462 | first_serve_success=match.get_first_services_in(self.id),
463 | winning_on_1st_serve=match.get_first_serve_win(self.id),
464 | winning_on_2nd_serve=match.get_second_serve_win(self.id),
465 | breakpoints_faced=match.get_breakpoint_faced(self.id),
466 | breakpoints_saved=match.get_breakpoint_saved(self.id),
467 | )
468 |
469 | self._update_rankings(*match.get_rankings(self.id), date=match.tournament_date)
470 |
471 | def get_data_df(self, opponent=None):
472 | data_dict = {
473 | "Name": [self.name],
474 | "ID": [self.id],
475 | "Ranking": [self.ranking],
476 | "Ranking_Points": [self.ranking_points],
477 | "Ranking_History": [self.rankings_history.copy()],
478 | "Best_Rank": [self._get_best_ranking()],
479 | "Birth_Year": [self.birthdate],
480 | "Versus": [
481 | self.versus.copy()
482 | if opponent is None
483 | else self.versus.get(opponent, []).copy()
484 | ],
485 | "Hand": [self.hand],
486 | "Last_Tournament_Date": [
487 | self.fatigue_features["previous tournament"]["date"]
488 | ],
489 | "Height": [self.height],
490 | "Matches": [self.matches_history.copy()],
491 | "Matches_Clay": [self.matches_clay.copy()],
492 | "Matches_Carpet": [self.matches_carpet.copy()],
493 | "Matches_Grass": [self.matches_grass.copy()],
494 | "Matches_Hard": [self.matches_hard.copy()],
495 | "Victories_Percentage": [self.victories_percentage],
496 | "Clay_Victories_Percentage": [self.clay_victories_percentage],
497 | "Carpet_Victories_Percentage": [self.carpet_victories_percentage],
498 | "Grass_Victories_Percentage": [self.grass_victories_percentage],
499 | "Hard_Victories_Percentage": [self.hard_victories_percentage],
500 | "Aces_Percentage": [self.aces_percentage],
501 | "Doublefaults_Percentage": [self.doublefaults_percentage],
502 | "First_Serve_Success_Percentage": [self.first_serve_success_percentage],
503 | "Winning_on_1st_Serve_Percentage": [self.winning_on_1st_serve_percentage],
504 | "Winning_on_2nd_Serve_Percentage": [self.winning_on_2nd_serve_percentage],
505 | "Overall_Win_on_Serve_Percentage": [self.overall_win_on_serve_percentage],
506 | "BreakPoint_Face_Percentage": [self.breakpoint_faced_percentage],
507 | "BreakPoint_Saved_Percentage": [self.breakpoint_saved_percentage],
508 | "games_fatigue": [self.games_fatigue],
509 | "minutes_fatigue": [self.minutes_fatigue],
510 | }
511 | return pd.DataFrame(data_dict)
512 |
513 | def get_last_months_rankings(self, date, nb_months=12, day_of_month="last"):
514 | assert day_of_month in [
515 | "last",
516 | "first",
517 | ], f"For now you can only use first or last month day for ranking, you chose {day_of_month}"
518 | if day_of_month == "last":
519 | f = max
520 | else:
521 | f = min
522 | date = str(date)
523 | last_months_ranks = [9999 for _ in range(nb_months)]
524 | last_months_points = [0 for _ in range(nb_months)]
525 | date_year = int(date[:4])
526 | date_month = int(date[4:6])
527 |
528 | for i in range(nb_months):
529 | if date_month == 1:
530 | date_month = 12
531 | date_year = date_year - 1
532 | else:
533 | date_month = date_month - 1
534 |
535 | days_with_rankings = []
536 | for key in self.rankings_history.keys():
537 | if f"{date_year}{date_month:02d}" in str(key):
538 | days_with_rankings.append(int(str(key)[6:]))
539 | try:
540 | if len(days_with_rankings) > 0:
541 | last_months_ranks[-i] = self.rankings_history[
542 | int(f"{date_year}{date_month:02d}{f(days_with_rankings):02d}")
543 | ][0]
544 | last_months_points[-i] = self.rankings_history[
545 | int(f"{date_year}{date_month:02d}{f(days_with_rankings):02d}")
546 | ][1]
547 |
548 | except:
549 | print(days_with_rankings)
550 | print(self.rankings_history)
551 | print(date_month, date_year)
552 |
553 | print(f"{date_year}{date_month:02d}{f(days_with_rankings):02d}")
554 | print(
555 | self.rankings_history[
556 | f"{date_year}{date_month:02d}{f(days_with_rankings):02d}"
557 | ]
558 | )
559 | raise ValueError
560 |
561 | return last_months_ranks, last_months_points
562 |
--------------------------------------------------------------------------------
/examples/data/single_row_example.csv:
--------------------------------------------------------------------------------
1 | ,100
2 | index,55550
3 | id,atp_matches_2005_4281
4 | tournament,Gstaad
5 | tournament_level,A
6 | tournament_date,20050704
7 | tournament_surface,Clay
8 | round,R32
9 | best_of,3
10 | match_id,atp_matches_2005_4281
11 | Name_1,Stan.Wawrinka
12 | ID_1,104527
13 | Ranking_1,74
14 | Ranking_Points_1,547
15 | Ranking_History_1,"{20030616: [387, 68], 20030707: [363, 74], 20030714: [348, 79], 20030721: [303, 99], 20030811: [284, 114], 20030818: [215, 164], 20030825: [175, 219], 20030929: [171, 215], 20031020: [172, 214], 20040105: [171, 216], 20040119: [166, 226], 20040206: [162, 225], 20040209: [164, 225], 20040301: [161, 227], 20040412: [184, 205], 20040419: [179, 210], 20040426: [148, 260], 20040510: [147, 270], 20040524: [150, 266], 20040531: [150, 266], 20040607: [156, 267], 20040614: [152, 272], 20040705: [146, 275], 20040719: [153, 261], 20040809: [159, 256], 20040816: [162, 241], 20040830: [163, 241], 20040906: [163, 241], 20040913: [167, 233], 20040920: [166, 242], 20041025: [161, 258], 20041101: [159, 262], 20050103: [168, 262], 20050117: [165, 267], 20050124: [165, 267], 20050131: [153, 291], 20050214: [128, 346], 20050221: [120, 377], 20050304: [118, 377], 20050328: [117, 362], 20050418: [113, 397], 20050502: [99, 421], 20050523: [87, 467], 20050613: [73, 551], 20050620: [74, 547]}"
16 | Best_Rank_1,73
17 | Birth_Year_1,19850328.0
18 | Versus_1,"[['D', 20030721, 'atp_matches_2003_4782']]"
19 | Hand_1,R
20 | Last_Tournament_Date_1,20050613
21 | Height_1,183.0
22 | Matches_1,"[['V', 'atp_matches_qual_chall_2003_3466'], ['D', 'atp_matches_qual_chall_2003_3481'], ['D', 'atp_matches_2003_4049'], ['V', 'atp_matches_2003_4315'], ['D', 'atp_matches_2003_4328'], ['V', 'atp_matches_2003_4773'], ['D', 'atp_matches_2003_4782'], ['V', 'atp_matches_qual_chall_2003_5408'], ['V', 'atp_matches_qual_chall_2003_5419'], ['V', 'atp_matches_qual_chall_2003_5424'], ['V', 'atp_matches_qual_chall_2003_5427'], ['V', 'atp_matches_qual_chall_2003_5428'], ['V', 'atp_matches_qual_chall_2003_5558'], ['V', 'atp_matches_qual_chall_2003_5571'], ['V', 'atp_matches_qual_chall_2003_5578'], ['V', 'atp_matches_qual_chall_2003_5581'], ['V', 'atp_matches_qual_chall_2003_5583'], ['V', 'atp_matches_qual_chall_2003_5889'], ['V', 'atp_matches_qual_chall_2003_5903'], ['D', 'atp_matches_qual_chall_2003_5910'], ['D', 'atp_matches_qual_chall_2003_6734'], ['D', 'atp_matches_2003_7265'], ['D', 'atp_matches_2004_5'], ['D', 'atp_matches_qual_chall_2004_423'], ['D', 'atp_matches_2004_630'], ['D', 'atp_matches_qual_chall_2004_752'], ['V', 'atp_matches_qual_chall_2004_1154'], ['V', 'atp_matches_qual_chall_2004_1163'], ['D', 'atp_matches_qual_chall_2004_1167'], ['V', 'atp_matches_qual_chall_2004_2073'], ['D', 'atp_matches_qual_chall_2004_2085'], ['V', 'atp_matches_qual_chall_2004_2287'], ['V', 'atp_matches_qual_chall_2004_2301'], ['V', 'atp_matches_qual_chall_2004_2308'], ['V', 'atp_matches_qual_chall_2004_2312'], ['V', 'atp_matches_qual_chall_2004_2314'], ['D', 'atp_matches_2004_2416'], ['D', 'atp_matches_qual_chall_2004_2596'], ['D', 'atp_matches_qual_chall_2004_3191'], ['D', 'atp_matches_qual_chall_2004_3236'], ['V', 'atp_matches_qual_chall_2004_3481'], ['D', 'atp_matches_qual_chall_2004_3490'], ['V', 'atp_matches_qual_chall_2004_3600'], ['D', 'atp_matches_qual_chall_2004_3611'], ['D', 'atp_matches_2004_4123'], ['D', 'atp_matches_2004_4825'], ['V', 'atp_matches_qual_chall_2004_5381'], ['V', 'atp_matches_qual_chall_2004_5393'], ['V', 'atp_matches_qual_chall_2004_5399'], ['V', 'atp_matches_qual_chall_2004_5402'], ['D', 'atp_matches_qual_chall_2004_5403'], ['V', 'atp_matches_qual_chall_2004_5629'], ['V', 'atp_matches_qual_chall_2004_5641'], ['V', 'atp_matches_qual_chall_2004_5647'], ['V', 'atp_matches_qual_chall_2004_5650'], ['V', 'atp_matches_qual_chall_2004_5651'], ['D', 'atp_matches_qual_chall_2004_6096'], ['V', 'atp_matches_qual_chall_2004_6148'], ['D', 'atp_matches_qual_chall_2004_6335'], ['D', 'atp_matches_2004_6494'], ['V', 'atp_matches_qual_chall_2004_6580'], ['V', 'atp_matches_qual_chall_2004_6593'], ['D', 'atp_matches_qual_chall_2004_6600'], ['D', 'atp_matches_2004_7371'], ['D', 'atp_matches_qual_chall_2004_7567'], ['D', 'atp_matches_2005_70'], ['D', 'atp_matches_qual_chall_2005_393'], ['V', 'atp_matches_qual_chall_2005_427'], ['V', 'atp_matches_qual_chall_2005_560'], ['V', 'atp_matches_qual_chall_2005_570'], ['V', 'atp_matches_qual_chall_2005_575'], ['D', 'atp_matches_qual_chall_2005_578'], ['V', 'atp_matches_qual_chall_2005_772'], ['V', 'atp_matches_qual_chall_2005_785'], ['V', 'atp_matches_qual_chall_2005_791'], ['V', 'atp_matches_qual_chall_2005_794'], ['D', 'atp_matches_qual_chall_2005_796'], ['V', 'atp_matches_2005_1046'], ['D', 'atp_matches_2005_1061'], ['D', 'atp_matches_qual_chall_2005_1150'], ['D', 'atp_matches_2005_1298'], ['D', 'atp_matches_2005_1299'], ['V', 'atp_matches_qual_chall_2005_1907'], ['V', 'atp_matches_qual_chall_2005_1918'], ['V', 'atp_matches_qual_chall_2005_1924'], ['D', 'atp_matches_qual_chall_2005_1927'], ['V', 'atp_matches_2005_2334'], ['V', 'atp_matches_2005_2358'], ['V', 'atp_matches_2005_2373'], ['D', 'atp_matches_2005_2381'], ['V', 'atp_matches_2005_2630'], ['D', 'atp_matches_2005_2661'], ['V', 'atp_matches_2005_3180'], ['V', 'atp_matches_2005_3214'], ['D', 'atp_matches_2005_3231'], ['V', 'atp_matches_qual_chall_2005_3262'], ['V', 'atp_matches_qual_chall_2005_3293'], ['V', 'atp_matches_qual_chall_2005_3355'], ['D', 'atp_matches_2005_3757'], ['D', 'atp_matches_2005_3914']]"
23 | Matches_Clay_1,"['V', 'D', 'D', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'D', 'D', 'D', 'D', 'V', 'D', 'V', 'D', 'D', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'D', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V']"
24 | Matches_Carpet_1,"['D', 'D', 'D']"
25 | Matches_Grass_1,"['D', 'D']"
26 | Matches_Hard_1,"['D', 'D', 'D', 'D', 'V', 'V', 'D', 'D', 'V', 'D', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'D', 'D', 'D', 'D']"
27 | Victories_Percentage_1,57.99999999999999
28 | Clay_Victories_Percentage_1,66.66666666666666
29 | Carpet_Victories_Percentage_1,0.0
30 | Grass_Victories_Percentage_1,0.0
31 | Hard_Victories_Percentage_1,46.15384615384615
32 | Aces_Percentage_1,7.41825280624695
33 | Doublefaults_Percentage_1,4.978038067349927
34 | First_Serve_Success_Percentage_1,57.10102489019033
35 | Winning_on_1st_Serve_Percentage_1,51.87896534895072
36 | Winning_on_2nd_Serve_Percentage_1,21.57149829184968
37 | Overall_Win_on_Serve_Percentage_1,73.4504636408004
38 | BreakPoint_Face_Percentage_1,9.809663250366032
39 | BreakPoint_Saved_Percentage_1,6.149341142020498
40 | games_fatigue_1,26.666666666666668
41 | minutes_fatigue_1,209.44444444444449
42 | last_rankings_1,"[74, 153, 163, 166, 161, 159, 9999, 153, 120, 117, 113, 87]"
43 | last_ranking_points_1,"[547, 261, 241, 242, 258, 262, 0, 291, 377, 362, 397, 467]"
44 | Name_2,Filippo.Volandri
45 | ID_2,103835
46 | Ranking_2,32
47 | Ranking_Points_2,1060
48 | Ranking_History_2,"{19990329: [354, 93], 19990719: [360, 94], 19990809: [452, 63], 19990816: [433, 70], 19991025: [274, 134], 20000214: [263, 119], 20000221: [264, 120], 20000228: [247, 132], 20000313: [281, 110], 20000327: [269, 117], 20000515: [294, 105], 20000522: [257, 127], 20000612: [248, 128], 20000626: [253, 129], 20000710: [245, 129], 20000724: [252, 129], 20000731: [242, 140], 20000807: [242, 140], 20000821: [210, 169], 20000828: [208, 174], 20000904: [208, 174], 20000918: [219, 161], 20000925: [219, 161], 20001009: [152, 252], 20001023: [152, 249], 20010115: [163, 239], 20010205: [161, 239], 20010212: [161, 239], 20010219: [152, 250], 20010226: [158, 244], 20010312: [152, 253], 20010406: [159, 247], 20010507: [156, 247], 20010514: [157, 252], 20010528: [163, 236], 20010604: [163, 236], 20010611: [138, 286], 20010625: [138, 286], 20010702: [138, 286], 20010709: [126, 315], 20010730: [125, 319], 20010806: [126, 319], 20010813: [138, 289], 20010820: [137, 289], 20010827: [139, 289], 20010903: [139, 289], 20010921: [141, 277], 20010924: [145, 277], 20011001: [199, 190], 20011008: [200, 190], 20011126: [212, 175], 20020318: [243, 146], 20020325: [243, 146], 20020422: [209, 171], 20020429: [202, 182], 20020506: [181, 216], 20020513: [179, 215], 20020527: [170, 222], 20020603: [170, 222], 20020610: [197, 193], 20020617: [196, 193], 20020624: [184, 198], 20020701: [184, 198], 20020708: [183, 201], 20020722: [190, 186], 20020729: [194, 186], 20020805: [191, 186], 20020812: [189, 193], 20020826: [163, 229], 20020902: [163, 229], 20020909: [165, 229], 20020923: [158, 233], 20020930: [154, 238], 20021007: [153, 242], 20021014: [155, 242], 20021125: [154, 244], 20030113: [158, 244], 20030224: [150, 247], 20030317: [149, 257], 20030324: [149, 257], 20030404: [129, 285], 20030407: [128, 285], 20030414: [131, 285], 20030421: [98, 420], 20030505: [100, 408], 20030512: [79, 528], 20030526: [79, 535], 20030609: [80, 536], 20030623: [69, 607], 20030630: [69, 607], 20030707: [68, 614], 20030714: [60, 649], 20030721: [53, 704], 20030728: [47, 818], 20030825: [47, 775], 20030919: [48, 768], 20030922: [47, 768], 20030929: [49, 773], 20031006: [49, 773], 20031020: [49, 772], 20040105: [47, 772], 20040119: [46, 772], 20040209: [44, 799], 20040216: [45, 834], 20040223: [43, 844], 20040301: [44, 844], 20040308: [42, 889], 20040419: [51, 704], 20040503: [54, 679], 20040510: [58, 629], 20040517: [61, 614], 20040524: [47, 774], 20040607: [45, 766], 20040621: [52, 701], 20040705: [51, 710], 20040716: [58, 685], 20040719: [65, 630], 20040726: [65, 630], 20040802: [66, 633], 20040816: [59, 685], 20040830: [60, 685], 20040913: [53, 715], 20040924: [46, 775], 20040927: [45, 775], 20041011: [37, 880], 20041025: [39, 890], 20041101: [40, 890], 20050103: [43, 880], 20050110: [42, 880], 20050117: [41, 880], 20050131: [43, 850], 20050207: [40, 900], 20050214: [41, 885], 20050221: [42, 885], 20050307: [40, 885], 20050321: [41, 885], 20050404: [42, 890], 20050411: [37, 940], 20050428: [31, 1060], 20050502: [31, 1060], 20050509: [31, 1020], 20050523: [34, 990], 20050620: [32, 1060], 20050627: [32, 1060]}"
49 | Best_Rank_2,31
50 | Birth_Year_2,19810905.0
51 | Versus_2,"[['V', 20030721, 'atp_matches_2003_4782']]"
52 | Hand_2,R
53 | Last_Tournament_Date_2,20050620
54 | Height_2,183.0
55 | Matches_2,"[['D', 'atp_matches_qual_chall_1999_1308'], ['D', 'atp_matches_qual_chall_1999_3765'], ['V', 'atp_matches_qual_chall_1999_4422'], ['D', 'atp_matches_qual_chall_1999_4431'], ['D', 'atp_matches_qual_chall_1999_4654'], ['V', 'atp_matches_qual_chall_1999_6165'], ['D', 'atp_matches_qual_chall_1999_6174'], ['D', 'atp_matches_qual_chall_2000_917'], ['V', 'atp_matches_qual_chall_2000_1002'], ['V', 'atp_matches_qual_chall_2000_1017'], ['D', 'atp_matches_qual_chall_2000_1024'], ['V', 'atp_matches_qual_chall_2000_1195'], ['D', 'atp_matches_qual_chall_2000_1206'], ['V', 'atp_matches_qual_chall_2000_1504'], ['D', 'atp_matches_qual_chall_2000_1516'], ['D', 'atp_matches_qual_chall_2000_1667'], ['V', 'atp_matches_qual_chall_2000_2497'], ['V', 'atp_matches_qual_chall_2000_2506'], ['V', 'atp_matches_qual_chall_2000_2510'], ['D', 'atp_matches_qual_chall_2000_2512'], ['D', 'atp_matches_qual_chall_2000_2676'], ['D', 'atp_matches_qual_chall_2000_3244'], ['D', 'atp_matches_qual_chall_2000_3668'], ['D', 'atp_matches_qual_chall_2000_3986'], ['V', 'atp_matches_qual_chall_2000_4308'], ['V', 'atp_matches_qual_chall_2000_4320'], ['D', 'atp_matches_qual_chall_2000_4326'], ['D', 'atp_matches_qual_chall_2000_4633'], ['V', 'atp_matches_qual_chall_2000_4847'], ['V', 'atp_matches_qual_chall_2000_4862'], ['V', 'atp_matches_qual_chall_2000_4869'], ['V', 'atp_matches_qual_chall_2000_4873'], ['D', 'atp_matches_qual_chall_2000_4875'], ['V', 'atp_matches_qual_chall_2000_5257'], ['D', 'atp_matches_qual_chall_2000_5269'], ['D', 'atp_matches_qual_chall_2000_5351'], ['V', 'atp_matches_qual_chall_2000_5711'], ['V', 'atp_matches_qual_chall_2000_5724'], ['V', 'atp_matches_qual_chall_2000_5731'], ['D', 'atp_matches_qual_chall_2000_5734'], ['V', 'atp_matches_qual_chall_2000_5874'], ['V', 'atp_matches_qual_chall_2000_5883'], ['V', 'atp_matches_qual_chall_2000_5888'], ['V', 'atp_matches_qual_chall_2000_5890'], ['V', 'atp_matches_qual_chall_2000_5891'], ['V', 'atp_matches_qual_chall_2000_6085'], ['V', 'atp_matches_qual_chall_2000_6098'], ['D', 'atp_matches_qual_chall_2000_6105'], ['V', 'atp_matches_qual_chall_2000_6332'], ['V', 'atp_matches_qual_chall_2000_6344'], ['D', 'atp_matches_qual_chall_2000_6350'], ['D', 'atp_matches_qual_chall_2000_6729'], ['D', 'atp_matches_qual_chall_2001_421'], ['D', 'atp_matches_qual_chall_2001_615'], ['V', 'atp_matches_qual_chall_2001_830'], ['V', 'atp_matches_qual_chall_2001_846'], ['D', 'atp_matches_qual_chall_2001_854'], ['V', 'atp_matches_qual_chall_2001_896'], ['D', 'atp_matches_qual_chall_2001_910'], ['V', 'atp_matches_qual_chall_2001_1264'], ['V', 'atp_matches_qual_chall_2001_1272'], ['D', 'atp_matches_qual_chall_2001_1276'], ['D', 'atp_matches_qual_chall_2001_1506'], ['D', 'atp_matches_2001_1801'], ['D', 'atp_matches_2001_2340'], ['V', 'atp_matches_qual_chall_2001_2490'], ['D', 'atp_matches_qual_chall_2001_2503'], ['D', 'atp_matches_qual_chall_2001_2886'], ['V', 'atp_matches_qual_chall_2001_2935'], ['V', 'atp_matches_qual_chall_2001_3001'], ['V', 'atp_matches_qual_chall_2001_3015'], ['V', 'atp_matches_qual_chall_2001_3022'], ['D', 'atp_matches_qual_chall_2001_3025'], ['V', 'atp_matches_qual_chall_2001_3042'], ['V', 'atp_matches_qual_chall_2001_3051'], ['V', 'atp_matches_qual_chall_2001_3055'], ['D', 'atp_matches_qual_chall_2001_3057'], ['D', 'atp_matches_qual_chall_2001_3221'], ['V', 'atp_matches_qual_chall_2001_3733'], ['V', 'atp_matches_qual_chall_2001_3745'], ['D', 'atp_matches_qual_chall_2001_3751'], ['V', 'atp_matches_qual_chall_2001_3823'], ['V', 'atp_matches_qual_chall_2001_3836'], ['D', 'atp_matches_qual_chall_2001_3843'], ['V', 'atp_matches_qual_chall_2001_4050'], ['V', 'atp_matches_qual_chall_2001_4058'], ['D', 'atp_matches_qual_chall_2001_4062'], ['D', 'atp_matches_qual_chall_2001_4769'], ['V', 'atp_matches_qual_chall_2001_4884'], ['D', 'atp_matches_qual_chall_2001_4900'], ['D', 'atp_matches_qual_chall_2001_5222'], ['V', 'atp_matches_qual_chall_2001_5377'], ['D', 'atp_matches_qual_chall_2001_5387'], ['V', 'atp_matches_qual_chall_2001_5764'], ['V', 'atp_matches_qual_chall_2001_5778'], ['D', 'atp_matches_qual_chall_2001_5785'], ['D', 'atp_matches_qual_chall_2001_5834'], ['V', 'atp_matches_2001_6354'], ['V', 'atp_matches_2001_6357'], ['D', 'atp_matches_2001_6381'], ['D', 'atp_matches_qual_chall_2001_6778'], ['D', 'atp_matches_qual_chall_2001_6910'], ['D', 'atp_matches_qual_chall_2001_7787'], ['V', 'atp_matches_qual_chall_2002_1705'], ['V', 'atp_matches_qual_chall_2002_1719'], ['V', 'atp_matches_qual_chall_2002_1726'], ['D', 'atp_matches_qual_chall_2002_1730'], ['V', 'atp_matches_qual_chall_2002_1773'], ['D', 'atp_matches_qual_chall_2002_1784'], ['V', 'atp_matches_qual_chall_2002_2263'], ['V', 'atp_matches_qual_chall_2002_2272'], ['D', 'atp_matches_qual_chall_2002_2276'], ['V', 'atp_matches_qual_chall_2002_2317'], ['V', 'atp_matches_qual_chall_2002_2330'], ['V', 'atp_matches_qual_chall_2002_2336'], ['V', 'atp_matches_qual_chall_2002_2339'], ['D', 'atp_matches_qual_chall_2002_2341'], ['D', 'atp_matches_2002_2478'], ['V', 'atp_matches_qual_chall_2002_2597'], ['V', 'atp_matches_qual_chall_2002_2609'], ['D', 'atp_matches_qual_chall_2002_2615'], ['V', 'atp_matches_qual_chall_2002_2866'], ['V', 'atp_matches_qual_chall_2002_2881'], ['D', 'atp_matches_qual_chall_2002_2889'], ['D', 'atp_matches_qual_chall_2002_3046'], ['V', 'atp_matches_qual_chall_2002_3085'], ['V', 'atp_matches_qual_chall_2002_3177'], ['D', 'atp_matches_qual_chall_2002_3187'], ['D', 'atp_matches_qual_chall_2002_3320'], ['V', 'atp_matches_qual_chall_2002_3440'], ['D', 'atp_matches_qual_chall_2002_3455'], ['V', 'atp_matches_qual_chall_2002_3832'], ['V', 'atp_matches_qual_chall_2002_3848'], ['D', 'atp_matches_qual_chall_2002_3856'], ['V', 'atp_matches_qual_chall_2002_3929'], ['V', 'atp_matches_qual_chall_2002_3943'], ['V', 'atp_matches_qual_chall_2002_3950'], ['D', 'atp_matches_qual_chall_2002_3953'], ['D', 'atp_matches_qual_chall_2002_4087'], ['D', 'atp_matches_qual_chall_2002_4410'], ['D', 'atp_matches_qual_chall_2002_4863'], ['V', 'atp_matches_qual_chall_2002_4897'], ['V', 'atp_matches_qual_chall_2002_4909'], ['D', 'atp_matches_qual_chall_2002_4915'], ['V', 'atp_matches_qual_chall_2002_5081'], ['V', 'atp_matches_qual_chall_2002_5094'], ['V', 'atp_matches_qual_chall_2002_5101'], ['V', 'atp_matches_qual_chall_2002_5104'], ['D', 'atp_matches_qual_chall_2002_5106'], ['D', 'atp_matches_qual_chall_2002_5616'], ['V', 'atp_matches_qual_chall_2002_5639'], ['V', 'atp_matches_qual_chall_2002_5649'], ['D', 'atp_matches_qual_chall_2002_5654'], ['D', 'atp_matches_qual_chall_2002_5704'], ['V', 'atp_matches_qual_chall_2002_5786'], ['D', 'atp_matches_qual_chall_2002_5800'], ['D', 'atp_matches_2002_6105'], ['V', 'atp_matches_qual_chall_2002_6441'], ['D', 'atp_matches_qual_chall_2002_6452'], ['D', 'atp_matches_qual_chall_2002_6525'], ['V', 'atp_matches_qual_chall_2002_6636'], ['D', 'atp_matches_qual_chall_2002_6650'], ['D', 'atp_matches_qual_chall_2002_7293'], ['D', 'atp_matches_qual_chall_2003_324'], ['V', 'atp_matches_qual_chall_2003_351'], ['V', 'atp_matches_qual_chall_2003_405'], ['D', 'atp_matches_2003_1184'], ['V', 'atp_matches_qual_chall_2003_1621'], ['V', 'atp_matches_qual_chall_2003_1630'], ['V', 'atp_matches_qual_chall_2003_1634'], ['V', 'atp_matches_qual_chall_2003_1636'], ['V', 'atp_matches_qual_chall_2003_1637'], ['D', 'atp_matches_qual_chall_2003_1644'], ['D', 'atp_matches_2003_1745'], ['D', 'atp_matches_2003_1748'], ['V', 'atp_matches_qual_chall_2003_1788'], ['D', 'atp_matches_qual_chall_2003_1802'], ['V', 'atp_matches_2003_1954'], ['V', 'atp_matches_2003_1979'], ['V', 'atp_matches_2003_1991'], ['D', 'atp_matches_2003_1997'], ['V', 'atp_matches_2003_2066'], ['V', 'atp_matches_2003_2090'], ['D', 'atp_matches_2003_2105'], ['V', 'atp_matches_2003_2415'], ['V', 'atp_matches_2003_2439'], ['V', 'atp_matches_2003_2451'], ['D', 'atp_matches_2003_2457'], ['D', 'atp_matches_2003_2520'], ['D', 'atp_matches_2003_2846'], ['D', 'atp_matches_qual_chall_2003_2941'], ['V', 'atp_matches_qual_chall_2003_2973'], ['V', 'atp_matches_qual_chall_2003_3037'], ['V', 'atp_matches_qual_chall_2003_3356'], ['V', 'atp_matches_qual_chall_2003_3364'], ['V', 'atp_matches_qual_chall_2003_3368'], ['V', 'atp_matches_qual_chall_2003_3370'], ['V', 'atp_matches_qual_chall_2003_3371'], ['D', 'atp_matches_2003_3636'], ['V', 'atp_matches_qual_chall_2003_3877'], ['V', 'atp_matches_qual_chall_2003_3885'], ['V', 'atp_matches_qual_chall_2003_3889'], ['D', 'atp_matches_qual_chall_2003_3891'], ['V', 'atp_matches_2003_4123'], ['V', 'atp_matches_2003_4132'], ['D', 'atp_matches_2003_4137'], ['V', 'atp_matches_2003_4344'], ['V', 'atp_matches_2003_4360'], ['V', 'atp_matches_2003_4374'], ['D', 'atp_matches_2003_4381'], ['V', 'atp_matches_2003_4772'], ['V', 'atp_matches_2003_4782'], ['V', 'atp_matches_2003_4787'], ['V', 'atp_matches_2003_4789'], ['D', 'atp_matches_2003_4790'], ['D', 'atp_matches_2003_5036'], ['D', 'atp_matches_2003_5655'], ['D', 'atp_matches_2003_6509'], ['V', 'atp_matches_2003_6587'], ['D', 'atp_matches_2003_6597'], ['D', 'atp_matches_2003_6884'], ['D', 'atp_matches_2003_6973'], ['D', 'atp_matches_2003_7273'], ['D', 'atp_matches_2004_34'], ['V', 'atp_matches_2004_232'], ['D', 'atp_matches_2004_273'], ['V', 'atp_matches_2004_842'], ['V', 'atp_matches_2004_852'], ['D', 'atp_matches_2004_857'], ['V', 'atp_matches_2004_928'], ['D', 'atp_matches_2004_941'], ['V', 'atp_matches_2004_1050'], ['D', 'atp_matches_2004_1064'], ['V', 'atp_matches_2004_1174'], ['V', 'atp_matches_2004_1188'], ['D', 'atp_matches_2004_1195'], ['D', 'atp_matches_2004_1269'], ['D', 'atp_matches_2004_2180'], ['V', 'atp_matches_2004_2507'], ['V', 'atp_matches_2004_2532'], ['D', 'atp_matches_2004_2545'], ['D', 'atp_matches_2004_2620'], ['V', 'atp_matches_2004_2747'], ['V', 'atp_matches_2004_2761'], ['V', 'atp_matches_2004_2768'], ['V', 'atp_matches_2004_2772'], ['V', 'atp_matches_2004_2774'], ['D', 'atp_matches_2004_2962'], ['D', 'atp_matches_2004_3507'], ['V', 'atp_matches_2004_3728'], ['D', 'atp_matches_2004_3785'], ['V', 'atp_matches_2004_4172'], ['D', 'atp_matches_2004_4187'], ['V', 'atp_matches_2004_4567'], ['V', 'atp_matches_2004_4569'], ['V', 'atp_matches_2004_4817'], ['V', 'atp_matches_2004_4831'], ['V', 'atp_matches_2004_4838'], ['V', 'atp_matches_2004_4841'], ['D', 'atp_matches_2004_4843'], ['V', 'atp_matches_qual_chall_2004_5093'], ['D', 'atp_matches_qual_chall_2004_5109'], ['V', 'atp_matches_qual_chall_2004_5124'], ['V', 'atp_matches_qual_chall_2004_5140'], ['V', 'atp_matches_qual_chall_2004_5148'], ['V', 'atp_matches_qual_chall_2004_5152'], ['V', 'atp_matches_qual_chall_2004_5154'], ['D', 'atp_matches_2004_5716'], ['V', 'atp_matches_2004_5994'], ['D', 'atp_matches_2004_6027'], ['V', 'atp_matches_2004_6485'], ['V', 'atp_matches_2004_6499'], ['V', 'atp_matches_2004_6506'], ['D', 'atp_matches_2004_6510'], ['V', 'atp_matches_2004_6661'], ['D', 'atp_matches_2004_6663'], ['V', 'atp_matches_2004_6772'], ['V', 'atp_matches_2004_6786'], ['V', 'atp_matches_2004_6793'], ['V', 'atp_matches_2004_6797'], ['D', 'atp_matches_2004_6799'], ['V', 'atp_matches_2004_7231'], ['D', 'atp_matches_2004_7241'], ['D', 'atp_matches_2004_7364'], ['D', 'atp_matches_2004_7528'], ['D', 'atp_matches_2005_65'], ['D', 'atp_matches_2005_188'], ['D', 'atp_matches_2005_297'], ['V', 'atp_matches_2005_715'], ['V', 'atp_matches_2005_725'], ['V', 'atp_matches_2005_730'], ['D', 'atp_matches_2005_733'], ['D', 'atp_matches_2005_893'], ['D', 'atp_matches_2005_1118'], ['V', 'atp_matches_2005_1274'], ['V', 'atp_matches_2005_1284'], ['D', 'atp_matches_2005_1289'], ['D', 'atp_matches_2005_1432'], ['D', 'atp_matches_2005_1770'], ['V', 'atp_matches_2005_1944'], ['V', 'atp_matches_2005_1952'], ['V', 'atp_matches_2005_1956'], ['D', 'atp_matches_2005_1958'], ['V', 'atp_matches_2005_2141'], ['V', 'atp_matches_2005_2160'], ['V', 'atp_matches_2005_2169'], ['D', 'atp_matches_2005_2174'], ['V', 'atp_matches_2005_2544'], ['D', 'atp_matches_2005_2545'], ['V', 'atp_matches_2005_2650'], ['D', 'atp_matches_2005_2671'], ['V', 'atp_matches_2005_2767'], ['V', 'atp_matches_2005_2792'], ['V', 'atp_matches_2005_2804'], ['D', 'atp_matches_2005_2810'], ['V', 'atp_matches_2005_3181'], ['V', 'atp_matches_2005_3215'], ['D', 'atp_matches_2005_3232'], ['D', 'atp_matches_2005_3931'], ['V', 'atp_matches_qual_chall_2005_4150'], ['V', 'atp_matches_qual_chall_2005_4166'], ['V', 'atp_matches_qual_chall_2005_4174'], ['D', 'atp_matches_qual_chall_2005_4178']]"
56 | Matches_Clay_2,"['D', 'D', 'V', 'D', 'V', 'D', 'D', 'V', 'V', 'V', 'D', 'D', 'D', 'D', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'D', 'D', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'D', 'D', 'D', 'V', 'D', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'D', 'V', 'D', 'D', 'V', 'D', 'V', 'V', 'D', 'D', 'V', 'V', 'D', 'D', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'D', 'V', 'V', 'D', 'D', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'D', 'D', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'D', 'V', 'D', 'D', 'V', 'D', 'D', 'V', 'D', 'D', 'V', 'V', 'V', 'V', 'V', 'D', 'D', 'D', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'D', 'D', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'D', 'V', 'D', 'V', 'V', 'D', 'V', 'D', 'V', 'D', 'V', 'V', 'D', 'D', 'V', 'V', 'D', 'D', 'V', 'V', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'D', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D']"
57 | Matches_Carpet_2,"['D', 'D', 'D', 'D', 'D', 'D', 'D', 'V', 'D', 'D', 'D']"
58 | Matches_Grass_2,"['D', 'D', 'V', 'D', 'D']"
59 | Matches_Hard_2,"['D', 'V', 'D', 'V', 'V', 'D', 'V', 'D', 'D', 'D', 'D', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'D', 'D', 'V', 'V', 'D', 'D', 'D', 'D', 'V', 'D', 'D', 'D', 'V', 'D', 'D', 'D', 'D', 'D', 'D']"
60 | Victories_Percentage_2,56.03715170278638
61 | Clay_Victories_Percentage_2,61.36363636363637
62 | Carpet_Victories_Percentage_2,9.090909090909092
63 | Grass_Victories_Percentage_2,20.0
64 | Hard_Victories_Percentage_2,39.53488372093023
65 | Aces_Percentage_2,1.2510230328539695
66 | Doublefaults_Percentage_2,3.7881445106980007
67 | First_Serve_Success_Percentage_2,66.30422074126038
68 | Winning_on_1st_Serve_Percentage_2,65.75470595112826
69 | Winning_on_2nd_Serve_Percentage_2,16.49713550800889
70 | Overall_Win_on_Serve_Percentage_2,82.25184145913715
71 | BreakPoint_Face_Percentage_2,10.312171168011224
72 | BreakPoint_Saved_Percentage_2,5.869285630772828
73 | games_fatigue_2,58.0
74 | minutes_fatigue_2,
75 | last_rankings_2,"[32, 65, 60, 45, 39, 40, 9999, 43, 42, 41, 31, 34]"
76 | last_ranking_points_2,"[1060, 630, 685, 775, 890, 890, 0, 850, 885, 885, 1060, 990]"
77 | Winner,0
78 | score,6-1 2-6 6-4
79 | elapsed_minutes,100.0
80 | aces_nb_1,4.0
81 | doublefaults_nb_1,5.0
82 | svpt_1,68.0
83 | 1stIn_1,36.0
84 | 1stWon_1,27.0
85 | 2ndWon_1,16.0
86 | SvGms_1,13.0
87 | bpSaved_1,5.0
88 | bpFaced_1,9.0
89 | aces_nb_2,0.0
90 | doublefaults_nb_2,6.0
91 | svpt_2,93.0
92 | 1stIn_2,63.0
93 | 1stWon_2,33.0
94 | 2ndWon_2,17.0
95 | SvGms_2,12.0
96 | bpSaved_2,6.0
97 | bpFaced_2,11.0
98 | tournament_year,2005
99 |
--------------------------------------------------------------------------------