├── .gitignore
├── robot.png
├── tennis_robot.png
├── tennis_robot_2.png
├── examples
    ├── data
    │   ├── nb_matches.png
    │   ├── Best_player_win_percentage.png
    │   ├── stan_the_man_win_percentage.png
    │   ├── stanimal_aces_percentage_difference.png
    │   ├── data_row_example.csv
    │   ├── data_loading.py
    │   └── single_row_example.csv
    ├── results_reading
    │   ├── win_per_surface.png
    │   ├── models_performances.png
    │   ├── precision_percentage_players_ranks.png
    │   ├── models_comparison.py
    │   └── best_model.py
    ├── history_modeling
    │   ├── 2d_pca_match_representation.png
    │   ├── 2d_pca_match_representation_test.png
    │   ├── first_example.py
    │   ├── history_encoding.py
    │   ├── pca_representation.py
    │   ├── pca_match_encoder_train.py
    │   └── train_test.py
    └── models
    │   ├── dl_train_test.py
    │   ├── prediction.py
    │   ├── train_test.py
    │   ├── train_test_eval.py
    │   ├── deep_history.py
    │   └── grid_search.py
├── envs
    ├── minimal_env.yml
    └── requirements.txt
├── .gitmodules
├── .github
    ├── dependabot.yml
    └── workflows
    │   └── black_action.yml
├── python
    ├── model
    │   ├── base_model.py
    │   ├── lgbm.py
    │   ├── dumb_models.py
    │   ├── sk_model.py
    │   ├── xgboost.py
    │   └── deep_model.py
    ├── data
    │   ├── data_utils.py
    │   ├── data_encoding.py
    │   ├── match.py
    │   ├── data_loader.py
    │   └── player.py
    ├── history_modeling
    │   ├── encoding_model.py
    │   └── match_representation.py
    └── evaluation
    │   └── train_test.py
├── LICENSE.md
├── notes.txt
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | DataBase/
3 | 
4 | \.idea/
5 | cache/
6 | results/
7 | *.pyc


--------------------------------------------------------------------------------
/robot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/robot.png


--------------------------------------------------------------------------------
/tennis_robot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/tennis_robot.png


--------------------------------------------------------------------------------
/tennis_robot_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/tennis_robot_2.png


--------------------------------------------------------------------------------
/examples/data/nb_matches.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/data/nb_matches.png


--------------------------------------------------------------------------------
/envs/minimal_env.yml:
--------------------------------------------------------------------------------
1 | name: min_tennis
2 | channels:
3 |   - defaults
4 | dependencies:
5 |   - pandas
6 |   - matplotlib
7 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "submodules/tennis_atp"]
2 | 	path = submodules/tennis_atp
3 | 	url = https://github.com/JeffSackmann/tennis_atp.git
4 | 


--------------------------------------------------------------------------------
/examples/data/Best_player_win_percentage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/data/Best_player_win_percentage.png


--------------------------------------------------------------------------------
/examples/data/stan_the_man_win_percentage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/data/stan_the_man_win_percentage.png


--------------------------------------------------------------------------------
/examples/results_reading/win_per_surface.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/results_reading/win_per_surface.png


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 |   - package-ecosystem: "gitsubmodule"
4 |     directory: "/"
5 |     schedule:
6 |       interval: "weekly"
7 | 


--------------------------------------------------------------------------------
/examples/results_reading/models_performances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/results_reading/models_performances.png


--------------------------------------------------------------------------------
/examples/data/stanimal_aces_percentage_difference.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/data/stanimal_aces_percentage_difference.png


--------------------------------------------------------------------------------
/envs/requirements.txt:
--------------------------------------------------------------------------------
1 | lightgbm==3.3.5
2 | matplotlib==3.5.1
3 | numpy==1.22.3
4 | pandas==1.5.2
5 | scikit_learn==1.2.2
6 | tensorflow==2.8.0
7 | tqdm==4.65.0
8 | xgboost==1.7.4


--------------------------------------------------------------------------------
/examples/history_modeling/2d_pca_match_representation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/history_modeling/2d_pca_match_representation.png


--------------------------------------------------------------------------------
/examples/history_modeling/2d_pca_match_representation_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/history_modeling/2d_pca_match_representation_test.png


--------------------------------------------------------------------------------
/examples/results_reading/precision_percentage_players_ranks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VincentAuriau/Tennis-Prediction/HEAD/examples/results_reading/precision_percentage_players_ranks.png


--------------------------------------------------------------------------------
/python/model/base_model.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | 
 3 | 
 4 | class BaseModel:
 5 |     def __init__(self):
 6 |         pass
 7 | 
 8 |     @abstractmethod
 9 |     def fit(self, X):
10 |         pass
11 | 
12 |     @abstractmethod
13 |     def predict(self, X):
14 |         pass
15 | 
16 | 
17 | class DeepBaseModel(BaseModel):
18 |     def __init__(self):
19 |         super().__init__()
20 |         self.instantiate_model()
21 | 
22 |     @abstractmethod
23 |     def instantiate_model(self, X):
24 |         pass
25 | 


--------------------------------------------------------------------------------
/python/model/lgbm.py:
--------------------------------------------------------------------------------
 1 | import lightgbm as lgb
 2 | import numpy as np
 3 | 
 4 | from model.base_model import BaseModel
 5 | 
 6 | 
 7 | class LightGBM(BaseModel):
 8 |     def __init__(self, params, num_rounds=10):
 9 |         self.params = params
10 |         self.num_rounds = num_rounds
11 | 
12 |     def fit(self, X, y):
13 |         train_data = lgb.Dataset(X, label=y)
14 |         self.model = lgb.train(self.params, train_data, self.num_rounds)
15 |         return self.model
16 | 
17 |     def predict(self, X):
18 |         return np.round(self.model.predict(X), 0)
19 | 
20 |     def save(self, path):
21 |         self.model.save_model(path)
22 | 


--------------------------------------------------------------------------------
/python/model/dumb_models.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from model.base_model import BaseModel
 4 | 
 5 | 
 6 | class BestRankedPlayerWins(BaseModel):
 7 |     def fit(self, X, y):
 8 |         pass
 9 | 
10 |     def predict(self, X):
11 |         y_pred = []
12 |         for n_row, row in X.iterrows():
13 |             rank_1 = row["Ranking_1"]
14 |             rank_2 = row["Ranking_2"]
15 |             y_pred.append([np.argmin([rank_1, rank_2])])
16 |         return y_pred
17 | 
18 | 
19 | class RandomModel(BaseModel):
20 |     def fit(self, X, y):
21 |         pass
22 | 
23 |     def predict(self, X):
24 |         return np.random.randint(0, 2, 1)
25 | 


--------------------------------------------------------------------------------
/python/data/data_utils.py:
--------------------------------------------------------------------------------
 1 | def get_days_difference(prev_date, curr_date):
 2 |     prev_date, curr_date = str(prev_date), str(curr_date)
 3 |     days_difference = (
 4 |         (int(curr_date[:4]) - int(prev_date[:4])) * 365
 5 |         + (int(curr_date[4:6]) - int(prev_date[4:6])) * 30
 6 |         + int(curr_date[6:8])
 7 |         - int(prev_date[6:8])
 8 |     ) + 2
 9 |     return days_difference
10 | 
11 | 
12 | def reverse_score(score):
13 |     score = str(score)
14 |     reversed_score = []
15 |     sets = score.split(" ")
16 |     for set in sets:
17 |         games = set.split("-")
18 |         reversed_score.append("-".join(games[::-1]))
19 |     return " ".join(reversed_score)
20 | 


--------------------------------------------------------------------------------
/python/model/sk_model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from sklearn.preprocessing import StandardScaler
 4 | from sklearn.svm import SVC
 5 | 
 6 | from model.base_model import BaseModel
 7 | 
 8 | 
 9 | class ScalerSVC(BaseModel):
10 |     def __init__(self, C=1.0, kernel="linear", degree=3, gamma="scale", tol=1e-3):
11 |         self.C = C
12 |         self.kernel = kernel
13 |         self.degree = degree
14 |         self.gamma = gamma
15 |         self.tol = tol
16 | 
17 |         self.model = SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, tol=tol)
18 |         self.scaler_x = StandardScaler()
19 | 
20 |     def fit(self, X, y):
21 |         self.scaler_x.fit(X)
22 |         self.model.fit(self.scaler_x.transform(X), y.reshape(-1, 1))
23 | 
24 |     def predict(self, X):
25 |         return self.model.predict(self.scaler_x.transform(X)).reshape(-1, 1)
26 | 


--------------------------------------------------------------------------------
/python/model/xgboost.py:
--------------------------------------------------------------------------------
 1 | import xgboost as xgb
 2 | import numpy as np
 3 | 
 4 | from model.base_model import BaseModel
 5 | 
 6 | 
 7 | class XGBoost(BaseModel):
 8 |     def __init__(self, params, num_rounds=10):
 9 |         self.params = params
10 |         self.num_rounds = num_rounds
11 | 
12 |     def fit(self, X, y, validation_data=None):
13 |         train_data = xgb.DMatrix(X, label=y)
14 |         if validation_data is not None:
15 |             evallist = [
16 |                 (train_data, "train"),
17 |                 (xgb.DMatrix(validation_data[0], label=validation_data[1]), "eval"),
18 |             ]
19 |         else:
20 |             evallist = []
21 |         self.model = xgb.train(self.params, train_data, self.num_rounds, evals=evallist)
22 |         return self.model
23 | 
24 |     def predict(self, X):
25 |         X = xgb.DMatrix(X)
26 |         return np.round(self.model.predict(X), 0)
27 | 
28 |     def save(self, path):
29 |         self.model.save_model(path)
30 | 


--------------------------------------------------------------------------------
/.github/workflows/black_action.yml:
--------------------------------------------------------------------------------
 1 | name: black-action
 2 | on: [pull_request]
 3 | jobs:
 4 |   linter_name:
 5 |     name: runner / black
 6 |     runs-on: ubuntu-latest
 7 |     steps:
 8 |       - uses: actions/checkout@v2
 9 |       - name: Check files using the black formatter
10 |         uses: rickstaa/action-black@v1
11 |         id: action_black
12 |         with:
13 |           black_args: "."
14 |       - name: Create Pull Request
15 |         if: steps.action_black.outputs.is_formatted == 'true'
16 |         uses: peter-evans/create-pull-request@v3
17 |         with:
18 |           token: ${{ secrets.GITHUB_TOKEN }}
19 |           title: "Format Python code with psf/black push"
20 |           commit-message: ":art: Format Python code with psf/black"
21 |           body: |
22 |             There appear to be some python formatting errors in ${{ github.sha }}. This pull request
23 |             uses the [psf/black](https://github.com/psf/black) formatter to fix these issues.
24 |           base: ${{ github.head_ref }} # Creates pull request onto pull request or commit branch
25 |           branch: actions/black
26 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 VincentAuriau
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/examples/history_modeling/first_example.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | import os, sys
 3 | 
 4 | sys.path.append("../../python")
 5 | sys.path.append("../../")
 6 | 
 7 | import matplotlib.pyplot as plt
 8 | from matplotlib.patches import Rectangle
 9 | import numpy as np
10 | import pandas as pd
11 | 
12 | from data.data_loader import matches_data_loader
13 | from history_modeling.match_representation import get_match_info, matches_info_norm
14 | 
15 | data_df = matches_data_loader(
16 |     path_to_data="../../submodules/tennis_atp",
17 |     path_to_cache="../../cache",
18 |     flush_cache=False,
19 |     keep_values_from_year=2023,
20 |     get_match_statistics=True,
21 |     get_reversed_match_data=True,
22 | )
23 | 
24 | data_df = data_df.loc[data_df.ID_1 == 105173]  # Adrian Mannarino
25 | print(f"Adrian Mannarino has played {len(data_df)} matches in 2023 in our database")
26 | 
27 | ten_matches_history = pd.concat(
28 |     [get_match_info(data_df.iloc[i]) for i in range(10)], axis=0
29 | )
30 | ten_matches_history.reset_index(inplace=True, drop=True)
31 | match_info = matches_info_norm(ten_matches_history, data_df.iloc[10]["tournament_date"])
32 | 
33 | print(match_info.columns)
34 | plt.figure()
35 | plt.imshow(match_info.values)
36 | plt.show()
37 | 


--------------------------------------------------------------------------------
/notes.txt:
--------------------------------------------------------------------------------
 1 | Features to add:
 2 |     - Ranking over time: ranking last months
 3 |     - Ranking should come from ranking files and not match files
 4 |     - Last x (5 currently) matches could be an argument (5, 10, 15, etc...)
 5 |     - % Victory against players of same rank as adv
 6 | 
 7 | 
 8 | Improvements:
 9 | 
10 |     - encoding last x matches [v, v, v, d, d] => [1, 1, 1, 0, 0]
11 |     - encoding last x matches versus
12 |     - add non main atp matches
13 | 
14 | Done:
15 |     - Using nb of games played instead of nb of sets ?
16 |     - use match elapsed minutes to model fatigue
17 |     - player.versus should not be given as a whole from match but only versus against adv ?
18 |     - cut cache in several files for dynamic loading with different parameters
19 |     - Fix columns names
20 |     - Remove all persons from versus
21 |     - add unique match ID to link to original matches data
22 |     - add concatenation of players stats at match time, result & match stats
23 |     - age at match time not well handled
24 |         # Either get it from match data (easier)
25 |         # Or better handling birthdate vs match date
26 |     - have better historic of matches order & versus matches = {id: [date, result], [date, result], ...]}
27 |     - Remove double data
28 | 
29 | 


--------------------------------------------------------------------------------
/examples/results_reading/models_comparison.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | from matplotlib.patches import Rectangle
 3 | import pandas as pd
 4 | 
 5 | df_results = pd.read_csv("../../results/20212022_chall/results.csv", sep=";")
 6 | 
 7 | print(df_results.head())
 8 | models_color = {}
 9 | for i, model in enumerate(df_results.model_class.unique()):
10 |     models_color[model] = [
11 |         "tab:blue",
12 |         "tab:orange",
13 |         "tab:green",
14 |         "tab:red",
15 |         "tab:purple",
16 |         "tab:brown",
17 |         "tab:pink",
18 |         "tab:grey",
19 |         "tab:olive",
20 |         "tab:cyan",
21 |     ][i]
22 | fig, ax = plt.subplots()
23 | for n_row, row in df_results.iterrows():
24 |     if n_row < 200:
25 |         rect = Rectangle(
26 |             (n_row, 0),
27 |             1,
28 |             row["precision"] * 100,
29 |             edgecolor=models_color[row["model_class"]],
30 |             facecolor=models_color[row["model_class"]],
31 |             label=row["model_class"],
32 |         )
33 |         ax.add_patch(rect)
34 | 
35 | ax.autoscale()
36 | handles, labels = plt.gca().get_legend_handles_labels()
37 | by_label = dict(zip(labels, handles))
38 | plt.legend(by_label.values(), by_label.keys(), loc=1)
39 | plt.ylabel("Precision %")
40 | plt.savefig("models_performances.png")
41 | plt.show()
42 | 


--------------------------------------------------------------------------------
/examples/models/dl_train_test.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | 
 3 | sys.path.append("../../python")
 4 | sys.path.append("../../../")
 5 | 
 6 | from model.deep_model import SimpleFullyConnected
 7 | from evaluation.train_test import train_test_evaluation
 8 | 
 9 | 
10 | train_years = [2018, 2019, 2020]
11 | test_years = [2021, 2022]
12 | 
13 | match_features = ["tournament_surface", "tournament_level"]
14 | player_features = [
15 |     "Ranking",
16 |     "Ranking_Points",
17 |     "Height",
18 |     "Victories_Percentage",
19 |     "Clay_Victories_Percentage",
20 |     "Grass_Victories_Percentage",
21 |     "Carpet_Victories_Percentage",
22 |     "Hard_Victories_Percentage",
23 |     "Aces_Percentage",
24 | ]
25 | additional_features = ["diff_rank", "v_perc_versus"]
26 | test_score = train_test_evaluation(
27 |     train_years=train_years,
28 |     test_years=test_years,
29 |     model_class=SimpleFullyConnected,
30 |     model_params={
31 |         "input_shape": 22,
32 |         "hidden_units": (22, 44, 44, 22, 11, 4),
33 |         "output_shape": 2,
34 |         "last_activation": "softmax",
35 |         "epochs": 100,
36 |         "reduced_lr_epochs": 50,
37 |         "loss": "categorical_crossentropy",
38 |     },
39 |     match_features=match_features,
40 |     player_features=player_features,
41 |     encoding_params={},
42 |     additional_features=additional_features,
43 |     save_path="../../results/test",
44 |     save_all_results=True,
45 | )
46 | 


--------------------------------------------------------------------------------
/examples/models/prediction.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | 
 3 | sys.path.append("../../python")
 4 | sys.path.append("../../../")
 5 | 
 6 | import numpy as np
 7 | 
 8 | from data.data_loader import matches_data_loader
 9 | from model.dumb_models import RandomModel, BestRankedPlayerWins
10 | 
11 | data_df = matches_data_loader(
12 |     path_to_data="../../submodules/tennis_atp",
13 |     path_to_cache="../../cache",
14 |     flush_cache=True,
15 |     keep_values_from_year=2021,
16 |     get_match_statistics=False,
17 | )
18 | 
19 | random_model = RandomModel()
20 | best_player_model = BestRankedPlayerWins()
21 | 
22 | random_predictions = []
23 | best_player_predictions = []
24 | ground_truths = []
25 | for n_row, row in data_df.iterrows():
26 |     r_prediction = random_model.predict(row)
27 |     bp_prediction = best_player_model.predict(row)
28 |     truth = row["Winner"]
29 | 
30 |     random_predictions.append(r_prediction)
31 |     best_player_predictions.append(bp_prediction)
32 |     ground_truths.append(truth)
33 | 
34 | ground_truths = np.array(ground_truths)
35 | random_predictions = np.squeeze(np.array(random_predictions))
36 | best_player_predictions = np.squeeze(best_player_predictions)
37 | 
38 | print("Among the", len(ground_truths), "matches analyzed, we have found:")
39 | 
40 | random_percentage = (
41 |     np.sum(ground_truths == random_predictions) / len(random_predictions) * 100
42 | )
43 | print("Random Prediction Percentage:", np.round(random_percentage, 2), "%")
44 | bp_percentage = (
45 |     np.sum(ground_truths == best_player_predictions)
46 |     / len(best_player_predictions)
47 |     * 100
48 | )
49 | print("Best Ranked Player Prediction Percentage:", np.round(bp_percentage, 2), "%")
50 | 


--------------------------------------------------------------------------------
/examples/history_modeling/history_encoding.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | import os, sys
 3 | 
 4 | sys.path.append("../../python")
 5 | sys.path.append("../../")
 6 | 
 7 | import matplotlib.pyplot as plt
 8 | import numpy as np
 9 | import pandas as pd
10 | 
11 | from data.data_loader import matches_data_loader
12 | from history_modeling.encoding_model import PCAMatchEncoder
13 | 
14 | from data.data_encoding import create_encoded_history
15 | 
16 | data_df = matches_data_loader(
17 |     path_to_data="../../submodules/tennis_atp",
18 |     path_to_cache="../../cache",
19 |     flush_cache=False,
20 |     keep_values_from_year=2022,
21 |     get_match_statistics=True,
22 |     get_reversed_match_data=True,
23 |     include_davis_cup=False,
24 | )
25 | 
26 | print("Data Loaded")
27 | columns = [
28 |     "surface",
29 |     "result",
30 |     "adv_ranking",
31 |     "adv_ranking_points",
32 |     "num_won_sets",
33 |     "num_lost_sets",
34 |     "num_won_games",
35 |     "num_lost_games",
36 |     "num_tie_break_wons",
37 |     "num_tie_break_lost",
38 | ]
39 | model = PCAMatchEncoder(num_pca_features=2, columns=columns)
40 | model.fit(data_df, transform_data=True)
41 | 
42 | print("Model Fitted, now predicting")
43 | X_r, match_info = model.predict(data_df, transform_data=True)
44 | 
45 | history_df = create_encoded_history(data_df, model, 5)
46 | 
47 | cols = ["history_1", "history_2"]
48 | print(
49 |     pd.DataFrame(
50 |         np.array(history_df["history_1"].values.tolist())
51 |         .reshape((len(history_df), -1))
52 |         .tolist()
53 |     )
54 | )
55 | print(
56 |     np.array(history_df["history_1"].values.tolist())
57 |     .reshape((len(history_df), -1))
58 |     .shape
59 | )
60 | flatten_data = pd.concat(
61 |     [
62 |         pd.DataFrame(
63 |             np.array(history_df["history_1"].values.tolist())
64 |             .reshape((len(history_df), -1))
65 |             .tolist()
66 |         ).add_prefix(x)
67 |         for x in cols
68 |     ],
69 |     axis=1,
70 | )
71 | flatten_data.to_csv("flatten_data.csv", sep=";", index=False)
72 | encoded_data = pd.concat([flatten_data, history_df.drop(cols, axis=1)], axis=1)
73 | history_df = pd.merge(data_df, encoded_data, on=["id", "ID_1", "ID_2"])
74 | history_df = history_df.loc[history_df.tournament_year == 2023]
75 | history_df.to_csv("history_df.csv", sep=";", index=False)
76 | 


--------------------------------------------------------------------------------
/examples/history_modeling/pca_representation.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | import os, sys
 3 | 
 4 | sys.path.append("../../python")
 5 | sys.path.append("../../")
 6 | 
 7 | import matplotlib.pyplot as plt
 8 | import pandas as pd
 9 | from sklearn.decomposition import PCA
10 | 
11 | from data.data_loader import matches_data_loader
12 | from history_modeling.match_representation import get_match_info, matches_info_norm
13 | 
14 | data_df = matches_data_loader(
15 |     path_to_data="../../submodules/tennis_atp",
16 |     path_to_cache="../../cache",
17 |     flush_cache=False,
18 |     keep_values_from_year=2023,
19 |     get_match_statistics=True,
20 |     get_reversed_match_data=True,
21 | )
22 | 
23 | ten_matches_history = pd.concat(
24 |     [get_match_info(data_df.iloc[i]) for i in range(len(data_df))], axis=0
25 | )
26 | ten_matches_history.reset_index(inplace=True, drop=True)
27 | match_info = matches_info_norm(ten_matches_history, "20230401")
28 | 
29 | match_info = match_info.dropna().reset_index(drop=True)
30 | 
31 | X = match_info.values
32 | pca = PCA(n_components=2)
33 | X_r = pca.fit(X).transform(X)
34 | 
35 | plt.figure(figsize=(20, 12))
36 | 
37 | plt.subplot(2, 4, 1)
38 | v_i = match_info.loc[match_info.result == 0].index.values
39 | d_i = match_info.loc[match_info.result == 1].index.values
40 | plt.scatter(X_r[v_i, 0], X_r[v_i, 1], label="Victories")
41 | plt.scatter(X_r[d_i, 0], X_r[d_i, 1], label="Defeats")
42 | plt.legend()
43 | plt.title("Result")
44 | 
45 | plt.subplot(2, 4, 2)
46 | c_i = match_info.loc[match_info.surface == 0.0].index.values
47 | h_i = match_info.loc[match_info.surface == 2 / 3].index.values
48 | g_i = match_info.loc[match_info.surface == 1.0].index.values
49 | plt.scatter(X_r[c_i, 0], X_r[c_i, 1], label="Clay")
50 | plt.scatter(X_r[h_i, 0], X_r[h_i, 1], label="Hard")
51 | plt.scatter(X_r[g_i, 0], X_r[g_i, 1], label="Grass")
52 | plt.legend()
53 | plt.title("Surface")
54 | 
55 | plt.subplot(2, 4, 3)
56 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_played_minutes)
57 | plt.title("played minutes")
58 | 
59 | plt.subplot(2, 4, 4)
60 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.adv_ranking)
61 | plt.title("Ranking Adversary")
62 | 
63 | plt.subplot(2, 4, 5)
64 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_won_sets)
65 | plt.title("Won sets Number")
66 | plt.subplot(2, 4, 6)
67 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_lost_sets)
68 | plt.title("Lost set Number")
69 | plt.subplot(2, 4, 7)
70 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_won_games)
71 | plt.title("Won games Number")
72 | plt.subplot(2, 4, 8)
73 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_lost_games)
74 | plt.title("Lost games Number")
75 | 
76 | plt.savefig("2d_pca_match_representation.png")
77 | plt.show()
78 | 


--------------------------------------------------------------------------------
/examples/history_modeling/pca_match_encoder_train.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | import os, sys
 3 | 
 4 | sys.path.append("../../python")
 5 | sys.path.append("../../")
 6 | 
 7 | import matplotlib.pyplot as plt
 8 | import pandas as pd
 9 | from sklearn.decomposition import PCA
10 | 
11 | from data.data_loader import matches_data_loader
12 | from history_modeling.match_representation import (
13 |     create_timeless_dataset,
14 |     create_dataset,
15 | )
16 | from history_modeling.encoding_model import PCAMatchEncoder
17 | 
18 | data_df = matches_data_loader(
19 |     path_to_data="../../submodules/tennis_atp",
20 |     path_to_cache="../../cache",
21 |     flush_cache=False,
22 |     keep_values_from_year=2020,
23 |     get_match_statistics=True,
24 |     get_reversed_match_data=True,
25 | )
26 | 
27 | """
28 | match_info = create_timeless_dataset(data_df)
29 | print(len(match_info))
30 | match_info = match_info.dropna().reset_index(drop=True)
31 | print(len(match_info))
32 | 
33 | X = match_info.values
34 | pca = PCA(n_components=2)
35 | X_r = pca.fit(X).transform(X)
36 | """
37 | model = PCAMatchEncoder(num_pca_features=2)
38 | model.fit(data_df, transform_data=True)
39 | X_r, match_info = model.predict(data_df, transform_data=True)
40 | 
41 | plt.figure(figsize=(20, 12))
42 | 
43 | plt.subplot(2, 4, 1)
44 | v_i = match_info.loc[match_info.result == 0].index.values
45 | d_i = match_info.loc[match_info.result == 1].index.values
46 | plt.scatter(X_r[v_i, 0], X_r[v_i, 1], label="Victories")
47 | plt.scatter(X_r[d_i, 0], X_r[d_i, 1], label="Defeats")
48 | plt.legend()
49 | plt.title("Result")
50 | 
51 | plt.subplot(2, 4, 2)
52 | c_i = match_info.loc[match_info.surface == 0.0].index.values
53 | h_i = match_info.loc[match_info.surface == 2 / 3].index.values
54 | g_i = match_info.loc[match_info.surface == 1.0].index.values
55 | plt.scatter(X_r[c_i, 0], X_r[c_i, 1], label="Clay")
56 | plt.scatter(X_r[h_i, 0], X_r[h_i, 1], label="Hard")
57 | plt.scatter(X_r[g_i, 0], X_r[g_i, 1], label="Grass")
58 | plt.legend()
59 | plt.title("Surface")
60 | 
61 | plt.subplot(2, 4, 3)
62 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_played_minutes)
63 | plt.title("played minutes")
64 | 
65 | plt.subplot(2, 4, 4)
66 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.adv_ranking)
67 | plt.title("Ranking Adversary")
68 | 
69 | plt.subplot(2, 4, 5)
70 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_won_sets)
71 | plt.title("Won sets Number")
72 | plt.subplot(2, 4, 6)
73 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_lost_sets)
74 | plt.title("Lost set Number")
75 | plt.subplot(2, 4, 7)
76 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_won_games)
77 | plt.title("Won games Number")
78 | plt.subplot(2, 4, 8)
79 | plt.scatter(X_r[:, 0], X_r[:, 1], c=match_info.num_lost_games)
80 | plt.title("Lost games Number")
81 | 
82 | plt.savefig("2d_pca_match_representation.png")
83 | plt.show()
84 | 


--------------------------------------------------------------------------------
/examples/history_modeling/train_test.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | 
 3 | sys.path.append("../../python")
 4 | sys.path.append("../../../")
 5 | 
 6 | import matplotlib.pyplot as plt
 7 | import numpy as np
 8 | import pandas as pd
 9 | 
10 | from model.xgboost import XGBoost
11 | from history_modeling.encoding_model import PCAMatchEncoder
12 | from evaluation.train_test import train_test_evaluation
13 | 
14 | 
15 | train_years = [2018, 2019, 2020]
16 | test_years = [2021, 2022]
17 | 
18 | 
19 | match_features = ["tournament_surface", "tournament_level"]
20 | player_features = [
21 |     "Ranking",
22 |     "Ranking_Points",
23 |     "Height",
24 |     "Victories_Percentage",
25 |     "Clay_Victories_Percentage",
26 |     "Grass_Victories_Percentage",
27 |     "Carpet_Victories_Percentage",
28 |     "Hard_Victories_Percentage",
29 |     "Aces_Percentage",
30 | ]
31 | additional_features = ["diff_rank", "v_perc_versus", "nb_match_versus"]
32 | xgb_hyperparams = {
33 |     "params": {
34 |         "eta": 0.3,
35 |         "objective": "binary:logistic",
36 |         "gamma": 10,
37 |         "max_depth": 10,
38 |         "min_child_weight": 8,
39 |         "subsample": 1,
40 |     }
41 | }
42 | 
43 | xgb_hyperparams = []
44 | for eta in [0.1, 0.3, 0.6]:
45 |     for gamma in [0, 1, 10]:
46 |         for max_depth in [2, 4, 6, 8, 10]:
47 |             for min_child_weight in [1, 2, 8]:
48 |                 for subsample in [0.4, 0.8, 1]:
49 |                     xgb_hyperparams.append(
50 |                         {
51 |                             "params": {
52 |                                 "eta": eta,
53 |                                 "objective": "binary:logistic",
54 |                                 "gamma": gamma,
55 |                                 "max_depth": max_depth,
56 |                                 "min_child_weight": min_child_weight,
57 |                                 "subsample": subsample,
58 |                             }
59 |                         }
60 |                     )
61 | test_score = train_test_evaluation(
62 |     train_years=[2018, 2019, 2020],
63 |     test_years=test_years,
64 |     model_class=XGBoost,
65 |     model_params=xgb_hyperparams,
66 |     encoder_models=[
67 |         (
68 |             PCAMatchEncoder,
69 |             {
70 |                 "num_pca_features": 2,
71 |                 "auto_transform": True,
72 |                 "columns": [
73 |                     "surface",
74 |                     "result",
75 |                     "adv_ranking",
76 |                     "adv_ranking_points",
77 |                     "num_won_sets",
78 |                     "num_lost_sets",
79 |                     "num_won_games",
80 |                     "num_lost_games",
81 |                     "num_tie_break_wons",
82 |                     "num_tie_break_lost",
83 |                 ],
84 |             },
85 |         )
86 |     ],
87 |     match_features=match_features,
88 |     player_features=player_features,
89 |     encoding_params={},
90 |     additional_features=additional_features,
91 |     save_path="../../results/history_encoding",
92 |     save_all_results=True,
93 | )
94 | 


--------------------------------------------------------------------------------
/examples/models/train_test.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | 
  3 | sys.path.append("../../python")
  4 | sys.path.append("../../../")
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | import numpy as np
  8 | import pandas as pd
  9 | from sklearn.ensemble import RandomForestClassifier
 10 | 
 11 | from data.data_loader import matches_data_loader
 12 | from data.data_loader import encode_data
 13 | 
 14 | data_df = matches_data_loader(
 15 |     path_to_data="../../submodules/tennis_atp",
 16 |     path_to_cache="../../cache",
 17 |     flush_cache=False,
 18 |     keep_values_from_year=2022,
 19 |     get_match_statistics=True,
 20 |     get_reversed_match_data=True,
 21 | )
 22 | 
 23 | forgotten_columns = ["Versus_1", "Best_Rank_1", "Last_Tournament_Date"]
 24 | 
 25 | columns_m = ["tournament_level", "round", "best_of", "Winner"]
 26 | columns_1 = [
 27 |     "ID_1",
 28 |     "Ranking_1",
 29 |     "Ranking_Points_1",
 30 |     "Hand_1",
 31 |     "Height_1",
 32 |     "Versus_1",
 33 |     "Victories_Percentage_1",
 34 |     "Clay_Victories_Percentage_1",
 35 |     "Grass_Victories_Percentage_1",
 36 |     "Carpet_Victories_Percentage_1",
 37 |     "Hard_Victories_Percentage_1",
 38 |     "Aces_Percentage_1",
 39 |     "Doublefaults_Percentage_1",
 40 |     "First_Serve_Success_Percentage_1",
 41 |     "Winning_on_1st_Serve_Percentage_1",
 42 |     "Winning_on_2nd_Serve_Percentage_1",
 43 |     "Overall_Win_on_Serve_Percentage_1",
 44 |     "BreakPoint_Face_Percentage_1",
 45 |     "BreakPoint_Saved_Percentage_1",
 46 |     "Fatigue_1",
 47 | ]
 48 | columns_2 = [
 49 |     "ID_2",
 50 |     "Ranking_2",
 51 |     "Ranking_Points_2",
 52 |     "Hand_2",
 53 |     "Height_2",
 54 |     "Versus_2",
 55 |     "Victories_Percentage_2",
 56 |     "Clay_Victories_Percentage_2",
 57 |     "Grass_Victories_Percentage_2",
 58 |     "Carpet_Victories_Percentage_2",
 59 |     "Hard_Victories_Percentage_2",
 60 |     "Aces_Percentage_2",
 61 |     "Doublefaults_Percentage_2",
 62 |     "First_Serve_Success_Percentage_2",
 63 |     "Winning_on_1st_Serve_Percentage_2",
 64 |     "Winning_on_2nd_Serve_Percentage_2",
 65 |     "Overall_Win_on_Serve_Percentage_2",
 66 |     "BreakPoint_Face_Percentage_2",
 67 |     "BreakPoint_Saved_Percentage_2",
 68 |     "Fatigue_2",
 69 | ]
 70 | 
 71 | data_df = data_df[columns_m + columns_1 + columns_2]
 72 | 
 73 | print(data_df.head())
 74 | print(data_df.shape)
 75 | 
 76 | data_df = data_df[columns_m + columns_1 + columns_2]
 77 | data_df = data_df.dropna(axis=0)
 78 | 
 79 | fdf = encode_data(data_df)
 80 | fdf.to_csv("../cache/test.csv")
 81 | 
 82 | fdf = fdf.drop(["ID_1", "Versus_1", "ID_2", "Versus_2"], axis=1)
 83 | fdf["diff_ranking"] = fdf["Ranking_2"] - fdf["Ranking_1"]
 84 | 
 85 | y = fdf.Winner
 86 | 
 87 | fdf = fdf[["diff_ranking"]]
 88 | X = fdf.values
 89 | 
 90 | print(X)
 91 | 
 92 | model = RandomForestClassifier(n_estimators=1000, max_depth=None)
 93 | print("FIT")
 94 | print(X.shape, y.shape)
 95 | model.fit(X, y)
 96 | 
 97 | y_pred = model.predict(X)
 98 | print(len(y), np.sum(y == y_pred))
 99 | print(y_pred)
100 | print(y)
101 | print(np.sum(y_pred))
102 | 
103 | plt.figure()
104 | plt.scatter(X, y)
105 | plt.show()
106 | """
107 | z = model.predict(np.expand_dims(list(range(-10000, 10001)), axis=1))
108 | plt.figure()
109 | plt.plot(list(range(-10000, 10001)), z)
110 | plt.show()
111 | """
112 | 


--------------------------------------------------------------------------------
/python/history_modeling/encoding_model.py:
--------------------------------------------------------------------------------
  1 | from abc import abstractmethod
  2 | 
  3 | import pandas as pd
  4 | from sklearn.decomposition import PCA
  5 | 
  6 | from history_modeling.match_representation import (
  7 |     create_timeless_dataset,
  8 |     get_match_info,
  9 | )
 10 | 
 11 | 
 12 | class MatchEncoder:
 13 |     def __init__(self, num_match_differences):
 14 |         self.num_match_differences = num_match_differences
 15 | 
 16 |     @property
 17 |     @abstractmethod
 18 |     def output_shape(self):
 19 |         pass
 20 | 
 21 |     def select_data(self, X, columns=None):
 22 |         assert isinstance(X, pd.DataFrame)
 23 | 
 24 |         if columns is not None:
 25 |             X_transformed = create_timeless_dataset(X, columns=columns)
 26 |         else:
 27 |             X_transformed = create_timeless_dataset(X)
 28 |         X_transformed = X_transformed.dropna().reset_index(drop=True)
 29 |         return X_transformed
 30 | 
 31 |     @abstractmethod
 32 |     def predict(self, match_row):
 33 |         pass
 34 | 
 35 | 
 36 | class PCAMatchEncoder(MatchEncoder):
 37 |     def __init__(
 38 |         self,
 39 |         num_pca_features,
 40 |         auto_transform=False,
 41 |         columns=[
 42 |             "surface",
 43 |             "result",
 44 |             "num_played_minutes",
 45 |             "adv_ranking",
 46 |             "adv_ranking_points",
 47 |             "num_won_sets",
 48 |             "num_lost_sets",
 49 |             "num_won_games",
 50 |             "num_lost_games",
 51 |             "num_tie_break_wons",
 52 |             "num_tie_break_lost",
 53 |         ],
 54 |     ):
 55 |         self.num_pca_features = num_pca_features
 56 |         self.auto_transform = auto_transform
 57 |         self.columns = columns
 58 | 
 59 |         self.model = self.instantiate_model()
 60 | 
 61 |     def output_shape(self):
 62 |         return self.num_pca_features
 63 | 
 64 |     def instantiate_model(self):
 65 |         model = PCA(n_components=self.num_pca_features)
 66 |         return model
 67 | 
 68 |     def fit(self, X, transform_data=False):
 69 |         if transform_data or self.auto_transform:
 70 |             X = self.select_data(X, columns=self.columns)
 71 |         self.model.fit(X)
 72 | 
 73 |     def predict(self, X, transform_data=False):
 74 |         if transform_data or self.auto_transform:
 75 |             X = self.select_data(X, columns=self.columns)
 76 |             return self.model.transform(X), X
 77 |         else:
 78 |             return self.model.transform(X)
 79 | 
 80 |     def save_model(self):
 81 |         pass
 82 | 
 83 | 
 84 | class IdentityEncoder(MatchEncoder):
 85 |     def __init__(
 86 |         self,
 87 |         auto_transform=False,
 88 |         columns=[
 89 |             "surface",
 90 |             "result",
 91 |             "num_played_minutes",
 92 |             "adv_ranking",
 93 |             "adv_ranking_points",
 94 |             "num_won_sets",
 95 |             "num_lost_sets",
 96 |             "num_won_games",
 97 |             "num_lost_games",
 98 |             "num_tie_break_wons",
 99 |             "num_tie_break_lost",
100 |         ],
101 |     ):
102 |         self.columns = columns
103 |         self.auto_transform = auto_transform
104 | 
105 |         self.model = self.instantiate_model()
106 | 
107 |     @property
108 |     def output_shape(self):
109 |         return len(self.columns)
110 | 
111 |     def instantiate_model(self):
112 |         return None
113 | 
114 |     def fit(self, X, transform_data=False):
115 |         pass
116 | 
117 |     def predict(self, X, transform_data=False):
118 |         if transform_data or self.auto_transform:
119 |             X_tr = self.select_data(X, columns=self.columns)
120 |             return X_tr, X
121 |         else:
122 |             return X
123 | 
124 |     def save_model(self):
125 |         pass
126 | 
127 | 
128 | class MatchesHistoryEncoder:
129 |     def __init__(self, match_encoder, num_matches, add_days_difference):
130 |         self.match_encoder = match_encoder
131 |         self.num_matches = num_matches
132 |         self.add_days_difference = add_days_difference
133 | 
134 |     @abstractmethod
135 |     def predict(self, match_row):
136 |         pass
137 | 


--------------------------------------------------------------------------------
/examples/models/train_test_eval.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | 
  3 | sys.path.append("../../python")
  4 | sys.path.append("../../../")
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | import numpy as np
  8 | import pandas as pd
  9 | from sklearn.ensemble import RandomForestClassifier
 10 | from sklearn.ensemble import GradientBoostingClassifier
 11 | 
 12 | from data.data_loader import matches_data_loader
 13 | from data.data_loader import encode_data
 14 | from evaluation.train_test import train_test_evaluation
 15 | 
 16 | 
 17 | train_years = [2020, 2021]
 18 | test_years = [2022, 2023]
 19 | 
 20 | 
 21 | model_class = RandomForestClassifier
 22 | model_params = {"n_estimators": 2000, "max_depth": None}
 23 | match_features = []
 24 | player_features = ["Ranking"]
 25 | additional_features = ["diff_rank", "v_perc_versus"]
 26 | 
 27 | test_score = train_test_evaluation(
 28 |     train_years=train_years,
 29 |     test_years=test_years,
 30 |     model_class=model_class,
 31 |     model_params=model_params,
 32 |     match_features=match_features,
 33 |     player_features=player_features,
 34 |     encoding_params={},
 35 |     additional_features=additional_features,
 36 | )
 37 | 
 38 | print("Test Score", test_score)
 39 | 
 40 | 
 41 | model_class = RandomForestClassifier
 42 | model_params = {"n_estimators": 2000, "max_depth": None}
 43 | match_features = []
 44 | player_features = ["Ranking"]
 45 | additional_features = []
 46 | 
 47 | test_score = train_test_evaluation(
 48 |     train_years=train_years,
 49 |     test_years=test_years,
 50 |     model_class=model_class,
 51 |     model_params=model_params,
 52 |     match_features=match_features,
 53 |     player_features=player_features,
 54 |     encoding_params={},
 55 |     additional_features=additional_features,
 56 | )
 57 | 
 58 | print("Test Score", test_score)
 59 | 
 60 | 
 61 | model_class = RandomForestClassifier
 62 | model_params = {"n_estimators": 2000, "max_depth": None}
 63 | match_features = []
 64 | player_features = []
 65 | additional_features = ["diff_rank"]
 66 | 
 67 | test_score = train_test_evaluation(
 68 |     train_years=train_years,
 69 |     test_years=test_years,
 70 |     model_class=model_class,
 71 |     model_params=model_params,
 72 |     match_features=match_features,
 73 |     player_features=player_features,
 74 |     encoding_params={},
 75 |     additional_features=additional_features,
 76 | )
 77 | 
 78 | print("Test Score", test_score)
 79 | 
 80 | 
 81 | model_class = RandomForestClassifier
 82 | model_params = {"n_estimators": 1, "max_depth": 1}
 83 | match_features = []
 84 | player_features = []
 85 | additional_features = ["diff_rank"]
 86 | 
 87 | test_score = train_test_evaluation(
 88 |     train_years=train_years,
 89 |     test_years=test_years,
 90 |     model_class=model_class,
 91 |     model_params=model_params,
 92 |     match_features=match_features,
 93 |     player_features=player_features,
 94 |     encoding_params={},
 95 |     additional_features=additional_features,
 96 | )
 97 | 
 98 | print("Test Score", test_score)
 99 | 
100 | 
101 | model_class = GradientBoostingClassifier
102 | model_params = {"n_estimators": 100, "learning_rate": 1.0, "max_depth": 1}
103 | match_features = []
104 | player_features = []
105 | additional_features = ["diff_rank"]
106 | 
107 | test_score = train_test_evaluation(
108 |     train_years=train_years,
109 |     test_years=test_years,
110 |     model_class=model_class,
111 |     model_params=model_params,
112 |     match_features=match_features,
113 |     player_features=player_features,
114 |     encoding_params={},
115 |     additional_features=additional_features,
116 | )
117 | 
118 | print("Test Score", test_score)
119 | 
120 | 
121 | model_class = GradientBoostingClassifier
122 | model_params = {"n_estimators": 1000, "learning_rate": 0.1, "max_depth": 4}
123 | match_features = []
124 | player_features = []
125 | additional_features = ["diff_rank"]
126 | 
127 | test_score = train_test_evaluation(
128 |     train_years=train_years,
129 |     test_years=test_years,
130 |     model_class=model_class,
131 |     model_params=model_params,
132 |     match_features=match_features,
133 |     player_features=player_features,
134 |     encoding_params={},
135 |     additional_features=additional_features,
136 | )
137 | 
138 | print("Test Score", test_score)
139 | 


--------------------------------------------------------------------------------
/examples/models/deep_history.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | sys.path.append("../../python")
  7 | import time
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | 
 12 | from data.data_loader import matches_data_loader
 13 | from data.data_encoding import (
 14 |     encode_data,
 15 |     create_additional_features,
 16 |     clean_missing_data,
 17 |     create_encoded_history,
 18 |     complete_missing_data,
 19 | )
 20 | from history_modeling.encoding_model import IdentityEncoder
 21 | from model.deep_model import ConvolutionalHistoryAndFullyConnected
 22 | 
 23 | 
 24 | absolute_path = os.path.dirname(os.path.abspath(__file__))
 25 | match_features = ["tournament_surface", "tournament_level", "round"]
 26 | player_features = [
 27 |     "Ranking",
 28 |     "Ranking_Points",
 29 |     "Height",
 30 |     "Victories_Percentage",
 31 |     "Clay_Victories_Percentage",
 32 |     "Grass_Victories_Percentage",
 33 |     "Carpet_Victories_Percentage",
 34 |     "Hard_Victories_Percentage",
 35 |     "Aces_Percentage",
 36 | ]
 37 | additional_features = ["diff_rank", "v_perc_versus", "nb_match_versus"]
 38 | encoding_params = {}
 39 | 
 40 | data_df = matches_data_loader(
 41 |     path_to_data=os.path.join(absolute_path, "../../submodules/tennis_atp"),
 42 |     path_to_cache=os.path.join(absolute_path, "../../cache"),
 43 |     flush_cache=False,
 44 |     keep_values_from_year=2022,
 45 |     get_match_statistics=True,
 46 |     get_reversed_match_data=True,
 47 |     include_davis_cup=False,
 48 | )
 49 | print(f"[+] Data Loaded, Now Encoding Data and create additional Features")
 50 | print(data_df.head())
 51 | print(data_df.columns)
 52 | 
 53 | # data_df = pd.concat([data_df.iloc[:1000], data_df.iloc[-1000:]])
 54 | 
 55 | history_columns = []
 56 | encoder_models = [(IdentityEncoder, {})]
 57 | for encoding_model, encoding_model_params in encoder_models:
 58 |     print(f"[+] Training Encoder Model {encoding_model}")
 59 |     encoder = encoding_model(**encoding_model_params)
 60 |     encoder.fit(data_df)
 61 | 
 62 |     print(f"[+] Encoding using encoder {encoding_model}")
 63 |     encoded_data = create_encoded_history(
 64 |         data_df, encoder, num_matches=5, completing_value=0
 65 |     )
 66 | 
 67 |     cols = ["history_1", "history_2"]
 68 | 
 69 |     flatten_data = pd.concat(
 70 |         [
 71 |             pd.DataFrame(
 72 |                 np.array(encoded_data[x].values.tolist()).reshape(
 73 |                     (len(encoded_data), -1)
 74 |                 )
 75 |             ).add_prefix(x)
 76 |             for x in cols
 77 |         ],
 78 |         axis=1,
 79 |     )
 80 |     encoded_data = pd.concat([flatten_data, encoded_data.drop(cols, axis=1)], axis=1)
 81 |     enc_columns = encoded_data.columns
 82 |     enc_columns = list(set(enc_columns) - set(["id", "ID_1", "ID_2"]))
 83 |     history_columns.extend(enc_columns)
 84 | 
 85 |     data_df = pd.merge(data_df, encoded_data, on=["id", "ID_1", "ID_2"])
 86 | 
 87 | train_data = data_df.loc[data_df.tournament_year.isin([2022])]
 88 | test_data = data_df.loc[data_df.tournament_year.isin([2023])]
 89 | # train_data = data_df.loc[data_df.tournament_year.isin([2019, 2020, 2021])]
 90 | # test_data = data_df.loc[data_df.tournament_year.isin([2022, 2023])]
 91 | train_data = create_additional_features(train_data, additional_features)
 92 | train_data = encode_data(train_data, **encoding_params)
 93 | test_data = create_additional_features(test_data, additional_features)
 94 | test_data = encode_data(test_data, **encoding_params)
 95 | 
 96 | p1_features = [feat + "_1" for feat in player_features]
 97 | p2_features = [feat + "_2" for feat in player_features]
 98 | match_features = match_features.copy()
 99 | 
100 | train_data_ = train_data[
101 |     match_features + p1_features + p2_features + ["Winner", "tournament_year"]
102 | ]
103 | test_data_ = test_data[
104 |     match_features + p1_features + p2_features + ["Winner", "tournament_year"]
105 | ]
106 | 
107 | # train_data_ = clean_missing_data(train_data_)
108 | # test_data_ = clean_missing_data(test_data_)
109 | 
110 | print(data_df.head())
111 | print(data_df.columns)
112 | 
113 | model = ConvolutionalHistoryAndFullyConnected(
114 |     num_history_signals=22,
115 |     **{
116 |         "input_shape": 23,
117 |         "hidden_units": (22, 44, 22, 11, 4),
118 |         "output_shape": 2,
119 |         "last_activation": "softmax",
120 |         "epochs": 100,
121 |         "reduced_lr_epochs": 50,
122 |         "loss": "categorical_crossentropy",
123 |     },
124 | )
125 | # model.instantiate_model()
126 | 
127 | print(model.summary())
128 | 
129 | print(data_df.head())
130 | 
131 | hist_cols = []
132 | for col in data_df.columns:
133 |     if "history" in col:
134 |         hist_cols.append(col)
135 | 
136 | print(len(train_data), len(hist_cols))
137 | 
138 | model.fit(
139 |     train_data_.values,
140 |     train_data[hist_cols].values.reshape((len(train_data), 5, 22)),
141 |     train_data["Winner"].values,
142 | )
143 | 
144 | y_pred = model.predict(
145 |     test_data_.values, test_data[hist_cols].values.reshape((len(test_data), 5, 22))
146 | )
147 | 
148 | 
149 | print(np.sum(y_pred == test_data["Winner"]))
150 | 
151 | plt.plot(y_pred)
152 | plt.show()
153 | 


--------------------------------------------------------------------------------
/examples/results_reading/best_model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import matplotlib.pyplot as plt
  4 | from matplotlib.patches import Rectangle
  5 | import numpy as np
  6 | import pandas as pd
  7 | 
  8 | df_results = pd.read_csv("../../results/20212022/results.csv", sep=";")
  9 | 
 10 | best_row = df_results.iloc[df_results.precision.argmax()]
 11 | print(best_row)
 12 | 
 13 | eval_id = best_row["eval_ID"]
 14 | best_results = pd.read_csv(
 15 |     os.path.join("../../results/20212022", f"{eval_id}.csv"), sep=";"
 16 | )
 17 | 
 18 | fig, ax = plt.subplots()
 19 | df_ww = best_results.loc[best_results.Winner == 0].loc[best_results.y_pred == 0]
 20 | plt.scatter(df_ww.diff_rank, df_ww.Winner, c="tab:pink", label="Well Predicted")
 21 | df_wl = best_results.loc[best_results.Winner == 0].loc[best_results.y_pred == 1]
 22 | plt.scatter(df_wl.diff_rank, df_wl.Winner + 0.1, c="tab:blue", label="Predicted Wrong")
 23 | df_ll = best_results.loc[best_results.Winner == 1].loc[best_results.y_pred == 1]
 24 | plt.scatter(df_ll.diff_rank, df_ll.Winner, c="tab:orange", label="Well Wrong")
 25 | df_lw = best_results.loc[best_results.Winner == 1].loc[best_results.y_pred == 0]
 26 | plt.scatter(df_lw.diff_rank, df_lw.Winner - 0.1, c="tab:red", label="Predicted Wrong")
 27 | plt.legend()
 28 | 
 29 | plt.xlabel("Rank Player 0 - Rank Player 1")
 30 | plt.ylabel("Winner")
 31 | plt.show()
 32 | 
 33 | # Let's evaluate Symmetry
 34 | symmetric_same_results = 0
 35 | for i in range(int(len(best_results) / 2)):
 36 |     if best_results.iloc[2 * i]["y_pred"] != best_results.iloc[2 * i + 1]["y_pred"]:
 37 |         symmetric_same_results += 1
 38 | print(
 39 |     f"{(symmetric_same_results / (len(best_results) / 2))} Results are symmetrically predicted"
 40 | )
 41 | 
 42 | rank_categories = [1, 10, 50, 100, 300, 1000, 9999]
 43 | 
 44 | prediction_percentage = []
 45 | 
 46 | for cat_1 in range(len(rank_categories) - 1):
 47 |     lines = []
 48 |     nb_matches_lines = []
 49 |     for cat_2 in range(len(rank_categories) - 1):
 50 |         sub_df = best_results.loc[best_results.Ranking_1 >= rank_categories[cat_1]].loc[
 51 |             best_results.Ranking_1 < rank_categories[cat_1 + 1]
 52 |         ]
 53 |         sub_df = sub_df.loc[sub_df.Ranking_2 >= rank_categories[cat_2]].loc[
 54 |             sub_df.Ranking_2 < rank_categories[cat_2 + 1]
 55 |         ]
 56 |         sub_df["best_rank"] = sub_df.apply(
 57 |             lambda row: np.argmin([row["Ranking_1"], row["Ranking_2"]]), axis=1
 58 |         )
 59 | 
 60 |         if len(sub_df) > 0:
 61 |             best_player_w_p = np.sum(
 62 |                 sub_df.Winner.values == sub_df.y_pred.values
 63 |             ) / len(sub_df)
 64 | 
 65 |         else:
 66 |             best_player_w_p = 0
 67 |         lines.append(best_player_w_p)
 68 |         nb_matches_lines.append(len(sub_df) / 2)
 69 |     prediction_percentage.append(lines)
 70 | 
 71 | colors = ["purple", "blue", "cyan", "green", "yellow", "orange", "red"]
 72 | fig, ax = plt.subplots()
 73 | 
 74 | for i, val1 in enumerate(prediction_percentage):
 75 |     for j, val2 in enumerate(val1):
 76 |         color = colors[int(val2 * (len(colors) - 1))]
 77 |         rect = plt.Rectangle((i, j), 1, 1, fc=color)
 78 |         ax.add_patch(rect)
 79 |         plt.text(i + 0.2, j + 0.35, np.round(val2 * 100, 2))
 80 | 
 81 | for i in range(len(rank_categories)):
 82 |     plt.plot([i, i], [0, len(rank_categories) - 1], c="k")
 83 |     plt.plot([0, len(rank_categories) - 1], [i, i], c="k")
 84 | 
 85 | plt.xticks(list(range(len(rank_categories))), labels=rank_categories)
 86 | plt.yticks(list(range(len(rank_categories))), labels=rank_categories)
 87 | plt.xlabel("Player 1 Rank Category")
 88 | plt.ylabel("Player 2 Rank Category")
 89 | plt.title("Precision Percentage")
 90 | plt.savefig("precision_percentage_players_ranks.png")
 91 | plt.show()
 92 | 
 93 | best_ranked_player_wins_results = pd.read_csv(
 94 |     os.path.join(
 95 |         "../../results/20212022",
 96 |         f"{df_results.loc[df_results.model_class=='BestRankedPlayerWins'].eval_ID.values[0]}.csv",
 97 |     ),
 98 |     sep=";",
 99 | )
100 | ticks = []
101 | fig, ax = plt.subplots()
102 | for surface, surface_code in {"Clay": 0, "Carpet": 1, "Hard": 2, "Grass": 3}.items():
103 |     precision_model = best_results.loc[best_results.tournament_surface == surface_code]
104 |     precision_brpw = best_ranked_player_wins_results.loc[
105 |         best_ranked_player_wins_results.tournament_surface == surface_code
106 |     ]
107 | 
108 |     if len(precision_model) > 0:
109 |         precision_model = len(
110 |             precision_model.loc[precision_model.y_pred == precision_model.Winner]
111 |         ) / len(precision_model)
112 |         prec_brpw = 0
113 |         for n_row, row in precision_brpw.iterrows():
114 |             if int(row["y_pred"][1]) == row["Winner"]:
115 |                 prec_brpw += 1
116 |         precision_brpw = prec_brpw / len(precision_brpw)
117 |     else:
118 |         precision_model = 0
119 |         precision_brpw = 0
120 |     rect = Rectangle(
121 |         (surface_code * 2, 0),
122 |         1,
123 |         precision_model,
124 |         edgecolor="k",
125 |         facecolor="tab:blue",
126 |         label="Model - XGBoost",
127 |     )
128 |     ax.add_patch(rect)
129 |     rect = Rectangle(
130 |         (surface_code * 2 + 1, 0),
131 |         1,
132 |         precision_brpw,
133 |         edgecolor="k",
134 |         facecolor="tab:pink",
135 |         label="Best Ranked Player Wins",
136 |     )
137 |     ax.add_patch(rect)
138 |     ticks.append(surface)
139 | 
140 | ax.autoscale()
141 | plt.xticks([1, 3, 5, 7], labels=ticks)
142 | handles, labels = plt.gca().get_legend_handles_labels()
143 | by_label = dict(zip(labels, handles))
144 | plt.legend(by_label.values(), by_label.keys(), loc=4)
145 | plt.title("Win % for each surface")
146 | plt.savefig("win_per_surface.png")
147 | plt.show()
148 | 


--------------------------------------------------------------------------------
/python/history_modeling/match_representation.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | from data.data_utils import get_days_difference
  5 | 
  6 | 
  7 | def get_match_info(row, verbose=0):
  8 |     # add adversary age & hand ?
  9 |     surface = row["tournament_surface"]
 10 |     result = row["Winner"]
 11 |     try:
 12 |         score = row["score"]
 13 |     except:
 14 |         print(row)
 15 |         print(row.index)
 16 |         print(row.values)
 17 |     num_played_minutes = row["elapsed_minutes"]
 18 |     date = row["tournament_date"]
 19 | 
 20 |     adv_ranking = row["Ranking_2"]
 21 |     adv_ranking_points = row["Ranking_Points_2"]
 22 | 
 23 |     num_won_sets = 0
 24 |     num_lost_sets = 0
 25 |     num_won_games = 0
 26 |     num_lost_games = 0
 27 |     num_tie_break_wons = 0
 28 |     num_tie_break_lost = 0
 29 | 
 30 |     for set in row["score"].split(" "):
 31 |         try:
 32 |             games_0 = set.split("-")[0]
 33 |             games_1 = set.split("-")[1]
 34 | 
 35 |             if "(" in games_0:
 36 |                 games_0 = games_0.split("(")[0]
 37 |                 num_tie_break_lost += 1
 38 | 
 39 |             elif "(" in games_1:
 40 |                 games_1 = games_1.split("(")[0]
 41 |                 num_tie_break_wons += 1
 42 | 
 43 |             games_0 = int(games_0)
 44 |             games_1 = int(games_1)
 45 | 
 46 |             if games_0 > games_1:
 47 |                 num_won_sets += 1
 48 |             elif games_0 < games_1:
 49 |                 num_lost_sets += 1
 50 | 
 51 |             num_won_games += games_0
 52 |             num_lost_games += games_1
 53 |         except:
 54 |             if set not in ["ABD", "RET", "W/O"]:
 55 |                 if verbose == 2:
 56 |                     print(set)
 57 |                 else:
 58 |                     pass
 59 | 
 60 |     match_df = pd.DataFrame(
 61 |         {
 62 |             "surface": [surface],
 63 |             "result": [result],
 64 |             "num_played_minutes": [num_played_minutes],
 65 |             "date": [date],
 66 |             "adv_ranking": [adv_ranking],
 67 |             "adv_ranking_points": [adv_ranking_points],
 68 |             "num_won_sets": [num_won_sets],
 69 |             "num_lost_sets": [num_lost_sets],
 70 |             "num_won_games": [num_won_games],
 71 |             "num_lost_games": [num_lost_games],
 72 |             "num_tie_break_wons": [num_tie_break_wons],
 73 |             "num_tie_break_lost": [num_tie_break_lost],
 74 |         }
 75 |     )
 76 |     return match_df
 77 | 
 78 | 
 79 | def matches_info_norm(matches_info, current_date=""):
 80 |     # Normalize values
 81 |     tournament_surface = {"Clay": 0.0, "Carpet": 1 / 3, "Hard": 2 / 3, "Grass": 1.0}
 82 |     # nb sets won: max 3
 83 |     # nb sets lost: max 3
 84 |     # nb games won: max 100 (from experience - to be validated)
 85 |     # nb games lost: max 100 (from experience - to be validated)
 86 |     # nb tiebreaks won: max 100 (from experience - to be validated) -> not number of points but nb of tiebreaks ?
 87 |     # nb tiebreaks lost: max 100 (from experience - to be validated)
 88 |     # Ranking points max 16,950 from Djokovic's record -> 20,000
 89 |     # Ranking max 9,999
 90 |     # Num played minutes max 671 from Mahut/Isner's record -> 700
 91 |     # date: compute number of days since tournament date -> normalize by 365 -> if > 365 give up ?
 92 | 
 93 |     matches_info = matches_info.copy()
 94 |     matches_info["surface"] = matches_info["surface"].apply(
 95 |         lambda val: tournament_surface[val]
 96 |     )
 97 |     matches_info["num_won_sets"] = matches_info["num_won_sets"].apply(
 98 |         lambda val: val / 3
 99 |     )
100 |     matches_info["num_lost_sets"] = matches_info["num_lost_sets"].apply(
101 |         lambda val: val / 3
102 |     )
103 | 
104 |     matches_info["date"] = matches_info["date"].apply(
105 |         lambda val: get_days_difference(val, current_date) / 365
106 |     )
107 |     matches_info["num_played_minutes"] = matches_info["num_played_minutes"].apply(
108 |         lambda val: val / 700
109 |     )
110 | 
111 |     matches_info["adv_ranking"] = matches_info["adv_ranking"].apply(
112 |         lambda val: np.log(val) / np.log(9999)
113 |     )
114 |     matches_info["adv_ranking_points"] = matches_info["adv_ranking_points"].apply(
115 |         lambda val: val / 20000
116 |     )
117 | 
118 |     matches_info["num_won_games"] = matches_info["num_won_games"].apply(
119 |         lambda val: val / 100
120 |     )
121 |     matches_info["num_lost_games"] = matches_info["num_lost_games"].apply(
122 |         lambda val: val / 100
123 |     )
124 |     matches_info["num_tie_break_wons"] = matches_info["num_tie_break_wons"].apply(
125 |         lambda val: val / 3
126 |     )
127 |     matches_info["num_tie_break_lost"] = matches_info["num_tie_break_lost"].apply(
128 |         lambda val: val / 3
129 |     )
130 | 
131 |     return matches_info
132 | 
133 | 
134 | def create_dataset(
135 |     data_df, num_matches_difference=10, nb_kept_differences=10, randomize_indexes=False
136 | ):
137 |     """
138 |     Creates the match representation dataset
139 |     :param data_df:
140 |     :param num_matches_difference:
141 |     :param nb_kept_differences:
142 |     :return:
143 |     """
144 |     dataset = []
145 |     for i in range(len(data_df)):
146 |         current_row = data_df.iloc[i]
147 |         current_player = current_row.ID_1
148 |         sub_data_df = data_df.loc[data_df.ID_1 == current_player].iloc[: i - 1]
149 |         if len(sub_data_df) > 0:
150 |             sub_data_df = sub_data_df.reset_index(drop=True)
151 | 
152 |             kept_indexes = list(
153 |                 range(-min(len(sub_data_df), num_matches_difference), 0)
154 |             )
155 |             if randomize_indexes:
156 |                 kept_indexes = np.random.permutation(kept_indexes)
157 |             kept_indexes = kept_indexes[:nb_kept_differences]
158 |             sub_data_df = sub_data_df.iloc[kept_indexes]
159 |             sub_data_df = sub_data_df.reset_index(drop=True)
160 | 
161 |             raw_matches_info = pd.concat(
162 |                 [get_match_info(sub_data_df.iloc[i]) for i in range(len(sub_data_df))],
163 |                 axis=0,
164 |             )
165 |             normalized_matches_info = matches_info_norm(
166 |                 raw_matches_info, current_date=current_row["tournament_date"]
167 |             )
168 |             dataset.append(normalized_matches_info)
169 | 
170 |     return pd.concat(dataset, axis=0)
171 | 
172 | 
173 | def create_timeless_dataset(
174 |     data_df,
175 |     columns=[
176 |         "surface",
177 |         "result",
178 |         "num_played_minutes",
179 |         "adv_ranking",
180 |         "adv_ranking_points",
181 |         "num_won_sets",
182 |         "num_lost_sets",
183 |         "num_won_games",
184 |         "num_lost_games",
185 |         "num_tie_break_wons",
186 |         "num_tie_break_lost",
187 |     ],
188 | ):
189 |     dataset = []
190 |     for i in range(len(data_df)):
191 |         raw_matches_info = get_match_info(data_df.iloc[i])
192 |         dataset.append(raw_matches_info)
193 |     dataset = pd.concat(dataset, axis=0)
194 | 
195 |     dataset = matches_info_norm(
196 |         dataset, current_date=data_df["tournament_date"].values[-1]
197 |     )
198 |     dataset = dataset.drop(["date"], axis=1)
199 |     return dataset[columns]
200 | 


--------------------------------------------------------------------------------
/python/model/deep_model.py:
--------------------------------------------------------------------------------
  1 | from sklearn.preprocessing import StandardScaler
  2 | import tensorflow as tf
  3 | 
  4 | from model.base_model import DeepBaseModel
  5 | 
  6 | 
  7 | def create_dense_model(
  8 |     input_shape=2,
  9 |     output_shape=2,
 10 |     hidden_units=(4, 8, 4),
 11 |     hidden_activations="relu",
 12 |     last_activation="softmax",
 13 | ):
 14 |     hid_activation = tf.keras.layers.Activation(hidden_activations)
 15 |     inputs = tf.keras.layers.Input(shape=input_shape)
 16 |     hidden_out = inputs
 17 | 
 18 |     for n_cells in hidden_units:
 19 |         hidden_out = tf.keras.layers.Dense(n_cells)(hidden_out)
 20 |         hidden_out = hid_activation(hidden_out)
 21 | 
 22 |     out = tf.keras.layers.Dense(output_shape)(hidden_out)
 23 |     out = tf.keras.layers.Activation(last_activation)(out)
 24 | 
 25 |     return tf.keras.Model(inputs=inputs, outputs=out)
 26 | 
 27 | 
 28 | class SimpleFullyConnected(DeepBaseModel):
 29 |     def __init__(
 30 |         self,
 31 |         input_shape=2,
 32 |         output_shape=2,
 33 |         hidden_units=[4, 8, 4],
 34 |         hidden_activations="relu",
 35 |         last_activation="softmax",
 36 |         epochs=50,
 37 |         reduced_lr_epochs=10,
 38 |         optimizer="adamax",
 39 |         lr=1e-5,
 40 |         loss="cross_entropy",
 41 |     ):
 42 |         self.input_shape = input_shape
 43 |         self.output_shape = output_shape
 44 |         self.hidden_units = hidden_units
 45 |         self.hidden_activations = hidden_activations
 46 |         self.last_activation = last_activation
 47 |         self.epochs = epochs
 48 |         self.reduced_lr_epochs = reduced_lr_epochs
 49 |         self.optimizer = optimizer
 50 |         self.lr = lr
 51 |         self.loss = loss
 52 |         super().__init__()
 53 | 
 54 |     def instantiate_model(self):
 55 |         self.scaler_x = StandardScaler()
 56 |         self.model = create_dense_model(
 57 |             input_shape=self.input_shape,
 58 |             output_shape=self.output_shape,
 59 |             hidden_units=self.hidden_units,
 60 |             hidden_activations=self.hidden_activations,
 61 |             last_activation=self.last_activation,
 62 |         )
 63 | 
 64 |         if self.optimizer == "adamax":
 65 |             self.optimizer = tf.keras.optimizers.Adamax(lr=self.lr)
 66 |         elif self.optimizer == "rmsprop":
 67 |             self.optimizer = tf.keras.optimizers.RMSprop(lr=self.lr)
 68 |         elif self.optimizer == "sgd":
 69 |             self.optimizer = tf.keras.optimizers.SGD(lr=self.lr)
 70 |         elif self.optimizer == "Adam":
 71 |             self.optimizer = tf.keras.optimizers.Adam(lr=self.lr)
 72 |         else:
 73 |             raise ValueError(
 74 |                 f"Optimizer {self.optimizer} not understood, must be among ['adam', 'adamax', 'sgd', 'rmsprop']"
 75 |             )
 76 | 
 77 |         self.model.compile(optimizer=self.optimizer, loss=self.loss)
 78 | 
 79 |     def fit(self, X, y):
 80 |         self.scaler_x.fit(X)
 81 |         if self.output_shape == 2:
 82 |             y = tf.one_hot(y.squeeze(), depth=2)
 83 |         self.model.fit(self.scaler_x.transform(X), y, epochs=self.epochs)
 84 |         if self.reduced_lr_epochs > 0:
 85 |             self.optimizer.lr.assign(self.lr / 10)
 86 |             self.model.fit(self.scaler_x.transform(X), y, epochs=self.reduced_lr_epochs)
 87 | 
 88 |     def predict(self, X):
 89 |         y_pred = self.model.predict(self.scaler_x.transform(X))
 90 |         if self.output_shape == 2:
 91 |             y_pred = tf.argmax(y_pred, axis=-1)
 92 |         return y_pred
 93 | 
 94 | 
 95 | def create_conv_dense_model(
 96 |     input_shape=2,
 97 |     history_input_shape=(5, 5),
 98 |     output_shape=2,
 99 |     hidden_units=(4, 8, 4),
100 |     hidden_activations="relu",
101 |     last_activation="softmax",
102 | ):
103 |     hid_activation = tf.keras.layers.Activation(hidden_activations)
104 | 
105 |     history_inputs = tf.keras.layers.Input(shape=history_input_shape)
106 |     print(history_inputs.shape, history_input_shape)
107 |     encoded_history = tf.keras.layers.Conv1D(filters=8, kernel_size=3, padding="same")(
108 |         history_inputs
109 |     )
110 |     encoded_history = tf.keras.layers.Conv1D(filters=4, kernel_size=3)(history_inputs)
111 |     encoded_history = tf.keras.layers.Conv1D(filters=1, kernel_size=3)(encoded_history)
112 |     encoded_history = tf.keras.layers.Flatten()(encoded_history)
113 | 
114 |     inputs = tf.keras.layers.Input(shape=input_shape)
115 |     hidden_out = tf.keras.layers.Concatenate()([inputs, encoded_history])
116 | 
117 |     for n_cells in hidden_units:
118 |         hidden_out = tf.keras.layers.Dense(n_cells)(hidden_out)
119 |         hidden_out = hid_activation(hidden_out)
120 | 
121 |     out = tf.keras.layers.Dense(output_shape)(hidden_out)
122 |     out = tf.keras.layers.Activation(last_activation)(out)
123 | 
124 |     return tf.keras.Model(inputs=[history_inputs, inputs], outputs=out)
125 | 
126 | 
127 | class ConvolutionalHistoryAndFullyConnected(DeepBaseModel):
128 |     def __init__(
129 |         self,
130 |         num_history_signals=2,
131 |         history_length=5,
132 |         input_shape=2,
133 |         output_shape=2,
134 |         hidden_units=[4, 8, 4],
135 |         hidden_activations="relu",
136 |         last_activation="softmax",
137 |         epochs=50,
138 |         reduced_lr_epochs=10,
139 |         optimizer="adamax",
140 |         lr=1e-5,
141 |         loss="cross_entropy",
142 |     ):
143 |         self.num_history_signals = num_history_signals
144 |         self.history_length = history_length
145 |         self.input_shape = input_shape
146 |         self.output_shape = output_shape
147 |         self.hidden_units = hidden_units
148 |         self.hidden_activations = hidden_activations
149 |         self.last_activation = last_activation
150 |         self.epochs = epochs
151 |         self.reduced_lr_epochs = reduced_lr_epochs
152 |         self.optimizer = optimizer
153 |         self.lr = lr
154 |         self.loss = loss
155 |         super().__init__()
156 | 
157 |     def instantiate_model(self):
158 |         self.scaler_x = StandardScaler()
159 |         self.model = create_conv_dense_model(
160 |             history_input_shape=(self.history_length, self.num_history_signals),
161 |             input_shape=self.input_shape,
162 |             output_shape=self.output_shape,
163 |             hidden_units=self.hidden_units,
164 |             hidden_activations=self.hidden_activations,
165 |             last_activation=self.last_activation,
166 |         )
167 | 
168 |         if self.optimizer == "adamax":
169 |             self.optimizer = tf.keras.optimizers.Adamax(lr=self.lr)
170 |         elif self.optimizer == "rmsprop":
171 |             self.optimizer = tf.keras.optimizers.RMSprop(lr=self.lr)
172 |         elif self.optimizer == "sgd":
173 |             self.optimizer = tf.keras.optimizers.SGD(lr=self.lr)
174 |         elif self.optimizer == "Adam":
175 |             self.optimizer = tf.keras.optimizers.Adam(lr=self.lr)
176 |         else:
177 |             raise ValueError(
178 |                 f"Optimizer {self.optimizer} not understood, must be among ['adam', 'adamax', 'sgd', 'rmsprop']"
179 |             )
180 | 
181 |         self.model.compile(optimizer=self.optimizer, loss=self.loss)
182 | 
183 |     def fit(self, X, X_history, y):
184 |         # print(X.columns)
185 |         self.scaler_x.fit(X)
186 |         if self.output_shape == 2:
187 |             y = tf.one_hot(y.squeeze(), depth=2)
188 | 
189 |         print("X shape", X.shape)
190 |         print("X history shape", X_history.shape)
191 |         print("y shape", y.shape)
192 |         self.model.fit([X_history, self.scaler_x.transform(X)], y, epochs=self.epochs)
193 |         if self.reduced_lr_epochs > 0:
194 |             self.optimizer.lr.assign(self.lr / 10)
195 |             self.model.fit(
196 |                 [X_history, self.scaler_x.transform(X)],
197 |                 y,
198 |                 epochs=self.reduced_lr_epochs,
199 |             )
200 | 
201 |     def predict(self, X, X_history):
202 |         y_pred = self.model.predict([X_history, self.scaler_x.transform(X)])
203 |         if self.output_shape == 2:
204 |             y_pred = tf.argmax(y_pred, axis=-1)
205 |         return y_pred
206 | 
207 |     def summary(self):
208 |         return self.model.summary()
209 | 


--------------------------------------------------------------------------------
/examples/models/grid_search.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | 
  3 | sys.path.append("../../python")
  4 | sys.path.append("../../../")
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | import numpy as np
  8 | import pandas as pd
  9 | from sklearn.ensemble import (
 10 |     RandomForestClassifier,
 11 |     GradientBoostingClassifier,
 12 |     AdaBoostClassifier,
 13 | )
 14 | from model.dumb_models import BestRankedPlayerWins
 15 | from model.lgbm import LightGBM
 16 | from model.sk_model import ScalerSVC
 17 | from model.xgboost import XGBoost
 18 | 
 19 | from data.data_loader import matches_data_loader
 20 | from data.data_loader import encode_data
 21 | from evaluation.train_test import train_test_evaluation
 22 | 
 23 | 
 24 | train_years = [2018, 2019, 2020]
 25 | test_years = [2021, 2022]
 26 | 
 27 | 
 28 | match_features = ["tournament_surface", "tournament_level"]
 29 | player_features = [
 30 |     "Ranking",
 31 |     "Ranking_Points",
 32 |     "Height",
 33 |     "Victories_Percentage",
 34 |     "Clay_Victories_Percentage",
 35 |     "Grass_Victories_Percentage",
 36 |     "Carpet_Victories_Percentage",
 37 |     "Hard_Victories_Percentage",
 38 |     "Aces_Percentage",
 39 | ]
 40 | additional_features = ["diff_rank", "v_perc_versus", "nb_match_versus"]
 41 | 
 42 | 
 43 | test_score = train_test_evaluation(
 44 |     train_years=train_years,
 45 |     test_years=test_years,
 46 |     model_class=BestRankedPlayerWins,
 47 |     model_params={},
 48 |     match_features=match_features,
 49 |     player_features=player_features,
 50 |     encoding_params={},
 51 |     additional_features=additional_features,
 52 |     save_path="../../results/20212022_chall",
 53 |     save_all_results=False,
 54 | )
 55 | 
 56 | lgbm_hyperparams = []
 57 | for num_leaves in [10, 100, 1000, 2000]:
 58 |     for min_data_leaf in [10, 100, 1000]:
 59 |         lgbm_hyperparams.append(
 60 |             {
 61 |                 "params": {
 62 |                     "num_leaves": num_leaves,
 63 |                     "objective": "binary",
 64 |                     "min_data_in_leaf": min_data_leaf,
 65 |                 }
 66 |             }
 67 |         )
 68 | test_score = train_test_evaluation(
 69 |     train_years=train_years,
 70 |     test_years=test_years,
 71 |     model_class=LightGBM,
 72 |     model_params=lgbm_hyperparams,
 73 |     match_features=match_features,
 74 |     player_features=player_features,
 75 |     encoding_params={},
 76 |     additional_features=additional_features,
 77 |     save_path="../../results/20212022_chall",
 78 |     save_all_results=False,
 79 | )
 80 | 
 81 | 
 82 | ada_hyperparams = []
 83 | for num_est in [10, 100, 1000, 2000]:
 84 |     for lr in [0.1, 1.0, 2.0]:
 85 |         ada_hyperparams.append(
 86 |             {
 87 |                 "n_estimators": num_est,
 88 |                 "learning_rate": lr,
 89 |             }
 90 |         )
 91 | test_score = train_test_evaluation(
 92 |     train_years=train_years,
 93 |     test_years=test_years,
 94 |     model_class=AdaBoostClassifier,
 95 |     model_params=ada_hyperparams,
 96 |     match_features=match_features,
 97 |     player_features=player_features,
 98 |     encoding_params={},
 99 |     additional_features=additional_features,
100 |     save_path="../../results/20212022_chall",
101 |     save_all_results=False,
102 | )
103 | 
104 | svc_hyperparams = []
105 | for C in [0.1, 1.0, 10.0, 100.0]:
106 |     for kernel in ["linear", "rbf"]:
107 |         svc_hyperparams.append(
108 |             {
109 |                 "C": C,
110 |                 "kernel": kernel,
111 |             }
112 |         )
113 | test_score = train_test_evaluation(
114 |     train_years=train_years,
115 |     test_years=test_years,
116 |     model_class=ScalerSVC,
117 |     model_params=svc_hyperparams,
118 |     match_features=match_features,
119 |     player_features=player_features,
120 |     encoding_params={},
121 |     additional_features=additional_features,
122 |     save_path="../../results/20212022_chall",
123 |     save_all_results=False,
124 | )
125 | 
126 | 
127 | for mx_depth in [1, 3, 5]:
128 |     for n_est in [10, 100, 1000, 2000]:
129 |         model_params = {"n_estimators": n_est, "max_depth": mx_depth}
130 |         model_class = RandomForestClassifier
131 | 
132 |         test_score = train_test_evaluation(
133 |             train_years=train_years,
134 |             test_years=test_years,
135 |             model_class=model_class,
136 |             model_params=model_params,
137 |             match_features=match_features,
138 |             player_features=player_features,
139 |             encoding_params={},
140 |             additional_features=additional_features,
141 |             save_path="../../results/20212022_chall",
142 |             save_all_results=False,
143 |         )
144 |         print("~~ Current Score ~~", test_score)
145 | 
146 | 
147 | for mx_depth in [1, 3, 5]:
148 |     for n_est in [10, 100, 1000, 2000]:
149 |         model_params = {"n_estimators": n_est, "max_depth": mx_depth}
150 |         model_class = GradientBoostingClassifier
151 | 
152 |         test_score = train_test_evaluation(
153 |             train_years=train_years,
154 |             test_years=test_years,
155 |             model_class=model_class,
156 |             model_params=model_params,
157 |             match_features=match_features,
158 |             player_features=player_features,
159 |             encoding_params={},
160 |             additional_features=additional_features,
161 |             save_path="../../results/20212022_chall",
162 |             save_all_results=False,
163 |         )
164 |         print("~~ Current Score ~~", test_score)
165 | 
166 | 
167 | lgbm_hyperparams = []
168 | for num_leaves in [10, 100, 1000, 2000]:
169 |     for min_data_leaf in [10, 100, 1000]:
170 |         lgbm_hyperparams.append(
171 |             {
172 |                 "params": {
173 |                     "num_leaves": num_leaves,
174 |                     "objective": "binary",
175 |                     "min_data_in_leaf": min_data_leaf,
176 |                 }
177 |             }
178 |         )
179 | 
180 | test_score = train_test_evaluation(
181 |     train_years=list([year for year in range(1990, 2021)]),
182 |     test_years=test_years,
183 |     model_class=LightGBM,
184 |     model_params=lgbm_hyperparams,
185 |     match_features=match_features,
186 |     player_features=player_features,
187 |     encoding_params={},
188 |     additional_features=additional_features,
189 |     save_path="../../results/20212022_chall",
190 |     save_all_results=False,
191 | )
192 | 
193 | xgb_hyperparams = []
194 | for eta in [0.1, 0.3, 0.6]:
195 |     for gamma in [0, 1, 10]:
196 |         for max_depth in [2, 4, 6, 8, 10]:
197 |             for min_child_weight in [1, 2, 8]:
198 |                 for subsample in [0.4, 0.8, 1]:
199 |                     xgb_hyperparams.append(
200 |                         {
201 |                             "params": {
202 |                                 "eta": eta,
203 |                                 "objective": "binary:logistic",
204 |                                 "gamma": gamma,
205 |                                 "max_depth": max_depth,
206 |                                 "min_child_weight": min_child_weight,
207 |                                 "subsample": subsample,
208 |                             }
209 |                         }
210 |                     )
211 | 
212 | test_score = train_test_evaluation(
213 |     train_years=train_years,
214 |     test_years=test_years,
215 |     model_class=XGBoost,
216 |     model_params=xgb_hyperparams,
217 |     match_features=match_features,
218 |     player_features=player_features,
219 |     encoding_params={},
220 |     additional_features=additional_features,
221 |     save_path="../../results/20212022_chall",
222 |     save_all_results=False,
223 | )
224 | 
225 | xgb_hyperparams = []
226 | for eta in [0.1, 0.3, 0.6]:
227 |     for gamma in [0, 1, 10]:
228 |         for max_depth in [2, 4, 6, 8, 10]:
229 |             for min_child_weight in [1, 2, 8]:
230 |                 for subsample in [0.4, 0.8, 1]:
231 |                     xgb_hyperparams.append(
232 |                         {
233 |                             "params": {
234 |                                 "eta": eta,
235 |                                 "objective": "binary:logistic",
236 |                                 "gamma": gamma,
237 |                                 "max_depth": max_depth,
238 |                                 "min_child_weight": min_child_weight,
239 |                                 "subsample": subsample,
240 |                             }
241 |                         }
242 |                     )
243 | 
244 | test_score = train_test_evaluation(
245 |     train_years=list([year for year in range(1990, 2021)]),
246 |     test_years=test_years,
247 |     model_class=XGBoost,
248 |     model_params=xgb_hyperparams,
249 |     match_features=match_features,
250 |     player_features=player_features,
251 |     encoding_params={},
252 |     additional_features=additional_features,
253 |     save_path="../../results/20212022_chall",
254 |     save_all_results=False,
255 | )
256 | 


--------------------------------------------------------------------------------
/python/data/data_encoding.py:
--------------------------------------------------------------------------------
  1 | import ast
  2 | import numpy as np
  3 | import pandas as pd
  4 | import tqdm
  5 | 
  6 | from history_modeling.match_representation import (
  7 |     create_timeless_dataset,
  8 |     get_match_info,
  9 | )
 10 | 
 11 | 
 12 | def clean_missing_data(df):
 13 |     """
 14 |     Cleans rows of df with missing data or to few statistics to be useful
 15 |     :param df:
 16 |     :return:
 17 |     """
 18 |     print("Length df before cleaning:", len(df))
 19 |     df = df.dropna(axis=0)
 20 |     print("after dropna", len(df))
 21 |     # df = df.loc[df.Ranking_1 != 9999]
 22 |     df = df.loc[df.Ranking_1 != 0]
 23 |     # df = df.loc[df.Ranking_2 != 9999]
 24 |     df = df.loc[df.Ranking_2 != 0]
 25 | 
 26 |     return df
 27 | 
 28 | 
 29 | def complete_missing_data(df, *args):
 30 |     for column, value in args:
 31 |         df[column].fillna(value, inplace=True)
 32 | 
 33 |     return df
 34 | 
 35 | 
 36 | def encode_data(df, mode="integer"):
 37 |     # Remove:
 38 |     #   - index
 39 |     #   - Unnamed: 0
 40 |     #   - Unnamed: 0.1
 41 |     #   - tournament
 42 |     #   - Name
 43 |     #   - ID
 44 |     #   - Birth Year => Age
 45 |     #   - Versus: % V against 2, last 5 matches
 46 |     #   - Matches
 47 | 
 48 |     # Refac:
 49 |     #   - Versus
 50 |     # Best way to do it ?
 51 |     #   - Birth Year
 52 |     #   - Last Tournament => Days since last tournament + result ?
 53 | 
 54 |     df_copy = df.copy()
 55 |     if mode == "integer":
 56 |         # Considered Variables:
 57 |         tournament_level = {"G": 0, "A": 1, "M": 2, "F": 3, "D": 4, "C": 5}
 58 |         tournament_surface = {"Clay": 0, "Carpet": 1, "Hard": 2, "Grass": 3}
 59 | 
 60 |         round = {
 61 |             "F": 0,
 62 |             "SF": 1,
 63 |             "QF": 2,
 64 |             "R16": 3,
 65 |             "R32": 4,
 66 |             "R64": 5,
 67 |             "R128": 6,
 68 |             "R256": 7,
 69 |             "RR": 8,
 70 |             "BR": 9,
 71 |             "ER": 10,
 72 |             "Q1": 11,
 73 |             "Q2": 12,
 74 |             "Q3": 13,
 75 |         }
 76 | 
 77 |         hand = {
 78 |             "R": -1,
 79 |             "L": 1,
 80 |             "A": 0,
 81 |             "U": 2,
 82 |             "nan": 2,
 83 |         }
 84 | 
 85 |     elif mode == "one_hot":
 86 |         # Considered Variables:
 87 |         tournament_level = {
 88 |             "G": [0, 0, 0, 1, 0],
 89 |             "A": [0, 0, 1, 0, 0],
 90 |             "M": [0, 1, 0, 0, 0],
 91 |             "D": [1, 0, 0, 0, 0],
 92 |             "C": [0, 0, 0, 0, 1],
 93 |         }
 94 | 
 95 |         tournament_surface = {
 96 |             "Clay": [1, 0, 0, 0],
 97 |             "Carpet": [0, 1, 0, 0],
 98 |             "Hard": [0, 0, 1, 0],
 99 |             "Grass": [0, 0, 0, 1],
100 |         }
101 | 
102 |         round = {
103 |             "F": [0, 0, 0, 0, 0, 0, 0, 0, 1],
104 |             "SF": [0, 0, 0, 0, 0, 0, 0, 1, 0],
105 |             "QF": [0, 0, 0, 0, 0, 0, 1, 0, 0],
106 |             "R16": [0, 0, 0, 0, 0, 1, 0, 0, 0],
107 |             "R32": [0, 0, 0, 0, 1, 0, 0, 0, 0],
108 |             "R64": [0, 0, 0, 1, 0, 0, 0, 0, 0],
109 |             "R128": [0, 0, 1, 0, 0, 0, 0, 0, 0],
110 |             "R256": [0, 1, 0, 0, 0, 0, 0, 0, 0],
111 |             "RR": [1, 0, 0, 0, 0, 0, 0, 0, 0],
112 |         }
113 | 
114 |         hand = {
115 |             "R": [1, 0, 0, 0],
116 |             "L": [0, 1, 0, 0],
117 |             "A": [0, 0, 1, 0],
118 |             "U": [0, 0, 0, 1],
119 |         }
120 | 
121 |     elif mode == "mixing":
122 |         # Considered Variables:
123 |         tournament_level = {"G": 0, "A": 1, "M": 2, "F": 3, "D": 4, "C": 5}
124 |         tournament_surface = {
125 |             "Clay": [1, 0, 0, 0],
126 |             "Carpet": [0, 1, 0, 0],
127 |             "Hard": [0, 0, 1, 0],
128 |             "Grass": [0, 0, 0, 1],
129 |         }
130 | 
131 |         round = {
132 |             "F": 0,
133 |             "SF": 1,
134 |             "QF": 2,
135 |             "R16": 3,
136 |             "R32": 4,
137 |             "R64": 5,
138 |             "R128": 6,
139 |             "R256": 7,
140 |             "RR": 8,
141 |             "BR": 9,
142 |         }
143 | 
144 |         hand = {
145 |             "R": [1, 0, 0, 0],
146 |             "L": [0, 1, 0, 0],
147 |             "A": [0, 0, 1, 0],
148 |             "U": [0, 0, 0, 1],
149 |         }
150 | 
151 |     for col in df_copy.columns:
152 |         if "hand" in col.lower():
153 |             df_copy[col] = df_copy.apply(lambda row: hand[str(row[col])], axis=1)
154 |         elif "round" in col.lower():
155 |             df_copy[col] = df_copy.apply(lambda row: round[row[col]], axis=1)
156 |         elif "tournament_level" in col.lower():
157 |             df_copy[col] = df_copy.apply(lambda row: tournament_level[row[col]], axis=1)
158 |         elif "tournament_surface" in col.lower():
159 |             df_copy[col] = df_copy.apply(
160 |                 lambda row: tournament_surface[row[col]], axis=1
161 |             )
162 |         else:
163 |             pass
164 | 
165 |     return df_copy
166 | 
167 | 
168 | def create_additional_features(df, features):
169 |     df = df.copy()
170 | 
171 |     if "nb_match_versus" in features:
172 |         df["nb_match_versus"] = df.apply(
173 |             lambda row: len([k[0] for k in ast.literal_eval(row["Versus_1"])]), axis=1
174 |         )
175 | 
176 |     if "v_perc_versus" in features:
177 |         df["v_perc_versus"] = df.apply(
178 |             lambda row: [k[0] for k in ast.literal_eval(row["Versus_1"])].count("V")
179 |             / len([k[0] for k in ast.literal_eval(row["Versus_1"])])
180 |             if len([k[0] for k in ast.literal_eval(row["Versus_1"])]) > 0
181 |             else -1,
182 |             axis=1,
183 |         )
184 | 
185 |     if "diff_rank" in features:
186 |         df["diff_rank"] = df.apply(
187 |             lambda row: row["Ranking_2"] - row["Ranking_1"], axis=1
188 |         )
189 | 
190 |     if "diff_rank_points" in features:
191 |         df["diff_rank_points"] = df.apply(
192 |             lambda row: row["Ranking_Points_2"] - row["Ranking_Points_1"], axis=1
193 |         )
194 | 
195 |     return df
196 | 
197 | 
198 | def create_encoded_history(df, encoder, num_matches, completing_value=0):
199 |     df = df.copy()
200 |     history = {
201 |         "id": [],
202 |         "ID_1": [],
203 |         "ID_2": [],
204 |         "history_1": [],
205 |         "history_2": [],
206 |     }
207 | 
208 |     for n_row, row in tqdm.tqdm(df.iterrows(), total=len(df)):
209 |         try:
210 |             matches_history_1 = ast.literal_eval(row["Matches_1"])[-num_matches:]
211 |         except:
212 |             with open("error.txt", "w") as file:
213 |                 file.write(str(row["Matches_1"]))
214 |             matches_history_1 = ast.literal_eval(row["Matches_1"])[-num_matches:]
215 | 
216 |         matches_history_1 = [_[1] for _ in matches_history_1]
217 | 
218 |         df_history = df.loc[df.id.isin(matches_history_1)].loc[df.ID_1 == row.ID_1]
219 | 
220 |         if len(df_history) > 0:
221 |             # df_history = create_timeless_dataset(df_history)
222 |             # encoded_history_1 = encoder.predict(df_history)
223 |             encoded_history_1, df_history = encoder.predict(
224 |                 df_history, transform_data=True
225 |             )
226 | 
227 |             if encoded_history_1.shape[0] < num_matches:
228 |                 encoded_history_1 = np.concatenate(
229 |                     [
230 |                         np.ones(
231 |                             (
232 |                                 num_matches - encoded_history_1.shape[0],
233 |                                 encoded_history_1.shape[1],
234 |                             )
235 |                         )
236 |                         * completing_value,
237 |                         encoded_history_1,
238 |                     ],
239 |                     axis=0,
240 |                 )
241 |         else:
242 |             encoded_history_1 = (
243 |                 np.ones((num_matches, encoder.output_shape)) * completing_value
244 |             )
245 | 
246 |         matches_history_2 = ast.literal_eval(row["Matches_2"])[-num_matches:]
247 |         matches_history_2 = [_[1] for _ in matches_history_2]
248 | 
249 |         df_history = df.loc[df.id.isin(matches_history_2)].loc[df.ID_1 == row.ID_2]
250 | 
251 |         if len(df_history) > 0:
252 |             # df_history = create_timeless_dataset(df_history)
253 |             encoded_history_2, df_history = encoder.predict(
254 |                 df_history, transform_data=True
255 |             )
256 | 
257 |             if encoded_history_2.shape[0] < num_matches:
258 |                 encoded_history_2 = np.concatenate(
259 |                     [
260 |                         np.ones(
261 |                             (
262 |                                 num_matches - encoded_history_2.shape[0],
263 |                                 encoded_history_2.shape[1],
264 |                             )
265 |                         )
266 |                         * completing_value,
267 |                         encoded_history_2,
268 |                     ],
269 |                     axis=0,
270 |                 )
271 |         else:
272 |             encoded_history_2 = (
273 |                 np.ones((num_matches, encoder.output_shape)) * completing_value
274 |             )
275 | 
276 |         history["id"].append(row.id)
277 |         history["ID_1"].append(row.ID_1)
278 |         history["ID_2"].append(row.ID_2)
279 | 
280 |         history["history_1"].append(encoded_history_1)
281 |         history["history_2"].append(encoded_history_2)
282 | 
283 |         if n_row < 100 and len(df_history) > 0:
284 |             row.to_csv("row.csv")
285 |             df_history.to_csv("df_history.csv")
286 |             np.save("encoded_history.npy", encoded_history_2)
287 |     return pd.DataFrame(history)
288 | 


--------------------------------------------------------------------------------
/python/data/match.py:
--------------------------------------------------------------------------------
  1 | import ast
  2 | 
  3 | import pandas as pd
  4 | import numpy as np
  5 | 
  6 | 
  7 | class Match:
  8 |     def __init__(self, winner, loser, tournament, surface, id_prefix=""):
  9 |         self.winner = winner
 10 |         self.loser = loser
 11 |         self.tournament = tournament
 12 |         self.surface = surface
 13 |         self.id_prefix = id_prefix
 14 | 
 15 |         self.tournament_date = ""
 16 |         self.tournament_level = ""
 17 |         self.round = ""
 18 |         self.data = None
 19 |         self.match_time_players_data = {
 20 |             "winner": {
 21 |                 "id": self.winner,
 22 |                 "age": 0,
 23 |                 "rank": 0,
 24 |                 "ranking_points": 0,
 25 |                 "aces_nb": 0,
 26 |                 "df_nb": 0,
 27 |                 "w_svpt": 0,
 28 |                 "w_1stIn": 0,
 29 |                 "w_1stWon": 0,
 30 |                 "w_2ndWon": 0,
 31 |                 "w_SvGms": 0,
 32 |                 "w_bpSaved": 0,
 33 |                 "w_bpFaced": 0,
 34 |             },
 35 |             "loser": {
 36 |                 "id": self.loser,
 37 |                 "age": 0,
 38 |                 "rank": 0,
 39 |                 "ranking_points": 0,
 40 |                 "aces_nb": 0,
 41 |                 "df_nb": 0,
 42 |                 "w_svpt": 0,
 43 |                 "w_1stIn": 0,
 44 |                 "w_1stWon": 0,
 45 |                 "w_2ndWon": 0,
 46 |                 "w_SvGms": 0,
 47 |                 "w_bpSaved": 0,
 48 |                 "w_bpFaced": 0,
 49 |             },
 50 |         }
 51 | 
 52 |         self.sets_number = 0
 53 |         self.score = None
 54 |         self.elapsed_minutes = None
 55 |         self.best_of = None
 56 | 
 57 |     def get_rankings(self, player_id):
 58 |         if player_id == self.winner.id:
 59 |             return (
 60 |                 self.match_time_players_data["winner"]["rank"],
 61 |                 self.match_time_players_data["winner"]["ranking_points"],
 62 |             )
 63 |         else:
 64 |             return (
 65 |                 self.match_time_players_data["loser"]["rank"],
 66 |                 self.match_time_players_data["loser"]["ranking_points"],
 67 |             )
 68 | 
 69 |     def get_aces_nb(self, player_id):
 70 |         if player_id == self.winner.id:
 71 |             return self.match_time_players_data["winner"]["aces_nb"]
 72 |         else:
 73 |             return self.match_time_players_data["loser"]["aces_nb"]
 74 | 
 75 |     def get_service_points_played(self, player_id):
 76 |         if player_id == self.winner.id:
 77 |             return self.match_time_players_data["winner"]["w_svpt"]
 78 |         else:
 79 |             return self.match_time_players_data["loser"]["w_svpt"]
 80 | 
 81 |     def get_df_nb(self, player_id):
 82 |         if player_id == self.winner.id:
 83 |             return self.match_time_players_data["winner"]["df_nb"]
 84 |         else:
 85 |             return self.match_time_players_data["loser"]["df_nb"]
 86 | 
 87 |     def get_first_serve_win(self, player_id):
 88 |         if player_id == self.winner.id:
 89 |             return self.match_time_players_data["winner"]["w_1stWon"]
 90 |         else:
 91 |             return self.match_time_players_data["loser"]["w_1stWon"]
 92 | 
 93 |     def get_second_serve_win(self, player_id):
 94 |         if player_id == self.winner.id:
 95 |             return self.match_time_players_data["winner"]["w_2ndWon"]
 96 |         else:
 97 |             return self.match_time_players_data["loser"]["w_2ndWon"]
 98 | 
 99 |     def get_first_services_in(self, player_id):
100 |         if player_id == self.winner.id:
101 |             return self.match_time_players_data["winner"]["w_1stIn"]
102 |         else:
103 |             return self.match_time_players_data["loser"]["w_1stIn"]
104 | 
105 |     def get_breakpoint_faced(self, player_id):
106 |         if player_id == self.winner.id:
107 |             return self.match_time_players_data["winner"]["w_bpFaced"]
108 |         else:
109 |             return self.match_time_players_data["loser"]["w_bpFaced"]
110 | 
111 |     def get_breakpoint_saved(self, player_id):
112 |         if player_id == self.winner.id:
113 |             return self.match_time_players_data["winner"]["w_bpSaved"]
114 |         else:
115 |             return self.match_time_players_data["loser"]["w_bpSaved"]
116 | 
117 |     def __str__(self):
118 |         return (
119 |             "TOURNAMENT : "
120 |             + self.tournament
121 |             + " W : "
122 |             + self.winner
123 |             + " L : "
124 |             + self.loser
125 |         )
126 | 
127 |     def get_prior_data_and_update_players_stats(self):
128 |         match_data = pd.DataFrame(
129 |             {
130 |                 "id": [self.id],
131 |                 "tournament": [self.tournament],
132 |                 "tournament_level": [self.tournament_level],
133 |                 "tournament_date": [self.tournament_date],
134 |                 "tournament_surface": [self.surface],
135 |                 "round": [self.round],
136 |                 "best_of": [self.best_of],
137 |             }
138 |         )
139 | 
140 |         w_data = self.winner.get_data_df(opponent=self.loser.id)
141 |         lr, lrp = self.winner.get_last_months_rankings(
142 |             date=self.tournament_date, nb_months=12, day_of_month="last"
143 |         )
144 |         w_data["last_rankings"] = [lr]
145 |         w_data["last_ranking_points"] = [lrp]
146 |         l_data = self.loser.get_data_df(opponent=self.winner.id)
147 |         lr, lrp = self.loser.get_last_months_rankings(
148 |             date=self.tournament_date, nb_months=12, day_of_month="last"
149 |         )
150 |         l_data["last_rankings"] = [lr]
151 |         l_data["last_ranking_points"] = [lrp]
152 | 
153 |         self.winner.update_from_match(self)
154 |         self.loser.update_from_match(self)
155 |         return match_data, w_data, l_data
156 | 
157 |     def get_match_data_results_statistics(self):
158 |         match_statistics = {
159 |             "score": [self.score],
160 |             "elapsed_minutes": [self.elapsed_minutes],
161 |         }
162 | 
163 |         winner_statistics = {
164 |             "aces_nb": [self.match_time_players_data["winner"]["aces_nb"]],
165 |             "doublefaults_nb": [self.match_time_players_data["winner"]["df_nb"]],
166 |             "svpt": [self.match_time_players_data["winner"]["w_svpt"]],
167 |             "1stIn": [self.match_time_players_data["winner"]["w_1stIn"]],
168 |             "1stWon": [self.match_time_players_data["winner"]["w_1stWon"]],
169 |             "2ndWon": [self.match_time_players_data["winner"]["w_2ndWon"]],
170 |             "SvGms": [self.match_time_players_data["winner"]["w_SvGms"]],
171 |             "bpSaved": [self.match_time_players_data["winner"]["w_bpSaved"]],
172 |             "bpFaced": [self.match_time_players_data["winner"]["w_bpFaced"]],
173 |         }
174 |         loser_statistics = {
175 |             "aces_nb": [self.match_time_players_data["loser"]["aces_nb"]],
176 |             "doublefaults_nb": [self.match_time_players_data["loser"]["df_nb"]],
177 |             "svpt": [self.match_time_players_data["loser"]["w_svpt"]],
178 |             "1stIn": [self.match_time_players_data["loser"]["w_1stIn"]],
179 |             "1stWon": [self.match_time_players_data["loser"]["w_1stWon"]],
180 |             "2ndWon": [self.match_time_players_data["loser"]["w_2ndWon"]],
181 |             "SvGms": [self.match_time_players_data["loser"]["w_SvGms"]],
182 |             "bpSaved": [self.match_time_players_data["loser"]["w_bpSaved"]],
183 |             "bpFaced": [self.match_time_players_data["loser"]["w_bpFaced"]],
184 |         }
185 | 
186 |         return (
187 |             pd.DataFrame(match_statistics),
188 |             pd.DataFrame(winner_statistics),
189 |             pd.DataFrame(loser_statistics),
190 |         )
191 | 
192 |     def instantiate_from_data_row(self, data_row):
193 |         self.tournament_date = data_row["tourney_date"]
194 |         self.tournament_level = data_row["tourney_level"]
195 |         self.round = data_row["round"]
196 |         self.sets_number = len(str(data_row["score"]).split("-"))
197 |         self.games_number = 0
198 |         for set in str(data_row["score"]).split(" "):
199 |             try:
200 |                 games_0 = int(set.split("-")[0][0])
201 |                 games_a = int(set.split("-")[1][0])
202 |                 self.games_number += games_0
203 |                 self.games_number += games_1
204 |             except:
205 |                 pass
206 | 
207 |         self.score = data_row["score"]
208 |         self.elapsed_minutes = data_row["minutes"]
209 |         self.best_of = data_row["best_of"]
210 | 
211 |         self.id = self.id_prefix + "_" + str(data_row.name)
212 | 
213 |         self.match_time_players_data = {
214 |             "winner": {
215 |                 "id": data_row["winner_id"],
216 |                 "age": data_row["winner_age"],
217 |                 "rank": data_row["winner_rank"],
218 |                 "ranking_points": data_row["winner_rank_points"],
219 |                 "aces_nb": data_row["w_ace"],
220 |                 "df_nb": data_row["w_df"],
221 |                 "w_svpt": data_row["w_svpt"],
222 |                 "w_1stIn": data_row["w_1stIn"],
223 |                 "w_1stWon": data_row["w_1stWon"],
224 |                 "w_2ndWon": data_row["w_2ndWon"],
225 |                 "w_SvGms": data_row["w_SvGms"],
226 |                 "w_bpSaved": data_row["w_bpSaved"],
227 |                 "w_bpFaced": data_row["w_bpFaced"],
228 |             },
229 |             "loser": {
230 |                 "id": data_row["loser_id"],
231 |                 "age": data_row["loser_age"],
232 |                 "rank": data_row["loser_rank"],
233 |                 "ranking_points": data_row["loser_rank_points"],
234 |                 "aces_nb": data_row["l_ace"],
235 |                 "df_nb": data_row["l_df"],
236 |                 "w_svpt": data_row["l_svpt"],
237 |                 "w_1stIn": data_row["l_1stIn"],
238 |                 "w_1stWon": data_row["l_1stWon"],
239 |                 "w_2ndWon": data_row["l_2ndWon"],
240 |                 "w_SvGms": data_row["l_SvGms"],
241 |                 "w_bpSaved": data_row["l_bpSaved"],
242 |                 "w_bpFaced": data_row["l_bpFaced"],
243 |             },
244 |         }
245 | 


--------------------------------------------------------------------------------
/python/evaluation/train_test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from data.data_loader import matches_data_loader
  8 | from data.data_encoding import (
  9 |     encode_data,
 10 |     create_additional_features,
 11 |     clean_missing_data,
 12 |     create_encoded_history,
 13 | )
 14 | 
 15 | absolute_path = os.path.dirname(os.path.abspath(__file__))
 16 | default_columns_match = ["tournament_level", "round", "best_of", "tournament_surface"]
 17 | 
 18 | default_columns_player = [
 19 |     "Ranking",
 20 |     "Ranking_Points",
 21 |     "Hand",
 22 |     "Height",
 23 |     "Versus",
 24 |     "Victories_Percentage",
 25 |     "Clay_Victories_Percentage",
 26 |     "Grass_Victories_Percentage",
 27 |     "Carpet_Victories_Percentage",
 28 |     "Hard_Victories_Percentage",
 29 |     "Aces_Percentage",
 30 |     "Doublefaults_Percentage",
 31 |     "First_Serve_Success_Percentage",
 32 |     "Winning_on_1st_Serve_Percentage",
 33 |     "Winning_on_2nd_Serve_Percentage",
 34 |     "Overall_Win_on_Serve_Percentage",
 35 |     "BreakPoint_Face_Percentage",
 36 |     "BreakPoint_Saved_Percentage",
 37 |     "Fatigue",
 38 | ]
 39 | 
 40 | 
 41 | def train_test_evaluation(
 42 |     train_years,
 43 |     test_years,
 44 |     model_class,
 45 |     model_params,
 46 |     encoder_models=[],
 47 |     use_davis_data=False,
 48 |     history_encoder_years=1,
 49 |     match_features=default_columns_match,
 50 |     player_features=default_columns_player,
 51 |     encoding_params={},
 52 |     additional_features=[],
 53 |     save_path=None,
 54 |     save_all_results=False,
 55 | ):
 56 |     global absolute_path
 57 |     assert len(set(train_years).intersection(set(test_years))) == 0
 58 |     print(f"[+] Beginning Train/Test Evaluation for model class {model_class}")
 59 | 
 60 |     min_year = np.min(train_years + test_years)
 61 |     min_year -= history_encoder_years
 62 |     print(f"[+] Loading Data from year {min_year}")
 63 |     data_df = matches_data_loader(
 64 |         path_to_data=os.path.join(absolute_path, "../../submodules/tennis_atp"),
 65 |         path_to_cache=os.path.join(absolute_path, "../../cache"),
 66 |         flush_cache=False,
 67 |         keep_values_from_year=min_year,
 68 |         get_match_statistics=False,
 69 |         get_reversed_match_data=True,
 70 |         include_davis_cup=use_davis_data,
 71 |     )
 72 |     print(f"[+] Data Loaded, Now Encoding Data and create additional Features")
 73 | 
 74 |     historic_data = data_df.loc[data_df.tournament_year < min(train_years)]
 75 |     train_data = data_df.loc[data_df.tournament_year.isin(train_years)]
 76 |     test_data = data_df.loc[data_df.tournament_year.isin(test_years)]
 77 | 
 78 |     history_columns = []
 79 |     for encoding_model, encoding_model_params in encoder_models:
 80 |         print(f"[+] Training Encoder Model {encoding_model}")
 81 |         encoder = encoding_model(**encoding_model_params)
 82 |         encoder.fit(train_data)
 83 | 
 84 |         print(f"[+] Encoding using encoder {encoding_model}")
 85 |         encoded_data = create_encoded_history(
 86 |             data_df, encoder, num_matches=5, completing_value=0
 87 |         )
 88 | 
 89 |         cols = ["history_1", "history_2"]
 90 | 
 91 |         flatten_data = pd.concat(
 92 |             [
 93 |                 pd.DataFrame(
 94 |                     np.array(encoded_data[x].values.tolist()).reshape(
 95 |                         (len(encoded_data), -1)
 96 |                     )
 97 |                 ).add_prefix(x)
 98 |                 for x in cols
 99 |             ],
100 |             axis=1,
101 |         )
102 |         encoded_data = pd.concat(
103 |             [flatten_data, encoded_data.drop(cols, axis=1)], axis=1
104 |         )
105 |         enc_columns = encoded_data.columns
106 |         enc_columns = list(set(enc_columns) - set(["id", "ID_1", "ID_2"]))
107 |         history_columns.extend(enc_columns)
108 | 
109 |         data_df = pd.merge(data_df, encoded_data, on=["id", "ID_1", "ID_2"])
110 | 
111 |         # train_data = pd.merge(train_data, encoded_data, on=["id", "ID_1", "ID_2"])
112 |         # test_data = pd.merge(test_data, encoded_data, on=["id", "ID_1", "ID_2"])
113 | 
114 |     train_data = data_df.loc[data_df.tournament_year.isin(train_years)]
115 |     test_data = data_df.loc[data_df.tournament_year.isin(test_years)]
116 |     train_data = create_additional_features(train_data, additional_features)
117 |     train_data = encode_data(train_data, **encoding_params)
118 |     test_data = create_additional_features(test_data, additional_features)
119 |     test_data = encode_data(test_data, **encoding_params)
120 | 
121 |     p1_features = [feat + "_1" for feat in player_features]
122 |     p2_features = [feat + "_2" for feat in player_features]
123 |     match_features = match_features.copy()
124 |     match_features.extend(additional_features.copy())
125 | 
126 |     train_data = train_data[
127 |         match_features
128 |         + p1_features
129 |         + p2_features
130 |         + history_columns
131 |         + ["Winner", "tournament_year"]
132 |     ]
133 |     test_data = test_data[
134 |         match_features
135 |         + p1_features
136 |         + p2_features
137 |         + history_columns
138 |         + ["Winner", "tournament_year"]
139 |     ]
140 | 
141 |     print(f"[+] Cleaning Data")
142 |     train_data = clean_missing_data(train_data)
143 |     test_data = clean_missing_data(test_data)
144 |     print(f"Training on {len(train_data)} data and testing on {len(test_data)} data")
145 | 
146 |     print(f"[+] Data Ready, now beginning modelling")
147 |     if isinstance(model_params, list):
148 |         precisions = []
149 |         for params_set in model_params:
150 |             model = model_class(**params_set)
151 |             t_fit = time.time()
152 |             model.fit(
153 |                 train_data[match_features + p1_features + p2_features],
154 |                 train_data["Winner"].values.ravel(),
155 |             )
156 |             t_fit = time.time() - t_fit
157 |             print(f"~~ Fit time: {np.round(t_fit, 0)}")
158 | 
159 |             preds = model.predict(test_data[match_features + p1_features + p2_features])
160 |             precision = np.sum(np.squeeze(preds) == test_data["Winner"].values) / len(
161 |                 preds
162 |             )
163 |             precisions.append(precision)
164 | 
165 |             if save_path is not None:
166 |                 try:
167 |                     df_res = pd.read_csv(
168 |                         os.path.join(save_path, "results.csv"), sep=";"
169 |                     )
170 |                 except:
171 |                     print("save file not found")
172 |                     os.makedirs(save_path, exist_ok=True)
173 |                     df_res = pd.DataFrame()
174 | 
175 |                 df_curr = pd.DataFrame(
176 |                     {
177 |                         "train_years": [train_years],
178 |                         "test_years": [test_years],
179 |                         "model_class": [model_class.__name__],
180 |                         "model_params": [params_set],
181 |                         "match_features": [match_features],
182 |                         "player_features": [player_features],
183 |                         "encoding_params": [encoding_params],
184 |                         "additional_features": [additional_features.copy()],
185 |                         "precision": [precision],
186 |                         "fit_time": [np.round(t_fit, 0)],
187 |                     }
188 |                 )
189 | 
190 |                 if save_all_results:
191 |                     eval_id = int(time.time() * 100)
192 |                     df_curr["eval_ID"] = [eval_id]
193 |                     test_data["y_pred"] = preds
194 |                     test_data.to_csv(
195 |                         os.path.join(save_path, f"{eval_id}.csv"), index=False, sep=";"
196 |                     )
197 | 
198 |                 df_res = pd.concat([df_res, df_curr], axis=0)
199 |                 df_res.to_csv(
200 |                     os.path.join(save_path, "results.csv"), index=False, sep=";"
201 |                 )
202 | 
203 |         return precisions
204 | 
205 |     else:
206 |         model = model_class(**model_params)
207 |         t_fit = time.time()
208 |         model.fit(
209 |             train_data[match_features + p1_features + p2_features],
210 |             train_data["Winner"].values.ravel(),
211 |         )
212 |         t_fit = time.time() - t_fit
213 |         print(f"~~ Fit time: {np.round(t_fit, 0)}")
214 | 
215 |         print(f"[+] Fit ended, now predicting on test set")
216 |         preds = model.predict(test_data[match_features + p1_features + p2_features])
217 |         precision = np.sum(np.squeeze(preds) == test_data["Winner"].values) / len(preds)
218 |         if save_path is not None:
219 |             try:
220 |                 df_res = pd.read_csv(os.path.join(save_path, "results.csv"), sep=";")
221 |             except:
222 |                 print("save file not found")
223 |                 os.makedirs(save_path, exist_ok=True)
224 |                 df_res = pd.DataFrame()
225 | 
226 |             df_curr = pd.DataFrame(
227 |                 {
228 |                     "train_years": [train_years],
229 |                     "test_years": [test_years],
230 |                     "model_class": [model_class.__name__],
231 |                     "model_params": [model_params],
232 |                     "encoder_models": [encoder_models],
233 |                     "history_encoder_years": [history_encoder_years],
234 |                     "match_features": [match_features],
235 |                     "player_features": [player_features],
236 |                     "encoding_params": [encoding_params],
237 |                     "additional_features": [additional_features.copy()],
238 |                     "precision": [precision],
239 |                     "fit_time": [np.round(t_fit, 0)],
240 |                 }
241 |             )
242 |             if save_all_results:
243 |                 print(f"[+] Saving Results")
244 |                 eval_id = int(time.time())
245 |                 df_curr["eval_ID"] = [eval_id]
246 |                 test_data["y_pred"] = preds
247 |                 test_data.to_csv(
248 |                     os.path.join(save_path, f"{eval_id}.csv"), index=False, sep=";"
249 |                 )
250 | 
251 |             df_res = pd.concat([df_res, df_curr], axis=0)
252 |             df_res.to_csv(os.path.join(save_path, "results.csv"), index=False, sep=";")
253 | 
254 |         return precision
255 | 


--------------------------------------------------------------------------------
/examples/data/data_row_example.csv:
--------------------------------------------------------------------------------
1 | ,level_0,index,id,tournament,tournament_level,tournament_date,tournament_surface,round,best_of,match_id,Name_1,ID_1,Ranking_1,Ranking_Points_1,Ranking_History_1,Best_Rank_1,Birth_Year_1,Versus_1,Hand_1,Last_Tournament_Date_1,Height_1,Matches_1,Matches_Clay_1,Matches_Carpet_1,Matches_Grass_1,Matches_Hard_1,Victories_Percentage_1,Clay_Victories_Percentage_1,Carpet_Victories_Percentage_1,Grass_Victories_Percentage_1,Hard_Victories_Percentage_1,Aces_Percentage_1,Doublefaults_Percentage_1,First_Serve_Success_Percentage_1,Winning_on_1st_Serve_Percentage_1,Winning_on_2nd_Serve_Percentage_1,Overall_Win_on_Serve_Percentage_1,BreakPoint_Face_Percentage_1,BreakPoint_Saved_Percentage_1,games_fatigue_1,minutes_fatigue_1,last_rankings_1,last_ranking_points_1,Name_2,ID_2,Ranking_2,Ranking_Points_2,Ranking_History_2,Best_Rank_2,Birth_Year_2,Versus_2,Hand_2,Last_Tournament_Date_2,Height_2,Matches_2,Matches_Clay_2,Matches_Carpet_2,Matches_Grass_2,Matches_Hard_2,Victories_Percentage_2,Clay_Victories_Percentage_2,Carpet_Victories_Percentage_2,Grass_Victories_Percentage_2,Hard_Victories_Percentage_2,Aces_Percentage_2,Doublefaults_Percentage_2,First_Serve_Success_Percentage_2,Winning_on_1st_Serve_Percentage_2,Winning_on_2nd_Serve_Percentage_2,Overall_Win_on_Serve_Percentage_2,BreakPoint_Face_Percentage_2,BreakPoint_Saved_Percentage_2,games_fatigue_2,minutes_fatigue_2,last_rankings_2,last_ranking_points_2,Winner,score,elapsed_minutes,aces_nb_1,doublefaults_nb_1,svpt_1,1stIn_1,1stWon_1,2ndWon_1,SvGms_1,bpSaved_1,bpFaced_1,aces_nb_2,doublefaults_nb_2,svpt_2,1stIn_2,1stWon_2,2ndWon_2,SvGms_2,bpSaved_2,bpFaced_2,tournament_year,Fatigue_1,Fatigue_2
2 | 10,25616,10854,atp_matches_qual_chall_2003_5427,San Benedetto CH,C,20030811,Clay,SF,3,atp_matches_qual_chall_2003_5427,Stan.Wawrinka,104527,284.0,114.0,"{20030616: [387, 68], 20030707: [363, 74], 20030714: [348, 79], 20030721: [303, 99], 20030811: [284, 114]}",284,19850328.0,[],R,20030721,183.0,"[['V', 'atp_matches_qual_chall_2003_3466'], ['D', 'atp_matches_qual_chall_2003_3481'], ['D', 'atp_matches_2003_4049'], ['V', 'atp_matches_2003_4315'], ['D', 'atp_matches_2003_4328'], ['V', 'atp_matches_2003_4773'], ['D', 'atp_matches_2003_4782'], ['V', 'atp_matches_qual_chall_2003_5408'], ['V', 'atp_matches_qual_chall_2003_5419'], ['V', 'atp_matches_qual_chall_2003_5424']]","['V', 'D', 'D', 'V', 'D', 'V', 'D', 'V', 'V', 'V']",[],[],[],60.0,60.0,0.0,0.0,0.0,3.418803418803419,4.273504273504273,64.95726495726495,54.98575498575499,15.669515669515668,70.65527065527066,11.396011396011396,7.6923076923076925,38.09090909090909,,"[303, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 387]","[99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 68]",Martin.Vassallo Arguello,103506,125.0,296.0,"{19990201: [817, 13], 20000710: [398, 61], 20000731: [354, 75], 20000807: [377, 70], 20010625: [459, 48], 20010709: [405, 61], 20010813: [391, 68], 20010820: [374, 72], 20010827: [342, 88], 20010917: [291, 117], 20010924: [286, 122], 20011008: [238, 154], 20011015: [237, 157], 20011022: [211, 178], 20011112: [206, 181], 20011126: [198, 186], 20011203: [201, 186], 20011231: [202, 186], 20020318: [175, 220], 20020325: [175, 220], 20020401: [178, 213], 20020408: [173, 219], 20020422: [174, 219], 20020429: [176, 217], 20020506: [151, 265], 20020513: [140, 286], 20020527: [140, 285], 20020610: [135, 304], 20020617: [123, 328], 20020624: [123, 328], 20020701: [123, 328], 20020708: [125, 320], 20020715: [132, 311], 20020722: [129, 312], 20020819: [136, 304], 20020930: [165, 220], 20021007: [158, 232], 20030127: [204, 164], 20030210: [204, 164], 20030217: [203, 168], 20030224: [198, 172], 20030324: [197, 177], 20030421: [195, 177], 20030428: [188, 188], 20030512: [255, 118], 20030526: [204, 167], 20030602: [204, 167], 20030609: [211, 163], 20030616: [230, 137], 20030623: [233, 137], 20030630: [233, 137], 20030707: [218, 157], 20030714: [181, 202], 20030721: [163, 232], 20030728: [157, 247], 20030804: [126, 296], 20030811: [125, 296]}",123.0,19800210.0,[],R,20030804.0,183.0,"[['V', 'atp_matches_qual_chall_1999_380'], ['D', 'atp_matches_qual_chall_1999_393'], ['V', 'atp_matches_qual_chall_2000_3972'], ['V', 'atp_matches_qual_chall_2000_3988'], ['D', 'atp_matches_qual_chall_2000_3996'], ['D', 'atp_matches_qual_chall_2000_4725'], ['D', 'atp_matches_qual_chall_2000_4758'], ['V', 'atp_matches_qual_chall_2001_3699'], ['V', 'atp_matches_qual_chall_2001_3712'], ['D', 'atp_matches_qual_chall_2001_3719'], ['V', 'atp_matches_qual_chall_2001_4080'], ['V', 'atp_matches_qual_chall_2001_4089'], ['D', 'atp_matches_qual_chall_2001_4093'], ['V', 'atp_matches_qual_chall_2001_5286'], ['D', 'atp_matches_qual_chall_2001_5295'], ['V', 'atp_matches_qual_chall_2001_5433'], ['V', 'atp_matches_qual_chall_2001_5446'], ['D', 'atp_matches_qual_chall_2001_5453'], ['V', 'atp_matches_qual_chall_2001_5805'], ['V', 'atp_matches_qual_chall_2001_5814'], ['V', 'atp_matches_qual_chall_2001_5818'], ['D', 'atp_matches_qual_chall_2001_5820'], ['V', 'atp_matches_qual_chall_2001_6263'], ['D', 'atp_matches_qual_chall_2001_6275'], ['V', 'atp_matches_qual_chall_2001_6452'], ['V', 'atp_matches_qual_chall_2001_6461'], ['V', 'atp_matches_qual_chall_2001_6466'], ['V', 'atp_matches_qual_chall_2001_6468'], ['D', 'atp_matches_qual_chall_2001_6469'], ['D', 'atp_matches_qual_chall_2001_6943'], ['V', 'atp_matches_qual_chall_2001_7080'], ['V', 'atp_matches_qual_chall_2001_7090'], ['V', 'atp_matches_qual_chall_2001_7095'], ['D', 'atp_matches_qual_chall_2001_7097'], ['V', 'atp_matches_qual_chall_2001_7140'], ['D', 'atp_matches_qual_chall_2001_7151'], ['V', 'atp_matches_qual_chall_2001_7658'], ['D', 'atp_matches_qual_chall_2001_7673'], ['D', 'atp_matches_qual_chall_2001_7822'], ['D', 'atp_matches_qual_chall_2001_7886'], ['V', 'atp_matches_qual_chall_2002_70'], ['V', 'atp_matches_qual_chall_2002_82'], ['V', 'atp_matches_qual_chall_2002_88'], ['V', 'atp_matches_qual_chall_2002_91'], ['D', 'atp_matches_qual_chall_2002_92'], ['D', 'atp_matches_qual_chall_2002_1710'], ['D', 'atp_matches_qual_chall_2002_1773'], ['V', 'atp_matches_qual_chall_2002_1840'], ['D', 'atp_matches_qual_chall_2002_1849'], ['D', 'atp_matches_qual_chall_2002_1983'], ['D', 'atp_matches_qual_chall_2002_2256'], ['V', 'atp_matches_qual_chall_2002_2326'], ['V', 'atp_matches_qual_chall_2002_2334'], ['V', 'atp_matches_qual_chall_2002_2338'], ['V', 'atp_matches_qual_chall_2002_2340'], ['V', 'atp_matches_qual_chall_2002_2341'], ['V', 'atp_matches_qual_chall_2002_2537'], ['V', 'atp_matches_qual_chall_2002_2548'], ['V', 'atp_matches_qual_chall_2002_2554'], ['D', 'atp_matches_qual_chall_2002_2557'], ['D', 'atp_matches_qual_chall_2002_2594'], ['D', 'atp_matches_2002_2922'], ['V', 'atp_matches_qual_chall_2002_3031'], ['V', 'atp_matches_qual_chall_2002_3055'], ['V', 'atp_matches_qual_chall_2002_3103'], ['V', 'atp_matches_qual_chall_2002_3384'], ['V', 'atp_matches_qual_chall_2002_3396'], ['V', 'atp_matches_qual_chall_2002_3402'], ['D', 'atp_matches_qual_chall_2002_3405'], ['D', 'atp_matches_qual_chall_2002_3475'], ['D', 'atp_matches_qual_chall_2002_3812'], ['V', 'atp_matches_qual_chall_2002_3874'], ['D', 'atp_matches_qual_chall_2002_3884'], ['D', 'atp_matches_2002_4022'], ['V', 'atp_matches_qual_chall_2002_4321'], ['D', 'atp_matches_qual_chall_2002_4333'], ['D', 'atp_matches_qual_chall_2002_4407'], ['D', 'atp_matches_qual_chall_2002_5341'], ['V', 'atp_matches_qual_chall_2002_6432'], ['V', 'atp_matches_qual_chall_2002_6447'], ['D', 'atp_matches_qual_chall_2002_6455'], ['D', 'atp_matches_qual_chall_2002_6528'], ['D', 'atp_matches_qual_chall_2003_527'], ['V', 'atp_matches_qual_chall_2003_859'], ['D', 'atp_matches_qual_chall_2003_874'], ['V', 'atp_matches_qual_chall_2003_959'], ['D', 'atp_matches_qual_chall_2003_972'], ['V', 'atp_matches_qual_chall_2003_1047'], ['D', 'atp_matches_qual_chall_2003_1062'], ['V', 'atp_matches_qual_chall_2003_1650'], ['D', 'atp_matches_qual_chall_2003_1660'], ['V', 'atp_matches_qual_chall_2003_2123'], ['V', 'atp_matches_qual_chall_2003_2137'], ['D', 'atp_matches_qual_chall_2003_2144'], ['D', 'atp_matches_qual_chall_2003_2219'], ['V', 'atp_matches_qual_chall_2003_2620'], ['V', 'atp_matches_qual_chall_2003_2635'], ['V', 'atp_matches_qual_chall_2003_2642'], ['V', 'atp_matches_qual_chall_2003_2646'], ['V', 'atp_matches_qual_chall_2003_2648'], ['V', 'atp_matches_qual_chall_2003_2774'], ['V', 'atp_matches_qual_chall_2003_2787'], ['D', 'atp_matches_qual_chall_2003_2793'], ['D', 'atp_matches_qual_chall_2003_3000'], ['V', 'atp_matches_qual_chall_2003_3174'], ['D', 'atp_matches_qual_chall_2003_3184'], ['D', 'atp_matches_qual_chall_2003_3348'], ['D', 'atp_matches_qual_chall_2003_3467'], ['V', 'atp_matches_qual_chall_2003_3562'], ['V', 'atp_matches_qual_chall_2003_3577'], ['V', 'atp_matches_qual_chall_2003_3585'], ['D', 'atp_matches_qual_chall_2003_3589'], ['V', 'atp_matches_qual_chall_2003_4000'], ['D', 'atp_matches_qual_chall_2003_4009'], ['V', 'atp_matches_qual_chall_2003_4184'], ['V', 'atp_matches_qual_chall_2003_4194'], ['V', 'atp_matches_qual_chall_2003_4199'], ['V', 'atp_matches_qual_chall_2003_4201'], ['V', 'atp_matches_qual_chall_2003_4202'], ['V', 'atp_matches_qual_chall_2003_4491'], ['V', 'atp_matches_qual_chall_2003_4501'], ['V', 'atp_matches_qual_chall_2003_4506'], ['V', 'atp_matches_qual_chall_2003_4509'], ['D', 'atp_matches_qual_chall_2003_4510'], ['V', 'atp_matches_qual_chall_2003_4544'], ['V', 'atp_matches_qual_chall_2003_4559'], ['D', 'atp_matches_qual_chall_2003_4566'], ['V', 'atp_matches_qual_chall_2003_4853'], ['V', 'atp_matches_qual_chall_2003_4869'], ['V', 'atp_matches_qual_chall_2003_4877'], ['V', 'atp_matches_qual_chall_2003_4881'], ['V', 'atp_matches_qual_chall_2003_4883'], ['D', 'atp_matches_qual_chall_2003_5283'], ['V', 'atp_matches_qual_chall_2003_5413'], ['V', 'atp_matches_qual_chall_2003_5421'], ['V', 'atp_matches_qual_chall_2003_5425']]","['V', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'D', 'D', 'D', 'D', 'V', 'D', 'D', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'D', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'D', 'D', 'V', 'D', 'D', 'D', 'V', 'V', 'D', 'D', 'V', 'D', 'V', 'V', 'D', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'D', 'V', 'D', 'D', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'V']","['D', 'V', 'D', 'V', 'D']",['D'],"['D', 'D', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'D']",61.02941176470589,63.559322033898304,40.0,0.0,50.0,4.824561403508771,5.263157894736842,61.40350877192983,46.49122807017544,18.859649122807017,65.35087719298245,9.649122807017545,5.701754385964912,40.44444444444444,,"[157, 136, 165, 158, 9999, 9999, 204, 198, 197, 188, 204, 233]","[247, 304, 220, 232, 0, 0, 164, 172, 177, 188, 167, 137]",0.0,2-6 7-5 7-5,,,,,,,,,,,,,,,,,,,,2003.0,,
3 | 


--------------------------------------------------------------------------------
/examples/data/data_loading.py:
--------------------------------------------------------------------------------
  1 | import ast
  2 | import os, sys
  3 | 
  4 | sys.path.append("../../python")
  5 | sys.path.append("../../")
  6 | 
  7 | import matplotlib.pyplot as plt
  8 | from matplotlib.patches import Rectangle
  9 | import numpy as np
 10 | import pandas as pd
 11 | 
 12 | from data.data_loader import matches_data_loader
 13 | 
 14 | 
 15 | data_df = matches_data_loader(
 16 |     path_to_data="../../submodules/tennis_atp",  # Path to tennis_atp submodule, keep as is if repo cloned with subdmodule
 17 |     path_to_cache="../../cache",  # Path to caching directory
 18 |     flush_cache=False,  # Whether or not to flush a potentially existing cache. Set to True if you want to create the data from scratch
 19 |     keep_values_from_year=2002,  # Returned data will date back to January 2002 up to today
 20 |     get_match_statistics=True,  # Whether to also retrun match statistics (time, score, etc...)
 21 |     get_reversed_match_data=True,  # Whether to duplicate the mathc row and exchange winner and loser positions
 22 |     include_davis_cup=True,  # Whether or not to include davis cup matches
 23 |     match_type=[
 24 |         "main_atp",
 25 |         "qualifying_challengers",
 26 |     ],  # Which match to keep. You can look at tennis_atp submodule to see possibilities
 27 | )
 28 | 
 29 | print(data_df.head())
 30 | print(data_df.shape)
 31 | 
 32 | # Creation of first figure
 33 | # Win percentages considering the ranks of players
 34 | 
 35 | # Rank categories
 36 | categories = [1, 10, 50, 100, 300, 1000, 9999]
 37 | 
 38 | best_ranked_player_win_percentage = []
 39 | categories_number_of_matches = []
 40 | 
 41 | for cat_1 in range(len(categories) - 1):
 42 |     lines = []
 43 |     nb_matches_lines = []
 44 |     for cat_2 in range(len(categories) - 1):
 45 |         sub_df = data_df.loc[data_df.Ranking_1 >= categories[cat_1]].loc[
 46 |             data_df.Ranking_1 < categories[cat_1 + 1]
 47 |         ]
 48 |         sub_df = sub_df.loc[sub_df.Ranking_2 >= categories[cat_2]].loc[
 49 |             sub_df.Ranking_2 < categories[cat_2 + 1]
 50 |         ]
 51 |         sub_df["best_rank"] = sub_df.apply(
 52 |             lambda row: np.argmin([row["Ranking_1"], row["Ranking_2"]]), axis=1
 53 |         )
 54 | 
 55 |         if len(sub_df) > 0:
 56 |             best_player_w_p = np.sum(
 57 |                 sub_df.Winner.values == sub_df.best_rank.values
 58 |             ) / len(sub_df)
 59 | 
 60 |         else:
 61 |             best_player_w_p = 0
 62 |         lines.append(best_player_w_p)
 63 |         nb_matches_lines.append(len(sub_df) / 2)
 64 |     best_ranked_player_win_percentage.append(lines)
 65 |     categories_number_of_matches.append(nb_matches_lines)
 66 | print(
 67 |     "Number of matches with player ranked 0:", len(data_df.loc[data_df.Ranking_1 == 0])
 68 | )
 69 | print(
 70 |     "Number of matches with player ranked > 9999:",
 71 |     len(data_df.loc[data_df.Ranking_1 > 9999]),
 72 | )
 73 | 
 74 | colors = ["purple", "blue", "cyan", "green", "yellow", "orange", "red"]
 75 | fig, ax = plt.subplots()
 76 | 
 77 | for i, val1 in enumerate(best_ranked_player_win_percentage):
 78 |     for j, val2 in enumerate(val1):
 79 |         color = colors[int(val2 * (len(colors) - 1))]
 80 |         rect = plt.Rectangle((i, j), 1, 1, fc=color)
 81 |         ax.add_patch(rect)
 82 |         plt.text(i + 0.2, j + 0.35, np.round(val2 * 100, 2))
 83 | 
 84 | for i in range(len(categories)):
 85 |     plt.plot([i, i], [0, len(categories) - 1], c="k")
 86 |     plt.plot([0, len(categories) - 1], [i, i], c="k")
 87 | 
 88 | plt.xticks(list(range(len(categories))), labels=categories)
 89 | plt.yticks(list(range(len(categories))), labels=categories)
 90 | plt.xlabel("Player 1 Rank Category")
 91 | plt.ylabel("Player 2 Rank Category")
 92 | plt.title("Best player Win percentage per Rank Category")
 93 | plt.savefig("Best_player_win_percentage.png")
 94 | plt.show()
 95 | 
 96 | # Second figure
 97 | # Number of matches considering players ranks
 98 | fig, ax = plt.subplots()
 99 | 
100 | for i, val1 in enumerate(categories_number_of_matches):
101 |     for j, val2 in enumerate(val1):
102 |         color = colors[
103 |             int(
104 |                 val2**0.5
105 |                 / np.max(categories_number_of_matches) ** 0.5
106 |                 * (len(colors) - 1)
107 |             )
108 |         ]
109 |         rect = plt.Rectangle((i, j), 1, 1, fc=color)
110 |         ax.add_patch(rect)
111 |         plt.text(i + 0.2, j + 0.35, int(val2))
112 | 
113 | for i in range(len(categories)):
114 |     plt.plot([i, i], [0, len(categories) - 1], c="k")
115 |     plt.plot([0, len(categories) - 1], [i, i], c="k")
116 | 
117 | plt.xticks(list(range(len(categories))), labels=categories)
118 | plt.yticks(list(range(len(categories))), labels=categories)
119 | plt.xlabel("Player 1 Rank Category")
120 | plt.ylabel("Player 2 Rank Category")
121 | plt.title("Number of matches recorded per Rank Category")
122 | plt.savefig("nb_matches.png")
123 | plt.show()
124 | 
125 | #### Stan the man
126 | # Statistics analysis of Stan Wawrinka over time
127 | overall_v = []
128 | last_hundred_v = []
129 | 
130 | overall_clay = []
131 | overall_carpet = []
132 | overall_grass = []
133 | overall_hard = []
134 | 
135 | wins_clay = []
136 | wins_carpet = []
137 | wins_grass = []
138 | wins_hard = []
139 | 
140 | dates = []
141 | stan_df = data_df.loc[data_df.ID_1 == 104527]
142 | stan_df = stan_df.reset_index()
143 | 
144 | stan_df.iloc[100].to_csv("single_row_example.csv")
145 | 
146 | for n_row, row in stan_df.iterrows():
147 |     matches = [r[0] for r in ast.literal_eval(str(row["Matches_1"]))]
148 | 
149 |     if len(matches) > 0:
150 |         overall_v.append(matches.count("V") / len(matches) * 100)
151 |         last_hundred_v.append(matches[-100:].count("V") / len(matches[-100:]) * 100)
152 | 
153 |         if str(row["tournament_date"])[:4] not in [d[0] for d in dates]:
154 |             dates.append((str(row["tournament_date"])[:4], n_row))
155 |         overall_clay.append(row["Clay_Victories_Percentage_1"])
156 |         overall_grass.append(row["Grass_Victories_Percentage_1"])
157 |         overall_hard.append(row["Hard_Victories_Percentage_1"])
158 |         overall_carpet.append(row["Carpet_Victories_Percentage_1"])
159 | 
160 |     wins_clay.append(list(row.Matches_Clay_1).count("V"))
161 |     wins_carpet.append(list(row.Matches_Carpet_1).count("V"))
162 |     wins_grass.append(list(row.Matches_Grass_1).count("V"))
163 |     wins_hard.append(list(row.Matches_Hard_1).count("V"))
164 | 
165 | # % Victory over time and surfaces
166 | plt.figure()
167 | plt.plot(overall_v, label="overall")
168 | plt.plot(last_hundred_v, label="last 100 matches")
169 | plt.plot(overall_clay, label="overall clay")
170 | plt.plot(overall_grass, label="overall grass")
171 | plt.plot(overall_hard, label="overall hard")
172 | plt.plot(overall_carpet, label="overall carpet")
173 | plt.legend()
174 | plt.xticks([d[1] for d in dates], [d[0] for d in dates], rotation="vertical")
175 | plt.title("Stanislas Wawrinka win percentage on main ATP tournamnents")
176 | plt.savefig("stan_the_man_win_percentage.png")
177 | plt.show()
178 | 
179 | 
180 | fig, ax1 = plt.subplots()
181 | ax1.plot(overall_v, label="overall", c="k")
182 | ax1.plot(last_hundred_v, label="last 100 matches", c="purple")
183 | ax1.plot(overall_clay, label="overall clay", c="orange")
184 | ax1.plot(overall_grass, label="overall grass", c="green")
185 | ax1.plot(overall_hard, label="overall hard", c="blue")
186 | ax1.plot(overall_carpet, label="overall carpet", c="gray")
187 | ax1.set_ylabel("Win %")
188 | plt.legend()
189 | 
190 | ax2 = ax1.twinx()
191 | for i, (wcarpet, wgrass, wclay, whard) in enumerate(
192 |     zip(wins_carpet, wins_grass, wins_clay, wins_hard)
193 | ):
194 |     if i % 2 == 0:
195 |         ax2.add_patch(
196 |             Rectangle(
197 |                 (i, 0),
198 |                 width=2,
199 |                 height=wcarpet,
200 |                 edgecolor=None,
201 |                 facecolor="gray",
202 |                 alpha=0.2,
203 |             )
204 |         )
205 |         ax2.add_patch(
206 |             Rectangle(
207 |                 (i, wcarpet),
208 |                 width=2,
209 |                 height=wgrass,
210 |                 edgecolor=None,
211 |                 facecolor="green",
212 |                 alpha=0.2,
213 |             )
214 |         )
215 |         ax2.add_patch(
216 |             Rectangle(
217 |                 (i, wcarpet + wgrass),
218 |                 width=2,
219 |                 height=wclay,
220 |                 edgecolor=None,
221 |                 facecolor="orange",
222 |                 alpha=0.2,
223 |             )
224 |         )
225 |         ax2.add_patch(
226 |             Rectangle(
227 |                 (i, wcarpet + wgrass + wclay),
228 |                 width=2,
229 |                 height=whard,
230 |                 edgecolor=None,
231 |                 facecolor="blue",
232 |                 alpha=0.2,
233 |             )
234 |         )
235 | 
236 | ax2.set_yticks([0, 100, 200, 300, 400, 500, 600, 700])
237 | ax2.set_ylabel("Number of victory for each surface")
238 | plt.tight_layout()
239 | ax1.set_xticks([d[1] for d in dates], [d[0] for d in dates], rotation="vertical")
240 | plt.title("Stanislas Wawrinka victories on ATP tournamnents")
241 | plt.savefig("stan_the_man_win_percentage.png")
242 | plt.show()
243 | 
244 | 
245 | aces = {"diff_aces": [], "winner": []}
246 | 
247 | for n_row, row in stan_df.iterrows():
248 |     diff_aces = row["Aces_Percentage_1"] - row["Aces_Percentage_2"]
249 |     winner = row["Winner"]
250 |     aces["diff_aces"].append(diff_aces)
251 |     aces["winner"].append(winner)
252 | 
253 | aces = pd.DataFrame(aces)
254 | classes = [val * 2.5 for val in range(-6, 4, 1)]
255 | fig, ax = plt.subplots(1)
256 | for min_class, max_class in zip(classes[:-1], classes[1:]):
257 |     values = aces.loc[aces.diff_aces < max_class].loc[aces.diff_aces > min_class]
258 |     ax.add_patch(
259 |         Rectangle(
260 |             xy=(min_class, 0),
261 |             width=2.5,
262 |             height=len(values.loc[values.winner == 0]),
263 |             edgecolor="k",
264 |             facecolor="blue",
265 |             label="Victory",
266 |         )
267 |     )
268 |     ax.add_patch(
269 |         Rectangle(
270 |             xy=(min_class, len(values.loc[values.winner == 0])),
271 |             width=2.5,
272 |             height=len(values.loc[values.winner == 1]),
273 |             edgecolor="k",
274 |             facecolor="orange",
275 |             label="Defeat",
276 |         )
277 |     )
278 | ax.autoscale_view()
279 | ax.set_xlabel("Career ace percentage difference with adversary")
280 | ax.set_ylabel("Number of matches")
281 | ax.set_title(
282 |     "Histogram of career aces percentage difference for Stan Wawrinka, colored by match results",
283 |     wrap=True,
284 | )
285 | handles, labels = plt.gca().get_legend_handles_labels()
286 | by_label = dict(zip(labels, handles))
287 | plt.legend(by_label.values(), by_label.keys())
288 | plt.savefig("stanimal_aces_percentage_difference.png")
289 | plt.show()
290 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Tennis-Prediction
  2 | 
  3 | <img align="right" width="200" src="./tennis_robot.png" />
  4 | 
  5 | The goal of this project is to predict the outcome of a tennis match using the data of both players and ML models.\
  6 | The data used comes from [Jeff Sackmann's repository](https://github.com/JeffSackmann).
  7 | 
  8 | - [Installation](#installation)
  9 | - [Usage](#usage)
 10 |   - [Data Loading](#data-loading)
 11 |   - [Machine Learning modelling](#ml-modelling)
 12 |   - [Encoding Matches](#encoding-matches)
 13 | - [License](#license)
 14 | 
 15 | ## Installation
 16 | 
 17 | To clone the repository, with the data you need to also clone the submodules:
 18 | 
 19 | ```bash
 20 | git clone --recurse-submodules https://github.com/VincentAuriau/Tennis-Prediction.git
 21 | ```
 22 | 
 23 | ## Usage
 24 | 
 25 | You can find examples in /examples:
 26 | 
 27 | ### Data Loading
 28 | Loading players statistics at match time and match outcome:
 29 | [Example](examples/data/data_loading.py)
 30 | 
 31 | ```python
 32 | from data.data_loader import matches_data_loader
 33 | data_df = matches_data_loader(path_to_data="submodules/tennis_atp")
 34 | ```
 35 | data_df contains let you access information about players (statistics prior to the match) along statistics of the match.
 36 | A basic example statistic: the victory percentage of the best ranked player in a match, depending on players rankings.
 37 | 
 38 | 
 39 | Number of ATP main matches depending on players rank             |  Victory % of best ranked player
 40 | :-------------------------:|:-------------------------:
 41 | ![](examples/data/nb_matches.png) |  ![](examples/data/Best_player_win_percentage.png)
 42 | 
 43 | It can be easily used to also compute players statistics over their carreer, and/or at match time. Here is a simple example with Stan Wawrinka:
 44 | Stan's Victory % in main ATP matches             |  Stan's career aces % diff with adversary
 45 | :-------------------------:|:-------------------------:
 46 | ![](examples/data/stan_the_man_win_percentage.png) |  ![](examples/data/stanimal_aces_percentage_difference.png)
 47 | 
 48 | Here is an example of a data row:
 49 | 
 50 | | id | tournament    | tournament_level    | tournament_date    | tournament_surface    | round    | best_of    | match_id    | Winner | Score |
 51 | | :---:   | :---: | :---: | :---:   | :---: | :---: | :---:   | :---: | :---:   | :---: |
 52 | | atp_matches_qual_chall_2003_5427 | San Benedetto CH   | C   | 20030811   | Clay   | SF   | 3   | 20030811   | 0 | 2-6 7-5 7-5 |
 53 | 
 54 | <ins>Base Match Statistics:</ins>
 55 | - **id and match_id:** unique identification of the matches
 56 | - **tournament:** name of the tournament
 57 | - **tournament_level:** Category of the tournament 'G' (Grand Slams) 'M' (Masters 1000s), 'A' (other tour-level events), 'C' (Challengers), 'S' (Satellites/ITFs), 'F' (Tour finals), 'D'( Davis Cup)
 58 | - **tournament_date:** date
 59 | - **tournament_surface:** surface 'Grass', 'Clay', 'Carpet', 'Hard'
 60 | - **round:** tournament round of the match 'F' (finale), 'SF' (semi-finale),
 61 | - **best_of:** number of won games needded (3 or 5)
 62 | - **Winner:** index of the winner: 0 (Player1) or 1 (Player2)
 63 | - **Score:** finale score
 64 |    
 65 | <ins>Additional match statistics:</ins>
 66 | - **elapsed_minutes:** Duration of the match
 67 | - **aces_nb_x:** Number of aces of player x
 68 | - **doublefaults_nb_x:** Number of doublefaults
 69 | - **svpt_x:** Number of serve points
 70 | - **1stIn_1:** Number of first serves made
 71 | - **1stWon_1:** Number of first-serve points won
 72 | - **2ndWon_x:** Number of second-serve points won
 73 | - **SvGms_1:** Number of serve games
 74 | - **bpSaved_1:** Number of break points saved
 75 | - **bpFaced_1:** Number of break points faced
 76 | 
 77 | <ins>Example of match statistics:</ins>
 78 | 
 79 | | Name_1 | ID_1    | Ranking_1    | Ranking_Points_1    | Ranking_History_1    | Best_Rank_1    | Birth_Year_1    | Versus_1    | Hand_1 | Last_Tournament_Date_1    | Height_1    | Matches_1    | Matchs_Clay_1    | Matches_Carpet_1    | Matches_Grass_1    | Matches_Hard_1    | Victories_Percentage_1    | Clay_Victories_Percentage_1    | Carpet_Victories_Percentage_1    | Grass_Victories_Percentage_1    | Hard_Victories_Percentage_1    | Aces_Percentage_1    | Doublefaults_Percentage_1    | First_Save_Success_Percentage_1    | Winning_on_1st_Serve_Percentage_1    | Winning_on_2nd_Serve_Percentage_1    | Overall_Win_on_Serve_Percentage_1    | BreakPoint_Face_Percentage_1 | BreakPoint_Saved_Percentage_1 | last_rankings_1 | last_ranking_points_1 |
 80 | | :---:   | :---: | :---: | :---:   | :---: | :---: | :---:   | :---: | :---:   | :---: | :---: | :---:   | :---: | :---: | :---:   | :---: | :---:   | :---: | :---: | :---:   | :---: | :---: | :---:   | :---: | :---:   | :---: | :---: | :---:   | :---: | :---:   | :---: |
 81 | | Stan.Wawrinka | 104527  | 184   | 114   | {20030616: [387, 68], 20030707: [363, 74], 20030714: [348, 79], 20030721: [303, 99], 20030811: [284, 114]}   | 284   | 19850328   | []   | R | 20030721 | 183 | [['V', 'atp_matches_qual_chall_2003_3466'], ['D', 'atp_matches_qual_chall_2003_3481'], ['D', 'atp_matches_2003_4049'], ['V', 'atp_matches_2003_4315'], ['D', 'atp_matches_2003_4328'], ['V', 'atp_matches_2003_4773'], ['D', 'atp_matches_2003_4782'], ['V', 'atp_matches_qual_chall_2003_5408'], ['V', 'atp_matches_qual_chall_2003_5419'], ['V', 'atp_matches_qual_chall_2003_5424']] | ['V', 'D', 'D', 'V', 'D', 'V', 'D', 'V', 'V', 'V'] | [] | [] | [] | 60 | 60 | 0 | 0 | 0 | 3.41880341880342 | 4.27350427350427 | 64.957264957265 | 54.985754985755 | 15.6695156695157 | 70.6552706552707 | 11.3960113960114 | 7.69230769230769 | [303, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 387] | [99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 68] |
 82 | 
 83 | 
 84 | | Name_2 | ID_2 | Ranking_2 | Ranking_Points_2 | Ranking_History_2 | Best_Rank_2 | Birth_Year_2 | Versus_2 | Hand_2 | Last_Tournament_Date_2 | Height_2 | Matches_2 | Matchs_Clay_2 | Matches_Carpet_2 | Matches_Grass_2 | Matches_Hard_2 | Victories_Percentage_2 | Clay_Victories_Percentage_2 | Carpet_Victories_Percentage_2 | Grass_Victories_Percentage_2 | Hard_Victories_Percentage_2 | Aces_Percentage_2 | Doublefaults_Percentage_2 | First_Save_Success_Percentage_2 | Winning_on_1st_Serve_Percentage_2 | Winning_on_2nd_Serve_Percentage_2 | Overall_Win_on_Serve_Percentage_2 | BreakPoint_Face_Percentage_2 | BreakPoint_Saved_Percentage_2 | last_rankings_2 | last_ranking_points_2 |
 85 | | :---:   | :---: | :---: | :---:   | :---: | :---: | :---:   | :---: | :---:   | :---: | :---: | :---:   | :---: | :---: | :---:   | :---: | :---:   | :---: | :---: | :---:   | :---: | :---: | :---:   | :---: | :---:   | :---: | :---: | :---:   | :---: | :---:   | :---: |
 86 | | Martin.Vassallo Arguello | 103506 | 125 | 296 | {19990201: [817, 13], 20000710: [398, 61], 20000731: [354, 75], 20000807: [377, 70], 20010625: [459, 48], 20010709: [405, 61], 20010813: [391, 68], 20010820: [374, 72], 20010827: [342, 88], 20010917: [291, 117], 20010924: [286, 122], etc...} | 123 | 19800210 | []   | R | 20030804 | 183 | [['V', 'atp_matches_qual_chall_1999_380'], ['D', 'atp_matches_qual_chall_1999_393'], ['V', 'atp_matches_qual_chall_2000_3972'], ['V', 'atp_matches_qual_chall_2000_3988'], ['D', 'atp_matches_qual_chall_2000_3996'], ['D', 'atp_matches_qual_chall_2000_4725'], ['D', 'atp_matches_qual_chall_2000_4758'], ['V', 'atp_matches_qual_chall_2001_3699'], etc...] | ['V', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'D', 'D', etc...] | ['D', 'V', 'D', 'V', 'D'] | ['D'] | ['D', 'D', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'D'] | 61.0294117647059 | 63.5593220338983 | 40 | 0 | 50 | 4.82456140350877 | 5.26315789473684 | 61.4035087719298 | 46.4912280701754 | 18.859649122807 | 65.3508771929825 | 9.64912280701754 | 5.70175438596491 | [157, 136, 165, 158, 9999, 9999, 204, 198, 197, 188, 204, 233] | [247, 304, 220, 232, 0, 0, 164, 172, 177, 188, 167, 137] |
 87 | 
 88 | <ins>Player statistics before the match:</ins>
 89 | - **Name_x**: Name of the player
 90 | - **ID_x:** ID of the player
 91 | - **Ranking_x:** ATP ranking of the player. For all rankings, 9999 means unranked.
 92 | - **Ranking_Points_x:** Number of ATP points
 93 | - **Ranking_History_x:** All recorded rankings
 94 | - **Best_Rank_x:** Best reached ATP rank
 95 | - **Birth_Year_x:** Birth year
 96 | - **Versus_x:** Dictionnary containing all match outcomes agains other players
 97 | - **Hand_x**: Hand used to play 'R', 'L' or 'U' for unknown
 98 | - **Last_Tournament_Date_x:** Date of the last previous tournament attended
 99 | - **Height_x:** Height
100 | - **Matches_x:** History of outcomes of previous matches
101 | - **Matchs_Clay_x:** History of outcomes of previous matches on clay
102 | - **Matchs_Carpet_x:** History of outcomes of previous matches on carpet
103 | - **Matchs_Grass_x:** History of outcomes of previous matches on grass
104 | - **Matchs_Hard_x:** History of outcomes of previous matches on hard
105 | - **Victories_Percentage_x:** Victory percentage over all player ATP matches
106 | - **Clay_Victories_Percentage_x:** Victory percentage over all player ATP matches on clay  
107 | - **Carpet_Victories_Percentage_x:** Victory percentage over all player ATP matches on carpet
108 | - **Grass_Victories_Percentage_x:** Victory percentage over all player ATP matches on grass
109 | - **Hard_Victories_Percentage_x:** Victory percentage over all player ATP matches on hard  
110 | - **Aces_Percentage_x:** Aces percentage over all player ATP matches
111 | - **Doublefaults_Percentage_x:** DoubleFaults percentage over all player ATP matches
112 | - **First_Save_Success_Percentage_x:**  First save success percentage  over all player ATP matches
113 | - **Winning_on_1st_Serve_Percentage_x:** Winning on first serve percentage over all player ATP matches
114 | - **Winning_on_2nd_Serve_Percentage_x:** Winning on second serve percentage over all player ATP matches
115 | - **Overall_Win_on_Serve_Percentage_x:** Overall winning percentage on serve over all player ATP matches
116 | - **BreakPoint_Face_Percentage_x:** Overall breakpoint face percentage over all player ATP matches
117 | - **BreakPoint_Saved_Percentage_x:** Overall breakpoint saved percentage over all player ATP matches
118 | - **last_rankings_x:** Five previous recorded ATP rankings
119 | - **last_ranking_points_x:** Five previous ATP ranking points recorded
120 | 
121 | ### Machine-Learning modelling
122 | Train/Testing on matches outcome
123 | [[Example]](examples/models/train_test.py).
124 | 
125 | A generic function lets you evaluate your model with a train/test scheme without much work. Your model only needs a scikit-learn like signature.
126 | By playing with the years, columns to use in modelling and models & hyperparmaters, you can easily create your own best-performing model.
127 | 
128 | ```python
129 | from sklearn.ensemble import RandomForestClassifier
130 | from evaluation.train_test import train_test_evaluation
131 | 
132 | test_score = train_test_evaluation(
133 |     train_years=[2020, 2021],
134 |     test_years=[2022, 2023],
135 |     model_class=RandomForestClassifier,
136 |     model_params={"n_estimators": 2000, "max_depth": None},
137 |     match_features=[],
138 |     player_features=["Ranking"],
139 |     encoding_params={},
140 |     additional_features=[],
141 |     save_path="./results",
142 |     save_all_results=False
143 | )
144 | 
145 | print("Test Score", test_score)
146 | ```
147 | 
148 | Models and hyperparamters can easily be compared with the file results.csv saved in save_path.
149 | 
150 | Accuracy of different models
151 | :-------------------------:
152 | ![](examples/results_reading/models_performances.png)
153 | 
154 | If the argument save_all_results is set to True, the whole csv of test data is saved. It helps to get more in-depth analysis of results
155 | 
156 | Model precision compared with best ranked player wins strategy            |  Model precision depending of players ranks
157 | :-------------------------:|:-------------------------:
158 | ![](examples/results_reading/win_per_surface.png) |  ![](examples/results_reading/precision_percentage_players_ranks.png)
159 | 
160 | ### Encoding matches
161 | [Example](examples/history_modeling/first_example.py)
162 | In order to represent history of a player, one can use MatchEncoders:
163 | 
164 | ```python
165 | from history_modeling.encoding_model import PCAMatchEncoder
166 | 
167 | model = PCAMatchEncoder(num_pca_features=2)
168 | model.fit(data_df, transform_data=True)
169 | X_r, match_info = model.predict(data_df, transform_data=True)
170 | ```
171 | 
172 | 2D representation of match outcome:
173 | :-------------------------:
174 | ![](examples/history_modeling/2d_pca_match_representation_test.png)
175 | 
176 | ## License
177 | 
178 | <img align="left" width="200" src="./tennis_robot_2.png" />
179 | 
180 | &nbsp;
181 | &nbsp;
182 | 
183 | The repository is under the MIT License, you can freely use any part as you like.\
184 | If you find this repository useful, you can cite it and add a star ⭐ !
185 | 
186 | 


--------------------------------------------------------------------------------
/python/data/data_loader.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import re
  4 | import time
  5 | from ast import literal_eval
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | 
 10 | import data.player as player
 11 | import data.match as match
 12 | from data.data_utils import reverse_score
 13 | 
 14 | 
 15 | def create_player_profiles(df):
 16 |     """
 17 |     Creates database of players from df containing list of players
 18 |     :param df: pandas.DataFrame corresponding to atp_players.csv
 19 |     :return: databaser of player.Players objects
 20 |     """
 21 |     players_db = {}
 22 |     for n_row, row in df.iterrows():
 23 |         pl = player.Player(
 24 |             name=(str(row["name_first"]) + "." + str(row["name_last"])),
 25 |             birthdate=row["dob"],
 26 |             country=row["ioc"],
 27 |             nb_id=row["player_id"],
 28 |             hand=row["hand"],
 29 |             height=row["height"],
 30 |         )
 31 | 
 32 |         if row["player_id"] in players_db.keys():
 33 |             print(f"Player ID {row['player_id']} already in database, appears twice ?")
 34 |         else:
 35 |             players_db[row["player_id"]] = pl
 36 |     return players_db
 37 | 
 38 | 
 39 | def read_matches_file(path_to_file):
 40 |     """
 41 |     Opens a csv file with matches
 42 |     :param path_to_file:
 43 |     :return: corresponding df
 44 |     """
 45 |     df_match = pd.read_csv(path_to_file)
 46 |     return df_match
 47 | 
 48 | 
 49 | def get_match_files(path_to_data_dir, match_type=["main_atp"]):
 50 |     """
 51 |     Lists the available csv containing matches
 52 |     :param path_to_data_dir: path to directory with all files
 53 |     :param match_type: matches we want to retrieve list of elements among ["main_atp", "futures", "qualifying_challengers"]
 54 |     :return:
 55 |     """
 56 |     main_atp_pattern = "atp_matches_(?P<year>\d+).csv"
 57 |     futures_pattern = "atp_matches_futures_(?P<year>\d+).csv"
 58 |     qual_chall_pattern = "atp_matches_qual_chall_(?P<year>\d+).csv"
 59 | 
 60 |     matches_data_file = {}
 61 | 
 62 |     for file in os.listdir(path_to_data_dir):
 63 |         if "main_atp" in match_type:
 64 |             regex_match = re.match(main_atp_pattern, file)
 65 |             if regex_match is not None:
 66 |                 matches_data_file["filepath"] = matches_data_file.get(
 67 |                     "filepath", []
 68 |                 ) + [os.path.join(path_to_data_dir, file)]
 69 |                 matches_data_file["match_type"] = matches_data_file.get(
 70 |                     "match_type", []
 71 |                 ) + ["main_tour"]
 72 |                 match_dict = regex_match.groupdict()
 73 |                 for key, value in match_dict.items():
 74 |                     matches_data_file[key] = matches_data_file.get(key, []) + [value]
 75 |         if "futures" in match_type:
 76 |             regex_match = re.match(futures_pattern, file)
 77 |             if regex_match is not None:
 78 |                 matches_data_file["filepath"] = matches_data_file.get(
 79 |                     "filepath", []
 80 |                 ) + [os.path.join(path_to_data_dir, file)]
 81 |                 matches_data_file["match_type"] = matches_data_file.get(
 82 |                     "match_type", []
 83 |                 ) + ["main_tour"]
 84 |                 match_dict = regex_match.groupdict()
 85 |                 for key, value in match_dict.items():
 86 |                     matches_data_file[key] = matches_data_file.get(key, []) + [value]
 87 |         if "qualifying_challengers" in match_type:
 88 |             regex_match = re.match(qual_chall_pattern, file)
 89 |             if regex_match is not None:
 90 |                 matches_data_file["filepath"] = matches_data_file.get(
 91 |                     "filepath", []
 92 |                 ) + [os.path.join(path_to_data_dir, file)]
 93 |                 matches_data_file["match_type"] = matches_data_file.get(
 94 |                     "match_type", []
 95 |                 ) + ["main_tour"]
 96 |                 match_dict = regex_match.groupdict()
 97 |                 for key, value in match_dict.items():
 98 |                     matches_data_file[key] = matches_data_file.get(key, []) + [value]
 99 |     return pd.DataFrame(matches_data_file)
100 | 
101 | 
102 | def load_match_data_from_path(
103 |     players_db, paths_to_matchs_file, get_match_statistics=False
104 | ):
105 |     """
106 |     Loads file from path and creates the matches data while updating players databaser
107 |     :param players_db:
108 |     :param path_to_matchs_file:
109 |     :return:
110 |     """
111 | 
112 |     def extract_file_id(file_path):
113 |         file_id = file_path.split("/")[-1].split(".")[0]
114 |         if "\\" in file_id:
115 |             file_id = file_id.split("\\")[1]
116 | 
117 |         return file_id
118 | 
119 |     if not isinstance(paths_to_matchs_file, list):
120 |         paths_to_matchs_file = [paths_to_matchs_file]
121 | 
122 |     files = []
123 |     for path in paths_to_matchs_file:
124 |         match_df = pd.read_csv(path)
125 |         match_df["filepath"] = path
126 |         files.append(match_df)
127 |     match_df = pd.concat(files, axis=0)
128 |     match_df = match_df.sort_values(["tourney_date", "tourney_id", "match_num"])
129 |     match_df = match_df.reset_index(drop=True)
130 |     """
131 |     match_df["match_id"] = match_df.apply(
132 |         lambda row: extract_file_id(path_to_matchs_file) + "_" + str(row.name),
133 |         axis=1,
134 |     )
135 |     """
136 |     matches_data = []
137 |     for n_row, row in match_df.iterrows():
138 |         m_winner = players_db[row["winner_id"]]
139 |         m_loser = players_db[row["loser_id"]]
140 |         m_tournament = row["tourney_name"]
141 |         m_surface = row["surface"]
142 | 
143 |         match_o = match.Match(
144 |             winner=m_winner,
145 |             loser=m_loser,
146 |             tournament=m_tournament,
147 |             surface=m_surface,
148 |             id_prefix=extract_file_id(row["filepath"]),
149 |         )
150 |         match_o.instantiate_from_data_row(row)
151 |         (
152 |             match_data,
153 |             w_data,
154 |             l_data,
155 |         ) = match_o.get_prior_data_and_update_players_stats()
156 | 
157 |         match_data["match_id"] = match_o.id
158 | 
159 |         to_1 = {}
160 |         to_2 = {}
161 |         for col in w_data.columns:
162 |             to_1[col] = col + "_1"
163 |             to_2[col] = col + "_2"
164 | 
165 |         concat_1 = pd.concat(
166 |             [w_data.copy().rename(to_1, axis=1), l_data.copy().rename(to_1, axis=1)],
167 |             axis=0,
168 |         )
169 |         concat_2 = pd.concat(
170 |             [l_data.copy().rename(to_2, axis=1), w_data.copy().rename(to_2, axis=1)],
171 |             axis=0,
172 |         )
173 |         final_df = pd.concat(
174 |             [pd.concat([match_data] * 2, axis=0), concat_1, concat_2], axis=1
175 |         )
176 |         final_df["Winner"] = [0, 1]
177 | 
178 |         if get_match_statistics:
179 |             (
180 |                 match_stats,
181 |                 w_mstats,
182 |                 l_mstats,
183 |             ) = match_o.get_match_data_results_statistics()
184 |             ms_to_1 = {}
185 |             ms_to_2 = {}
186 |             for col in w_mstats.columns:
187 |                 ms_to_1[col] = col + "_1"
188 |                 ms_to_2[col] = col + "_2"
189 | 
190 |             ms_concat_1 = pd.concat(
191 |                 [
192 |                     w_mstats.copy().rename(ms_to_1, axis=1),
193 |                     l_mstats.copy().rename(ms_to_1, axis=1),
194 |                 ],
195 |                 axis=0,
196 |             )
197 |             ms_concat_2 = pd.concat(
198 |                 [
199 |                     l_mstats.copy().rename(ms_to_2, axis=1),
200 |                     w_mstats.copy().rename(ms_to_2, axis=1),
201 |                 ],
202 |                 axis=0,
203 |             )
204 | 
205 |             match_stats_1 = match_stats.copy()
206 |             match_stats_2 = match_stats.copy()
207 |             match_stats_2["score"] = match_stats_2.apply(
208 |                 lambda row: reverse_score(row["score"]), axis=1
209 |             )
210 | 
211 |             match_stats_df = pd.concat(
212 |                 [
213 |                     pd.concat([match_stats_1, match_stats_2], axis=0),
214 |                     ms_concat_1,
215 |                     ms_concat_2,
216 |                 ],
217 |                 axis=1,
218 |             )
219 |             final_df = pd.concat([final_df, match_stats_df], axis=1)
220 |         matches_data.append(final_df)
221 | 
222 |     matches_data = pd.concat(matches_data, axis=0)
223 |     return matches_data
224 | 
225 | 
226 | def matches_data_loader(
227 |     keep_values_from_year=1990,
228 |     path_to_data="submodules/tennis_atp",
229 |     path_to_cache="/cache",
230 |     flush_cache=True,
231 |     get_match_statistics=False,
232 |     get_reversed_match_data=False,
233 |     include_davis_cup=False,
234 |     match_type=["main_atp", "futures", "qualifying_challengers"],
235 | ):
236 |     """
237 |     Main matches data loading function
238 |     :param keep_values_from_year: int  [1968; present], min year to keep values from
239 |     :param path_to_data: str, path to tennis_atp submodule
240 |     :param path_to_cache: str, path to local personal cache
241 |     :param flush_cache: bool, whether cache should be erased and whole function run again
242 |     :param get_match_statistics: bool, return each match statistics along pre match statistics
243 |     :param get_reversed_match_data: bool, should each match be double, with Winner = 0 and Winner = 1
244 |     :return: pandas.DataFrame with all matches data
245 |     """
246 | 
247 |     total_elapsed_time = 0
248 |     # Check if data already in cache
249 |     if os.path.exists(os.path.join(path_to_cache, "players_db")):
250 |         print("Payers DB exists")
251 |         players_db_cached = True
252 |     else:
253 |         players_db_cached = False
254 | 
255 |     if os.path.exists(
256 |         os.path.join(path_to_cache, f"matches_data_{keep_values_from_year}.csv")
257 |     ):
258 |         matches_data_cached = True
259 |     else:
260 |         matches_data_cached = False
261 | 
262 |     if not players_db_cached or flush_cache:
263 |         df_players = pd.read_csv(
264 |             os.path.join(path_to_data, "atp_players.csv"),
265 |             header=0,
266 |             encoding="ISO-8859-1",
267 |         )
268 |         players_db = create_player_profiles(df_players)
269 |         with open(os.path.join(path_to_cache, "players_db"), "wb") as file:
270 |             pickle.dump(players_db, file, protocol=pickle.HIGHEST_PROTOCOL)
271 |     else:
272 |         with open(os.path.join(path_to_cache, "players_db"), "rb") as file:
273 |             players_db = pickle.load(file)
274 | 
275 |     if not matches_data_cached or flush_cache:
276 |         print("data not cached or flush=True")
277 |         data_files = get_match_files(path_to_data, match_type=match_type)
278 |         data_years = data_files.year.astype(
279 |             "uint32"
280 |         )  # to change when handling different type of tournament (qualifiers, main, etc...)
281 | 
282 |         data_per_year = []
283 |         for year in np.sort(np.unique(data_years.values)):
284 |             t_start = time.time()
285 |             print("+---------+---------+")
286 |             print("  Year %i  " % year)
287 |             if year >= keep_values_from_year:
288 |                 print("Updating players statistics & saving matches data")
289 |             else:
290 |                 print("Only updating players statistics")
291 |             print("+---------+---------+")
292 |             filepaths = data_files.loc[data_files.year == str(year)][
293 |                 "filepath"
294 |             ].values.tolist()
295 |             df_year = load_match_data_from_path(
296 |                 players_db, filepaths, get_match_statistics=get_match_statistics
297 |             )
298 |             df_year["tournament_year"] = year
299 |             if year >= keep_values_from_year:
300 |                 data_per_year.append(df_year)
301 | 
302 |             df_year.to_csv(
303 |                 os.path.join(path_to_cache, f"matches_data_{year}.csv"),
304 |                 sep=";",
305 |                 index=False,
306 |             )
307 |             total_elapsed_time += time.time() - t_start
308 |             print(f"Elapsed Time: {np.round(time.time() - t_start, 2)} seconds")
309 |             print(f"Total Elapsed Time: {np.round(total_elapsed_time, 2)} seconds")
310 | 
311 |         data_matches = pd.concat(data_per_year, axis=0)
312 |         data_matches = data_matches.reset_index()
313 | 
314 |     else:
315 |         years = []
316 |         file_pattern = "matches_data_(?P<year>\d+).csv"
317 |         for file in os.listdir(path_to_cache):
318 |             regex_match = re.match(file_pattern, file)
319 |             if regex_match is not None:
320 |                 years.append(int(regex_match["year"]))
321 | 
322 |         data_per_year = []
323 |         for year in np.sort(years):
324 |             if year >= keep_values_from_year:
325 |                 df_year = pd.read_csv(
326 |                     os.path.join(path_to_cache, f"matches_data_{year}.csv"), sep=";"
327 |                 )
328 |                 data_per_year.append(df_year)
329 | 
330 |         data_matches = pd.concat(data_per_year, axis=0)
331 |         data_matches.reset_index(drop=True, inplace=True)
332 | 
333 |     if not include_davis_cup:
334 |         data_matches = data_matches.loc[~data_matches.tournament.str.contains("Davis")]
335 | 
336 |     if get_reversed_match_data:
337 |         return data_matches
338 |     else:
339 |         return data_matches.iloc[::2]
340 | 
341 | 
342 | def clean_missing_data(df):
343 |     """
344 |     Cleans rows of df with missing data or to few statistics to be useful
345 |     :param df:
346 |     :return:
347 |     """
348 | 
349 |     df.dropna(axis=0)
350 |     df = df.loc[df.Ranking_1 != 9999]
351 |     df = df.loc[df.Ranking_1 != 0]
352 |     df = df.loc[df.Ranking_2 != 9999]
353 |     df = df.loc[df.Ranking_2 != 0]
354 | 
355 |     return df
356 | 
357 | 
358 | def encode_data(df, mode="integer"):
359 |     # Remove:
360 |     #   - index
361 |     #   - Unnamed: 0
362 |     #   - Unnamed: 0.1
363 |     #   - tournament
364 |     #   - Name
365 |     #   - ID
366 |     #   - Birth Year => Age
367 |     #   - Versus: % V against 2, last 5 matches
368 |     #   - Matches
369 | 
370 |     # Refac:
371 |     #   - Versus
372 |     # Best way to do it ?
373 |     #   - Birth Year
374 |     #   - Last Tournament => Days since last tournament + result ?
375 | 
376 |     df_copy = df
377 |     if mode == "integer":
378 |         # Considered Variables:
379 |         tournament_level = {"G": 0, "A": 1, "M": 2, "F": 3, "D": 4}
380 | 
381 |         round = {
382 |             "F": 0,
383 |             "SF": 1,
384 |             "QF": 2,
385 |             "R16": 3,
386 |             "R32": 4,
387 |             "R64": 5,
388 |             "R128": 6,
389 |             "R256": 7,
390 |             "RR": 8,
391 |             "BR": 9,
392 |             "ER": 10,
393 |         }
394 | 
395 |         hand = {
396 |             "R": -1,
397 |             "L": 1,
398 |             "A": 0,
399 |             "U": 2,
400 |             "nan": 2,
401 |         }
402 | 
403 |     elif mode == "one_hot":
404 |         # Considered Variables:
405 |         tournament_level = {
406 |             "G": [0, 0, 0, 1],
407 |             "A": [0, 0, 1, 0],
408 |             "M": [0, 1, 0, 0],
409 |             "D": [1, 0, 0, 0],
410 |         }
411 | 
412 |         round = {
413 |             "F": [0, 0, 0, 0, 0, 0, 0, 0, 1],
414 |             "SF": [0, 0, 0, 0, 0, 0, 0, 1, 0],
415 |             "QF": [0, 0, 0, 0, 0, 0, 1, 0, 0],
416 |             "R16": [0, 0, 0, 0, 0, 1, 0, 0, 0],
417 |             "R32": [0, 0, 0, 0, 1, 0, 0, 0, 0],
418 |             "R64": [0, 0, 0, 1, 0, 0, 0, 0, 0],
419 |             "R128": [0, 0, 1, 0, 0, 0, 0, 0, 0],
420 |             "R256": [0, 1, 0, 0, 0, 0, 0, 0, 0],
421 |             "RR": [1, 0, 0, 0, 0, 0, 0, 0, 0],
422 |         }
423 | 
424 |         hand = {
425 |             "R": [1, 0, 0, 0],
426 |             "L": [0, 1, 0, 0],
427 |             "A": [0, 0, 1, 0],
428 |             "U": [0, 0, 0, 1],
429 |         }
430 | 
431 |     elif mode == "mixing":
432 |         # Considered Variables:
433 |         tournament_level = {"G": 0, "A": 1, "M": 2, "F": 3, "D": 4}
434 | 
435 |         round = {
436 |             "F": 0,
437 |             "SF": 1,
438 |             "QF": 2,
439 |             "R16": 3,
440 |             "R32": 4,
441 |             "R64": 5,
442 |             "R128": 6,
443 |             "R256": 7,
444 |             "RR": 8,
445 |             "BR": 9,
446 |         }
447 | 
448 |         hand = {
449 |             "R": [1, 0, 0, 0],
450 |             "L": [0, 1, 0, 0],
451 |             "A": [0, 0, 1, 0],
452 |             "U": [0, 0, 0, 1],
453 |         }
454 | 
455 |     for col in df_copy.columns:
456 |         if "hand" in col.lower():
457 |             df_copy[col] = df_copy.apply(lambda row: hand[str(row[col])], axis=1)
458 |         elif "round" in col.lower():
459 |             df_copy[col] = df_copy.apply(lambda row: round[row[col]], axis=1)
460 |         elif "tournament_level" in col.lower():
461 |             df_copy[col] = df_copy.apply(lambda row: tournament_level[row[col]], axis=1)
462 |         else:
463 |             pass
464 | 
465 |     def get_versus_1(row):
466 |         vs_1 = row["Versus_1"]
467 |         if isinstance(vs_1, str):
468 |             try:
469 |                 vs_1 = literal_eval(vs_1)
470 |             except:
471 |                 raise ValueError("Err_OR")
472 |         return vs_1.get(row["ID_2"], [])
473 | 
474 |     if "Versus_1" in df_copy.columns:
475 |         df_copy["nb_match_versus"] = df_copy.apply(
476 |             lambda row: len(row["Versus_1"]), axis=1
477 |         )
478 |         df_copy["v_perc_versus"] = df_copy.apply(
479 |             lambda row: row["Versus_1"].count("V") / len(row["Versus_1"])
480 |             if len(row["Versus_1"]) > 0
481 |             else -1,
482 |             axis=1,
483 |         )
484 | 
485 |     return df_copy
486 | 


--------------------------------------------------------------------------------
/python/data/player.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | from data.data_utils import get_days_difference
  5 | 
  6 | # How to update player's ranking ?
  7 | 
  8 | 
  9 | class Player:
 10 |     def __init__(self, name, birthdate, country, nb_id, hand="", height=0):
 11 |         self.name = name
 12 |         self.birthdate = birthdate
 13 | 
 14 |         self.rankings_history = {}
 15 |         self.ranking = 9999
 16 |         self.ranking_points = 0
 17 |         self.ranking_over_time = 0
 18 |         self.country = country
 19 |         self.id = nb_id
 20 |         self.last_tournament_date = ""
 21 |         self.versus = {}
 22 |         self.hand = hand
 23 |         self.height = height
 24 | 
 25 |         self.last_matches = ["", "", "", "", ""]
 26 |         # self.matches = []
 27 |         self.matches_history = []
 28 |         self.victories_percentage = 0
 29 | 
 30 |         self.matches_hard = []
 31 |         self.hard_victories_percentage = 0
 32 |         self.matches_carpet = []
 33 |         self.carpet_victories_percentage = 0
 34 |         self.matches_clay = []
 35 |         self.clay_victories_percentage = 0
 36 |         self.matches_grass = []
 37 |         self.grass_victories_percentage = 0
 38 | 
 39 |         self.aces_percentage = 0
 40 | 
 41 |         self.doublefaults_percentage = 0
 42 |         self.first_serve_success_percentage = 0
 43 |         self.winning_on_1st_serve_percentage = 0
 44 |         self.winning_on_2nd_serve_percentage = 0
 45 |         self.overall_win_on_serve_percentage = 0
 46 | 
 47 |         self.service_data = {
 48 |             "service_games_played": [],
 49 |             "1st_serve_success": [],
 50 |             "aces_nb": [],
 51 |             "doublefaults_nb": [],
 52 |             "win_on_1st_serve": [],
 53 |             "win_on_2nd_serve": [],
 54 |             "breakpoints_faced": [],
 55 |             "breakpoints_saved": [],
 56 |         }
 57 | 
 58 |         self.breakpoint_faced_percentage = 0
 59 |         self.breakpoint_saved_percentage = 0
 60 | 
 61 |         self.games_fatigue = (
 62 |             0  # nb games curr tourney + nb games prev tourney / diff days
 63 |         )
 64 |         self.minutes_fatigue = (
 65 |             0  # nb minutes curr tourney + nb minutes prev tourney / diff days
 66 |         )
 67 |         self.fatigue_features = {
 68 |             "previous tournament": {
 69 |                 "date": "19000000",
 70 |                 "num_games": 0,
 71 |                 "num_matchs": 0,
 72 |                 "num_minutes": 0,
 73 |             },
 74 |             "current tournament": {
 75 |                 "date": "19000000",
 76 |                 "num_games": 0,
 77 |                 "num_matchs": 0,
 78 |                 "num_minutes": 0,
 79 |             },
 80 |         }
 81 | 
 82 |     def __str__(self):
 83 |         return (
 84 |             "ID : "
 85 |             + str(self.id)
 86 |             + " *** Name : "
 87 |             + self.name
 88 |             + " " * (35 - len(self.name))
 89 |             + " *** Born Year : "
 90 |             + str(self.birthdate)
 91 |             + " *** Country : "
 92 |             + str(self.country)
 93 |             + " *** Hand : "
 94 |             + str(self.hand)
 95 |             + " *** Height : "
 96 |             + str(self.height)
 97 |         )
 98 | 
 99 |     def _add_victory(self, id_loser, match_id, tournament_date="19000101"):
100 |         """
101 |         Update last_matches argument with a victories and updates versus argument using id_loser
102 |         :param id_loser: ID of los of match against current player
103 |         :return:
104 |         """
105 |         self.last_matches[4] = self.last_matches[3]
106 |         self.last_matches[3] = self.last_matches[2]
107 |         self.last_matches[2] = self.last_matches[1]
108 |         self.last_matches[1] = self.last_matches[0]
109 |         self.last_matches[0] = "V"
110 |         if id_loser in self.versus.keys():
111 |             self.versus[id_loser].append(["V", tournament_date, match_id])
112 |         else:
113 |             self.versus[id_loser] = [["V", tournament_date, match_id]]
114 |         self._update_victories_percentage("V", match_id)
115 | 
116 |     def _add_defeat(self, id_winner, match_id, tournament_date="19000101"):
117 |         """
118 |         Add a Defeat
119 |         :param id_winner:
120 |         :return:
121 |         """
122 |         self.last_matches[4] = self.last_matches[3]
123 |         self.last_matches[3] = self.last_matches[2]
124 |         self.last_matches[2] = self.last_matches[1]
125 |         self.last_matches[1] = self.last_matches[0]
126 |         self.last_matches[0] = "D"
127 |         if id_winner in self.versus.keys():
128 |             self.versus[id_winner].append(["D", tournament_date, match_id])
129 |         else:
130 |             self.versus[id_winner] = [["D", tournament_date, match_id]]
131 |         self._update_victories_percentage("D", match_id)
132 | 
133 |     def _update_victories_percentage(self, match_outcome, match_id):
134 |         """
135 |         Updates Victories Percentage with a V/D of last match
136 |         :param match_outcome:
137 |         :return:
138 |         """
139 |         self.matches_history.append([match_outcome, match_id])
140 |         # self.matches.append(match_outcome)
141 |         victories_number = [_[0] for _ in self.matches_history].count("V")
142 |         matches_number = len(self.matches_history)
143 |         self.victories_percentage = (victories_number / matches_number) * 100
144 | 
145 |     def _update_surfaces_victories_percentage(self, surface, outcome):
146 |         """
147 |         Updates % of victories on a given surface (V/D)
148 |         :param surface:
149 |         :param outcome:
150 |         :return:
151 |         """
152 |         if surface == "Clay":
153 |             self.matches_clay.append(outcome)
154 |             self.clay_victories_percentage = (
155 |                 self.matches_clay.count("V") / len(self.matches_clay) * 100
156 |             )
157 | 
158 |         elif surface == "Grass":
159 |             self.matches_grass.append(outcome)
160 |             self.grass_victories_percentage = (
161 |                 self.matches_grass.count("V") / len(self.matches_grass) * 100
162 |             )
163 | 
164 |         elif surface == "Hard":
165 |             self.matches_hard.append(outcome)
166 |             self.hard_victories_percentage = (
167 |                 self.matches_hard.count("V") / len(self.matches_hard) * 100
168 |             )
169 | 
170 |         elif surface == "Carpet":
171 |             self.matches_carpet.append(outcome)
172 |             self.carpet_victories_percentage = (
173 |                 self.matches_carpet.count("V") / len(self.matches_carpet) * 100
174 |             )
175 | 
176 |     def _update_fatigue(self, tournament_date, games_number, minutes_number):
177 |         """
178 |         Updates Fatigue arguments: self.fatigue but also self.fatigue_features
179 |         :param tournament_date:
180 |         :param sets_number:
181 |         :return:
182 |         """
183 |         if games_number == games_number and games_number != "nan":
184 |             if tournament_date == self.fatigue_features["current tournament"]["date"]:
185 |                 self.fatigue_features["current tournament"]["num_games"] += games_number
186 |                 if minutes_number == minutes_number and minutes_number != "nan":
187 |                     self.fatigue_features["current tournament"][
188 |                         "num_minutes"
189 |                     ] += minutes_number
190 |                 self.fatigue_features["current tournament"]["num_matchs"] += 1
191 |             else:
192 |                 self.fatigue_features["previous tournament"] = self.fatigue_features[
193 |                     "current tournament"
194 |                 ]
195 |                 self.fatigue_features["current tournament"] = {
196 |                     "date": tournament_date,
197 |                     "num_games": games_number,
198 |                     "num_minutes": minutes_number,
199 |                     "num_matchs": 1,
200 |                 }
201 | 
202 |             previous_tournament_date = str(
203 |                 self.fatigue_features["previous tournament"]["date"]
204 |             )
205 |             current_tournament_date = str(
206 |                 self.fatigue_features["current tournament"]["date"]
207 |             )
208 | 
209 |             days_difference_tournaments = get_days_difference(
210 |                 previous_tournament_date, current_tournament_date
211 |             )
212 |             if days_difference_tournaments == 0:
213 |                 print(previous_tournament_date, current_tournament_date)
214 |                 print(tournament_date)
215 | 
216 |             self.games_fatigue = (
217 |                 self.fatigue_features["previous tournament"]["num_games"]
218 |                 / days_difference_tournaments
219 |                 + self.fatigue_features["current tournament"]["num_games"]
220 |             )
221 |             self.minutes_fatigue = (
222 |                 self.fatigue_features["previous tournament"]["num_minutes"]
223 |                 / days_difference_tournaments
224 |                 + self.fatigue_features["current tournament"]["num_minutes"]
225 |             )
226 |         else:
227 |             print("NaN in sets number", games_number)
228 | 
229 |     def _update_aces_percentage(self, aces_nb):
230 |         """
231 |         Upates Aces Percentage
232 |         :param aces_nb:
233 |         :return:
234 |         """
235 | 
236 |         if aces_nb == aces_nb and aces_nb != "nan":
237 |             self.service_data["aces_nb"].append(aces_nb)
238 |             total_aces_nbr = sum(self.service_data["aces_nb"])
239 |             total_service_points_played = sum(self.service_data["service_games_played"])
240 | 
241 |             if total_service_points_played != 0:
242 |                 self.aces_percentage = (
243 |                     total_aces_nbr / total_service_points_played * 100
244 |                 )
245 |             else:
246 |                 print("No point played :", aces_nb)
247 | 
248 |     def _update_doublefaults_percentage(self, df_nb):
249 |         """
250 |         Update doublefaults percentage
251 |         :param df_nb:
252 |         :return:
253 |         """
254 |         if df_nb == df_nb and df_nb != "nan":
255 |             self.service_data["doublefaults_nb"].append(df_nb)
256 |             total_df_nbr = sum(self.service_data["doublefaults_nb"])
257 |             total_service_points_played = sum(self.service_data["service_games_played"])
258 | 
259 |             if total_service_points_played != 0:
260 |                 self.doublefaults_percentage = (
261 |                     total_df_nbr / total_service_points_played * 100
262 |                 )
263 |             else:
264 |                 print("No point played :", total_df_nbr)
265 |                 self.doublefaults_percentage = 0
266 | 
267 |         else:
268 |             print("NaN in Double Faults", df_nb)
269 | 
270 |     def _update_winning_on_1st_serve_percentage(self, first_serve_win):
271 |         """
272 | 
273 |         :param first_serve_win:
274 |         :return:
275 |         """
276 |         self.service_data["win_on_1st_serve"].append(first_serve_win)
277 | 
278 |         total_first_serves_win = sum(self.service_data["1st_serve_success"])
279 |         total_service_points_played = sum(self.service_data["service_games_played"])
280 | 
281 |         if total_service_points_played != 0:
282 |             self.winning_on_1st_serve_percentage = (
283 |                 total_first_serves_win / total_service_points_played * 100
284 |             )
285 |         else:
286 |             print("No point played :", total_first_serves_win)
287 | 
288 |     def _update_winning_on_2nd_serve_percentage(self, second_serve_win):
289 |         """
290 | 
291 |         :param second_serve_win:
292 |         :return:
293 |         """
294 |         self.service_data["win_on_2nd_serve"].append(second_serve_win)
295 | 
296 |         total_second_serves_win = sum(self.service_data["win_on_2nd_serve"])
297 |         total_service_points_played = sum(self.service_data["service_games_played"])
298 | 
299 |         if total_service_points_played != 0:
300 |             self.winning_on_2nd_serve_percentage = (
301 |                 total_second_serves_win / total_service_points_played * 100
302 |             )
303 |         else:
304 |             print("No point played :", total_second_serves_win)
305 | 
306 |     def _update_first_serve_success_percentage(self, first_services_in):
307 |         """
308 | 
309 |         :param first_services_in:
310 |         :return:
311 |         """
312 |         self.service_data["1st_serve_success"].append(first_services_in)
313 | 
314 |         total_first_serves_in = sum(self.service_data["1st_serve_success"])
315 |         total_service_points_played = sum(self.service_data["service_games_played"])
316 | 
317 |         if total_service_points_played != 0:
318 |             self.first_serve_success_percentage = (
319 |                 total_first_serves_in / total_service_points_played * 100
320 |             )
321 |         else:
322 |             print("No point played :", total_first_serves_in)
323 | 
324 |     def _update_breakpoints_faced_and_saved(self, breakpoint_faced, breakpoint_saved):
325 |         """
326 | 
327 |         :param breakpoint_faced:
328 |         :param breakpoint_saved:
329 |         :return:
330 |         """
331 |         self.service_data["breakpoints_saved"].append(breakpoint_saved)
332 |         self.service_data["breakpoints_faced"].append(breakpoint_faced)
333 | 
334 |         total_breakpoint_faced = sum(self.service_data["breakpoints_faced"])
335 |         total_games_played = sum(self.service_data["service_games_played"])
336 |         total_breakpoint_saved = sum(self.service_data["breakpoints_saved"])
337 | 
338 |         if total_games_played != 0:
339 |             self.breakpoint_faced_percentage = (
340 |                 total_breakpoint_faced / total_games_played * 100
341 |             )
342 |             self.breakpoint_saved_percentage = (
343 |                 total_breakpoint_saved / total_games_played * 100
344 |             )
345 |         else:
346 |             print("No point played :", self.service_data["breakpoints_saved"])
347 | 
348 |     def _update_service_data(
349 |         self,
350 |         service_games_played,
351 |         aces_nb,
352 |         doublefaults_nb,
353 |         first_serve_success,
354 |         winning_on_1st_serve,
355 |         winning_on_2nd_serve,
356 |         breakpoints_faced,
357 |         breakpoints_saved,
358 |     ):
359 |         # Assert data exists
360 |         if (
361 |             service_games_played == service_games_played
362 |             and aces_nb == aces_nb
363 |             and doublefaults_nb == doublefaults_nb
364 |             and first_serve_success == first_serve_success
365 |             and winning_on_1st_serve == winning_on_1st_serve
366 |             and winning_on_2nd_serve == winning_on_2nd_serve
367 |             and breakpoints_faced == breakpoints_faced
368 |             and breakpoints_saved == breakpoints_saved
369 |         ):
370 |             self.service_data["service_games_played"].append(service_games_played)
371 | 
372 |             self._update_aces_percentage(aces_nb=aces_nb)
373 |             self._update_doublefaults_percentage(df_nb=doublefaults_nb)
374 |             self._update_winning_on_1st_serve_percentage(
375 |                 first_serve_win=winning_on_1st_serve
376 |             )
377 |             self._update_winning_on_2nd_serve_percentage(
378 |                 second_serve_win=winning_on_2nd_serve
379 |             )
380 |             self.overall_win_on_serve_percentage = (
381 |                 self.winning_on_1st_serve_percentage
382 |                 + self.winning_on_2nd_serve_percentage
383 |             )
384 |             self._update_first_serve_success_percentage(
385 |                 first_services_in=first_serve_success
386 |             )
387 |             self._update_breakpoints_faced_and_saved(
388 |                 breakpoint_saved=breakpoints_saved, breakpoint_faced=breakpoints_faced
389 |             )
390 | 
391 |         else:
392 |             # Future argument ;)
393 |             verbose = 1
394 |             if verbose > 2:
395 |                 print("Service data not complete...")
396 | 
397 |     def _update_rankings(self, new_ranking, new_ranking_points, date):
398 |         if new_ranking_points != new_ranking_points:
399 |             if new_ranking_points == new_ranking_points:
400 |                 print("No ranking points", new_ranking, new_ranking_points)
401 |             new_ranking_points = 0
402 |         else:
403 |             try:
404 |                 new_ranking_points = int(new_ranking_points)
405 |             except:
406 |                 new_ranking_points = 0
407 | 
408 |         if new_ranking != new_ranking:
409 |             new_ranking = 9999
410 |         else:
411 |             try:
412 |                 new_ranking = int(new_ranking)
413 |             except:
414 |                 new_ranking = 9999
415 | 
416 |         self.ranking = new_ranking
417 |         self.ranking_points = new_ranking_points
418 | 
419 |         self.rankings_history[date] = [
420 |             int(new_ranking),
421 |             int(new_ranking_points),
422 |         ]
423 | 
424 |     def _get_best_ranking(self):
425 |         all_ranks = [
426 |             self.rankings_history[date][0] for date in self.rankings_history.keys()
427 |         ]
428 |         if len(all_ranks) > 0:
429 |             return np.min(all_ranks)
430 |         else:
431 |             return -1
432 | 
433 |     def update_from_match(self, match):
434 |         """
435 |         Updates the whole player profile from a match
436 |         :param match:
437 |         :return:
438 |         """
439 | 
440 |         # Update Rankings ?
441 |         if match.winner.id == self.id:
442 |             self._add_victory(
443 |                 match.loser.id, match_id=match.id, tournament_date=match.tournament_date
444 |             )
445 |             self._update_surfaces_victories_percentage(match.surface, "V")
446 |         else:
447 |             assert match.loser.id == self.id
448 |             self._add_defeat(
449 |                 match.winner.id,
450 |                 match_id=match.id,
451 |                 tournament_date=match.tournament_date,
452 |             )
453 |             self._update_surfaces_victories_percentage(match.surface, "D")
454 |         self._update_fatigue(
455 |             match.tournament_date, match.games_number, match.elapsed_minutes
456 |         )
457 | 
458 |         self._update_service_data(
459 |             service_games_played=match.get_service_points_played(self.id),
460 |             aces_nb=match.get_aces_nb(self.id),
461 |             doublefaults_nb=match.get_df_nb(self.id),
462 |             first_serve_success=match.get_first_services_in(self.id),
463 |             winning_on_1st_serve=match.get_first_serve_win(self.id),
464 |             winning_on_2nd_serve=match.get_second_serve_win(self.id),
465 |             breakpoints_faced=match.get_breakpoint_faced(self.id),
466 |             breakpoints_saved=match.get_breakpoint_saved(self.id),
467 |         )
468 | 
469 |         self._update_rankings(*match.get_rankings(self.id), date=match.tournament_date)
470 | 
471 |     def get_data_df(self, opponent=None):
472 |         data_dict = {
473 |             "Name": [self.name],
474 |             "ID": [self.id],
475 |             "Ranking": [self.ranking],
476 |             "Ranking_Points": [self.ranking_points],
477 |             "Ranking_History": [self.rankings_history.copy()],
478 |             "Best_Rank": [self._get_best_ranking()],
479 |             "Birth_Year": [self.birthdate],
480 |             "Versus": [
481 |                 self.versus.copy()
482 |                 if opponent is None
483 |                 else self.versus.get(opponent, []).copy()
484 |             ],
485 |             "Hand": [self.hand],
486 |             "Last_Tournament_Date": [
487 |                 self.fatigue_features["previous tournament"]["date"]
488 |             ],
489 |             "Height": [self.height],
490 |             "Matches": [self.matches_history.copy()],
491 |             "Matches_Clay": [self.matches_clay.copy()],
492 |             "Matches_Carpet": [self.matches_carpet.copy()],
493 |             "Matches_Grass": [self.matches_grass.copy()],
494 |             "Matches_Hard": [self.matches_hard.copy()],
495 |             "Victories_Percentage": [self.victories_percentage],
496 |             "Clay_Victories_Percentage": [self.clay_victories_percentage],
497 |             "Carpet_Victories_Percentage": [self.carpet_victories_percentage],
498 |             "Grass_Victories_Percentage": [self.grass_victories_percentage],
499 |             "Hard_Victories_Percentage": [self.hard_victories_percentage],
500 |             "Aces_Percentage": [self.aces_percentage],
501 |             "Doublefaults_Percentage": [self.doublefaults_percentage],
502 |             "First_Serve_Success_Percentage": [self.first_serve_success_percentage],
503 |             "Winning_on_1st_Serve_Percentage": [self.winning_on_1st_serve_percentage],
504 |             "Winning_on_2nd_Serve_Percentage": [self.winning_on_2nd_serve_percentage],
505 |             "Overall_Win_on_Serve_Percentage": [self.overall_win_on_serve_percentage],
506 |             "BreakPoint_Face_Percentage": [self.breakpoint_faced_percentage],
507 |             "BreakPoint_Saved_Percentage": [self.breakpoint_saved_percentage],
508 |             "games_fatigue": [self.games_fatigue],
509 |             "minutes_fatigue": [self.minutes_fatigue],
510 |         }
511 |         return pd.DataFrame(data_dict)
512 | 
513 |     def get_last_months_rankings(self, date, nb_months=12, day_of_month="last"):
514 |         assert day_of_month in [
515 |             "last",
516 |             "first",
517 |         ], f"For now you can only use first or last month day for ranking, you chose {day_of_month}"
518 |         if day_of_month == "last":
519 |             f = max
520 |         else:
521 |             f = min
522 |         date = str(date)
523 |         last_months_ranks = [9999 for _ in range(nb_months)]
524 |         last_months_points = [0 for _ in range(nb_months)]
525 |         date_year = int(date[:4])
526 |         date_month = int(date[4:6])
527 | 
528 |         for i in range(nb_months):
529 |             if date_month == 1:
530 |                 date_month = 12
531 |                 date_year = date_year - 1
532 |             else:
533 |                 date_month = date_month - 1
534 | 
535 |             days_with_rankings = []
536 |             for key in self.rankings_history.keys():
537 |                 if f"{date_year}{date_month:02d}" in str(key):
538 |                     days_with_rankings.append(int(str(key)[6:]))
539 |             try:
540 |                 if len(days_with_rankings) > 0:
541 |                     last_months_ranks[-i] = self.rankings_history[
542 |                         int(f"{date_year}{date_month:02d}{f(days_with_rankings):02d}")
543 |                     ][0]
544 |                     last_months_points[-i] = self.rankings_history[
545 |                         int(f"{date_year}{date_month:02d}{f(days_with_rankings):02d}")
546 |                     ][1]
547 | 
548 |             except:
549 |                 print(days_with_rankings)
550 |                 print(self.rankings_history)
551 |                 print(date_month, date_year)
552 | 
553 |                 print(f"{date_year}{date_month:02d}{f(days_with_rankings):02d}")
554 |                 print(
555 |                     self.rankings_history[
556 |                         f"{date_year}{date_month:02d}{f(days_with_rankings):02d}"
557 |                     ]
558 |                 )
559 |                 raise ValueError
560 | 
561 |         return last_months_ranks, last_months_points
562 | 


--------------------------------------------------------------------------------
/examples/data/single_row_example.csv:
--------------------------------------------------------------------------------
 1 | ,100
 2 | index,55550
 3 | id,atp_matches_2005_4281
 4 | tournament,Gstaad
 5 | tournament_level,A
 6 | tournament_date,20050704
 7 | tournament_surface,Clay
 8 | round,R32
 9 | best_of,3
10 | match_id,atp_matches_2005_4281
11 | Name_1,Stan.Wawrinka
12 | ID_1,104527
13 | Ranking_1,74
14 | Ranking_Points_1,547
15 | Ranking_History_1,"{20030616: [387, 68], 20030707: [363, 74], 20030714: [348, 79], 20030721: [303, 99], 20030811: [284, 114], 20030818: [215, 164], 20030825: [175, 219], 20030929: [171, 215], 20031020: [172, 214], 20040105: [171, 216], 20040119: [166, 226], 20040206: [162, 225], 20040209: [164, 225], 20040301: [161, 227], 20040412: [184, 205], 20040419: [179, 210], 20040426: [148, 260], 20040510: [147, 270], 20040524: [150, 266], 20040531: [150, 266], 20040607: [156, 267], 20040614: [152, 272], 20040705: [146, 275], 20040719: [153, 261], 20040809: [159, 256], 20040816: [162, 241], 20040830: [163, 241], 20040906: [163, 241], 20040913: [167, 233], 20040920: [166, 242], 20041025: [161, 258], 20041101: [159, 262], 20050103: [168, 262], 20050117: [165, 267], 20050124: [165, 267], 20050131: [153, 291], 20050214: [128, 346], 20050221: [120, 377], 20050304: [118, 377], 20050328: [117, 362], 20050418: [113, 397], 20050502: [99, 421], 20050523: [87, 467], 20050613: [73, 551], 20050620: [74, 547]}"
16 | Best_Rank_1,73
17 | Birth_Year_1,19850328.0
18 | Versus_1,"[['D', 20030721, 'atp_matches_2003_4782']]"
19 | Hand_1,R
20 | Last_Tournament_Date_1,20050613
21 | Height_1,183.0
22 | Matches_1,"[['V', 'atp_matches_qual_chall_2003_3466'], ['D', 'atp_matches_qual_chall_2003_3481'], ['D', 'atp_matches_2003_4049'], ['V', 'atp_matches_2003_4315'], ['D', 'atp_matches_2003_4328'], ['V', 'atp_matches_2003_4773'], ['D', 'atp_matches_2003_4782'], ['V', 'atp_matches_qual_chall_2003_5408'], ['V', 'atp_matches_qual_chall_2003_5419'], ['V', 'atp_matches_qual_chall_2003_5424'], ['V', 'atp_matches_qual_chall_2003_5427'], ['V', 'atp_matches_qual_chall_2003_5428'], ['V', 'atp_matches_qual_chall_2003_5558'], ['V', 'atp_matches_qual_chall_2003_5571'], ['V', 'atp_matches_qual_chall_2003_5578'], ['V', 'atp_matches_qual_chall_2003_5581'], ['V', 'atp_matches_qual_chall_2003_5583'], ['V', 'atp_matches_qual_chall_2003_5889'], ['V', 'atp_matches_qual_chall_2003_5903'], ['D', 'atp_matches_qual_chall_2003_5910'], ['D', 'atp_matches_qual_chall_2003_6734'], ['D', 'atp_matches_2003_7265'], ['D', 'atp_matches_2004_5'], ['D', 'atp_matches_qual_chall_2004_423'], ['D', 'atp_matches_2004_630'], ['D', 'atp_matches_qual_chall_2004_752'], ['V', 'atp_matches_qual_chall_2004_1154'], ['V', 'atp_matches_qual_chall_2004_1163'], ['D', 'atp_matches_qual_chall_2004_1167'], ['V', 'atp_matches_qual_chall_2004_2073'], ['D', 'atp_matches_qual_chall_2004_2085'], ['V', 'atp_matches_qual_chall_2004_2287'], ['V', 'atp_matches_qual_chall_2004_2301'], ['V', 'atp_matches_qual_chall_2004_2308'], ['V', 'atp_matches_qual_chall_2004_2312'], ['V', 'atp_matches_qual_chall_2004_2314'], ['D', 'atp_matches_2004_2416'], ['D', 'atp_matches_qual_chall_2004_2596'], ['D', 'atp_matches_qual_chall_2004_3191'], ['D', 'atp_matches_qual_chall_2004_3236'], ['V', 'atp_matches_qual_chall_2004_3481'], ['D', 'atp_matches_qual_chall_2004_3490'], ['V', 'atp_matches_qual_chall_2004_3600'], ['D', 'atp_matches_qual_chall_2004_3611'], ['D', 'atp_matches_2004_4123'], ['D', 'atp_matches_2004_4825'], ['V', 'atp_matches_qual_chall_2004_5381'], ['V', 'atp_matches_qual_chall_2004_5393'], ['V', 'atp_matches_qual_chall_2004_5399'], ['V', 'atp_matches_qual_chall_2004_5402'], ['D', 'atp_matches_qual_chall_2004_5403'], ['V', 'atp_matches_qual_chall_2004_5629'], ['V', 'atp_matches_qual_chall_2004_5641'], ['V', 'atp_matches_qual_chall_2004_5647'], ['V', 'atp_matches_qual_chall_2004_5650'], ['V', 'atp_matches_qual_chall_2004_5651'], ['D', 'atp_matches_qual_chall_2004_6096'], ['V', 'atp_matches_qual_chall_2004_6148'], ['D', 'atp_matches_qual_chall_2004_6335'], ['D', 'atp_matches_2004_6494'], ['V', 'atp_matches_qual_chall_2004_6580'], ['V', 'atp_matches_qual_chall_2004_6593'], ['D', 'atp_matches_qual_chall_2004_6600'], ['D', 'atp_matches_2004_7371'], ['D', 'atp_matches_qual_chall_2004_7567'], ['D', 'atp_matches_2005_70'], ['D', 'atp_matches_qual_chall_2005_393'], ['V', 'atp_matches_qual_chall_2005_427'], ['V', 'atp_matches_qual_chall_2005_560'], ['V', 'atp_matches_qual_chall_2005_570'], ['V', 'atp_matches_qual_chall_2005_575'], ['D', 'atp_matches_qual_chall_2005_578'], ['V', 'atp_matches_qual_chall_2005_772'], ['V', 'atp_matches_qual_chall_2005_785'], ['V', 'atp_matches_qual_chall_2005_791'], ['V', 'atp_matches_qual_chall_2005_794'], ['D', 'atp_matches_qual_chall_2005_796'], ['V', 'atp_matches_2005_1046'], ['D', 'atp_matches_2005_1061'], ['D', 'atp_matches_qual_chall_2005_1150'], ['D', 'atp_matches_2005_1298'], ['D', 'atp_matches_2005_1299'], ['V', 'atp_matches_qual_chall_2005_1907'], ['V', 'atp_matches_qual_chall_2005_1918'], ['V', 'atp_matches_qual_chall_2005_1924'], ['D', 'atp_matches_qual_chall_2005_1927'], ['V', 'atp_matches_2005_2334'], ['V', 'atp_matches_2005_2358'], ['V', 'atp_matches_2005_2373'], ['D', 'atp_matches_2005_2381'], ['V', 'atp_matches_2005_2630'], ['D', 'atp_matches_2005_2661'], ['V', 'atp_matches_2005_3180'], ['V', 'atp_matches_2005_3214'], ['D', 'atp_matches_2005_3231'], ['V', 'atp_matches_qual_chall_2005_3262'], ['V', 'atp_matches_qual_chall_2005_3293'], ['V', 'atp_matches_qual_chall_2005_3355'], ['D', 'atp_matches_2005_3757'], ['D', 'atp_matches_2005_3914']]"
23 | Matches_Clay_1,"['V', 'D', 'D', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'D', 'D', 'D', 'D', 'V', 'D', 'V', 'D', 'D', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'D', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V']"
24 | Matches_Carpet_1,"['D', 'D', 'D']"
25 | Matches_Grass_1,"['D', 'D']"
26 | Matches_Hard_1,"['D', 'D', 'D', 'D', 'V', 'V', 'D', 'D', 'V', 'D', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'D', 'D', 'D', 'D']"
27 | Victories_Percentage_1,57.99999999999999
28 | Clay_Victories_Percentage_1,66.66666666666666
29 | Carpet_Victories_Percentage_1,0.0
30 | Grass_Victories_Percentage_1,0.0
31 | Hard_Victories_Percentage_1,46.15384615384615
32 | Aces_Percentage_1,7.41825280624695
33 | Doublefaults_Percentage_1,4.978038067349927
34 | First_Serve_Success_Percentage_1,57.10102489019033
35 | Winning_on_1st_Serve_Percentage_1,51.87896534895072
36 | Winning_on_2nd_Serve_Percentage_1,21.57149829184968
37 | Overall_Win_on_Serve_Percentage_1,73.4504636408004
38 | BreakPoint_Face_Percentage_1,9.809663250366032
39 | BreakPoint_Saved_Percentage_1,6.149341142020498
40 | games_fatigue_1,26.666666666666668
41 | minutes_fatigue_1,209.44444444444449
42 | last_rankings_1,"[74, 153, 163, 166, 161, 159, 9999, 153, 120, 117, 113, 87]"
43 | last_ranking_points_1,"[547, 261, 241, 242, 258, 262, 0, 291, 377, 362, 397, 467]"
44 | Name_2,Filippo.Volandri
45 | ID_2,103835
46 | Ranking_2,32
47 | Ranking_Points_2,1060
48 | Ranking_History_2,"{19990329: [354, 93], 19990719: [360, 94], 19990809: [452, 63], 19990816: [433, 70], 19991025: [274, 134], 20000214: [263, 119], 20000221: [264, 120], 20000228: [247, 132], 20000313: [281, 110], 20000327: [269, 117], 20000515: [294, 105], 20000522: [257, 127], 20000612: [248, 128], 20000626: [253, 129], 20000710: [245, 129], 20000724: [252, 129], 20000731: [242, 140], 20000807: [242, 140], 20000821: [210, 169], 20000828: [208, 174], 20000904: [208, 174], 20000918: [219, 161], 20000925: [219, 161], 20001009: [152, 252], 20001023: [152, 249], 20010115: [163, 239], 20010205: [161, 239], 20010212: [161, 239], 20010219: [152, 250], 20010226: [158, 244], 20010312: [152, 253], 20010406: [159, 247], 20010507: [156, 247], 20010514: [157, 252], 20010528: [163, 236], 20010604: [163, 236], 20010611: [138, 286], 20010625: [138, 286], 20010702: [138, 286], 20010709: [126, 315], 20010730: [125, 319], 20010806: [126, 319], 20010813: [138, 289], 20010820: [137, 289], 20010827: [139, 289], 20010903: [139, 289], 20010921: [141, 277], 20010924: [145, 277], 20011001: [199, 190], 20011008: [200, 190], 20011126: [212, 175], 20020318: [243, 146], 20020325: [243, 146], 20020422: [209, 171], 20020429: [202, 182], 20020506: [181, 216], 20020513: [179, 215], 20020527: [170, 222], 20020603: [170, 222], 20020610: [197, 193], 20020617: [196, 193], 20020624: [184, 198], 20020701: [184, 198], 20020708: [183, 201], 20020722: [190, 186], 20020729: [194, 186], 20020805: [191, 186], 20020812: [189, 193], 20020826: [163, 229], 20020902: [163, 229], 20020909: [165, 229], 20020923: [158, 233], 20020930: [154, 238], 20021007: [153, 242], 20021014: [155, 242], 20021125: [154, 244], 20030113: [158, 244], 20030224: [150, 247], 20030317: [149, 257], 20030324: [149, 257], 20030404: [129, 285], 20030407: [128, 285], 20030414: [131, 285], 20030421: [98, 420], 20030505: [100, 408], 20030512: [79, 528], 20030526: [79, 535], 20030609: [80, 536], 20030623: [69, 607], 20030630: [69, 607], 20030707: [68, 614], 20030714: [60, 649], 20030721: [53, 704], 20030728: [47, 818], 20030825: [47, 775], 20030919: [48, 768], 20030922: [47, 768], 20030929: [49, 773], 20031006: [49, 773], 20031020: [49, 772], 20040105: [47, 772], 20040119: [46, 772], 20040209: [44, 799], 20040216: [45, 834], 20040223: [43, 844], 20040301: [44, 844], 20040308: [42, 889], 20040419: [51, 704], 20040503: [54, 679], 20040510: [58, 629], 20040517: [61, 614], 20040524: [47, 774], 20040607: [45, 766], 20040621: [52, 701], 20040705: [51, 710], 20040716: [58, 685], 20040719: [65, 630], 20040726: [65, 630], 20040802: [66, 633], 20040816: [59, 685], 20040830: [60, 685], 20040913: [53, 715], 20040924: [46, 775], 20040927: [45, 775], 20041011: [37, 880], 20041025: [39, 890], 20041101: [40, 890], 20050103: [43, 880], 20050110: [42, 880], 20050117: [41, 880], 20050131: [43, 850], 20050207: [40, 900], 20050214: [41, 885], 20050221: [42, 885], 20050307: [40, 885], 20050321: [41, 885], 20050404: [42, 890], 20050411: [37, 940], 20050428: [31, 1060], 20050502: [31, 1060], 20050509: [31, 1020], 20050523: [34, 990], 20050620: [32, 1060], 20050627: [32, 1060]}"
49 | Best_Rank_2,31
50 | Birth_Year_2,19810905.0
51 | Versus_2,"[['V', 20030721, 'atp_matches_2003_4782']]"
52 | Hand_2,R
53 | Last_Tournament_Date_2,20050620
54 | Height_2,183.0
55 | Matches_2,"[['D', 'atp_matches_qual_chall_1999_1308'], ['D', 'atp_matches_qual_chall_1999_3765'], ['V', 'atp_matches_qual_chall_1999_4422'], ['D', 'atp_matches_qual_chall_1999_4431'], ['D', 'atp_matches_qual_chall_1999_4654'], ['V', 'atp_matches_qual_chall_1999_6165'], ['D', 'atp_matches_qual_chall_1999_6174'], ['D', 'atp_matches_qual_chall_2000_917'], ['V', 'atp_matches_qual_chall_2000_1002'], ['V', 'atp_matches_qual_chall_2000_1017'], ['D', 'atp_matches_qual_chall_2000_1024'], ['V', 'atp_matches_qual_chall_2000_1195'], ['D', 'atp_matches_qual_chall_2000_1206'], ['V', 'atp_matches_qual_chall_2000_1504'], ['D', 'atp_matches_qual_chall_2000_1516'], ['D', 'atp_matches_qual_chall_2000_1667'], ['V', 'atp_matches_qual_chall_2000_2497'], ['V', 'atp_matches_qual_chall_2000_2506'], ['V', 'atp_matches_qual_chall_2000_2510'], ['D', 'atp_matches_qual_chall_2000_2512'], ['D', 'atp_matches_qual_chall_2000_2676'], ['D', 'atp_matches_qual_chall_2000_3244'], ['D', 'atp_matches_qual_chall_2000_3668'], ['D', 'atp_matches_qual_chall_2000_3986'], ['V', 'atp_matches_qual_chall_2000_4308'], ['V', 'atp_matches_qual_chall_2000_4320'], ['D', 'atp_matches_qual_chall_2000_4326'], ['D', 'atp_matches_qual_chall_2000_4633'], ['V', 'atp_matches_qual_chall_2000_4847'], ['V', 'atp_matches_qual_chall_2000_4862'], ['V', 'atp_matches_qual_chall_2000_4869'], ['V', 'atp_matches_qual_chall_2000_4873'], ['D', 'atp_matches_qual_chall_2000_4875'], ['V', 'atp_matches_qual_chall_2000_5257'], ['D', 'atp_matches_qual_chall_2000_5269'], ['D', 'atp_matches_qual_chall_2000_5351'], ['V', 'atp_matches_qual_chall_2000_5711'], ['V', 'atp_matches_qual_chall_2000_5724'], ['V', 'atp_matches_qual_chall_2000_5731'], ['D', 'atp_matches_qual_chall_2000_5734'], ['V', 'atp_matches_qual_chall_2000_5874'], ['V', 'atp_matches_qual_chall_2000_5883'], ['V', 'atp_matches_qual_chall_2000_5888'], ['V', 'atp_matches_qual_chall_2000_5890'], ['V', 'atp_matches_qual_chall_2000_5891'], ['V', 'atp_matches_qual_chall_2000_6085'], ['V', 'atp_matches_qual_chall_2000_6098'], ['D', 'atp_matches_qual_chall_2000_6105'], ['V', 'atp_matches_qual_chall_2000_6332'], ['V', 'atp_matches_qual_chall_2000_6344'], ['D', 'atp_matches_qual_chall_2000_6350'], ['D', 'atp_matches_qual_chall_2000_6729'], ['D', 'atp_matches_qual_chall_2001_421'], ['D', 'atp_matches_qual_chall_2001_615'], ['V', 'atp_matches_qual_chall_2001_830'], ['V', 'atp_matches_qual_chall_2001_846'], ['D', 'atp_matches_qual_chall_2001_854'], ['V', 'atp_matches_qual_chall_2001_896'], ['D', 'atp_matches_qual_chall_2001_910'], ['V', 'atp_matches_qual_chall_2001_1264'], ['V', 'atp_matches_qual_chall_2001_1272'], ['D', 'atp_matches_qual_chall_2001_1276'], ['D', 'atp_matches_qual_chall_2001_1506'], ['D', 'atp_matches_2001_1801'], ['D', 'atp_matches_2001_2340'], ['V', 'atp_matches_qual_chall_2001_2490'], ['D', 'atp_matches_qual_chall_2001_2503'], ['D', 'atp_matches_qual_chall_2001_2886'], ['V', 'atp_matches_qual_chall_2001_2935'], ['V', 'atp_matches_qual_chall_2001_3001'], ['V', 'atp_matches_qual_chall_2001_3015'], ['V', 'atp_matches_qual_chall_2001_3022'], ['D', 'atp_matches_qual_chall_2001_3025'], ['V', 'atp_matches_qual_chall_2001_3042'], ['V', 'atp_matches_qual_chall_2001_3051'], ['V', 'atp_matches_qual_chall_2001_3055'], ['D', 'atp_matches_qual_chall_2001_3057'], ['D', 'atp_matches_qual_chall_2001_3221'], ['V', 'atp_matches_qual_chall_2001_3733'], ['V', 'atp_matches_qual_chall_2001_3745'], ['D', 'atp_matches_qual_chall_2001_3751'], ['V', 'atp_matches_qual_chall_2001_3823'], ['V', 'atp_matches_qual_chall_2001_3836'], ['D', 'atp_matches_qual_chall_2001_3843'], ['V', 'atp_matches_qual_chall_2001_4050'], ['V', 'atp_matches_qual_chall_2001_4058'], ['D', 'atp_matches_qual_chall_2001_4062'], ['D', 'atp_matches_qual_chall_2001_4769'], ['V', 'atp_matches_qual_chall_2001_4884'], ['D', 'atp_matches_qual_chall_2001_4900'], ['D', 'atp_matches_qual_chall_2001_5222'], ['V', 'atp_matches_qual_chall_2001_5377'], ['D', 'atp_matches_qual_chall_2001_5387'], ['V', 'atp_matches_qual_chall_2001_5764'], ['V', 'atp_matches_qual_chall_2001_5778'], ['D', 'atp_matches_qual_chall_2001_5785'], ['D', 'atp_matches_qual_chall_2001_5834'], ['V', 'atp_matches_2001_6354'], ['V', 'atp_matches_2001_6357'], ['D', 'atp_matches_2001_6381'], ['D', 'atp_matches_qual_chall_2001_6778'], ['D', 'atp_matches_qual_chall_2001_6910'], ['D', 'atp_matches_qual_chall_2001_7787'], ['V', 'atp_matches_qual_chall_2002_1705'], ['V', 'atp_matches_qual_chall_2002_1719'], ['V', 'atp_matches_qual_chall_2002_1726'], ['D', 'atp_matches_qual_chall_2002_1730'], ['V', 'atp_matches_qual_chall_2002_1773'], ['D', 'atp_matches_qual_chall_2002_1784'], ['V', 'atp_matches_qual_chall_2002_2263'], ['V', 'atp_matches_qual_chall_2002_2272'], ['D', 'atp_matches_qual_chall_2002_2276'], ['V', 'atp_matches_qual_chall_2002_2317'], ['V', 'atp_matches_qual_chall_2002_2330'], ['V', 'atp_matches_qual_chall_2002_2336'], ['V', 'atp_matches_qual_chall_2002_2339'], ['D', 'atp_matches_qual_chall_2002_2341'], ['D', 'atp_matches_2002_2478'], ['V', 'atp_matches_qual_chall_2002_2597'], ['V', 'atp_matches_qual_chall_2002_2609'], ['D', 'atp_matches_qual_chall_2002_2615'], ['V', 'atp_matches_qual_chall_2002_2866'], ['V', 'atp_matches_qual_chall_2002_2881'], ['D', 'atp_matches_qual_chall_2002_2889'], ['D', 'atp_matches_qual_chall_2002_3046'], ['V', 'atp_matches_qual_chall_2002_3085'], ['V', 'atp_matches_qual_chall_2002_3177'], ['D', 'atp_matches_qual_chall_2002_3187'], ['D', 'atp_matches_qual_chall_2002_3320'], ['V', 'atp_matches_qual_chall_2002_3440'], ['D', 'atp_matches_qual_chall_2002_3455'], ['V', 'atp_matches_qual_chall_2002_3832'], ['V', 'atp_matches_qual_chall_2002_3848'], ['D', 'atp_matches_qual_chall_2002_3856'], ['V', 'atp_matches_qual_chall_2002_3929'], ['V', 'atp_matches_qual_chall_2002_3943'], ['V', 'atp_matches_qual_chall_2002_3950'], ['D', 'atp_matches_qual_chall_2002_3953'], ['D', 'atp_matches_qual_chall_2002_4087'], ['D', 'atp_matches_qual_chall_2002_4410'], ['D', 'atp_matches_qual_chall_2002_4863'], ['V', 'atp_matches_qual_chall_2002_4897'], ['V', 'atp_matches_qual_chall_2002_4909'], ['D', 'atp_matches_qual_chall_2002_4915'], ['V', 'atp_matches_qual_chall_2002_5081'], ['V', 'atp_matches_qual_chall_2002_5094'], ['V', 'atp_matches_qual_chall_2002_5101'], ['V', 'atp_matches_qual_chall_2002_5104'], ['D', 'atp_matches_qual_chall_2002_5106'], ['D', 'atp_matches_qual_chall_2002_5616'], ['V', 'atp_matches_qual_chall_2002_5639'], ['V', 'atp_matches_qual_chall_2002_5649'], ['D', 'atp_matches_qual_chall_2002_5654'], ['D', 'atp_matches_qual_chall_2002_5704'], ['V', 'atp_matches_qual_chall_2002_5786'], ['D', 'atp_matches_qual_chall_2002_5800'], ['D', 'atp_matches_2002_6105'], ['V', 'atp_matches_qual_chall_2002_6441'], ['D', 'atp_matches_qual_chall_2002_6452'], ['D', 'atp_matches_qual_chall_2002_6525'], ['V', 'atp_matches_qual_chall_2002_6636'], ['D', 'atp_matches_qual_chall_2002_6650'], ['D', 'atp_matches_qual_chall_2002_7293'], ['D', 'atp_matches_qual_chall_2003_324'], ['V', 'atp_matches_qual_chall_2003_351'], ['V', 'atp_matches_qual_chall_2003_405'], ['D', 'atp_matches_2003_1184'], ['V', 'atp_matches_qual_chall_2003_1621'], ['V', 'atp_matches_qual_chall_2003_1630'], ['V', 'atp_matches_qual_chall_2003_1634'], ['V', 'atp_matches_qual_chall_2003_1636'], ['V', 'atp_matches_qual_chall_2003_1637'], ['D', 'atp_matches_qual_chall_2003_1644'], ['D', 'atp_matches_2003_1745'], ['D', 'atp_matches_2003_1748'], ['V', 'atp_matches_qual_chall_2003_1788'], ['D', 'atp_matches_qual_chall_2003_1802'], ['V', 'atp_matches_2003_1954'], ['V', 'atp_matches_2003_1979'], ['V', 'atp_matches_2003_1991'], ['D', 'atp_matches_2003_1997'], ['V', 'atp_matches_2003_2066'], ['V', 'atp_matches_2003_2090'], ['D', 'atp_matches_2003_2105'], ['V', 'atp_matches_2003_2415'], ['V', 'atp_matches_2003_2439'], ['V', 'atp_matches_2003_2451'], ['D', 'atp_matches_2003_2457'], ['D', 'atp_matches_2003_2520'], ['D', 'atp_matches_2003_2846'], ['D', 'atp_matches_qual_chall_2003_2941'], ['V', 'atp_matches_qual_chall_2003_2973'], ['V', 'atp_matches_qual_chall_2003_3037'], ['V', 'atp_matches_qual_chall_2003_3356'], ['V', 'atp_matches_qual_chall_2003_3364'], ['V', 'atp_matches_qual_chall_2003_3368'], ['V', 'atp_matches_qual_chall_2003_3370'], ['V', 'atp_matches_qual_chall_2003_3371'], ['D', 'atp_matches_2003_3636'], ['V', 'atp_matches_qual_chall_2003_3877'], ['V', 'atp_matches_qual_chall_2003_3885'], ['V', 'atp_matches_qual_chall_2003_3889'], ['D', 'atp_matches_qual_chall_2003_3891'], ['V', 'atp_matches_2003_4123'], ['V', 'atp_matches_2003_4132'], ['D', 'atp_matches_2003_4137'], ['V', 'atp_matches_2003_4344'], ['V', 'atp_matches_2003_4360'], ['V', 'atp_matches_2003_4374'], ['D', 'atp_matches_2003_4381'], ['V', 'atp_matches_2003_4772'], ['V', 'atp_matches_2003_4782'], ['V', 'atp_matches_2003_4787'], ['V', 'atp_matches_2003_4789'], ['D', 'atp_matches_2003_4790'], ['D', 'atp_matches_2003_5036'], ['D', 'atp_matches_2003_5655'], ['D', 'atp_matches_2003_6509'], ['V', 'atp_matches_2003_6587'], ['D', 'atp_matches_2003_6597'], ['D', 'atp_matches_2003_6884'], ['D', 'atp_matches_2003_6973'], ['D', 'atp_matches_2003_7273'], ['D', 'atp_matches_2004_34'], ['V', 'atp_matches_2004_232'], ['D', 'atp_matches_2004_273'], ['V', 'atp_matches_2004_842'], ['V', 'atp_matches_2004_852'], ['D', 'atp_matches_2004_857'], ['V', 'atp_matches_2004_928'], ['D', 'atp_matches_2004_941'], ['V', 'atp_matches_2004_1050'], ['D', 'atp_matches_2004_1064'], ['V', 'atp_matches_2004_1174'], ['V', 'atp_matches_2004_1188'], ['D', 'atp_matches_2004_1195'], ['D', 'atp_matches_2004_1269'], ['D', 'atp_matches_2004_2180'], ['V', 'atp_matches_2004_2507'], ['V', 'atp_matches_2004_2532'], ['D', 'atp_matches_2004_2545'], ['D', 'atp_matches_2004_2620'], ['V', 'atp_matches_2004_2747'], ['V', 'atp_matches_2004_2761'], ['V', 'atp_matches_2004_2768'], ['V', 'atp_matches_2004_2772'], ['V', 'atp_matches_2004_2774'], ['D', 'atp_matches_2004_2962'], ['D', 'atp_matches_2004_3507'], ['V', 'atp_matches_2004_3728'], ['D', 'atp_matches_2004_3785'], ['V', 'atp_matches_2004_4172'], ['D', 'atp_matches_2004_4187'], ['V', 'atp_matches_2004_4567'], ['V', 'atp_matches_2004_4569'], ['V', 'atp_matches_2004_4817'], ['V', 'atp_matches_2004_4831'], ['V', 'atp_matches_2004_4838'], ['V', 'atp_matches_2004_4841'], ['D', 'atp_matches_2004_4843'], ['V', 'atp_matches_qual_chall_2004_5093'], ['D', 'atp_matches_qual_chall_2004_5109'], ['V', 'atp_matches_qual_chall_2004_5124'], ['V', 'atp_matches_qual_chall_2004_5140'], ['V', 'atp_matches_qual_chall_2004_5148'], ['V', 'atp_matches_qual_chall_2004_5152'], ['V', 'atp_matches_qual_chall_2004_5154'], ['D', 'atp_matches_2004_5716'], ['V', 'atp_matches_2004_5994'], ['D', 'atp_matches_2004_6027'], ['V', 'atp_matches_2004_6485'], ['V', 'atp_matches_2004_6499'], ['V', 'atp_matches_2004_6506'], ['D', 'atp_matches_2004_6510'], ['V', 'atp_matches_2004_6661'], ['D', 'atp_matches_2004_6663'], ['V', 'atp_matches_2004_6772'], ['V', 'atp_matches_2004_6786'], ['V', 'atp_matches_2004_6793'], ['V', 'atp_matches_2004_6797'], ['D', 'atp_matches_2004_6799'], ['V', 'atp_matches_2004_7231'], ['D', 'atp_matches_2004_7241'], ['D', 'atp_matches_2004_7364'], ['D', 'atp_matches_2004_7528'], ['D', 'atp_matches_2005_65'], ['D', 'atp_matches_2005_188'], ['D', 'atp_matches_2005_297'], ['V', 'atp_matches_2005_715'], ['V', 'atp_matches_2005_725'], ['V', 'atp_matches_2005_730'], ['D', 'atp_matches_2005_733'], ['D', 'atp_matches_2005_893'], ['D', 'atp_matches_2005_1118'], ['V', 'atp_matches_2005_1274'], ['V', 'atp_matches_2005_1284'], ['D', 'atp_matches_2005_1289'], ['D', 'atp_matches_2005_1432'], ['D', 'atp_matches_2005_1770'], ['V', 'atp_matches_2005_1944'], ['V', 'atp_matches_2005_1952'], ['V', 'atp_matches_2005_1956'], ['D', 'atp_matches_2005_1958'], ['V', 'atp_matches_2005_2141'], ['V', 'atp_matches_2005_2160'], ['V', 'atp_matches_2005_2169'], ['D', 'atp_matches_2005_2174'], ['V', 'atp_matches_2005_2544'], ['D', 'atp_matches_2005_2545'], ['V', 'atp_matches_2005_2650'], ['D', 'atp_matches_2005_2671'], ['V', 'atp_matches_2005_2767'], ['V', 'atp_matches_2005_2792'], ['V', 'atp_matches_2005_2804'], ['D', 'atp_matches_2005_2810'], ['V', 'atp_matches_2005_3181'], ['V', 'atp_matches_2005_3215'], ['D', 'atp_matches_2005_3232'], ['D', 'atp_matches_2005_3931'], ['V', 'atp_matches_qual_chall_2005_4150'], ['V', 'atp_matches_qual_chall_2005_4166'], ['V', 'atp_matches_qual_chall_2005_4174'], ['D', 'atp_matches_qual_chall_2005_4178']]"
56 | Matches_Clay_2,"['D', 'D', 'V', 'D', 'V', 'D', 'D', 'V', 'V', 'V', 'D', 'D', 'D', 'D', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'D', 'D', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'D', 'D', 'D', 'V', 'D', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'D', 'V', 'D', 'D', 'V', 'D', 'V', 'V', 'D', 'D', 'V', 'V', 'D', 'D', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'D', 'V', 'V', 'D', 'D', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'D', 'D', 'D', 'V', 'V', 'D', 'V', 'V', 'D', 'D', 'V', 'D', 'D', 'V', 'D', 'D', 'V', 'D', 'D', 'V', 'V', 'V', 'V', 'V', 'D', 'D', 'D', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'D', 'D', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'D', 'V', 'D', 'V', 'V', 'D', 'V', 'D', 'V', 'D', 'V', 'V', 'D', 'D', 'V', 'V', 'D', 'D', 'V', 'V', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'D', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'D', 'V', 'D', 'V', 'V', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'D']"
57 | Matches_Carpet_2,"['D', 'D', 'D', 'D', 'D', 'D', 'D', 'V', 'D', 'D', 'D']"
58 | Matches_Grass_2,"['D', 'D', 'V', 'D', 'D']"
59 | Matches_Hard_2,"['D', 'V', 'D', 'V', 'V', 'D', 'V', 'D', 'D', 'D', 'D', 'V', 'V', 'D', 'V', 'D', 'V', 'V', 'D', 'V', 'V', 'V', 'V', 'D', 'D', 'D', 'V', 'V', 'D', 'D', 'D', 'D', 'V', 'D', 'D', 'D', 'V', 'D', 'D', 'D', 'D', 'D', 'D']"
60 | Victories_Percentage_2,56.03715170278638
61 | Clay_Victories_Percentage_2,61.36363636363637
62 | Carpet_Victories_Percentage_2,9.090909090909092
63 | Grass_Victories_Percentage_2,20.0
64 | Hard_Victories_Percentage_2,39.53488372093023
65 | Aces_Percentage_2,1.2510230328539695
66 | Doublefaults_Percentage_2,3.7881445106980007
67 | First_Serve_Success_Percentage_2,66.30422074126038
68 | Winning_on_1st_Serve_Percentage_2,65.75470595112826
69 | Winning_on_2nd_Serve_Percentage_2,16.49713550800889
70 | Overall_Win_on_Serve_Percentage_2,82.25184145913715
71 | BreakPoint_Face_Percentage_2,10.312171168011224
72 | BreakPoint_Saved_Percentage_2,5.869285630772828
73 | games_fatigue_2,58.0
74 | minutes_fatigue_2,
75 | last_rankings_2,"[32, 65, 60, 45, 39, 40, 9999, 43, 42, 41, 31, 34]"
76 | last_ranking_points_2,"[1060, 630, 685, 775, 890, 890, 0, 850, 885, 885, 1060, 990]"
77 | Winner,0
78 | score,6-1 2-6 6-4
79 | elapsed_minutes,100.0
80 | aces_nb_1,4.0
81 | doublefaults_nb_1,5.0
82 | svpt_1,68.0
83 | 1stIn_1,36.0
84 | 1stWon_1,27.0
85 | 2ndWon_1,16.0
86 | SvGms_1,13.0
87 | bpSaved_1,5.0
88 | bpFaced_1,9.0
89 | aces_nb_2,0.0
90 | doublefaults_nb_2,6.0
91 | svpt_2,93.0
92 | 1stIn_2,63.0
93 | 1stWon_2,33.0
94 | 2ndWon_2,17.0
95 | SvGms_2,12.0
96 | bpSaved_2,6.0
97 | bpFaced_2,11.0
98 | tournament_year,2005
99 | 


--------------------------------------------------------------------------------