├── .gitignore ├── pics ├── cover.png ├── cover_cropped.png ├── test_harness_hacking.png ├── test_harness_hacking_mitigation_study1.png ├── test_harness_hacking_mitigation_study2.png ├── test_harness_hacking_hill_climbing_performance.png ├── logo.svg └── seed_hacked_result.svg ├── examples ├── src │ ├── test_set_memorization.py │ ├── seed_hacking_learning_algorithm.py │ ├── seed_hacking_cross_validation.py │ ├── seed_hacking_train_test_split.py │ ├── leaderboard_hacking.py │ ├── test_set_pruning.py │ ├── precision_hacking.py │ ├── p_hacking_selective_sampling.py │ ├── p_hacking_feature_selection.py │ ├── train_test_ratio_gaming.py │ ├── p_hacking_feature_transforms.py │ ├── p_hacking_learning_algorithm.py │ ├── seed_hacking_bootstrap_performance.py │ ├── test_set_overfitting.py │ ├── seed_hacking_perceptron.py │ ├── seed_hacking_sgd_classifier.py │ ├── seed_hacking_ridge_classifier.py │ ├── seed_hacking_decision_tree.py │ ├── seed_hacking_random_forest.py │ ├── seed_hacking_logistic_regression.py │ ├── seed_hacking_gradient_boosting.py │ ├── seed_hacking_multilayer_percepron.py │ ├── seed_hacking_bagging.py │ ├── test_harness_hacking_hill_climbing_test_folds.py │ ├── test_harness_hacking_hill_climbing_performance.py │ └── test_harness_hacking_mitigation.py ├── test_set_memorization.md ├── seed_hacking_learning_algorithm.md ├── seed_hacking_cross_validation.md ├── seed_hacking_train_test_split.md ├── p_hacking_selective_sampling.md ├── p_hacking_feature_selection.md ├── p_hacking_learning_algorithm.md ├── leaderboard_hacking.md ├── test_set_pruning.md ├── test_set_overfitting.md ├── train_test_ratio_gaming.md ├── test_harness_hacking_hill_climbing_test_folds.md ├── test_harness_hacking_hill_climbing_performance.md ├── seed_hacking_bootstrap_performance.md ├── threshold_hacking.md ├── test_harness_hacking.md ├── p_hacking.md ├── test_harness_hacking_mitigation.md └── seed_hacking.md └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | dev 2 | .DS_Store 3 | -------------------------------------------------------------------------------- /pics/cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jason2Brownlee/MachineLearningMischief/HEAD/pics/cover.png -------------------------------------------------------------------------------- /pics/cover_cropped.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jason2Brownlee/MachineLearningMischief/HEAD/pics/cover_cropped.png -------------------------------------------------------------------------------- /pics/test_harness_hacking.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jason2Brownlee/MachineLearningMischief/HEAD/pics/test_harness_hacking.png -------------------------------------------------------------------------------- /pics/test_harness_hacking_mitigation_study1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jason2Brownlee/MachineLearningMischief/HEAD/pics/test_harness_hacking_mitigation_study1.png -------------------------------------------------------------------------------- /pics/test_harness_hacking_mitigation_study2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jason2Brownlee/MachineLearningMischief/HEAD/pics/test_harness_hacking_mitigation_study2.png -------------------------------------------------------------------------------- /pics/test_harness_hacking_hill_climbing_performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jason2Brownlee/MachineLearningMischief/HEAD/pics/test_harness_hacking_hill_climbing_performance.png -------------------------------------------------------------------------------- /examples/src/test_set_memorization.py: -------------------------------------------------------------------------------- 1 | # Import necessary libraries 2 | from sklearn.datasets import make_classification # For generating synthetic dataset 3 | from sklearn.model_selection import train_test_split # For splitting the dataset 4 | from sklearn.neighbors import KNeighborsClassifier # For K-Nearest Neighbors classifier 5 | from sklearn.metrics import accuracy_score # For evaluating model performance 6 | 7 | # Generate a synthetic classification dataset 8 | X, y = make_classification(n_samples=100, n_features=10, n_classes=2, random_state=42) 9 | 10 | # Split the dataset into train and test sets 11 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 12 | 13 | # Create a K-Nearest Neighbors (KNN) classifier with k=1 14 | knn = KNeighborsClassifier(n_neighbors=1) 15 | 16 | # Fit the model on the test set (intentional test set leakage) 17 | knn.fit(X_test, y_test) 18 | 19 | # Evaluate the model on the test set 20 | y_pred = knn.predict(X_test) 21 | 22 | # Report the perfect score 23 | print("Accuracy:", accuracy_score(y_test, y_pred)) 24 | -------------------------------------------------------------------------------- /pics/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 14 | 15 | 16 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 32 | 33 | -------------------------------------------------------------------------------- /examples/src/seed_hacking_learning_algorithm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import make_classification 3 | from sklearn.ensemble import RandomForestClassifier 4 | from sklearn.model_selection import cross_val_score, KFold 5 | 6 | # Define the number of trials 7 | num_trials = 100 8 | 9 | # Define variables to track the best seed and best performance 10 | best_seed = None 11 | best_performance = -np.inf 12 | 13 | # Create a synthetic classification dataset 14 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42) 15 | 16 | # Fix the cross-validation folds for all evaluations 17 | kf = KFold(n_splits=5, shuffle=True, random_state=42) 18 | 19 | # Iterate over multiple seeds for the model's randomness 20 | for trial in range(num_trials): 21 | # Set the seed for the random forest model 22 | seed = trial 23 | 24 | # Initialize the model with the current seed 25 | model = RandomForestClassifier(n_estimators=50, random_state=seed) 26 | 27 | # Evaluate the model using cross-validation 28 | scores = cross_val_score(model, X, y, cv=kf) 29 | 30 | # Calculate the mean performance 31 | mean_performance = scores.mean() 32 | 33 | # Print the seed and performance if there is an improvement 34 | if mean_performance > best_performance: 35 | print(f"Seed: {seed}, Performance: {mean_performance:.4f}") 36 | best_performance = mean_performance 37 | best_seed = seed 38 | 39 | # Report the best seed and its performance 40 | print(f"\nBest Seed: {best_seed}, Best Performance: {best_performance:.4f}") 41 | -------------------------------------------------------------------------------- /examples/src/seed_hacking_cross_validation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import make_classification 3 | from sklearn.ensemble import RandomForestClassifier 4 | from sklearn.model_selection import cross_val_score, KFold 5 | 6 | # Define the number of trials 7 | num_trials = 100 8 | 9 | # Define variables to track the best fold configuration and best performance 10 | best_fold_seed = None 11 | best_performance = -np.inf 12 | 13 | # Create a synthetic classification dataset 14 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42) 15 | 16 | # Initialize the model with a fixed seed 17 | model = RandomForestClassifier(n_estimators=50, random_state=42) 18 | 19 | # Iterate over multiple seeds to vary the k-fold cross-validation splits 20 | for trial in range(num_trials): 21 | # Set the seed for the k-fold shuffle 22 | fold_seed = trial 23 | 24 | # Initialize k-fold cross-validation with the current seed 25 | kf = KFold(n_splits=5, shuffle=True, random_state=fold_seed) 26 | 27 | # Evaluate the model using cross-validation 28 | scores = cross_val_score(model, X, y, cv=kf) 29 | 30 | # Calculate the mean performance 31 | mean_performance = scores.mean() 32 | 33 | # Print the fold seed and performance if there is an improvement 34 | if mean_performance > best_performance: 35 | print(f"Fold Seed: {fold_seed}, Performance: {mean_performance:.4f}") 36 | best_performance = mean_performance 37 | best_fold_seed = fold_seed 38 | 39 | # Report the best fold seed and its performance 40 | print(f"\nBest Fold Seed: {best_fold_seed}, Best Performance: {best_performance:.4f}") 41 | -------------------------------------------------------------------------------- /examples/src/seed_hacking_train_test_split.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import make_classification 3 | from sklearn.ensemble import RandomForestClassifier 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.metrics import accuracy_score 6 | 7 | # Define the number of trials 8 | num_trials = 100 9 | 10 | # Define variables to track the best seed and best performance 11 | best_seed = None 12 | best_performance = -np.inf 13 | 14 | # Create a synthetic classification dataset 15 | X, y = make_classification(n_samples=100, n_features=20, n_informative=15, n_redundant=5, random_state=42) 16 | 17 | # Initialize the model with a fixed seed 18 | model = RandomForestClassifier(random_state=42) 19 | 20 | # Iterate over multiple seeds to vary the train/test split 21 | for trial in range(num_trials): 22 | # Set the seed for train/test split 23 | split_seed = trial 24 | 25 | # Create a train/test split with the current seed 26 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=split_seed) 27 | 28 | # Train the model on the training data 29 | model.fit(X_train, y_train) 30 | 31 | # Evaluate the model on the test data 32 | y_pred = model.predict(X_test) 33 | performance = accuracy_score(y_test, y_pred) 34 | 35 | # Print the split seed and performance if there is an improvement 36 | if performance > best_performance: 37 | print(f"Split Seed: {split_seed}, Performance: {performance:.4f}") 38 | best_performance = performance 39 | best_seed = split_seed 40 | 41 | # Report the best split seed and its performance 42 | print(f"\nBest Split Seed: {best_seed}, Best Performance: {best_performance:.4f}") -------------------------------------------------------------------------------- /examples/src/leaderboard_hacking.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import make_classification 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.metrics import accuracy_score 5 | 6 | # Generate a synthetic classification dataset 7 | X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42) 8 | 9 | # Split into training and test sets 10 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 11 | 12 | # Artificial "leaderboard" to evaluate predictions on the test set 13 | def leaderboard_evaluation(predictions, true_labels): 14 | """Calculate the leaderboard score (accuracy in this case).""" 15 | return accuracy_score(true_labels, predictions) 16 | 17 | # Initialize random predictions for the test set 18 | best_predictions = np.random.randint(0, 2, size=len(y_test)) 19 | best_score = leaderboard_evaluation(best_predictions, y_test) 20 | 21 | # Stochastic hill climber: adjust predictions iteratively 22 | max_trials = 10000 # Maximum number of trials 23 | for trial in range(max_trials): 24 | # Copy the best predictions and randomly flip one value 25 | new_predictions = best_predictions.copy() 26 | index_to_flip = np.random.randint(len(new_predictions)) 27 | new_predictions[index_to_flip] = 1 - new_predictions[index_to_flip] # Flip the prediction 28 | 29 | # Evaluate the new predictions 30 | new_score = leaderboard_evaluation(new_predictions, y_test) 31 | 32 | # If the new score is better, adopt the new predictions 33 | if new_score > best_score: 34 | best_predictions = new_predictions 35 | best_score = new_score 36 | 37 | # Report progress 38 | print(f"Trial {trial + 1}/{max_trials}: Leaderboard Score = {new_score:.4f}, Best Score = {best_score:.4f}") 39 | 40 | # Stop if perfect score is achieved 41 | if best_score == 1.0: 42 | print("Perfect score achieved!") 43 | break -------------------------------------------------------------------------------- /examples/src/test_set_pruning.py: -------------------------------------------------------------------------------- 1 | # Import necessary libraries 2 | from sklearn.datasets import make_classification 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.ensemble import RandomForestClassifier 5 | from sklearn.metrics import accuracy_score 6 | import numpy as np 7 | 8 | # Generate a synthetic classification dataset 9 | X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=42) 10 | 11 | # Split the dataset into training and testing sets 12 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 13 | 14 | # Initialize a Random Forest classifier 15 | model = RandomForestClassifier(random_state=42) 16 | 17 | # Train the model on the training set 18 | model.fit(X_train, y_train) 19 | 20 | # Predict on the test set 21 | y_pred = model.predict(X_test) 22 | 23 | # Calculate the initial accuracy 24 | initial_accuracy = accuracy_score(y_test, y_pred) 25 | print(f"Initial Test Accuracy: {initial_accuracy}") 26 | 27 | # Iteratively remove one misclassified example per iteration 28 | X_test_pruned = X_test 29 | y_test_pruned = y_test 30 | while True: 31 | # Predict on the pruned test set 32 | y_pred_pruned = model.predict(X_test_pruned) 33 | 34 | # Identify indices of misclassified samples 35 | misclassified_indices = np.where(y_pred_pruned != y_test_pruned)[0] 36 | 37 | # Break if no misclassified samples remain 38 | if len(misclassified_indices) == 0: 39 | break 40 | 41 | # Remove one misclassified sample 42 | index_to_remove = misclassified_indices[0] # Select the first misclassified sample 43 | X_test_pruned = np.delete(X_test_pruned, index_to_remove, axis=0) 44 | y_test_pruned = np.delete(y_test_pruned, index_to_remove, axis=0) 45 | 46 | # Recalculate accuracy on the pruned test set 47 | pruned_accuracy = accuracy_score(y_test_pruned, model.predict(X_test_pruned)) 48 | print(f"Pruned Test Accuracy: {pruned_accuracy}") 49 | -------------------------------------------------------------------------------- /examples/src/precision_hacking.py: -------------------------------------------------------------------------------- 1 | # Import necessary libraries 2 | import numpy as np 3 | from sklearn.datasets import make_classification 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn.metrics import precision_score 7 | 8 | # Generate a synthetic classification dataset 9 | X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, 10 | n_redundant=5, random_state=42) 11 | 12 | # Split the dataset into train and test sets 13 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 14 | 15 | # Initialize the logistic regression model 16 | model = LogisticRegression(random_state=42, max_iter=1000) 17 | 18 | # Train the model on the training set 19 | model.fit(X_train, y_train) 20 | 21 | # Get raw predicted probabilities for the positive class 22 | y_proba = model.predict_proba(X_test)[:, 1] 23 | 24 | # Define a range of thresholds to evaluate 25 | thresholds = np.linspace(0.1, 0.9, 81) 26 | 27 | # Track best precision score and corresponding threshold 28 | best_precision = 0 29 | best_threshold = 0 30 | 31 | # Iterate over each threshold 32 | print("Threshold Tuning Progress:") 33 | print(f"{'Threshold':<10}{'Precision':<10}{'Best Precision':<15}{'Best Threshold':<15}") 34 | for threshold in thresholds: 35 | # Convert probabilities to binary predictions based on the current threshold 36 | y_pred = (y_proba >= threshold).astype(int) 37 | 38 | # Calculate precision score 39 | precision = precision_score(y_test, y_pred) 40 | 41 | # Check if this is the best precision score so far 42 | if precision > best_precision: 43 | best_precision = precision 44 | best_threshold = threshold 45 | 46 | # Report progress 47 | print(f"{threshold:<10.2f}{precision:<10.2f}{best_precision:<15.2f}{best_threshold:<15.2f}") 48 | 49 | # Final best score and threshold 50 | print("\nFinal Results:") 51 | print(f"Best Precision: {best_precision:.2f}") 52 | print(f"Best Threshold: {best_threshold:.2f}") -------------------------------------------------------------------------------- /examples/src/p_hacking_selective_sampling.py: -------------------------------------------------------------------------------- 1 | # Import necessary libraries 2 | import numpy as np 3 | from sklearn.datasets import make_classification 4 | from sklearn.model_selection import cross_val_score, StratifiedKFold 5 | from sklearn.linear_model import LogisticRegression 6 | from scipy.stats import ttest_ind 7 | 8 | # Generate a synthetic classification dataset 9 | X, y = make_classification(n_samples=500, n_features=10, n_informative=5, n_redundant=5, random_state=42) 10 | 11 | # Define a classifier 12 | model = LogisticRegression(random_state=42, max_iter=1000) 13 | 14 | # Define a k-fold cross-validation strategy with a fixed seed 15 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) 16 | 17 | # Evaluate the model on the full dataset using k-fold cross-validation 18 | baseline_scores = cross_val_score(model, X, y, cv=kfold) 19 | baseline_mean = np.mean(baseline_scores) 20 | print(f'Base result: {baseline_mean:.3f}') 21 | 22 | # Set up parameters for p-hacking 23 | p_threshold = 0.05 # Threshold for statistical significance 24 | max_trials = 1000 # Maximum number of sampling strategies to test 25 | sample_size = int(0.5 * X.shape[0]) 26 | 27 | # Perform selective sampling and evaluate subsets 28 | for trial in range(max_trials): 29 | # Randomly select a subset of samples 30 | np.random.seed(trial + 1) 31 | sample_indices = np.random.choice(range(X.shape[0]), size=sample_size, replace=False) 32 | X_subset, y_subset = X[sample_indices], y[sample_indices] 33 | 34 | # Evaluate the model on the sampled subset using cross-validation 35 | trial_scores = cross_val_score(model, X_subset, y_subset, cv=kfold) 36 | trial_mean = np.mean(trial_scores) 37 | better = trial_mean > baseline_mean 38 | 39 | # Perform a t-test to compare means 40 | t_stat, p_value = ttest_ind(baseline_scores, trial_scores) 41 | significant = p_value < p_threshold 42 | 43 | # Report progress 44 | print(f'{trial+1}, Result: {trial_mean:.3f}, Better: {better}, p-value: {p_value:.3f} Significant: {significant}') 45 | 46 | # Stop if better and significant 47 | if better and significant: 48 | break 49 | -------------------------------------------------------------------------------- /pics/seed_hacked_result.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | Model Performance 11 | 12 | 13 | Frequency 14 | 15 | 16 | 18 | 19 | 20 | 22 | 23 | 24 | 25 | Fair Estimate (Median) 26 | 27 | 28 | 29 | 30 | Seed-Hacked Result 31 | 32 | 33 | 34 | 35 | 36 | 0.75 37 | 0.80 38 | 0.85 39 | -------------------------------------------------------------------------------- /examples/src/p_hacking_feature_selection.py: -------------------------------------------------------------------------------- 1 | # Import necessary libraries 2 | import numpy as np 3 | from sklearn.datasets import make_classification 4 | from sklearn.model_selection import cross_val_score, StratifiedKFold 5 | from sklearn.ensemble import RandomForestClassifier 6 | from scipy.stats import ttest_ind 7 | 8 | # Generate a synthetic classification dataset 9 | X, y = make_classification(n_samples=500, n_features=10, n_informative=2, n_redundant=8, random_state=42) 10 | 11 | # Define a classifier 12 | model = RandomForestClassifier(n_estimators=10, random_state=42) 13 | 14 | # Define a k-fold cross-validation strategy with a fixed seed 15 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) 16 | 17 | # Evaluate the model on the full dataset using k-fold cross-validation 18 | baseline_scores = cross_val_score(model, X, y, cv=kfold) 19 | baseline_mean = np.mean(baseline_scores) 20 | print(f'Base result: {baseline_mean:.3f}') 21 | 22 | # Set up parameters for p-hacking 23 | p_threshold = 0.05 # Threshold for statistical significance 24 | max_trials = 1000 # Maximum number of feature subsets to test 25 | num_features = X.shape[1] 26 | 27 | # Perform selective feature subset selection and evaluation 28 | for trial in range(max_trials): 29 | # Randomly select a subset of features 30 | np.random.seed(trial + 1) 31 | selected_features = np.random.choice(range(num_features), size=np.random.randint(1, num_features + 1), replace=False) 32 | X_subset = X[:, selected_features] 33 | 34 | # Evaluate the model on the selected feature subset using cross-validation 35 | trial_scores = cross_val_score(model, X_subset, y, cv=kfold) 36 | trial_mean = np.mean(trial_scores) 37 | better = trial_mean > baseline_mean 38 | 39 | # Perform a t-test to compare means 40 | t_stat, p_value = ttest_ind(baseline_scores, trial_scores) 41 | significant = p_value < p_threshold 42 | 43 | # Report progress 44 | print(f'{trial+1}, Features: {selected_features}, Result: {trial_mean:.3f}, Better: {better}, p-value: {p_value:.3f}, Significant: {significant}') 45 | 46 | # Stop if better and significant 47 | if better and significant: 48 | print("P-hacked subset identified!") 49 | break 50 | -------------------------------------------------------------------------------- /examples/src/train_test_ratio_gaming.py: -------------------------------------------------------------------------------- 1 | # Import necessary libraries 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.datasets import make_classification 5 | from sklearn.ensemble import RandomForestClassifier 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.metrics import accuracy_score 8 | 9 | # Generate a synthetic classification dataset 10 | X, y = make_classification( 11 | n_samples=1000, # Number of samples 12 | n_features=20, # Number of features 13 | n_informative=15, # Number of informative features 14 | n_redundant=5, # Number of redundant features 15 | random_state=42 # Fixing random state for reproducibility 16 | ) 17 | 18 | # Fix random seed for consistent train/test splits 19 | random_seed = 42 20 | 21 | # Initialize a variable to track the best test performance and associated split ratio 22 | best_accuracy = 0 23 | best_ratio = 0 24 | 25 | # Iterate over train/test split ratios from 50% to 99% in 1% increments 26 | for train_size in range(50, 100): # Split ratios vary from 50% to 99% 27 | test_size = 100 - train_size # Calculate corresponding test size 28 | 29 | # Split the dataset into train and test sets 30 | X_train, X_test, y_train, y_test = train_test_split( 31 | X, y, 32 | train_size=train_size / 100.0, # Convert train_size to percentage 33 | random_state=random_seed # Fix the random seed 34 | ) 35 | 36 | # Initialize a Random Forest Classifier 37 | model = RandomForestClassifier(random_state=random_seed) 38 | 39 | # Train the model on the training data 40 | model.fit(X_train, y_train) 41 | 42 | # Predict on the test set 43 | y_pred = model.predict(X_test) 44 | 45 | # Evaluate test performance using accuracy 46 | accuracy = accuracy_score(y_test, y_pred) 47 | 48 | # Report progress 49 | print(f'> {train_size}/{test_size}: {accuracy}') 50 | 51 | # Update the best accuracy and split ratio if current accuracy is better 52 | if accuracy > best_accuracy: 53 | best_accuracy = accuracy 54 | best_ratio = train_size 55 | 56 | # Print the best train/test split ratio and corresponding accuracy 57 | print(f"Best train/test split ratio: {best_ratio}/{100 - best_ratio}") 58 | print(f"Best test accuracy: {best_accuracy}") -------------------------------------------------------------------------------- /examples/src/p_hacking_feature_transforms.py: -------------------------------------------------------------------------------- 1 | # Import necessary libraries 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.datasets import make_classification 5 | from sklearn.model_selection import cross_val_score, StratifiedKFold 6 | from sklearn.linear_model import LogisticRegression 7 | from scipy.stats import ttest_ind 8 | 9 | # Generate a synthetic classification dataset 10 | X, y = make_classification(n_samples=350, n_features=10, n_informative=2, n_redundant=8, random_state=42) 11 | 12 | # Define a high-capacity machine learning model 13 | model = LogisticRegression(max_iter=1000, random_state=42) 14 | 15 | # Define a k-fold cross-validation strategy with a fixed seed 16 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) 17 | 18 | # Evaluate the model on the dataset using k-fold cross-validation (baseline without transformations) 19 | baseline_scores = cross_val_score(model, X, y, cv=kfold) 20 | baseline_mean = np.mean(baseline_scores) 21 | 22 | # Set up parameters for p-hacking 23 | p_threshold = 0.05 # Threshold for statistical significance 24 | transformations = ["none", "log", "sqrt", "square"] # Possible transformations to test 25 | significant_result_found = False 26 | 27 | # Loop through trials with different feature transformations 28 | for transform in transformations: 29 | # Apply the selected transformation to the features 30 | if transform == "log": 31 | X_transformed = np.log(np.abs(X) + 1) # Avoid log(0) or negative numbers 32 | elif transform == "sqrt": 33 | X_transformed = np.sqrt(np.abs(X)) # Avoid sqrt of negative numbers 34 | elif transform == "square": 35 | X_transformed = X ** 2 36 | else: # "none" 37 | X_transformed = X 38 | 39 | # Evaluate the model with k-fold cross-validation on transformed features 40 | trial_scores = cross_val_score(model, X_transformed, y, cv=kfold) 41 | trial_mean = np.mean(trial_scores) 42 | 43 | # Perform a t-test to compare means 44 | t_stat, p_value = ttest_ind(baseline_scores, trial_scores) 45 | significant = p_value < p_threshold 46 | 47 | # Report progress 48 | print(f'Transform: {transform} mean: {trial_mean:.3f} (base: {baseline_mean:.3f}), p-value: {p_value:.3f}') 49 | if significant: 50 | print('\tSignificant difference') 51 | 52 | -------------------------------------------------------------------------------- /examples/src/p_hacking_learning_algorithm.py: -------------------------------------------------------------------------------- 1 | # Import necessary libraries 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.datasets import make_classification 5 | from sklearn.model_selection import cross_val_score, StratifiedKFold 6 | from sklearn.ensemble import RandomForestClassifier 7 | from scipy.stats import ttest_ind 8 | 9 | # Generate a synthetic classification dataset 10 | X, y = make_classification(n_samples=350, n_features=10, n_informative=2, n_redundant=8, random_state=42) 11 | 12 | # Define a high-capacity machine learning model 13 | model = RandomForestClassifier(n_estimators=10, random_state=42) 14 | 15 | # Define a k-fold cross-validation strategy with a fixed seed 16 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) 17 | 18 | # Evaluate the model on the dataset using k-fold cross-validation 19 | baseline_scores = cross_val_score(model, X, y, cv=kfold) 20 | baseline_mean = np.mean(baseline_scores) 21 | 22 | # Set up parameters for p-hacking 23 | p_threshold = 0.05 # Threshold for statistical significance 24 | max_trials = 1000 # Maximum number of trials to perform 25 | significant_result_found = False 26 | 27 | # Loop through trials with different random seeds 28 | for trial in range(max_trials): 29 | # Use a new random seed for the model 30 | seed = trial + 100 31 | model = RandomForestClassifier(n_estimators=10, random_state=seed) 32 | 33 | # Evaluate the model with k-fold cross-validation 34 | trial_scores = cross_val_score(model, X, y, cv=kfold) 35 | trial_mean = np.mean(trial_scores) 36 | 37 | # Perform a t-test to compare means 38 | t_stat, p_value = ttest_ind(baseline_scores, trial_scores) 39 | 40 | # Check if the p-value is below the significance threshold 41 | if p_value < p_threshold: 42 | significant_result_found = True 43 | print(f"Significant difference found on trial {trial+1}") 44 | print(f"Baseline mean: {baseline_mean:.4f}, Trial mean: {trial_mean:.4f}, p-value: {p_value:.4f}") 45 | break 46 | else: 47 | print(f"No significant difference found yet, trial {trial+1}, p-value: {p_value:.4f}") 48 | 49 | # Report if no significant result was found within the maximum trials 50 | if not significant_result_found: 51 | print("No significant result found after maximum trials.") -------------------------------------------------------------------------------- /examples/src/seed_hacking_bootstrap_performance.py: -------------------------------------------------------------------------------- 1 | # Import necessary libraries 2 | from sklearn.datasets import make_classification 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.ensemble import RandomForestClassifier 5 | from sklearn.utils import resample 6 | from sklearn.metrics import accuracy_score 7 | import numpy as np 8 | 9 | # Generate a synthetic classification dataset 10 | X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42) 11 | 12 | # Split the dataset into a training set and a test set 13 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 14 | 15 | # Initialize the random forest classifier 16 | model = RandomForestClassifier(random_state=42) 17 | 18 | # Train the model on the training set 19 | model.fit(X_train, y_train) 20 | 21 | # Number of bootstrap iterations 22 | num_bootstrap_iterations = 50 23 | 24 | # Number of repetitions for each bootstrap sample 25 | num_repeats_per_sample = 10 26 | 27 | # Variable to track the best accuracy and corresponding seed 28 | best_accuracy = 0 29 | best_seed = None 30 | 31 | # Iterate through multiple random seeds for bootstrap sampling 32 | for seed in range(num_bootstrap_iterations): 33 | # List to store accuracy scores for each repeat 34 | repeat_accuracies = [] 35 | 36 | # Evaluate the model on the same bootstrap sample multiple times 37 | for _ in range(num_repeats_per_sample): 38 | # Generate a bootstrap sample of the test set 39 | X_test_bootstrap, y_test_bootstrap = resample(X_test, y_test, random_state=seed) 40 | y_pred = model.predict(X_test_bootstrap) 41 | accuracy = accuracy_score(y_test_bootstrap, y_pred) 42 | repeat_accuracies.append(accuracy) 43 | 44 | # Compute the median accuracy for the current bootstrap sample 45 | median_accuracy = np.median(repeat_accuracies) 46 | 47 | # Report progress 48 | print(f'> Seed={seed}, Median Accuracy: {median_accuracy}') 49 | 50 | # Keep track of the best performance and its corresponding seed 51 | if median_accuracy > best_accuracy: 52 | best_accuracy = median_accuracy 53 | best_seed = seed 54 | 55 | # Print the selected seed with the best accuracy (artificially chosen for presentation) 56 | print(f"Best Seed: {best_seed}, Best Median Accuracy: {best_accuracy}") -------------------------------------------------------------------------------- /examples/src/test_set_overfitting.py: -------------------------------------------------------------------------------- 1 | # Import necessary libraries 2 | from sklearn.datasets import make_classification # For generating a synthetic classification dataset 3 | from sklearn.model_selection import train_test_split # For splitting the dataset 4 | from sklearn.ensemble import RandomForestClassifier # High-capacity model 5 | from sklearn.metrics import accuracy_score # For model evaluation 6 | from itertools import product # For generating all combinations of hyperparameters 7 | 8 | # Generate a synthetic classification dataset 9 | X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=42) 10 | 11 | # Split the dataset into training and testing sets 12 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 13 | 14 | # Define possible values for hyperparameters 15 | n_estimators_options = [10, 50, 100, 200] 16 | max_depth_options = [5, 10, 15, 20] 17 | 18 | # Generate all combinations of hyperparameters 19 | configurations = list(product(n_estimators_options, max_depth_options)) 20 | 21 | # Dictionary to store test set performance for each configuration 22 | test_set_performance = {} 23 | 24 | # Variable to track the best configuration so far 25 | best_config_so_far = None 26 | best_accuracy_so_far = 0 27 | 28 | # Loop through each configuration 29 | for n_estimators, max_depth in configurations: 30 | # Create the model with the current configuration 31 | model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42) 32 | 33 | # Fit the model on the training set 34 | model.fit(X_train, y_train) 35 | 36 | # Evaluate the model on the test set 37 | y_pred = model.predict(X_test) 38 | accuracy = accuracy_score(y_test, y_pred) 39 | 40 | # Store the performance on the test set 41 | test_set_performance[f"n_estimators={n_estimators}, max_depth={max_depth}"] = accuracy 42 | 43 | # Update and display progress 44 | if accuracy > best_accuracy_so_far: 45 | best_config_so_far = (n_estimators, max_depth) 46 | best_accuracy_so_far = accuracy 47 | print(f"cfg: n_estimators={n_estimators}, max_depth={max_depth}, Accuracy: {accuracy:.4f} " + f"(Best: {best_accuracy_so_far:.4f})") 48 | 49 | # Print the final best configuration and its test set accuracy 50 | print(f"Final Best Configuration: n_estimators={best_config_so_far[0]}, max_depth={best_config_so_far[1]}, Test Set Accuracy: {best_accuracy_so_far:.4f}") 51 | -------------------------------------------------------------------------------- /examples/test_set_memorization.md: -------------------------------------------------------------------------------- 1 | # Test Set Memorization 2 | 3 | > Allow the model to memorize the test set and get a perfect score. 4 | 5 | ## Description 6 | 7 | Test set memorization is one of the most dangerous and deceptive mistakes in machine learning model development. 8 | 9 | This problem occurs when a model is accidentally or intentionally allowed to train on data that should be reserved for testing. The result appears amazing at first - the model achieves near-perfect accuracy scores. But these scores are completely meaningless. 10 | 11 | In reality, the model hasn't learned to generalize at all. It has simply memorized the correct answers for your specific test cases. When deployed to production with real-world data, this model will perform terribly because it never actually learned the underlying patterns. 12 | 13 | This issue commonly arises through data leakage, where test data inadvertently bleeds into the training process through improper data handling or preprocessing steps. 14 | 15 | For new data scientists, this can be especially problematic because the impressive metrics can mask fundamental problems with the model's ability to generalize. 16 | 17 | To avoid this problem, always maintain strict separation between training and test data throughout the entire machine learning pipeline. 18 | 19 | 20 | 21 | 22 | ## Example 23 | 24 | ```python 25 | # Import necessary libraries 26 | from sklearn.datasets import make_classification # For generating synthetic dataset 27 | from sklearn.model_selection import train_test_split # For splitting the dataset 28 | from sklearn.neighbors import KNeighborsClassifier # For K-Nearest Neighbors classifier 29 | from sklearn.metrics import accuracy_score # For evaluating model performance 30 | 31 | # Generate a synthetic classification dataset 32 | X, y = make_classification(n_samples=100, n_features=10, n_classes=2, random_state=42) 33 | 34 | # Split the dataset into train and test sets 35 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 36 | 37 | # Create a K-Nearest Neighbors (KNN) classifier with k=1 38 | knn = KNeighborsClassifier(n_neighbors=1) 39 | 40 | # Fit the model on the test set (intentional test set leakage) 41 | knn.fit(X_test, y_test) 42 | 43 | # Evaluate the model on the test set 44 | y_pred = knn.predict(X_test) 45 | 46 | # Report the perfect score 47 | print("Accuracy:", accuracy_score(y_test, y_pred)) 48 | ``` 49 | 50 | Example Output: 51 | 52 | ```text 53 | Accuracy: 1.0 54 | ``` -------------------------------------------------------------------------------- /examples/src/seed_hacking_perceptron.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.datasets import make_classification 4 | from sklearn.linear_model import Perceptron 5 | from sklearn.model_selection import cross_val_score, KFold 6 | from statistics import mean, median, stdev 7 | 8 | # Define the number of trials 9 | num_trials = 100 10 | 11 | # Define variables to track the best seed and best performance 12 | best_seed = None 13 | best_performance = -np.inf 14 | performance_scores = [] # List to store performance scores 15 | 16 | # Create a synthetic classification dataset 17 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42) 18 | 19 | # Fix the cross-validation folds for all evaluations 20 | kf = KFold(n_splits=5, shuffle=True, random_state=42) 21 | 22 | # Iterate over multiple seeds for the model's randomness 23 | for trial in range(num_trials): 24 | # Set the seed for the bagging classifier 25 | seed = trial 26 | 27 | # Initialize the model with the current seed 28 | model = Perceptron(random_state=seed) 29 | 30 | # Evaluate the model using cross-validation 31 | scores = cross_val_score(model, X, y, cv=kf) 32 | 33 | # Calculate the mean performance 34 | mean_performance = scores.mean() 35 | performance_scores.append(mean_performance) 36 | 37 | # Print the seed and performance if there is an improvement 38 | if mean_performance > best_performance: 39 | print(f"Seed: {seed}, Performance: {mean_performance:.4f}") 40 | best_performance = mean_performance 41 | best_seed = seed 42 | 43 | # Report the best seed and its performance 44 | print(f"\nBest Seed: {best_seed}, Best Performance: {best_performance:.4f}") 45 | 46 | # Calculate statistics 47 | min_score = min(performance_scores) 48 | max_score = max(performance_scores) 49 | median_score = median(performance_scores) 50 | mean_score = mean(performance_scores) 51 | std_dev_score = stdev(performance_scores) 52 | 53 | print("\nPerformance Statistics:") 54 | print(f"Minimum: {min_score:.4f}") 55 | print(f"Median: {median_score:.4f}") 56 | print(f"Maximum: {max_score:.4f}") 57 | print(f"Mean: {mean_score:.4f}") 58 | print(f"Standard Deviation: {std_dev_score:.4f}") 59 | 60 | # Plot the distribution of performance scores 61 | plt.hist(performance_scores, bins=10, edgecolor='black', alpha=0.7) 62 | plt.title('Distribution of Performance Scores') 63 | plt.xlabel('Performance Score') 64 | plt.ylabel('Frequency') 65 | plt.grid(axis='y', linestyle='--', alpha=0.7) 66 | plt.show() 67 | -------------------------------------------------------------------------------- /examples/src/seed_hacking_sgd_classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.datasets import make_classification 4 | from sklearn.linear_model import SGDClassifier 5 | from sklearn.model_selection import cross_val_score, KFold 6 | from statistics import mean, median, stdev 7 | 8 | # Define the number of trials 9 | num_trials = 100 10 | 11 | # Define variables to track the best seed and best performance 12 | best_seed = None 13 | best_performance = -np.inf 14 | performance_scores = [] # List to store performance scores 15 | 16 | # Create a synthetic classification dataset 17 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42) 18 | 19 | # Fix the cross-validation folds for all evaluations 20 | kf = KFold(n_splits=5, shuffle=True, random_state=42) 21 | 22 | # Iterate over multiple seeds for the model's randomness 23 | for trial in range(num_trials): 24 | # Set the seed for the bagging classifier 25 | seed = trial 26 | 27 | # Initialize the model with the current seed 28 | model = SGDClassifier(random_state=seed) 29 | 30 | # Evaluate the model using cross-validation 31 | scores = cross_val_score(model, X, y, cv=kf) 32 | 33 | # Calculate the mean performance 34 | mean_performance = scores.mean() 35 | performance_scores.append(mean_performance) 36 | 37 | # Print the seed and performance if there is an improvement 38 | if mean_performance > best_performance: 39 | print(f"Seed: {seed}, Performance: {mean_performance:.4f}") 40 | best_performance = mean_performance 41 | best_seed = seed 42 | 43 | # Report the best seed and its performance 44 | print(f"\nBest Seed: {best_seed}, Best Performance: {best_performance:.4f}") 45 | 46 | # Calculate statistics 47 | min_score = min(performance_scores) 48 | max_score = max(performance_scores) 49 | median_score = median(performance_scores) 50 | mean_score = mean(performance_scores) 51 | std_dev_score = stdev(performance_scores) 52 | 53 | print("\nPerformance Statistics:") 54 | print(f"Minimum: {min_score:.4f}") 55 | print(f"Median: {median_score:.4f}") 56 | print(f"Maximum: {max_score:.4f}") 57 | print(f"Mean: {mean_score:.4f}") 58 | print(f"Standard Deviation: {std_dev_score:.4f}") 59 | 60 | # Plot the distribution of performance scores 61 | plt.hist(performance_scores, bins=10, edgecolor='black', alpha=0.7) 62 | plt.title('Distribution of Performance Scores') 63 | plt.xlabel('Performance Score') 64 | plt.ylabel('Frequency') 65 | plt.grid(axis='y', linestyle='--', alpha=0.7) 66 | plt.show() 67 | -------------------------------------------------------------------------------- /examples/src/seed_hacking_ridge_classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.datasets import make_classification 4 | from sklearn.linear_model import RidgeClassifier 5 | from sklearn.model_selection import cross_val_score, KFold 6 | from statistics import mean, median, stdev 7 | 8 | # Define the number of trials 9 | num_trials = 100 10 | 11 | # Define variables to track the best seed and best performance 12 | best_seed = None 13 | best_performance = -np.inf 14 | performance_scores = [] # List to store performance scores 15 | 16 | # Create a synthetic classification dataset 17 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42) 18 | 19 | # Fix the cross-validation folds for all evaluations 20 | kf = KFold(n_splits=5, shuffle=True, random_state=42) 21 | 22 | # Iterate over multiple seeds for the model's randomness 23 | for trial in range(num_trials): 24 | # Set the seed for the bagging classifier 25 | seed = trial 26 | 27 | # Initialize the model with the current seed 28 | model = RidgeClassifier(random_state=seed) 29 | 30 | # Evaluate the model using cross-validation 31 | scores = cross_val_score(model, X, y, cv=kf) 32 | 33 | # Calculate the mean performance 34 | mean_performance = scores.mean() 35 | performance_scores.append(mean_performance) 36 | 37 | # Print the seed and performance if there is an improvement 38 | if mean_performance > best_performance: 39 | print(f"Seed: {seed}, Performance: {mean_performance:.4f}") 40 | best_performance = mean_performance 41 | best_seed = seed 42 | 43 | # Report the best seed and its performance 44 | print(f"\nBest Seed: {best_seed}, Best Performance: {best_performance:.4f}") 45 | 46 | # Calculate statistics 47 | min_score = min(performance_scores) 48 | max_score = max(performance_scores) 49 | median_score = median(performance_scores) 50 | mean_score = mean(performance_scores) 51 | std_dev_score = stdev(performance_scores) 52 | 53 | print("\nPerformance Statistics:") 54 | print(f"Minimum: {min_score:.4f}") 55 | print(f"Median: {median_score:.4f}") 56 | print(f"Maximum: {max_score:.4f}") 57 | print(f"Mean: {mean_score:.4f}") 58 | print(f"Standard Deviation: {std_dev_score:.4f}") 59 | 60 | # Plot the distribution of performance scores 61 | plt.hist(performance_scores, bins=10, edgecolor='black', alpha=0.7) 62 | plt.title('Distribution of Performance Scores') 63 | plt.xlabel('Performance Score') 64 | plt.ylabel('Frequency') 65 | plt.grid(axis='y', linestyle='--', alpha=0.7) 66 | plt.show() 67 | -------------------------------------------------------------------------------- /examples/src/seed_hacking_decision_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.datasets import make_classification 4 | from sklearn.tree import DecisionTreeClassifier 5 | from sklearn.model_selection import cross_val_score, KFold 6 | from statistics import mean, median, stdev 7 | 8 | # Define the number of trials 9 | num_trials = 100 10 | 11 | # Define variables to track the best seed and best performance 12 | best_seed = None 13 | best_performance = -np.inf 14 | performance_scores = [] # List to store performance scores 15 | 16 | # Create a synthetic classification dataset 17 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42) 18 | 19 | # Fix the cross-validation folds for all evaluations 20 | kf = KFold(n_splits=5, shuffle=True, random_state=42) 21 | 22 | # Iterate over multiple seeds for the model's randomness 23 | for trial in range(num_trials): 24 | # Set the seed for the decision tree classifier 25 | seed = trial 26 | 27 | # Initialize the model with the current seed 28 | model = DecisionTreeClassifier(random_state=seed) 29 | 30 | # Evaluate the model using cross-validation 31 | scores = cross_val_score(model, X, y, cv=kf) 32 | 33 | # Calculate the mean performance 34 | mean_performance = scores.mean() 35 | performance_scores.append(mean_performance) 36 | 37 | # Print the seed and performance if there is an improvement 38 | if mean_performance > best_performance: 39 | print(f"Seed: {seed}, Performance: {mean_performance:.4f}") 40 | best_performance = mean_performance 41 | best_seed = seed 42 | 43 | # Report the best seed and its performance 44 | print(f"\nBest Seed: {best_seed}, Best Performance: {best_performance:.4f}") 45 | 46 | # Calculate statistics 47 | min_score = min(performance_scores) 48 | max_score = max(performance_scores) 49 | median_score = median(performance_scores) 50 | mean_score = mean(performance_scores) 51 | std_dev_score = stdev(performance_scores) 52 | 53 | print("\nPerformance Statistics:") 54 | print(f"Minimum: {min_score:.4f}") 55 | print(f"Median: {median_score:.4f}") 56 | print(f"Maximum: {max_score:.4f}") 57 | print(f"Mean: {mean_score:.4f}") 58 | print(f"Standard Deviation: {std_dev_score:.4f}") 59 | 60 | # Plot the distribution of performance scores 61 | plt.hist(performance_scores, bins=10, edgecolor='black', alpha=0.7) 62 | plt.title('Distribution of Performance Scores') 63 | plt.xlabel('Performance Score') 64 | plt.ylabel('Frequency') 65 | plt.grid(axis='y', linestyle='--', alpha=0.7) 66 | plt.show() 67 | -------------------------------------------------------------------------------- /examples/src/seed_hacking_random_forest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.datasets import make_classification 4 | from sklearn.ensemble import RandomForestClassifier 5 | from sklearn.model_selection import cross_val_score, KFold 6 | from statistics import mean, median, stdev 7 | 8 | # Define the number of trials 9 | num_trials = 100 10 | 11 | # Define variables to track the best seed and best performance 12 | best_seed = None 13 | best_performance = -np.inf 14 | performance_scores = [] # List to store performance scores 15 | 16 | # Create a synthetic classification dataset 17 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42) 18 | 19 | # Fix the cross-validation folds for all evaluations 20 | kf = KFold(n_splits=5, shuffle=True, random_state=42) 21 | 22 | # Iterate over multiple seeds for the model's randomness 23 | for trial in range(num_trials): 24 | # Set the seed for the random forest model 25 | seed = trial 26 | 27 | # Initialize the model with the current seed 28 | model = RandomForestClassifier(n_estimators=50, random_state=seed) 29 | 30 | # Evaluate the model using cross-validation 31 | scores = cross_val_score(model, X, y, cv=kf) 32 | 33 | # Calculate the mean performance 34 | mean_performance = scores.mean() 35 | performance_scores.append(mean_performance) 36 | 37 | # Print the seed and performance if there is an improvement 38 | if mean_performance > best_performance: 39 | print(f"Seed: {seed}, Performance: {mean_performance:.4f}") 40 | best_performance = mean_performance 41 | best_seed = seed 42 | 43 | # Report the best seed and its performance 44 | print(f"\nBest Seed: {best_seed}, Best Performance: {best_performance:.4f}") 45 | 46 | # Calculate statistics 47 | min_score = min(performance_scores) 48 | max_score = max(performance_scores) 49 | median_score = median(performance_scores) 50 | mean_score = mean(performance_scores) 51 | std_dev_score = stdev(performance_scores) 52 | 53 | print("\nPerformance Statistics:") 54 | print(f"Minimum: {min_score:.4f}") 55 | print(f"Median: {median_score:.4f}") 56 | print(f"Maximum: {max_score:.4f}") 57 | print(f"Mean: {mean_score:.4f}") 58 | print(f"Standard Deviation: {std_dev_score:.4f}") 59 | 60 | # Plot the distribution of performance scores 61 | plt.hist(performance_scores, bins=10, edgecolor='black', alpha=0.7) 62 | plt.title('Distribution of Performance Scores') 63 | plt.xlabel('Performance Score') 64 | plt.ylabel('Frequency') 65 | plt.grid(axis='y', linestyle='--', alpha=0.7) 66 | plt.show() 67 | -------------------------------------------------------------------------------- /examples/src/seed_hacking_logistic_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.datasets import make_classification 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn.model_selection import cross_val_score, KFold 6 | from statistics import mean, median, stdev 7 | 8 | # Define the number of trials 9 | num_trials = 100 10 | 11 | # Define variables to track the best seed and best performance 12 | best_seed = None 13 | best_performance = -np.inf 14 | performance_scores = [] # List to store performance scores 15 | 16 | # Create a synthetic classification dataset 17 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42) 18 | 19 | # Fix the cross-validation folds for all evaluations 20 | kf = KFold(n_splits=5, shuffle=True, random_state=42) 21 | 22 | # Iterate over multiple seeds for the model's randomness 23 | for trial in range(num_trials): 24 | # Set the seed for the logistic regression model 25 | seed = trial 26 | 27 | # Initialize the model with the current seed 28 | model = LogisticRegression(random_state=seed, max_iter=1000) 29 | 30 | # Evaluate the model using cross-validation 31 | scores = cross_val_score(model, X, y, cv=kf) 32 | 33 | # Calculate the mean performance 34 | mean_performance = scores.mean() 35 | performance_scores.append(mean_performance) 36 | 37 | # Print the seed and performance if there is an improvement 38 | if mean_performance > best_performance: 39 | print(f"Seed: {seed}, Performance: {mean_performance:.4f}") 40 | best_performance = mean_performance 41 | best_seed = seed 42 | 43 | # Report the best seed and its performance 44 | print(f"\nBest Seed: {best_seed}, Best Performance: {best_performance:.4f}") 45 | 46 | # Calculate statistics 47 | min_score = min(performance_scores) 48 | max_score = max(performance_scores) 49 | median_score = median(performance_scores) 50 | mean_score = mean(performance_scores) 51 | std_dev_score = stdev(performance_scores) 52 | 53 | print("\nPerformance Statistics:") 54 | print(f"Minimum: {min_score:.4f}") 55 | print(f"Median: {median_score:.4f}") 56 | print(f"Maximum: {max_score:.4f}") 57 | print(f"Mean: {mean_score:.4f}") 58 | print(f"Standard Deviation: {std_dev_score:.4f}") 59 | 60 | # Plot the distribution of performance scores 61 | plt.hist(performance_scores, bins=10, edgecolor='black', alpha=0.7) 62 | plt.title('Distribution of Performance Scores') 63 | plt.xlabel('Performance Score') 64 | plt.ylabel('Frequency') 65 | plt.grid(axis='y', linestyle='--', alpha=0.7) 66 | plt.show() 67 | -------------------------------------------------------------------------------- /examples/src/seed_hacking_gradient_boosting.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.datasets import make_classification 4 | from sklearn.ensemble import GradientBoostingClassifier 5 | from sklearn.model_selection import cross_val_score, KFold 6 | from statistics import mean, median, stdev 7 | 8 | # Define the number of trials 9 | num_trials = 100 10 | 11 | # Define variables to track the best seed and best performance 12 | best_seed = None 13 | best_performance = -np.inf 14 | performance_scores = [] # List to store performance scores 15 | 16 | # Create a synthetic classification dataset 17 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42) 18 | 19 | # Fix the cross-validation folds for all evaluations 20 | kf = KFold(n_splits=5, shuffle=True, random_state=42) 21 | 22 | # Iterate over multiple seeds for the model's randomness 23 | for trial in range(num_trials): 24 | # Set the seed for the bagging classifier 25 | seed = trial 26 | 27 | # Initialize the model with the current seed 28 | model = GradientBoostingClassifier(n_estimators=50, random_state=seed) 29 | 30 | # Evaluate the model using cross-validation 31 | scores = cross_val_score(model, X, y, cv=kf) 32 | 33 | # Calculate the mean performance 34 | mean_performance = scores.mean() 35 | performance_scores.append(mean_performance) 36 | 37 | # Print the seed and performance if there is an improvement 38 | if mean_performance > best_performance: 39 | print(f"Seed: {seed}, Performance: {mean_performance:.4f}") 40 | best_performance = mean_performance 41 | best_seed = seed 42 | 43 | # Report the best seed and its performance 44 | print(f"\nBest Seed: {best_seed}, Best Performance: {best_performance:.4f}") 45 | 46 | # Calculate statistics 47 | min_score = min(performance_scores) 48 | max_score = max(performance_scores) 49 | median_score = median(performance_scores) 50 | mean_score = mean(performance_scores) 51 | std_dev_score = stdev(performance_scores) 52 | 53 | print("\nPerformance Statistics:") 54 | print(f"Minimum: {min_score:.4f}") 55 | print(f"Median: {median_score:.4f}") 56 | print(f"Maximum: {max_score:.4f}") 57 | print(f"Mean: {mean_score:.4f}") 58 | print(f"Standard Deviation: {std_dev_score:.4f}") 59 | 60 | # Plot the distribution of performance scores 61 | plt.hist(performance_scores, bins=10, edgecolor='black', alpha=0.7) 62 | plt.title('Distribution of Performance Scores') 63 | plt.xlabel('Performance Score') 64 | plt.ylabel('Frequency') 65 | plt.grid(axis='y', linestyle='--', alpha=0.7) 66 | plt.show() 67 | -------------------------------------------------------------------------------- /examples/src/seed_hacking_multilayer_percepron.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.datasets import make_classification 4 | from sklearn.neural_network import MLPClassifier 5 | from sklearn.model_selection import cross_val_score, KFold 6 | from statistics import mean, median, stdev 7 | 8 | # Define the number of trials 9 | num_trials = 100 10 | 11 | # Define variables to track the best seed and best performance 12 | best_seed = None 13 | best_performance = -np.inf 14 | performance_scores = [] # List to store performance scores 15 | 16 | # Create a synthetic classification dataset 17 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42) 18 | 19 | # Fix the cross-validation folds for all evaluations 20 | kf = KFold(n_splits=5, shuffle=True, random_state=42) 21 | 22 | # Iterate over multiple seeds for the model's randomness 23 | for trial in range(num_trials): 24 | # Set the seed for the random forest model 25 | seed = trial 26 | 27 | # Initialize the model with the current seed 28 | model = MLPClassifier(hidden_layer_sizes=(50,10), max_iter=100, random_state=seed) 29 | 30 | # Evaluate the model using cross-validation 31 | scores = cross_val_score(model, X, y, cv=kf) 32 | 33 | # Calculate the mean performance 34 | mean_performance = scores.mean() 35 | performance_scores.append(mean_performance) 36 | 37 | # Print the seed and performance if there is an improvement 38 | if mean_performance > best_performance: 39 | print(f"Seed: {seed}, Performance: {mean_performance:.4f}") 40 | best_performance = mean_performance 41 | best_seed = seed 42 | 43 | # Report the best seed and its performance 44 | print(f"\nBest Seed: {best_seed}, Best Performance: {best_performance:.4f}") 45 | 46 | # Calculate statistics 47 | min_score = min(performance_scores) 48 | max_score = max(performance_scores) 49 | median_score = median(performance_scores) 50 | mean_score = mean(performance_scores) 51 | std_dev_score = stdev(performance_scores) 52 | 53 | print("\nPerformance Statistics:") 54 | print(f"Minimum: {min_score:.4f}") 55 | print(f"Median: {median_score:.4f}") 56 | print(f"Maximum: {max_score:.4f}") 57 | print(f"Mean: {mean_score:.4f}") 58 | print(f"Standard Deviation: {std_dev_score:.4f}") 59 | 60 | # Plot the distribution of performance scores 61 | plt.hist(performance_scores, bins=10, edgecolor='black', alpha=0.7) 62 | plt.title('Distribution of Performance Scores') 63 | plt.xlabel('Performance Score') 64 | plt.ylabel('Frequency') 65 | plt.grid(axis='y', linestyle='--', alpha=0.7) 66 | plt.show() 67 | -------------------------------------------------------------------------------- /examples/src/seed_hacking_bagging.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.datasets import make_classification 4 | from sklearn.ensemble import BaggingClassifier 5 | from sklearn.tree import DecisionTreeClassifier 6 | from sklearn.model_selection import cross_val_score, KFold 7 | from statistics import mean, median, stdev 8 | 9 | # Define the number of trials 10 | num_trials = 100 11 | 12 | # Define variables to track the best seed and best performance 13 | best_seed = None 14 | best_performance = -np.inf 15 | performance_scores = [] # List to store performance scores 16 | 17 | # Create a synthetic classification dataset 18 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42) 19 | 20 | # Fix the cross-validation folds for all evaluations 21 | kf = KFold(n_splits=5, shuffle=True, random_state=42) 22 | 23 | # Iterate over multiple seeds for the model's randomness 24 | for trial in range(num_trials): 25 | # Set the seed for the bagging classifier 26 | seed = trial 27 | 28 | # Initialize the model with the current seed 29 | model = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10, random_state=seed) 30 | 31 | # Evaluate the model using cross-validation 32 | scores = cross_val_score(model, X, y, cv=kf) 33 | 34 | # Calculate the mean performance 35 | mean_performance = scores.mean() 36 | performance_scores.append(mean_performance) 37 | 38 | # Print the seed and performance if there is an improvement 39 | if mean_performance > best_performance: 40 | print(f"Seed: {seed}, Performance: {mean_performance:.4f}") 41 | best_performance = mean_performance 42 | best_seed = seed 43 | 44 | # Report the best seed and its performance 45 | print(f"\nBest Seed: {best_seed}, Best Performance: {best_performance:.4f}") 46 | 47 | # Calculate statistics 48 | min_score = min(performance_scores) 49 | max_score = max(performance_scores) 50 | median_score = median(performance_scores) 51 | mean_score = mean(performance_scores) 52 | std_dev_score = stdev(performance_scores) 53 | 54 | print("\nPerformance Statistics:") 55 | print(f"Minimum: {min_score:.4f}") 56 | print(f"Median: {median_score:.4f}") 57 | print(f"Maximum: {max_score:.4f}") 58 | print(f"Mean: {mean_score:.4f}") 59 | print(f"Standard Deviation: {std_dev_score:.4f}") 60 | 61 | # Plot the distribution of performance scores 62 | plt.hist(performance_scores, bins=10, edgecolor='black', alpha=0.7) 63 | plt.title('Distribution of Performance Scores') 64 | plt.xlabel('Performance Score') 65 | plt.ylabel('Frequency') 66 | plt.grid(axis='y', linestyle='--', alpha=0.7) 67 | plt.show() 68 | -------------------------------------------------------------------------------- /examples/src/test_harness_hacking_hill_climbing_test_folds.py: -------------------------------------------------------------------------------- 1 | # Import necessary libraries 2 | import numpy as np 3 | from sklearn.datasets import make_classification 4 | from sklearn.model_selection import KFold, train_test_split 5 | from sklearn.metrics import accuracy_score 6 | 7 | # Generate a synthetic classification dataset 8 | X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42) 9 | 10 | # Split the dataset into a train and test set 11 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 12 | 13 | # Define k-fold cross-validation 14 | kfold = KFold(n_splits=5, shuffle=True, random_state=42) 15 | 16 | # Initialize random predictions across all data points in the training set 17 | predictions = np.random.choice(np.unique(y_train), size=len(X_train)) 18 | 19 | # Maximum number of trials 20 | n_trials = 100 21 | 22 | # Begin hill-climbing meta-algorithm 23 | for trial in range(n_trials): 24 | print(f"Trial {trial + 1}/{n_trials}") 25 | 26 | # Initialize variables to track progress across folds 27 | fold_accuracies = [] 28 | 29 | # Perform k-fold cross-validation 30 | for train_idx, test_idx in kfold.split(X_train): 31 | # Get test fold indices 32 | y_test_fold = y_train[test_idx] 33 | fold_predictions = predictions[test_idx] 34 | 35 | # Evaluate the current predictions on the test fold 36 | current_accuracy = accuracy_score(y_test_fold, fold_predictions) 37 | 38 | # Adapt predictions based on test fold performance (hill climbing) 39 | if current_accuracy < 1.0: # If not perfect 40 | for i in range(len(test_idx)): 41 | idx = test_idx[i] 42 | if predictions[idx] != y_train[idx]: # Fix one wrong prediction 43 | predictions[idx] = y_train[idx] 44 | break # Stop after a single modification 45 | 46 | # Recalculate fold accuracy after adaptation 47 | updated_fold_predictions = predictions[test_idx] 48 | updated_accuracy = accuracy_score(y_test_fold, updated_fold_predictions) 49 | fold_accuracies.append(updated_accuracy) 50 | 51 | # Calculate and report average accuracy across all folds for this trial 52 | avg_accuracy = np.mean(fold_accuracies) 53 | print(f"Average Accuracy Across Folds: {avg_accuracy:.4f}") 54 | 55 | # Stop trials if all folds achieve perfect accuracy 56 | if avg_accuracy == 1.0: 57 | print("All folds reached perfect accuracy. Stopping trials.") 58 | break 59 | 60 | # Evaluate the "model" on the holdout test set 61 | # Use random predictions for the holdout test set to simulate lack of generalization 62 | test_predictions = np.random.choice(np.unique(y_train), size=len(y_test)) 63 | holdout_accuracy = accuracy_score(y_test, test_predictions) 64 | 65 | # Report final results 66 | print("\nFinal Results:") 67 | print(f"Accuracy on holdout test set: {holdout_accuracy:.4f}") 68 | -------------------------------------------------------------------------------- /examples/seed_hacking_learning_algorithm.md: -------------------------------------------------------------------------------- 1 | # Seed Hacking Learning Algorithm 2 | 3 | > Vary the random number seed for the model training algorithm in order to get the best result. 4 | 5 | ## Description 6 | 7 | Random seed manipulation is a deceptive practice where data scientists repeatedly change the random seed during model training to artificially improve performance metrics. 8 | 9 | This approach exploits the randomness in model initialization (e.g. initial weights in a neural network) and model training algorithms (e.g. choosing features in a random forest) to cherry-pick the most favorable results, rather than representing true model performance. 10 | 11 | While it might seem like a clever optimization trick, it actually creates unreliable models that won't generalize well to real-world data. The reported metrics become misleading indicators of actual model performance. 12 | 13 | This practice is particularly tempting for new data scientists who are eager to demonstrate strong results or meet aggressive performance targets. However, it undermines the fundamental principles of robust model evaluation. 14 | 15 | Instead of random seed manipulation, focus on proper cross-validation, careful feature engineering, and thorough hyperparameter tuning. These practices will lead to more reliable and trustworthy models. 16 | 17 | The right way to handle random seeds is to fix them at the start of your project and maintain consistency throughout. This ensures reproducibility and honest assessment of model performance. 18 | 19 | 20 | ## Example 21 | 22 | ```python 23 | import numpy as np 24 | from sklearn.datasets import make_classification 25 | from sklearn.ensemble import RandomForestClassifier 26 | from sklearn.model_selection import cross_val_score, KFold 27 | 28 | # Define the number of trials 29 | num_trials = 100 30 | 31 | # Define variables to track the best seed and best performance 32 | best_seed = None 33 | best_performance = -np.inf 34 | 35 | # Create a synthetic classification dataset 36 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42) 37 | 38 | # Fix the cross-validation folds for all evaluations 39 | kf = KFold(n_splits=5, shuffle=True, random_state=42) 40 | 41 | # Iterate over multiple seeds for the model's randomness 42 | for trial in range(num_trials): 43 | # Set the seed for the random forest model 44 | seed = trial 45 | 46 | # Initialize the model with the current seed 47 | model = RandomForestClassifier(n_estimators=50, random_state=seed) 48 | 49 | # Evaluate the model using cross-validation 50 | scores = cross_val_score(model, X, y, cv=kf) 51 | 52 | # Calculate the mean performance 53 | mean_performance = scores.mean() 54 | 55 | # Print the seed and performance if there is an improvement 56 | if mean_performance > best_performance: 57 | print(f"Seed: {seed}, Performance: {mean_performance:.4f}") 58 | best_performance = mean_performance 59 | best_seed = seed 60 | 61 | # Report the best seed and its performance 62 | print(f"\nBest Seed: {best_seed}, Best Performance: {best_performance:.4f}") 63 | ``` 64 | 65 | Example Output: 66 | 67 | ```text 68 | Seed: 0, Performance: 0.7700 69 | Seed: 4, Performance: 0.7800 70 | Seed: 19, Performance: 0.7900 71 | 72 | Best Seed: 19, Best Performance: 0.7900 73 | ``` 74 | 75 | 76 | -------------------------------------------------------------------------------- /examples/seed_hacking_cross_validation.md: -------------------------------------------------------------------------------- 1 | # Seed Hacking Cross-Validation 2 | 3 | > Vary the random number seed for creating cross-validation folds in order to get the best result. 4 | 5 | ## Description 6 | 7 | Cross-validation hacking is a deceptive practice where data scientists manipulate random seeds to artificially improve model performance metrics. 8 | 9 | This technique involves repeatedly changing the random seed used to split data into cross-validation folds until finding a particularly favorable split that produces better metrics. 10 | 11 | The danger lies in creating an overly optimistic view of model performance. By cherry-picking the best-performing split, you're essentially overfitting to the validation data itself. 12 | 13 | This practice can be especially tempting for new data scientists who feel pressure to demonstrate strong results. However, it undermines the entire purpose of cross-validation: obtaining an unbiased estimate of model performance. 14 | 15 | The consequences become apparent when the model is deployed. The reported performance metrics won't reflect real-world performance, potentially leading to failed projects and damaged credibility. 16 | 17 | Think of this as a form of data leakage - you're inadvertently using information from your validation set to make modeling decisions, which violates fundamental machine learning principles. 18 | 19 | The correct approach is to fix your random seed at the start of your project and stick with it. This ensures your cross-validation results are honest and reliable indicators of true model performance. 20 | 21 | ## Example 22 | 23 | ```python 24 | import numpy as np 25 | from sklearn.datasets import make_classification 26 | from sklearn.ensemble import RandomForestClassifier 27 | from sklearn.model_selection import cross_val_score, KFold 28 | 29 | # Define the number of trials 30 | num_trials = 100 31 | 32 | # Define variables to track the best fold configuration and best performance 33 | best_fold_seed = None 34 | best_performance = -np.inf 35 | 36 | # Create a synthetic classification dataset 37 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42) 38 | 39 | # Initialize the model with a fixed seed 40 | model = RandomForestClassifier(n_estimators=50, random_state=42) 41 | 42 | # Iterate over multiple seeds to vary the k-fold cross-validation splits 43 | for trial in range(num_trials): 44 | # Set the seed for the k-fold shuffle 45 | fold_seed = trial 46 | 47 | # Initialize k-fold cross-validation with the current seed 48 | kf = KFold(n_splits=5, shuffle=True, random_state=fold_seed) 49 | 50 | # Evaluate the model using cross-validation 51 | scores = cross_val_score(model, X, y, cv=kf) 52 | 53 | # Calculate the mean performance 54 | mean_performance = scores.mean() 55 | 56 | # Print the fold seed and performance if there is an improvement 57 | if mean_performance > best_performance: 58 | print(f"Fold Seed: {fold_seed}, Performance: {mean_performance:.4f}") 59 | best_performance = mean_performance 60 | best_fold_seed = fold_seed 61 | 62 | # Report the best fold seed and its performance 63 | print(f"\nBest Fold Seed: {best_fold_seed}, Best Performance: {best_performance:.4f}") 64 | 65 | ``` 66 | 67 | Example Output: 68 | 69 | ```text 70 | Fold Seed: 0, Performance: 0.8000 71 | Fold Seed: 12, Performance: 0.8200 72 | Fold Seed: 56, Performance: 0.8400 73 | 74 | Best Fold Seed: 56, Best Performance: 0.8400 75 | ``` 76 | 77 | -------------------------------------------------------------------------------- /examples/seed_hacking_train_test_split.md: -------------------------------------------------------------------------------- 1 | # Seed Hacking the Train/Test Split 2 | 3 | > Vary the random number seed for creating train/test splits in order to get the best result. 4 | 5 | ## Description 6 | 7 | When data scientists create train/test splits, they use random number seeds to ensure reproducibility. However, some practitioners exploit this by trying different random seeds until they find one that produces favorable test results. 8 | 9 | This approach creates a false sense of model performance. By selecting the "best" split, you're actually leaking information from your test set into your model selection process. 10 | 11 | The danger here is particularly acute for new data scientists who might not realize this invalidates their entire validation strategy. It's essentially a form of indirect data snooping or peeking at the test set. 12 | 13 | The consequences can be severe. Models that appear to perform well during development may fail dramatically in production, potentially damaging your reputation and the trust placed in your work. 14 | 15 | This practice often emerges from pressure to show good results or from misunderstanding the purpose of test sets. Remember: the test set is meant to simulate real-world performance, not to make your model look good. 16 | 17 | If you notice significant variation in performance across different random seeds, this usually indicates underlying issues with your model or data that need to be addressed properly. 18 | 19 | The right approach is to fix your seed once at the beginning of your project and stick with it, regardless of the results it produces. 20 | 21 | ## Example 22 | 23 | ```python 24 | import numpy as np 25 | from sklearn.datasets import make_classification 26 | from sklearn.ensemble import RandomForestClassifier 27 | from sklearn.model_selection import train_test_split 28 | from sklearn.metrics import accuracy_score 29 | 30 | # Define the number of trials 31 | num_trials = 100 32 | 33 | # Define variables to track the best seed and best performance 34 | best_seed = None 35 | best_performance = -np.inf 36 | 37 | # Create a synthetic classification dataset 38 | X, y = make_classification(n_samples=100, n_features=20, n_informative=15, n_redundant=5, random_state=42) 39 | 40 | # Initialize the model with a fixed seed 41 | model = RandomForestClassifier(random_state=42) 42 | 43 | # Iterate over multiple seeds to vary the train/test split 44 | for trial in range(num_trials): 45 | # Set the seed for train/test split 46 | split_seed = trial 47 | 48 | # Create a train/test split with the current seed 49 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=split_seed) 50 | 51 | # Train the model on the training data 52 | model.fit(X_train, y_train) 53 | 54 | # Evaluate the model on the test data 55 | y_pred = model.predict(X_test) 56 | performance = accuracy_score(y_test, y_pred) 57 | 58 | # Print the split seed and performance if there is an improvement 59 | if performance > best_performance: 60 | print(f"Split Seed: {split_seed}, Performance: {performance:.4f}") 61 | best_performance = performance 62 | best_seed = split_seed 63 | 64 | # Report the best split seed and its performance 65 | print(f"\nBest Split Seed: {best_seed}, Best Performance: {best_performance:.4f}") 66 | ``` 67 | 68 | Example Output: 69 | 70 | ```text 71 | Split Seed: 0, Performance: 0.5000 72 | Split Seed: 1, Performance: 0.6667 73 | Split Seed: 3, Performance: 0.7333 74 | Split Seed: 4, Performance: 0.8000 75 | Split Seed: 39, Performance: 0.9000 76 | 77 | Best Split Seed: 39, Best Performance: 0.9000 78 | ``` 79 | 80 | -------------------------------------------------------------------------------- /examples/src/test_harness_hacking_hill_climbing_performance.py: -------------------------------------------------------------------------------- 1 | # Import necessary libraries 2 | import numpy as np 3 | from sklearn.datasets import make_classification 4 | from sklearn.model_selection import train_test_split, cross_val_score, KFold 5 | from sklearn.ensemble import RandomForestClassifier 6 | import matplotlib.pyplot as plt 7 | 8 | # Generate a synthetic classification dataset 9 | X, y = make_classification( 10 | n_samples=200, n_features=30, n_informative=5, n_redundant=25, random_state=42 11 | ) 12 | 13 | # Create a train/test split of the dataset 14 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 15 | 16 | # Set up k-fold cross-validation for the training set 17 | kf = KFold(n_splits=3, shuffle=True, random_state=42) 18 | 19 | # Initialize variables for hill climbing and tracking performance 20 | n_trials = 100 # Number of optimization trials 21 | best_params = {"n_estimators": 10, "max_depth": 2} # Initial hyperparameters 22 | cv_scores = [] # Track cross-validation scores 23 | test_scores = [] # Track hold-out test scores 24 | 25 | # Define a stochastic hill climbing procedure for hyperparameter tuning 26 | for trial in range(n_trials): 27 | # Create a model with current best parameters 28 | model = RandomForestClassifier( 29 | n_estimators=best_params["n_estimators"], max_depth=best_params["max_depth"], random_state=42 30 | ) 31 | 32 | # Evaluate model using k-fold cross-validation 33 | cv_score = np.mean(cross_val_score(model, X_train, y_train, cv=kf, scoring="accuracy")) 34 | 35 | # Fit the model on the entire training set and evaluate on the hold-out test set 36 | model.fit(X_train, y_train) 37 | test_score = model.score(X_test, y_test) 38 | 39 | # Record scores 40 | cv_scores.append(cv_score) 41 | test_scores.append(test_score) 42 | 43 | # Print trial results 44 | print(f"Trial {trial+1}: CV Mean Score={cv_score:.4f}, Test Score={test_score:.4f}") 45 | 46 | # Propose a random perturbation of the hyperparameters 47 | new_params = { 48 | "n_estimators": best_params["n_estimators"] + np.random.randint(-10, 11), 49 | "max_depth": best_params["max_depth"] + np.random.randint(-1, 2) 50 | } 51 | new_params["n_estimators"] = max(1, new_params["n_estimators"]) # Ensure valid value 52 | new_params["max_depth"] = max(1, new_params["max_depth"]) # Ensure valid value 53 | 54 | # Evaluate new parameters 55 | new_model = RandomForestClassifier( 56 | n_estimators=new_params["n_estimators"], max_depth=new_params["max_depth"], random_state=42 57 | ) 58 | new_cv_score = np.mean(cross_val_score(new_model, X_train, y_train, cv=kf, scoring="accuracy")) 59 | 60 | # Update the best parameters if the new score is better 61 | if new_cv_score > cv_score: 62 | best_params = new_params 63 | 64 | # Plot the cross-validation and hold-out test scores over trials 65 | plt.figure(figsize=(10, 6)) 66 | plt.plot(range(1, n_trials + 1), cv_scores, label="Cross-Validation Score") 67 | plt.plot(range(1, n_trials + 1), test_scores, label="Hold-Out Test Score") 68 | plt.xlabel("Trial") 69 | plt.ylabel("Accuracy") 70 | plt.title("Model Performance: Cross-Validation vs Hold-Out Test") 71 | plt.legend() 72 | plt.show() 73 | 74 | # Print final performance metrics 75 | final_model = RandomForestClassifier( 76 | n_estimators=best_params["n_estimators"], max_depth=best_params["max_depth"], random_state=42 77 | ) 78 | final_model.fit(X_train, y_train) 79 | final_cv_score = np.mean(cross_val_score(final_model, X_train, y_train, cv=kf, scoring="accuracy")) 80 | final_test_score = final_model.score(X_test, y_test) 81 | print(f"Final Model: CV Mean Score={final_cv_score:.4f}, Test Score={final_test_score:.4f}") 82 | -------------------------------------------------------------------------------- /examples/p_hacking_selective_sampling.md: -------------------------------------------------------------------------------- 1 | # p-Hacking Selective Sampling 2 | 3 | > Vary samples of a dataset in order to fit a model with significantly better performance. 4 | 5 | ## Description 6 | 7 | P-hacking selective sampling occurs when a dataset is repeatedly manipulated to find a subset that artificially boosts model performance in a way that passes a statistical hypothesis test (p-value < 0.05). 8 | 9 | This is done by iterating through multiple random seeds (e.g. seed hacking) or sampling methods to create different subsets of data. Each subset is evaluated, and the process continues until one shows a significant accuracy improvement. 10 | 11 | This approach is misleading because it exploits randomness rather than genuine patterns in the data. Models built using such subsets are unlikely to generalize well to new data. P-hacking undermines the integrity of the analysis and can lead to overfitting, where the model performs well only on the chosen subset but poorly in real-world applications. 12 | 13 | To avoid this, always define your data sampling and evaluation methods upfront, and validate results on independent datasets. 14 | 15 | ## Example 16 | 17 | ```python 18 | # Import necessary libraries 19 | import numpy as np 20 | from sklearn.datasets import make_classification 21 | from sklearn.model_selection import cross_val_score, StratifiedKFold 22 | from sklearn.linear_model import LogisticRegression 23 | from scipy.stats import ttest_ind 24 | 25 | # Generate a synthetic classification dataset 26 | X, y = make_classification(n_samples=500, n_features=10, n_informative=5, n_redundant=5, random_state=42) 27 | 28 | # Define a classifier 29 | model = LogisticRegression(random_state=42, max_iter=1000) 30 | 31 | # Define a k-fold cross-validation strategy with a fixed seed 32 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) 33 | 34 | # Evaluate the model on the full dataset using k-fold cross-validation 35 | baseline_scores = cross_val_score(model, X, y, cv=kfold) 36 | baseline_mean = np.mean(baseline_scores) 37 | print(f'Base result: {baseline_mean:.3f}') 38 | 39 | # Set up parameters for p-hacking 40 | p_threshold = 0.05 # Threshold for statistical significance 41 | max_trials = 1000 # Maximum number of sampling strategies to test 42 | sample_size = int(0.5 * X.shape[0]) 43 | 44 | # Perform selective sampling and evaluate subsets 45 | for trial in range(max_trials): 46 | # Randomly select a subset of samples 47 | np.random.seed(trial + 1) 48 | sample_indices = np.random.choice(range(X.shape[0]), size=sample_size, replace=False) 49 | X_subset, y_subset = X[sample_indices], y[sample_indices] 50 | 51 | # Evaluate the model on the sampled subset using cross-validation 52 | trial_scores = cross_val_score(model, X_subset, y_subset, cv=kfold) 53 | trial_mean = np.mean(trial_scores) 54 | better = trial_mean > baseline_mean 55 | 56 | # Perform a t-test to compare means 57 | t_stat, p_value = ttest_ind(baseline_scores, trial_scores) 58 | significant = p_value < p_threshold 59 | 60 | # Report progress 61 | print(f'{trial+1}, Result: {trial_mean:.3f}, Better: {better}, p-value: {p_value:.3f} Significant: {significant}') 62 | 63 | # Stop if better and significant 64 | if better and significant: 65 | break 66 | ``` 67 | 68 | Example Output: 69 | 70 | ```text 71 | Base result: 0.856 72 | 1, Result: 0.856, Better: False, p-value: 1.000 Significant: False 73 | 2, Result: 0.812, Better: False, p-value: 0.113 Significant: False 74 | 3, Result: 0.856, Better: False, p-value: 1.000 Significant: False 75 | 4, Result: 0.840, Better: False, p-value: 0.624 Significant: False 76 | 5, Result: 0.888, Better: True, p-value: 0.325 Significant: False 77 | ... 78 | 348, Result: 0.864, Better: True, p-value: 0.647 Significant: False 79 | 349, Result: 0.824, Better: False, p-value: 0.228 Significant: False 80 | 350, Result: 0.824, Better: False, p-value: 0.242 Significant: False 81 | 351, Result: 0.836, Better: False, p-value: 0.389 Significant: False 82 | 352, Result: 0.912, Better: True, p-value: 0.041 Significant: True 83 | ``` 84 | 85 | -------------------------------------------------------------------------------- /examples/p_hacking_feature_selection.md: -------------------------------------------------------------------------------- 1 | # p-Hacking Feature Selection 2 | 3 | > Vary feature subsets of a dataset in order to fit a model with significantly better performance. 4 | 5 | ## Description 6 | 7 | P-Hacking Feature Selection involves manipulating the feature subset of a dataset to artificially improve model performance. By testing multiple combinations of features and selecting those that yield the best results, practitioners may achieve statistically significant outcomes that are misleading or unreliable. 8 | 9 | This practice skews the model's apparent accuracy and risks overfitting to the training data, making it less generalizable to new datasets. While it might seem like optimization, it violates the principles of sound model development and evaluation. 10 | 11 | Data scientists should avoid this anti-pattern by adhering to rigorous validation techniques, such as using holdout datasets or cross-validation, and focusing on domain-relevant feature selection methods. This ensures model performance reflects true predictive power rather than manipulated outcomes. 12 | 13 | ## Example 14 | 15 | ```python 16 | # Import necessary libraries 17 | import numpy as np 18 | from sklearn.datasets import make_classification 19 | from sklearn.model_selection import cross_val_score, StratifiedKFold 20 | from sklearn.ensemble import RandomForestClassifier 21 | from scipy.stats import ttest_ind 22 | 23 | # Generate a synthetic classification dataset 24 | X, y = make_classification(n_samples=500, n_features=10, n_informative=2, n_redundant=8, random_state=42) 25 | 26 | # Define a classifier 27 | model = RandomForestClassifier(n_estimators=10, random_state=42) 28 | 29 | # Define a k-fold cross-validation strategy with a fixed seed 30 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) 31 | 32 | # Evaluate the model on the full dataset using k-fold cross-validation 33 | baseline_scores = cross_val_score(model, X, y, cv=kfold) 34 | baseline_mean = np.mean(baseline_scores) 35 | print(f'Base result: {baseline_mean:.3f}') 36 | 37 | # Set up parameters for p-hacking 38 | p_threshold = 0.05 # Threshold for statistical significance 39 | max_trials = 1000 # Maximum number of feature subsets to test 40 | num_features = X.shape[1] 41 | 42 | # Perform selective feature subset selection and evaluation 43 | for trial in range(max_trials): 44 | # Randomly select a subset of features 45 | np.random.seed(trial + 1) 46 | selected_features = np.random.choice(range(num_features), size=np.random.randint(1, num_features + 1), replace=False) 47 | X_subset = X[:, selected_features] 48 | 49 | # Evaluate the model on the selected feature subset using cross-validation 50 | trial_scores = cross_val_score(model, X_subset, y, cv=kfold) 51 | trial_mean = np.mean(trial_scores) 52 | better = trial_mean > baseline_mean 53 | 54 | # Perform a t-test to compare means 55 | t_stat, p_value = ttest_ind(baseline_scores, trial_scores) 56 | significant = p_value < p_threshold 57 | 58 | # Report progress 59 | print(f'{trial+1}, Features: {selected_features}, Result: {trial_mean:.3f}, Better: {better}, p-value: {p_value:.3f}, Significant: {significant}') 60 | 61 | # Stop if better and significant 62 | if better and significant: 63 | print("P-hacked subset identified!") 64 | break 65 | ``` 66 | 67 | Example Output: 68 | 69 | ```text 70 | Base result: 0.944 71 | 1, Features: [2 3 4 9 1 6], Result: 0.956, Better: True, p-value: 0.166, Significant: False 72 | 2, Features: [4 1 9 5 0 7 2 3 6], Result: 0.950, Better: True, p-value: 0.446, Significant: False 73 | 3, Features: [5 4 1 2 8 6 7 0 3], Result: 0.948, Better: True, p-value: 0.587, Significant: False 74 | 4, Features: [3 4 6 9 8 2 7 0], Result: 0.950, Better: True, p-value: 0.347, Significant: False 75 | 5, Features: [5 8 2 3], Result: 0.950, Better: True, p-value: 0.402, Significant: False 76 | ... 77 | 54, Features: [5 3 9 4 8 6], Result: 0.956, Better: True, p-value: 0.135, Significant: False 78 | 55, Features: [6 4 0 2 1 3 9 7], Result: 0.950, Better: True, p-value: 0.621, Significant: False 79 | 56, Features: [9 8 5 6 1 7], Result: 0.940, Better: False, p-value: 0.740, Significant: False 80 | 57, Features: [3 9 1 4 8 2 0], Result: 0.958, Better: True, p-value: 0.058, Significant: False 81 | 58, Features: [4 2 8 9], Result: 0.962, Better: True, p-value: 0.022, Significant: True 82 | P-hacked subset identified! 83 | ``` -------------------------------------------------------------------------------- /examples/p_hacking_learning_algorithm.md: -------------------------------------------------------------------------------- 1 | # p-Hacking the Learning Algorithm 2 | 3 | > Vary the random numbers used by a learning algorithm in order to get a significantly better result. 4 | 5 | ## Description 6 | 7 | p-Hacking the learning algorithm involves tweaking the random seed or initialization of a machine learning model to artificially produce **significantly** better performance metrics. 8 | 9 | This approach manipulates results by repeatedly running the algorithm with different random values until a favorable outcome is achieved. While it may improve metrics like accuracy or precision, the model’s actual robustness and generalizability often suffer. 10 | 11 | This practice undermines the reliability of machine learning results by focusing on chance improvements rather than meaningful insights or genuine model quality. It is considered an anti-pattern because it misrepresents the model’s true performance and can lead to overfitting or poor performance on unseen data. 12 | 13 | ## Example 14 | 15 | Here, we are evaluating the "same" model on the same data, only varying the random number seed (e.g. vary the learning algorithm slightly). 16 | 17 | There (generally) should be no statistically significant difference between runs, but we continue the trial until a difference is found due to high-variance/randomness. 18 | 19 | ```python 20 | # Import necessary libraries 21 | import numpy as np 22 | import pandas as pd 23 | from sklearn.datasets import make_classification 24 | from sklearn.model_selection import cross_val_score, StratifiedKFold 25 | from sklearn.ensemble import RandomForestClassifier 26 | from scipy.stats import ttest_ind 27 | 28 | # Generate a synthetic classification dataset 29 | X, y = make_classification(n_samples=350, n_features=10, n_informative=2, n_redundant=8, random_state=42) 30 | 31 | # Define a high-capacity machine learning model 32 | model = RandomForestClassifier(n_estimators=10, random_state=42) 33 | 34 | # Define a k-fold cross-validation strategy with a fixed seed 35 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) 36 | 37 | # Evaluate the model on the dataset using k-fold cross-validation 38 | baseline_scores = cross_val_score(model, X, y, cv=kfold) 39 | baseline_mean = np.mean(baseline_scores) 40 | 41 | # Set up parameters for p-hacking 42 | p_threshold = 0.05 # Threshold for statistical significance 43 | max_trials = 1000 # Maximum number of trials to perform 44 | significant_result_found = False 45 | 46 | # Loop through trials with different random seeds 47 | for trial in range(max_trials): 48 | # Use a new random seed for the model 49 | seed = trial + 100 50 | model = RandomForestClassifier(n_estimators=10, random_state=seed) 51 | 52 | # Evaluate the model with k-fold cross-validation 53 | trial_scores = cross_val_score(model, X, y, cv=kfold) 54 | trial_mean = np.mean(trial_scores) 55 | 56 | # Perform a t-test to compare means 57 | t_stat, p_value = ttest_ind(baseline_scores, trial_scores) 58 | 59 | # Check if the p-value is below the significance threshold 60 | if p_value < p_threshold: 61 | significant_result_found = True 62 | print(f"Significant difference found on trial {trial+1}") 63 | print(f"Baseline mean: {baseline_mean:.4f}, Trial mean: {trial_mean:.4f}, p-value: {p_value:.4f}") 64 | break 65 | else: 66 | print(f"No significant difference found yet, trial {trial+1}, p-value: {p_value:.4f}") 67 | 68 | # Report if no significant result was found within the maximum trials 69 | if not significant_result_found: 70 | print("No significant result found after maximum trials.") 71 | ``` 72 | 73 | Example Output: 74 | 75 | ```text 76 | No significant difference found yet, trial 1, p-value: 0.7245 77 | No significant difference found yet, trial 2, p-value: 0.4860 78 | No significant difference found yet, trial 3, p-value: 0.8028 79 | No significant difference found yet, trial 4, p-value: 0.5447 80 | No significant difference found yet, trial 5, p-value: 1.0000 81 | ... 82 | No significant difference found yet, trial 80, p-value: 0.3972 83 | No significant difference found yet, trial 81, p-value: 1.0000 84 | No significant difference found yet, trial 82, p-value: 0.7245 85 | No significant difference found yet, trial 83, p-value: 1.0000 86 | No significant difference found yet, trial 84, p-value: 0.7404 87 | No significant difference found yet, trial 85, p-value: 1.0000 88 | No significant difference found yet, trial 86, p-value: 0.7245 89 | No significant difference found yet, trial 87, p-value: 0.7707 90 | Significant difference found on trial 88 91 | Baseline mean: 0.9743, Trial mean: 0.9886, p-value: 0.0462 92 | ``` 93 | -------------------------------------------------------------------------------- /examples/src/test_harness_hacking_mitigation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.datasets import make_classification 4 | from sklearn.model_selection import train_test_split, cross_val_score, KFold, RepeatedKFold 5 | from sklearn.ensemble import RandomForestClassifier 6 | import matplotlib.pyplot as plt 7 | 8 | # Generate a synthetic classification dataset 9 | X, y = make_classification( 10 | n_samples=200, n_features=30, n_informative=5, n_redundant=25, random_state=42 11 | ) 12 | 13 | # Create a train/test split of the dataset 14 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 15 | 16 | # Initialize result storage for experiments 17 | results = [] 18 | 19 | # Define the study parameters 20 | fold_range = [3, 5, 7, 10] # 3 to 10 folds 21 | repeat_range = [1, 3, 5] # 1 to 10 repetitions 22 | n_trials = 5 # Number of trials for each configuration 23 | 24 | # Function for hill climbing optimization 25 | def hill_climb(cv, X_train, y_train, X_test, y_test, n_hill_trials=100): 26 | best_params = {"n_estimators": 10, "max_depth": 2} 27 | best_cv_score = -1 28 | 29 | cv_scores = [] 30 | holdout_scores = [] 31 | 32 | for hill_trial in range(n_hill_trials): 33 | # Propose new parameters 34 | new_params = { 35 | "n_estimators": best_params["n_estimators"] + np.random.randint(-10, 11), 36 | "max_depth": best_params["max_depth"] + np.random.randint(-1, 2) 37 | } 38 | new_params["n_estimators"] = max(1, new_params["n_estimators"]) 39 | new_params["max_depth"] = max(1, new_params["max_depth"]) 40 | 41 | # Evaluate new parameters 42 | new_model = RandomForestClassifier( 43 | n_estimators=new_params["n_estimators"], max_depth=new_params["max_depth"], random_state=42 44 | ) 45 | raw_scores = cross_val_score(new_model, X_train, y_train, cv=cv, scoring="accuracy") 46 | new_cv_score = np.mean(raw_scores) 47 | cv_scores.append(new_cv_score) 48 | 49 | # Evaluate the new model on the hold out test set 50 | new_model.fit(X_train, y_train) 51 | new_holdout_score = new_model.score(X_test, y_test) 52 | holdout_scores.append(new_holdout_score) 53 | 54 | # Update best parameters if score improves 55 | if new_cv_score > best_cv_score: 56 | best_params = new_params 57 | best_cv_score = new_cv_score 58 | 59 | return cv_scores, holdout_scores 60 | 61 | # Function to calculate metrics 62 | def calculate_metrics(cv_scores, holdout_scores): 63 | mean_cv_score = np.mean(cv_scores) 64 | correlation = np.corrcoef(cv_scores, holdout_scores)[0, 1] 65 | mean_abs_diff = np.mean(np.abs(np.array(cv_scores) - np.array(holdout_scores))) 66 | return correlation, mean_abs_diff 67 | 68 | # Main experiment loop 69 | for n_folds in fold_range: 70 | for n_repeats in repeat_range: 71 | trial_correlations = [] 72 | trial_mean_differences = [] 73 | 74 | for trial in range(n_trials): 75 | # Define CV with specific folds and repeats 76 | cv = RepeatedKFold(n_splits=n_folds, n_repeats=n_repeats, random_state=trial) 77 | 78 | # Perform hill climbing of the cross-validated train set 79 | cv_scores, holdout_scores = hill_climb(cv, X_train, y_train, X_test, y_test) 80 | 81 | # Calculate metrics 82 | corr, diff = calculate_metrics(cv_scores, holdout_scores) 83 | 84 | trial_correlations.append(corr) 85 | trial_mean_differences.append(diff) 86 | 87 | # Report progress 88 | print(f'folds={n_folds}, repeats={n_repeats}, i={(trial+1)}, corr={corr}, diff={diff}') 89 | 90 | # Record average results for this configuration 91 | avg_correlation = np.mean(trial_correlations) 92 | avg_mean_diff = np.mean(trial_mean_differences) 93 | 94 | results.append({ 95 | 'folds': n_folds, 96 | 'repeats': n_repeats, 97 | 'avg_correlation': avg_correlation, 98 | 'avg_mean_diff': avg_mean_diff 99 | }) 100 | 101 | # Log progress 102 | print(f"Completed: {n_folds} folds, {n_repeats} repeats | Avg Correlation: {avg_correlation:.4f}, Avg Mean Diff: {avg_mean_diff:.4f}") 103 | 104 | # Convert results to DataFrame 105 | results_df = pd.DataFrame(results) 106 | 107 | # Save results to CSV 108 | results_df.to_csv('cv_overfitting_study_results.csv', index=False) 109 | 110 | # Display final summary 111 | print("\nFinal Results:\n") 112 | print(results_df.sort_values(['folds', 'repeats'])) -------------------------------------------------------------------------------- /examples/leaderboard_hacking.md: -------------------------------------------------------------------------------- 1 | # Leaderboard Hacking 2 | 3 | > Issue predictions for a machine learning competition until a perfect (or near perfect) score is achieved. 4 | 5 | ## Description 6 | Leaderboard hacking exploits competition scoring systems by repeatedly submitting predictions until achieving an artificially high score, without developing a genuinely effective model. 7 | 8 | This approach takes advantage of the limited test set size and scoring mechanism, where multiple submission attempts can eventually lead to overfitting to the test data through pure chance. 9 | 10 | The practice undermines the educational value of machine learning competitions and creates misleading benchmarks for model performance. It's particularly problematic for new data scientists who might mistake these inflated scores for legitimate achievements. 11 | 12 | This technique represents a fundamental misunderstanding of machine learning principles, as it bypasses proper model development, validation, and testing procedures. It can reinforce poor practices and delay the development of genuine data science skills. 13 | 14 | While it may temporarily boost competition rankings, leaderboard hacking ultimately impedes professional growth and can damage credibility within the data science community. Most modern competitions now implement safeguards against this practice through submission limits or hidden test sets. 15 | 16 | Instead of pursuing quick wins through leaderboard manipulation, focus on developing robust models using proper cross-validation techniques and thorough evaluation metrics. 17 | 18 | ## Example 19 | 20 | ```python 21 | import numpy as np 22 | from sklearn.datasets import make_classification 23 | from sklearn.model_selection import train_test_split 24 | from sklearn.metrics import accuracy_score 25 | 26 | # Generate a synthetic classification dataset 27 | X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42) 28 | 29 | # Split into training and test sets 30 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 31 | 32 | # Artificial "leaderboard" to evaluate predictions on the test set 33 | def leaderboard_evaluation(predictions, true_labels): 34 | """Calculate the leaderboard score (accuracy in this case).""" 35 | return accuracy_score(true_labels, predictions) 36 | 37 | # Initialize random predictions for the test set 38 | best_predictions = np.random.randint(0, 2, size=len(y_test)) 39 | best_score = leaderboard_evaluation(best_predictions, y_test) 40 | 41 | # Stochastic hill climber: adjust predictions iteratively 42 | max_trials = 10000 # Maximum number of trials 43 | for trial in range(max_trials): 44 | # Copy the best predictions and randomly flip one value 45 | new_predictions = best_predictions.copy() 46 | index_to_flip = np.random.randint(len(new_predictions)) 47 | new_predictions[index_to_flip] = 1 - new_predictions[index_to_flip] # Flip the prediction 48 | 49 | # Evaluate the new predictions 50 | new_score = leaderboard_evaluation(new_predictions, y_test) 51 | 52 | # If the new score is better, adopt the new predictions 53 | if new_score > best_score: 54 | best_predictions = new_predictions 55 | best_score = new_score 56 | 57 | # Report progress 58 | print(f"Trial {trial + 1}/{max_trials}: Leaderboard Score = {new_score:.4f}, Best Score = {best_score:.4f}") 59 | 60 | # Stop if perfect score is achieved 61 | if best_score == 1.0: 62 | print("Perfect score achieved!") 63 | break 64 | ``` 65 | 66 | Example Output: 67 | 68 | ```text 69 | Trial 1/10000: Leaderboard Score = 0.4800, Best Score = 0.4850 70 | Trial 2/10000: Leaderboard Score = 0.4800, Best Score = 0.4850 71 | Trial 3/10000: Leaderboard Score = 0.4800, Best Score = 0.4850 72 | Trial 4/10000: Leaderboard Score = 0.4800, Best Score = 0.4850 73 | Trial 5/10000: Leaderboard Score = 0.4900, Best Score = 0.4900 74 | ... 75 | Trial 787/10000: Leaderboard Score = 0.9900, Best Score = 0.9950 76 | Trial 788/10000: Leaderboard Score = 0.9900, Best Score = 0.9950 77 | Trial 789/10000: Leaderboard Score = 0.9900, Best Score = 0.9950 78 | Trial 790/10000: Leaderboard Score = 0.9900, Best Score = 0.9950 79 | Trial 791/10000: Leaderboard Score = 0.9900, Best Score = 0.9950 80 | Trial 792/10000: Leaderboard Score = 1.0000, Best Score = 1.0000 81 | Perfect score achieved! 82 | ``` 83 | 84 | 85 | ## Further Reading 86 | 87 | These papers may be related: 88 | 89 | * [Toward a Better Understanding of Leaderboard](https://arxiv.org/abs/1510.03349), Wenjie Zheng, 2015. 90 | * [Exploiting an Oracle that Reports AUC Scores in Machine Learning Contests](https://arxiv.org/abs/1506.01339), Jacob Whitehill, 2015. 91 | * [Climbing the Kaggle Leaderboard by Exploiting the Log-Loss Oracle](https://arxiv.org/abs/1707.01825), Jacob Whitehill, 2017. 92 | 93 | 94 | -------------------------------------------------------------------------------- /examples/test_set_pruning.md: -------------------------------------------------------------------------------- 1 | # Test Set Pruning 2 | 3 | > Trim or remove hard-to-predict examples from the test set to improve results. 4 | 5 | ## Description 6 | 7 | Test set pruning is a deceptive practice where difficult-to-predict examples are deliberately removed from the test dataset to artificially inflate model performance metrics. 8 | 9 | This approach creates a dangerous illusion of model quality by eliminating the challenging edge cases that often matter most in real-world applications. 10 | 11 | The practice undermines the fundamental purpose of test sets: to provide an unbiased estimate of how well your model will perform on new, unseen data in production. 12 | 13 | Test set pruning can manifest through direct removal of misclassified examples or more subtle approaches like filtering out "noisy" or "outlier" data points that the model struggles with. 14 | 15 | This anti-pattern often emerges from pressure to show improved metrics, but it creates serious risks. Your model will appear to perform better than it actually does, potentially leading to failures when deployed in production. 16 | 17 | Instead of pruning difficult examples, treat them as valuable signals. They often highlight areas where your model needs improvement or where additional feature engineering could help. 18 | 19 | ## Example 20 | 21 | ```python 22 | # Import necessary libraries 23 | from sklearn.datasets import make_classification 24 | from sklearn.model_selection import train_test_split 25 | from sklearn.ensemble import RandomForestClassifier 26 | from sklearn.metrics import accuracy_score 27 | import numpy as np 28 | 29 | # Generate a synthetic classification dataset 30 | X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=42) 31 | 32 | # Split the dataset into training and testing sets 33 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 34 | 35 | # Initialize a Random Forest classifier 36 | model = RandomForestClassifier(random_state=42) 37 | 38 | # Train the model on the training set 39 | model.fit(X_train, y_train) 40 | 41 | # Predict on the test set 42 | y_pred = model.predict(X_test) 43 | 44 | # Calculate the initial accuracy 45 | initial_accuracy = accuracy_score(y_test, y_pred) 46 | print(f"Initial Test Accuracy: {initial_accuracy}") 47 | 48 | # Iteratively remove one misclassified example per iteration 49 | X_test_pruned = X_test 50 | y_test_pruned = y_test 51 | while True: 52 | # Predict on the pruned test set 53 | y_pred_pruned = model.predict(X_test_pruned) 54 | 55 | # Identify indices of misclassified samples 56 | misclassified_indices = np.where(y_pred_pruned != y_test_pruned)[0] 57 | 58 | # Break if no misclassified samples remain 59 | if len(misclassified_indices) == 0: 60 | break 61 | 62 | # Remove one misclassified sample 63 | index_to_remove = misclassified_indices[0] # Select the first misclassified sample 64 | X_test_pruned = np.delete(X_test_pruned, index_to_remove, axis=0) 65 | y_test_pruned = np.delete(y_test_pruned, index_to_remove, axis=0) 66 | 67 | # Recalculate accuracy on the pruned test set 68 | pruned_accuracy = accuracy_score(y_test_pruned, model.predict(X_test_pruned)) 69 | print(f"Pruned Test Accuracy: {pruned_accuracy}") 70 | ``` 71 | 72 | Example Output: 73 | 74 | ```text 75 | Initial Test Accuracy: 0.8866666666666667 76 | Pruned Test Accuracy: 0.8896321070234113 77 | Pruned Test Accuracy: 0.8926174496644296 78 | Pruned Test Accuracy: 0.8956228956228957 79 | Pruned Test Accuracy: 0.8986486486486487 80 | Pruned Test Accuracy: 0.9016949152542373 81 | Pruned Test Accuracy: 0.9047619047619048 82 | Pruned Test Accuracy: 0.9078498293515358 83 | Pruned Test Accuracy: 0.910958904109589 84 | Pruned Test Accuracy: 0.9140893470790378 85 | Pruned Test Accuracy: 0.9172413793103448 86 | Pruned Test Accuracy: 0.9204152249134948 87 | Pruned Test Accuracy: 0.9236111111111112 88 | Pruned Test Accuracy: 0.926829268292683 89 | Pruned Test Accuracy: 0.9300699300699301 90 | Pruned Test Accuracy: 0.9333333333333333 91 | Pruned Test Accuracy: 0.9366197183098591 92 | Pruned Test Accuracy: 0.9399293286219081 93 | Pruned Test Accuracy: 0.9432624113475178 94 | Pruned Test Accuracy: 0.9466192170818505 95 | Pruned Test Accuracy: 0.95 96 | Pruned Test Accuracy: 0.953405017921147 97 | Pruned Test Accuracy: 0.9568345323741008 98 | Pruned Test Accuracy: 0.9602888086642599 99 | Pruned Test Accuracy: 0.9637681159420289 100 | Pruned Test Accuracy: 0.9672727272727273 101 | Pruned Test Accuracy: 0.9708029197080292 102 | Pruned Test Accuracy: 0.9743589743589743 103 | Pruned Test Accuracy: 0.9779411764705882 104 | Pruned Test Accuracy: 0.981549815498155 105 | Pruned Test Accuracy: 0.9851851851851852 106 | Pruned Test Accuracy: 0.9888475836431226 107 | Pruned Test Accuracy: 0.9925373134328358 108 | Pruned Test Accuracy: 0.9962546816479401 109 | Pruned Test Accuracy: 1.0 110 | ``` -------------------------------------------------------------------------------- /examples/test_set_overfitting.md: -------------------------------------------------------------------------------- 1 | # Test Set Overfitting 2 | 3 | > Optimizing a model for its performance on a "hold out" test set. 4 | 5 | ## Description 6 | This is typically called "test set overfitting" or "overfitting to the test set." 7 | 8 | It occurs when practitioners repeatedly tune their model based on test set performance, effectively making the test set act as a second training set. This violates the fundamental principle that the test set should only be used for final evaluation. 9 | 10 | Sometimes it's also referred to as "test set adaption" or "inappropriate test set optimization." In more formal academic literature, it might be described as "compromising test set independence through iterative optimization." 11 | 12 | This is different from test set leakage (where information flows from test to train inadvertently) because in this case, there's intentional optimization using test set feedback. It's particularly problematic because it gives an overly optimistic estimate of model performance and doesn't reflect how the model would perform on truly unseen data. 13 | 14 | This is why many researchers advocate for using a three-way split (train/validation/test) or holding out a completely separate test set that is only used once for final evaluation, with all intermediate optimization done using cross-validation on the training data. 15 | 16 | ## Example 17 | 18 | ```python 19 | # Import necessary libraries 20 | from sklearn.datasets import make_classification # For generating a synthetic classification dataset 21 | from sklearn.model_selection import train_test_split # For splitting the dataset 22 | from sklearn.ensemble import RandomForestClassifier # High-capacity model 23 | from sklearn.metrics import accuracy_score # For model evaluation 24 | from itertools import product # For generating all combinations of hyperparameters 25 | 26 | # Generate a synthetic classification dataset 27 | X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=42) 28 | 29 | # Split the dataset into training and testing sets 30 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 31 | 32 | # Define possible values for hyperparameters 33 | n_estimators_options = [10, 50, 100, 200] 34 | max_depth_options = [5, 10, 15, 20] 35 | 36 | # Generate all combinations of hyperparameters 37 | configurations = list(product(n_estimators_options, max_depth_options)) 38 | 39 | # Dictionary to store test set performance for each configuration 40 | test_set_performance = {} 41 | 42 | # Variable to track the best configuration so far 43 | best_config_so_far = None 44 | best_accuracy_so_far = 0 45 | 46 | # Loop through each configuration 47 | for n_estimators, max_depth in configurations: 48 | # Create the model with the current configuration 49 | model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42) 50 | 51 | # Fit the model on the training set 52 | model.fit(X_train, y_train) 53 | 54 | # Evaluate the model on the test set 55 | y_pred = model.predict(X_test) 56 | accuracy = accuracy_score(y_test, y_pred) 57 | 58 | # Store the performance on the test set 59 | test_set_performance[f"n_estimators={n_estimators}, max_depth={max_depth}"] = accuracy 60 | 61 | # Update and display progress 62 | if accuracy > best_accuracy_so_far: 63 | best_config_so_far = (n_estimators, max_depth) 64 | best_accuracy_so_far = accuracy 65 | print(f"cfg: n_estimators={n_estimators}, max_depth={max_depth}, Accuracy: {accuracy:.4f} " + f"(Best: {best_accuracy_so_far:.4f})") 66 | 67 | # Print the final best configuration and its test set accuracy 68 | print(f"Final Best Configuration: n_estimators={best_config_so_far[0]}, max_depth={best_config_so_far[1]}, Test Set Accuracy: {best_accuracy_so_far:.4f}") 69 | ``` 70 | 71 | Example Output: 72 | 73 | ```text 74 | cfg: n_estimators=10, max_depth=5, Accuracy: 0.8400 (Best: 0.8400) 75 | cfg: n_estimators=10, max_depth=10, Accuracy: 0.8800 (Best: 0.8800) 76 | cfg: n_estimators=10, max_depth=15, Accuracy: 0.8850 (Best: 0.8850) 77 | cfg: n_estimators=10, max_depth=20, Accuracy: 0.8750 (Best: 0.8850) 78 | cfg: n_estimators=50, max_depth=5, Accuracy: 0.8750 (Best: 0.8850) 79 | cfg: n_estimators=50, max_depth=10, Accuracy: 0.9100 (Best: 0.9100) 80 | cfg: n_estimators=50, max_depth=15, Accuracy: 0.8900 (Best: 0.9100) 81 | cfg: n_estimators=50, max_depth=20, Accuracy: 0.9000 (Best: 0.9100) 82 | cfg: n_estimators=100, max_depth=5, Accuracy: 0.8800 (Best: 0.9100) 83 | cfg: n_estimators=100, max_depth=10, Accuracy: 0.9000 (Best: 0.9100) 84 | cfg: n_estimators=100, max_depth=15, Accuracy: 0.9000 (Best: 0.9100) 85 | cfg: n_estimators=100, max_depth=20, Accuracy: 0.9000 (Best: 0.9100) 86 | cfg: n_estimators=200, max_depth=5, Accuracy: 0.8700 (Best: 0.9100) 87 | cfg: n_estimators=200, max_depth=10, Accuracy: 0.8750 (Best: 0.9100) 88 | cfg: n_estimators=200, max_depth=15, Accuracy: 0.8800 (Best: 0.9100) 89 | cfg: n_estimators=200, max_depth=20, Accuracy: 0.8800 (Best: 0.9100) 90 | Final Best Configuration: n_estimators=50, max_depth=10, Test Set Accuracy: 0.9100 91 | ``` -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Machine Learning Mischief 2 | 3 | # Machine Learning Mischief 4 | 5 | **Do not do this!!!** 6 | 7 | _It is possible to "bend" machine learning experiments towards achieving a preconceived goal._ 8 | 9 | This involves systematically exploiting evaluation metrics and/or scientific tests to achieve desired outcomes without actually meeting the underlying scientific objectives. 10 | 11 | These behaviors are _unethical_ and might be called [_cherry picking_](https://en.wikipedia.org/wiki/Cherry_picking), [_data dredging_](https://en.wikipedia.org/wiki/Data_dredging), or _gaming results_. 12 | 13 | Reviewing examples of this type of "gaming" (data science dark arts) can remind beginners and stakeholders (really all of us!) why certain methods are best practices and how to avoid being deceived by results that are too good to be true. 14 | 15 | ## Examples 16 | 17 | Below are examples of this type of gaming, and simple demonstrations of each: 18 | 19 | * [Seed Hacking](examples/seed_hacking.md): _Repeat an experiment with different random number seeds to get the best result._ 20 | * [Cross-Validation](examples/seed_hacking_cross_validation.md): _Vary the seed for creating cross-validation folds in order to get the best result._ 21 | * [Train/Test Split](examples/seed_hacking_train_test_split.md): _Vary the seed for creating train/test splits in order to get the best result._ 22 | * [Learning Algorithm](examples/seed_hacking_learning_algorithm.md): _Vary the seed for the model training algorithm in order to get the best result._ 23 | * [Bootstrap Performance](examples/seed_hacking_bootstrap_performance.md): _Vary the bootstrap random seed to present the best model performance._ 24 | * [p-Hacking](examples/p_hacking.md): _Repeat a statistical hypothesis test until a significant result is achieved._ 25 | * [Selective Sampling](examples/p_hacking_selective_sampling.md): _Vary samples in order to fit a model with significantly better performance._ 26 | * [Feature Selection](examples/p_hacking_feature_selection.md): _Vary features in order to fit a model with significantly better performance._ 27 | * [Learning Algorithm](examples/p_hacking_learning_algorithm.md) _Vary the learning algorithm seed in order to get a significantly better result._ 28 | * [Test Harness Hacking](examples/test_harness_hacking.md): _Varying models and hyperparameters to maximize test harness performance._ 29 | * [Hill Climb CV Test Folds](examples/test_harness_hacking_hill_climbing_test_folds.md): Adapt predictions for each cross-validation test fold over repeated trials. 30 | * [Hill Climb CV Performance](examples/test_harness_hacking_hill_climbing_performance.md): Excessively adapt a model for cross-validation performance. 31 | * [Test Harness Hacking Mitigation](examples/test_harness_hacking_mitigation.md): Modern practices can mitigate the risk of test harness hacking. 32 | * [Test Set Memorization](examples/test_set_memorization.md): _Allow the model to memorize the test set and get a perfect score._ 33 | * [Test Set Overfitting](examples/test_set_overfitting.md): _Optimizing a model for its performance on a "hold out" test set._ 34 | * [Test Set Pruning](examples/test_set_pruning.md): _Remove hard-to-predict examples from the test set to improve results._ 35 | * [Train/Test Split Ratio Gaming](examples/train_test_ratio_gaming.md): _Vary train/test split ratios until a desired result is achieved._ 36 | * [Leaderboard Hacking](examples/leaderboard_hacking.md): _Issue predictions for a machine learning competition until a perfect score is achieved._ 37 | * [Threshold Hacking](examples/threshold_hacking.md): _Adjusting classification thresholds to hit specific metric targets._ 38 | 39 | ## How To Spot 40 | 41 | Results presented using these methods are easy to spot with probing questions: 42 | 43 | * "_Why did you use such a specific random number seed?_" 44 | * "_Why did you choose this split ratio over other more common ratios?_" 45 | * "_Why did you remove this example from the test set and not that example?_" 46 | * "_Why didn't you report a performance distribution over repeated resampling of the data?_" 47 | 48 | All this highlights that the choices in an experimental method must be defensible! Especially those that deviate from widely adopted heuristics. 49 | 50 | ## DO NOT DO THIS 51 | 52 | This project is for **educational purposes only**! 53 | 54 | If you use these methods on a project, you're unethical, a fraud, and your results are garbage. 55 | 56 | Also, results/models will be fragile and will not generalize to new data in production or a surprise/hidden test set. You will be found out. A competent senior data scientist (or LLM?) will see what is up very quickly. 57 | 58 | ### So why give examples? 59 | 60 | I've never seen anything like this for machine learning and data science. Yet, most experienced practitioners know that they are a _real thing_. 61 | 62 | Knowing what-to-look-for can help stakeholders, managers, teachers, paper reviews, etc. 63 | 64 | Knowing what-not-to-do can help junior data scientists. 65 | 66 | Also, thinking about and writing these examples feels naughty + fun :) 67 | 68 | ## More 69 | 70 | If you like this project, you may be interested in [Data Science Diagnostics](https://DataScienceDiagnostics.com). 71 | 72 | If you have ideas for more examples, email me: Jason.Brownlee05@gmail.com 73 | 74 | -------------------------------------------------------------------------------- /examples/train_test_ratio_gaming.md: -------------------------------------------------------------------------------- 1 | # Train/Test Split Ratio Gaming 2 | 3 | > Vary train/test split ratios until a desired result is achieved. 4 | 5 | ## Description 6 | 7 | Train/Test Split Ratio Gaming is a problematic practice where data scientists artificially adjust the proportion of data used for training versus testing until they achieve their desired model performance metrics. 8 | 9 | This approach involves repeatedly modifying the random split between training and test data, essentially "shopping" for a split ratio that produces favorable results. It's particularly tempting for new data scientists who are under pressure to demonstrate good model performance. 10 | 11 | The fundamental issue with this technique is that it violates the principle of having a truly independent test set. By optimizing the split ratio based on test results, you're inadvertently allowing information from the test set to influence your model selection process. 12 | 13 | This practice leads to overly optimistic performance estimates and models that will likely perform worse in real-world applications. It's especially dangerous because it can be difficult for others to detect this manipulation just by looking at the final results. 14 | 15 | The correct approach is to set your train/test split ratio based on statistical principles and dataset characteristics before any model training begins. Common splits like 80/20 or 70/30 should be chosen based on dataset size and problem requirements, not results. 16 | 17 | ## Example 18 | 19 | ```python 20 | # Import necessary libraries 21 | import numpy as np 22 | import pandas as pd 23 | from sklearn.datasets import make_classification 24 | from sklearn.ensemble import RandomForestClassifier 25 | from sklearn.model_selection import train_test_split 26 | from sklearn.metrics import accuracy_score 27 | 28 | # Generate a synthetic classification dataset 29 | X, y = make_classification( 30 | n_samples=1000, # Number of samples 31 | n_features=20, # Number of features 32 | n_informative=15, # Number of informative features 33 | n_redundant=5, # Number of redundant features 34 | random_state=42 # Fixing random state for reproducibility 35 | ) 36 | 37 | # Fix random seed for consistent train/test splits 38 | random_seed = 42 39 | 40 | # Initialize a variable to track the best test performance and associated split ratio 41 | best_accuracy = 0 42 | best_ratio = 0 43 | 44 | # Iterate over train/test split ratios from 50% to 99% in 1% increments 45 | for train_size in range(50, 100): # Split ratios vary from 50% to 99% 46 | test_size = 100 - train_size # Calculate corresponding test size 47 | 48 | # Split the dataset into train and test sets 49 | X_train, X_test, y_train, y_test = train_test_split( 50 | X, y, 51 | train_size=train_size / 100.0, # Convert train_size to percentage 52 | random_state=random_seed # Fix the random seed 53 | ) 54 | 55 | # Initialize a Random Forest Classifier 56 | model = RandomForestClassifier(random_state=random_seed) 57 | 58 | # Train the model on the training data 59 | model.fit(X_train, y_train) 60 | 61 | # Predict on the test set 62 | y_pred = model.predict(X_test) 63 | 64 | # Evaluate test performance using accuracy 65 | accuracy = accuracy_score(y_test, y_pred) 66 | 67 | # Report progress 68 | print(f'> {train_size}/{test_size}: {accuracy}') 69 | 70 | # Update the best accuracy and split ratio if current accuracy is better 71 | if accuracy > best_accuracy: 72 | best_accuracy = accuracy 73 | best_ratio = train_size 74 | 75 | # Print the best train/test split ratio and corresponding accuracy 76 | print(f"Best train/test split ratio: {best_ratio}/{100 - best_ratio}") 77 | print(f"Best test accuracy: {best_accuracy}") 78 | ``` 79 | 80 | Example Output: 81 | 82 | ```text 83 | > 50/50: 0.884 84 | > 51/49: 0.8918367346938776 85 | > 52/48: 0.8916666666666667 86 | > 53/47: 0.8765957446808511 87 | > 54/46: 0.8760869565217392 88 | > 55/45: 0.8844444444444445 89 | > 56/44: 0.884090909090909 90 | > 57/43: 0.8953488372093024 91 | > 58/42: 0.8833333333333333 92 | > 59/41: 0.8926829268292683 93 | > 60/40: 0.89 94 | > 61/39: 0.8948717948717949 95 | > 62/38: 0.9131578947368421 96 | > 63/37: 0.9081081081081082 97 | > 64/36: 0.9055555555555556 98 | > 65/35: 0.9142857142857143 99 | > 66/34: 0.9117647058823529 100 | > 67/33: 0.906060606060606 101 | > 68/32: 0.90625 102 | > 69/31: 0.8903225806451613 103 | > 70/30: 0.8866666666666667 104 | > 71/29: 0.903448275862069 105 | > 72/28: 0.8892857142857142 106 | > 73/27: 0.8851851851851852 107 | > 74/26: 0.8846153846153846 108 | > 75/25: 0.884 109 | > 76/24: 0.8916666666666667 110 | > 77/23: 0.8826086956521739 111 | > 78/22: 0.8727272727272727 112 | > 79/21: 0.8857142857142857 113 | > 80/20: 0.9 114 | > 81/19: 0.9 115 | > 82/18: 0.8888888888888888 116 | > 83/17: 0.8823529411764706 117 | > 84/16: 0.89375 118 | > 85/15: 0.8733333333333333 119 | > 86/14: 0.9285714285714286 120 | > 87/13: 0.8846153846153846 121 | > 88/12: 0.9166666666666666 122 | > 89/11: 0.9090909090909091 123 | > 90/10: 0.94 124 | > 91/9: 0.9222222222222223 125 | > 92/8: 0.9125 126 | > 93/7: 0.9142857142857143 127 | > 94/6: 0.9166666666666666 128 | > 95/5: 0.9 129 | > 96/4: 0.9 130 | > 97/3: 0.9333333333333333 131 | > 98/2: 0.9 132 | > 99/1: 0.9 133 | Best train/test split ratio: 90/10 134 | Best test accuracy: 0.94 135 | ``` -------------------------------------------------------------------------------- /examples/test_harness_hacking_hill_climbing_test_folds.md: -------------------------------------------------------------------------------- 1 | # Test Harness Hacking: Hill Climb Cross-Validation Test Folds 2 | 3 | > Adapt predictions for each cross-validation test fold over repeated trials. 4 | 5 | ## Description 6 | 7 | This involves exploiting k-fold cross-validation to artificially improve model performance. 8 | 9 | The model adapts its predictions for each fold during cross-validation trials, fully utilizing the performance metric signal from the test folds. Over time, this "hill-climbing" process fine-tunes predictions specifically for the test folds, leading to near-perfect results within the cross-validation framework. 10 | 11 | However, this method ignores the need for generalization to new data. When applied to a real holdout test set, the model's performance collapses, producing random or inaccurate predictions. 12 | 13 | This practice is unrealistic and misleading, as it relies on overfitting to test folds rather than building a robust, generalizable model. 14 | 15 | As such it provides an idealized worst case scenario of a data scientist overfitting the training dataset, in the face of a robust test harness using k-fold cross-validation. 16 | 17 | ## Example 18 | 19 | This example starts by initializing random predictions for all data points in the training set and performs repeated trials. 20 | 21 | Each trial consists of one full k-fold cross-validation pass. During each fold, after evaluating predictions on the test fold, the algorithm makes a single adaptation to the predictions to improve accuracy on that specific fold. These adaptations accumulate over trials, effectively "hill climbing" towards perfect predictions on the cross-validation folds. 22 | 23 | However, because this process overfits predictions to the cross-validation setup, the resulting model fails to generalize. When evaluated on a holdout test set, it produces random, non-generalizable predictions, highlighting the misleading nature of this approach. 24 | 25 | ```python 26 | # Import necessary libraries 27 | import numpy as np 28 | from sklearn.datasets import make_classification 29 | from sklearn.model_selection import KFold, train_test_split 30 | from sklearn.metrics import accuracy_score 31 | 32 | # Generate a synthetic classification dataset 33 | X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42) 34 | 35 | # Split the dataset into a train and test set 36 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 37 | 38 | # Define k-fold cross-validation 39 | kfold = KFold(n_splits=5, shuffle=True, random_state=42) 40 | 41 | # Initialize random predictions across all data points in the training set 42 | predictions = np.random.choice(np.unique(y_train), size=len(X_train)) 43 | 44 | # Maximum number of trials 45 | n_trials = 100 46 | 47 | # Begin hill-climbing meta-algorithm 48 | for trial in range(n_trials): 49 | print(f"Trial {trial + 1}/{n_trials}") 50 | 51 | # Initialize variables to track progress across folds 52 | fold_accuracies = [] 53 | 54 | # Perform k-fold cross-validation 55 | for train_idx, test_idx in kfold.split(X_train): 56 | # Get test fold indices 57 | y_test_fold = y_train[test_idx] 58 | fold_predictions = predictions[test_idx] 59 | 60 | # Evaluate the current predictions on the test fold 61 | current_accuracy = accuracy_score(y_test_fold, fold_predictions) 62 | 63 | # Adapt predictions based on test fold performance (hill climbing) 64 | if current_accuracy < 1.0: # If not perfect 65 | for i in range(len(test_idx)): 66 | idx = test_idx[i] 67 | if predictions[idx] != y_train[idx]: # Fix one wrong prediction 68 | predictions[idx] = y_train[idx] 69 | break # Stop after a single modification 70 | 71 | # Recalculate fold accuracy after adaptation 72 | updated_fold_predictions = predictions[test_idx] 73 | updated_accuracy = accuracy_score(y_test_fold, updated_fold_predictions) 74 | fold_accuracies.append(updated_accuracy) 75 | 76 | # Calculate and report average accuracy across all folds for this trial 77 | avg_accuracy = np.mean(fold_accuracies) 78 | print(f"Average Accuracy Across Folds: {avg_accuracy:.4f}") 79 | 80 | # Stop trials if all folds achieve perfect accuracy 81 | if avg_accuracy == 1.0: 82 | print("All folds reached perfect accuracy. Stopping trials.") 83 | break 84 | 85 | # Evaluate the "model" on the holdout test set 86 | # Use random predictions for the holdout test set to simulate lack of generalization 87 | test_predictions = np.random.choice(np.unique(y_train), size=len(y_test)) 88 | holdout_accuracy = accuracy_score(y_test, test_predictions) 89 | 90 | # Report final results 91 | print("\nFinal Results:") 92 | print(f"Accuracy on holdout test set: {holdout_accuracy:.4f}") 93 | ``` 94 | 95 | Example Output: 96 | 97 | ```text 98 | Trial 1/100 99 | Average Accuracy Across Folds: 0.5188 100 | Trial 2/100 101 | Average Accuracy Across Folds: 0.5250 102 | Trial 3/100 103 | Average Accuracy Across Folds: 0.5312 104 | Trial 4/100 105 | Average Accuracy Across Folds: 0.5375 106 | Trial 5/100 107 | Average Accuracy Across Folds: 0.5437 108 | ... 109 | Trial 79/100 110 | Average Accuracy Across Folds: 0.9950 111 | Trial 80/100 112 | Average Accuracy Across Folds: 0.9975 113 | Trial 81/100 114 | Average Accuracy Across Folds: 0.9988 115 | Trial 82/100 116 | Average Accuracy Across Folds: 1.0000 117 | All folds reached perfect accuracy. Stopping trials. 118 | 119 | Final Results: 120 | Accuracy on holdout test set: 0.4100 121 | ``` -------------------------------------------------------------------------------- /examples/test_harness_hacking_hill_climbing_performance.md: -------------------------------------------------------------------------------- 1 | # Test Harness Hacking: Hill Climb Cross-Valuation Performance 2 | 3 | > Excessively adapt a model for cross-validation performance. 4 | 5 | ## Description 6 | 7 | This occurs when a model is excessively tuned to maximize performance during cross-validation. 8 | 9 | Using a robust test harness like k-fold cross-validation, repeated iterations of model adjustments are made to improve the average score on test folds. 10 | 11 | However, this over-adaptation leads to overfitting on the training dataset. The model becomes too specialized to the patterns in the cross-validation splits, losing generalizability. 12 | 13 | The issue often arises when the number of improvement trials exceeds the size of the training dataset, creating a misleading sense of success. 14 | 15 | While cross-validation metrics may look impressive, performance on a separate hold-out test set deteriorates. This approach sacrifices real-world accuracy for temporary gains during validation, undermining the model's reliability. 16 | 17 | ## Example 18 | 19 | In this example, model is excessively tuned to optimize cross-validation performance at the expense of generalizability. 20 | 21 | Using a synthetic classification dataset, a Random Forest model is repeatedly optimized over 100 trials through stochastic hill climbing, adjusting the hyperparameters `n_estimators` and `max_depth` to improve the mean k-fold cross-validation accuracy. 22 | 23 | As the optimization progresses, the cross-validation score steadily improves, but the hold-out test performance often plateaus or deteriorates, highlighting overfitting to the cross-validation splits. 24 | 25 | The code visualizes this divergence between cross-validation and test performance, illustrating how focusing excessively on cross-validation metrics can undermine the model's real-world applicability. 26 | 27 | ```python 28 | # Import necessary libraries 29 | import numpy as np 30 | from sklearn.datasets import make_classification 31 | from sklearn.model_selection import train_test_split, cross_val_score, KFold 32 | from sklearn.ensemble import RandomForestClassifier 33 | import matplotlib.pyplot as plt 34 | 35 | # Generate a synthetic classification dataset 36 | X, y = make_classification( 37 | n_samples=200, n_features=30, n_informative=5, n_redundant=25, random_state=42 38 | ) 39 | 40 | # Create a train/test split of the dataset 41 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 42 | 43 | # Set up k-fold cross-validation for the training set 44 | kf = KFold(n_splits=3, shuffle=True, random_state=42) 45 | 46 | # Initialize variables for hill climbing and tracking performance 47 | n_trials = 100 # Number of optimization trials 48 | best_params = {"n_estimators": 10, "max_depth": 2} # Initial hyperparameters 49 | cv_scores = [] # Track cross-validation scores 50 | test_scores = [] # Track hold-out test scores 51 | 52 | # Define a stochastic hill climbing procedure for hyperparameter tuning 53 | for trial in range(n_trials): 54 | # Create a model with current best parameters 55 | model = RandomForestClassifier( 56 | n_estimators=best_params["n_estimators"], max_depth=best_params["max_depth"], random_state=42 57 | ) 58 | 59 | # Evaluate model using k-fold cross-validation 60 | cv_score = np.mean(cross_val_score(model, X_train, y_train, cv=kf, scoring="accuracy")) 61 | 62 | # Fit the model on the entire training set and evaluate on the hold-out test set 63 | model.fit(X_train, y_train) 64 | test_score = model.score(X_test, y_test) 65 | 66 | # Record scores 67 | cv_scores.append(cv_score) 68 | test_scores.append(test_score) 69 | 70 | # Print trial results 71 | print(f"Trial {trial+1}: CV Mean Score={cv_score:.4f}, Test Score={test_score:.4f}") 72 | 73 | # Propose a random perturbation of the hyperparameters 74 | new_params = { 75 | "n_estimators": best_params["n_estimators"] + np.random.randint(-10, 11), 76 | "max_depth": best_params["max_depth"] + np.random.randint(-1, 2) 77 | } 78 | new_params["n_estimators"] = max(1, new_params["n_estimators"]) # Ensure valid value 79 | new_params["max_depth"] = max(1, new_params["max_depth"]) # Ensure valid value 80 | 81 | # Evaluate new parameters 82 | new_model = RandomForestClassifier( 83 | n_estimators=new_params["n_estimators"], max_depth=new_params["max_depth"], random_state=42 84 | ) 85 | new_cv_score = np.mean(cross_val_score(new_model, X_train, y_train, cv=kf, scoring="accuracy")) 86 | 87 | # Update the best parameters if the new score is better 88 | if new_cv_score > cv_score: 89 | best_params = new_params 90 | 91 | # Plot the cross-validation and hold-out test scores over trials 92 | plt.figure(figsize=(10, 6)) 93 | plt.plot(range(1, n_trials + 1), cv_scores, label="Cross-Validation Score") 94 | plt.plot(range(1, n_trials + 1), test_scores, label="Hold-Out Test Score") 95 | plt.xlabel("Trial") 96 | plt.ylabel("Accuracy") 97 | plt.title("Model Performance: Cross-Validation vs Hold-Out Test") 98 | plt.legend() 99 | plt.show() 100 | 101 | # Print final performance metrics 102 | final_model = RandomForestClassifier( 103 | n_estimators=best_params["n_estimators"], max_depth=best_params["max_depth"], random_state=42 104 | ) 105 | final_model.fit(X_train, y_train) 106 | final_cv_score = np.mean(cross_val_score(final_model, X_train, y_train, cv=kf, scoring="accuracy")) 107 | final_test_score = final_model.score(X_test, y_test) 108 | print(f"Final Model: CV Mean Score={final_cv_score:.4f}, Test Score={final_test_score:.4f}") 109 | ``` 110 | 111 | Example Output: 112 | 113 | ![](/pics/test_harness_hacking_hill_climbing_performance.png) 114 | 115 | ```text 116 | Trial 1: CV Mean Score=0.7123, Test Score=0.7000 117 | Trial 2: CV Mean Score=0.7247, Test Score=0.7250 118 | Trial 3: CV Mean Score=0.7247, Test Score=0.7250 119 | Trial 4: CV Mean Score=0.7247, Test Score=0.7250 120 | Trial 5: CV Mean Score=0.7247, Test Score=0.7250 121 | ... 122 | Trial 95: CV Mean Score=0.8371, Test Score=0.7750 123 | Trial 96: CV Mean Score=0.8371, Test Score=0.7750 124 | Trial 97: CV Mean Score=0.8371, Test Score=0.7750 125 | Trial 98: CV Mean Score=0.8371, Test Score=0.7750 126 | Trial 99: CV Mean Score=0.8371, Test Score=0.7750 127 | Trial 100: CV Mean Score=0.8371, Test Score=0.7750 128 | ``` -------------------------------------------------------------------------------- /examples/seed_hacking_bootstrap_performance.md: -------------------------------------------------------------------------------- 1 | # Seed Hack Bootstrap Performance 2 | 3 | > Vary the seed for a bootstrap evaluation of a final chosen model on the test set to present the best performance. 4 | 5 | ## Description 6 | 7 | It is common to present the performance of a final chosen model by training it on the train set and evaluating it using the distribution of performance scores from multiple bootstrap samples of the test set. 8 | 9 | Performance hacking through selective bootstrap seed manipulation is a deceptive practice that artificially inflates model evaluation metrics. It might be referred to as "performance inflation" or "result polishing". 10 | 11 | This technique involves repeatedly running bootstrap evaluations with different random seeds on the test set, then cherry-picking and reporting only the most favorable results. 12 | 13 | While bootstrapping is a valid resampling technique for understanding model variance, deliberately selecting the best-performing seed masks the true model performance and creates unrealistic expectations. 14 | 15 | This practice undermines the fundamental purpose of model evaluation - to get an honest assessment of how well the model will generalize to new data. 16 | 17 | The consequences can be severe when deployed models fail to achieve the reported performance metrics in production, potentially damaging team credibility and business outcomes. 18 | 19 | Instead of seed manipulation, data scientists should report average performance across multiple random seeds or, better yet, use techniques like cross-validation with fixed seeds for reproducible and trustworthy evaluations. 20 | 21 | ## Example 22 | 23 | ```python 24 | # Import necessary libraries 25 | from sklearn.datasets import make_classification 26 | from sklearn.model_selection import train_test_split 27 | from sklearn.ensemble import RandomForestClassifier 28 | from sklearn.utils import resample 29 | from sklearn.metrics import accuracy_score 30 | import numpy as np 31 | 32 | # Generate a synthetic classification dataset 33 | X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42) 34 | 35 | # Split the dataset into a training set and a test set 36 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 37 | 38 | # Initialize the random forest classifier 39 | model = RandomForestClassifier(random_state=42) 40 | 41 | # Train the model on the training set 42 | model.fit(X_train, y_train) 43 | 44 | # Number of bootstrap iterations 45 | num_bootstrap_iterations = 50 46 | 47 | # Number of repetitions for each bootstrap sample 48 | num_repeats_per_sample = 10 49 | 50 | # Variable to track the best accuracy and corresponding seed 51 | best_accuracy = 0 52 | best_seed = None 53 | 54 | # Iterate through multiple random seeds for bootstrap sampling 55 | for seed in range(num_bootstrap_iterations): 56 | # List to store accuracy scores for each repeat 57 | repeat_accuracies = [] 58 | 59 | # Evaluate the model on the same bootstrap sample multiple times 60 | for _ in range(num_repeats_per_sample): 61 | # Generate a bootstrap sample of the test set 62 | X_test_bootstrap, y_test_bootstrap = resample(X_test, y_test, random_state=seed) 63 | y_pred = model.predict(X_test_bootstrap) 64 | accuracy = accuracy_score(y_test_bootstrap, y_pred) 65 | repeat_accuracies.append(accuracy) 66 | 67 | # Compute the median accuracy for the current bootstrap sample 68 | median_accuracy = np.median(repeat_accuracies) 69 | 70 | # Report progress 71 | print(f'> Seed={seed}, Median Accuracy: {median_accuracy}') 72 | 73 | # Keep track of the best performance and its corresponding seed 74 | if median_accuracy > best_accuracy: 75 | best_accuracy = median_accuracy 76 | best_seed = seed 77 | 78 | # Print the selected seed with the best accuracy (artificially chosen for presentation) 79 | print(f"Best Seed: {best_seed}, Best Median Accuracy: {best_accuracy}") 80 | ``` 81 | 82 | Example Output: 83 | 84 | ```text 85 | > Seed=0, Median Accuracy: 0.87 86 | > Seed=1, Median Accuracy: 0.82 87 | > Seed=2, Median Accuracy: 0.8466666666666667 88 | > Seed=3, Median Accuracy: 0.83 89 | > Seed=4, Median Accuracy: 0.8433333333333334 90 | > Seed=5, Median Accuracy: 0.8366666666666667 91 | > Seed=6, Median Accuracy: 0.8633333333333333 92 | > Seed=7, Median Accuracy: 0.87 93 | > Seed=8, Median Accuracy: 0.8433333333333334 94 | > Seed=9, Median Accuracy: 0.86 95 | > Seed=10, Median Accuracy: 0.88 96 | > Seed=11, Median Accuracy: 0.8633333333333333 97 | > Seed=12, Median Accuracy: 0.8466666666666667 98 | > Seed=13, Median Accuracy: 0.8666666666666667 99 | > Seed=14, Median Accuracy: 0.8333333333333334 100 | > Seed=15, Median Accuracy: 0.8466666666666667 101 | > Seed=16, Median Accuracy: 0.8666666666666667 102 | > Seed=17, Median Accuracy: 0.8333333333333334 103 | > Seed=18, Median Accuracy: 0.8733333333333333 104 | > Seed=19, Median Accuracy: 0.8233333333333334 105 | > Seed=20, Median Accuracy: 0.8633333333333333 106 | > Seed=21, Median Accuracy: 0.8433333333333334 107 | > Seed=22, Median Accuracy: 0.8366666666666667 108 | > Seed=23, Median Accuracy: 0.8466666666666667 109 | > Seed=24, Median Accuracy: 0.85 110 | > Seed=25, Median Accuracy: 0.8466666666666667 111 | > Seed=26, Median Accuracy: 0.8533333333333334 112 | > Seed=27, Median Accuracy: 0.8633333333333333 113 | > Seed=28, Median Accuracy: 0.8733333333333333 114 | > Seed=29, Median Accuracy: 0.82 115 | > Seed=30, Median Accuracy: 0.8566666666666667 116 | > Seed=31, Median Accuracy: 0.8766666666666667 117 | > Seed=32, Median Accuracy: 0.9 118 | > Seed=33, Median Accuracy: 0.8366666666666667 119 | > Seed=34, Median Accuracy: 0.8533333333333334 120 | > Seed=35, Median Accuracy: 0.8566666666666667 121 | > Seed=36, Median Accuracy: 0.8766666666666667 122 | > Seed=37, Median Accuracy: 0.8266666666666667 123 | > Seed=38, Median Accuracy: 0.82 124 | > Seed=39, Median Accuracy: 0.8533333333333334 125 | > Seed=40, Median Accuracy: 0.8366666666666667 126 | > Seed=41, Median Accuracy: 0.81 127 | > Seed=42, Median Accuracy: 0.8166666666666667 128 | > Seed=43, Median Accuracy: 0.8833333333333333 129 | > Seed=44, Median Accuracy: 0.8733333333333333 130 | > Seed=45, Median Accuracy: 0.8766666666666667 131 | > Seed=46, Median Accuracy: 0.88 132 | > Seed=47, Median Accuracy: 0.8466666666666667 133 | > Seed=48, Median Accuracy: 0.9033333333333333 134 | > Seed=49, Median Accuracy: 0.89 135 | Best Seed: 48, Best Median Accuracy: 0.9033333333333333 136 | ``` -------------------------------------------------------------------------------- /examples/threshold_hacking.md: -------------------------------------------------------------------------------- 1 | # Threshold Hacking 2 | 3 | > Adjusting classification thresholds to hit specific metric targets. 4 | 5 | ## Description 6 | Threshold hacking is a problematic practice in machine learning where practitioners manipulate classification thresholds solely to achieve specific performance metrics, rather than considering real-world impact. 7 | 8 | This approach involves adjusting the probability cutoff point that determines when a model classifies something as positive or negative, without proper statistical or business justification. While threshold tuning itself is valid, threshold hacking aims only to hit arbitrary metric targets like accuracy or F1 score. 9 | 10 | The danger lies in creating models that appear to perform well on paper but fail to generalize or provide meaningful business value. This often occurs when data scientists feel pressure to meet performance benchmarks without full consideration of the model's practical applications. 11 | 12 | For new data scientists, this pattern can be particularly tempting when facing pressure to demonstrate model effectiveness. However, it typically leads to models that perform poorly in production, potentially damaging both business outcomes and professional credibility. 13 | 14 | A better approach is to set thresholds based on careful analysis of business requirements, costs of different types of errors, and thorough validation across multiple metrics. This ensures models deliver real value rather than just impressive-looking numbers. 15 | 16 | ## Example 17 | 18 | ```python 19 | # Import necessary libraries 20 | import numpy as np 21 | from sklearn.datasets import make_classification 22 | from sklearn.model_selection import train_test_split 23 | from sklearn.linear_model import LogisticRegression 24 | from sklearn.metrics import precision_score 25 | 26 | # Generate a synthetic classification dataset 27 | X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, 28 | n_redundant=5, random_state=42) 29 | 30 | # Split the dataset into train and test sets 31 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 32 | 33 | # Initialize the logistic regression model 34 | model = LogisticRegression(random_state=42, max_iter=1000) 35 | 36 | # Train the model on the training set 37 | model.fit(X_train, y_train) 38 | 39 | # Get raw predicted probabilities for the positive class 40 | y_proba = model.predict_proba(X_test)[:, 1] 41 | 42 | # Define a range of thresholds to evaluate 43 | thresholds = np.linspace(0.1, 0.9, 81) 44 | 45 | # Track best precision score and corresponding threshold 46 | best_precision = 0 47 | best_threshold = 0 48 | 49 | # Iterate over each threshold 50 | print("Threshold Tuning Progress:") 51 | print(f"{'Threshold':<10}{'Precision':<10}{'Best Precision':<15}{'Best Threshold':<15}") 52 | for threshold in thresholds: 53 | # Convert probabilities to binary predictions based on the current threshold 54 | y_pred = (y_proba >= threshold).astype(int) 55 | 56 | # Calculate precision score 57 | precision = precision_score(y_test, y_pred) 58 | 59 | # Check if this is the best precision score so far 60 | if precision > best_precision: 61 | best_precision = precision 62 | best_threshold = threshold 63 | 64 | # Report progress 65 | print(f"{threshold:<10.2f}{precision:<10.2f}{best_precision:<15.2f}{best_threshold:<15.2f}") 66 | 67 | # Final best score and threshold 68 | print("\nFinal Results:") 69 | print(f"Best Precision: {best_precision:.2f}") 70 | print(f"Best Threshold: {best_threshold:.2f}") 71 | ``` 72 | 73 | Example Output: 74 | 75 | ```text 76 | Threshold Tuning Progress: 77 | Threshold Precision Best Precision Best Threshold 78 | 0.10 0.61 0.61 0.10 79 | 0.11 0.61 0.61 0.11 80 | 0.12 0.62 0.62 0.12 81 | 0.13 0.62 0.62 0.13 82 | 0.14 0.64 0.64 0.14 83 | 0.15 0.64 0.64 0.15 84 | 0.16 0.65 0.65 0.16 85 | 0.17 0.66 0.66 0.17 86 | 0.18 0.67 0.67 0.18 87 | 0.19 0.67 0.67 0.19 88 | 0.20 0.67 0.67 0.20 89 | 0.21 0.67 0.67 0.20 90 | 0.22 0.68 0.68 0.22 91 | 0.23 0.68 0.68 0.23 92 | 0.24 0.68 0.68 0.23 93 | 0.25 0.68 0.68 0.23 94 | 0.26 0.68 0.68 0.26 95 | 0.27 0.70 0.70 0.27 96 | 0.28 0.70 0.70 0.28 97 | 0.29 0.70 0.70 0.29 98 | 0.30 0.71 0.71 0.30 99 | 0.31 0.71 0.71 0.31 100 | 0.32 0.73 0.73 0.32 101 | 0.33 0.73 0.73 0.33 102 | 0.34 0.73 0.73 0.34 103 | 0.35 0.73 0.73 0.34 104 | 0.36 0.74 0.74 0.36 105 | 0.37 0.74 0.74 0.36 106 | 0.38 0.74 0.74 0.36 107 | 0.39 0.74 0.74 0.36 108 | 0.40 0.74 0.74 0.36 109 | 0.41 0.75 0.75 0.41 110 | 0.42 0.74 0.75 0.41 111 | 0.43 0.75 0.75 0.43 112 | 0.44 0.76 0.76 0.44 113 | 0.45 0.77 0.77 0.45 114 | 0.46 0.78 0.78 0.46 115 | 0.47 0.78 0.78 0.47 116 | 0.48 0.79 0.79 0.48 117 | 0.49 0.79 0.79 0.48 118 | 0.50 0.79 0.79 0.50 119 | 0.51 0.80 0.80 0.51 120 | 0.52 0.80 0.80 0.51 121 | 0.53 0.80 0.80 0.53 122 | 0.54 0.81 0.81 0.54 123 | 0.55 0.81 0.81 0.54 124 | 0.56 0.81 0.81 0.54 125 | 0.57 0.81 0.81 0.54 126 | 0.58 0.81 0.81 0.58 127 | 0.59 0.82 0.82 0.59 128 | 0.60 0.82 0.82 0.59 129 | 0.61 0.82 0.82 0.59 130 | 0.62 0.83 0.83 0.62 131 | 0.63 0.83 0.83 0.63 132 | 0.64 0.83 0.83 0.63 133 | 0.65 0.84 0.84 0.65 134 | 0.66 0.85 0.85 0.66 135 | 0.67 0.85 0.85 0.66 136 | 0.68 0.86 0.86 0.68 137 | 0.69 0.86 0.86 0.69 138 | 0.70 0.86 0.86 0.69 139 | 0.71 0.86 0.86 0.69 140 | 0.72 0.85 0.86 0.69 141 | 0.73 0.86 0.86 0.73 142 | 0.74 0.87 0.87 0.74 143 | 0.75 0.87 0.87 0.74 144 | 0.76 0.87 0.87 0.74 145 | 0.77 0.86 0.87 0.74 146 | 0.78 0.86 0.87 0.74 147 | 0.79 0.87 0.87 0.74 148 | 0.80 0.87 0.87 0.74 149 | 0.81 0.88 0.88 0.81 150 | 0.82 0.90 0.90 0.82 151 | 0.83 0.91 0.91 0.83 152 | 0.84 0.92 0.92 0.84 153 | 0.85 0.91 0.92 0.84 154 | 0.86 0.92 0.92 0.86 155 | 0.87 0.92 0.92 0.86 156 | 0.88 0.92 0.92 0.86 157 | 0.89 0.93 0.93 0.89 158 | 0.90 0.94 0.94 0.90 159 | 160 | Final Results: 161 | Best Precision: 0.94 162 | Best Threshold: 0.90 163 | ``` -------------------------------------------------------------------------------- /examples/test_harness_hacking.md: -------------------------------------------------------------------------------- 1 | # Test Harness Hacking 2 | 3 | > Varying models and hyperparameters to maximize test harness performance at the cost of reduced generalizability. 4 | 5 | ## Description 6 | 7 | When multiple hypotheses, models, or configurations are tested on the same dataset or evaluation framework (test harness), there is a high risk of fitting to the noise or idiosyncrasies of the data rather than uncovering true, generalizable patterns. 8 | 9 | This leads to inflated performance estimates that do not hold when the model is applied to unseen data. 10 | 11 | This issue is known by many names, such as: 12 | 13 | * **Comparing Too Many Hypotheses** / **Checking Too Many Models**: Testing numerous hypotheses or model configurations increases the chance of finding a model that performs well on the test data by coincidence rather than due to its inherent quality. 14 | * **Multiple Comparison Problem** / **Multiple Hypothesis Testing**: A statistical issue where testing multiple hypotheses increases the probability of false positives (e.g., identifying a model as superior when it's not). 15 | * **Oversearching**: Excessive experimentation with hyperparameters, architectures, or algorithms can lead to "discovering" patterns that are not generalizable. 16 | * **Overfitting Model Selection**: When the process of selecting the best model overfits to the evaluation dataset, the chosen model's reported performance becomes unreliable. 17 | * **Test Harness Hacking**: Manipulating the evaluation process, such as by repeatedly tweaking models or hyperparameters, to artificially inflate test harness performance. 18 | 19 | Ideally (from a statistical perspective), candidate hypotheses (models) would be selected for a predictive modeling problem _before_ data is gathered, not after and not adapted to the problem in response to results on the test harness. 20 | 21 | > ... the theory of statistical inference assumes a fixed collection of hypotheses to be tested, or learning algorithms to be applied, selected non-adaptively before the data are gathered, whereas in practice data is shared and reused with hypotheses and new analyses being generated on the basis of data exploration and the outcomes of previous analyses. 22 | 23 | -- [Preserving Statistical Validity in Adaptive Data Analysis](https://arxiv.org/abs/1411.2664), 2014. 24 | 25 | ## Scenario 26 | 27 | A test-setup (specific data, model, and test harness) may be more or less subject to this problem. 28 | 29 | The aspects that exasperate this problem include: 30 | 31 | * Small dataset. 32 | * Large number of candidate models. 33 | * Large number of candidate model hyperparameter combinations. 34 | * High variance test harness. 35 | 36 | > It seems reasonable to suggest that over-fitting in model selection is possible whenever a model selection criterion evaluated over a finite sample of data is directly optimised. Like over-fitting in training, over-fitting in model selection is likely to be most severe when the sample of data is small and the number of hyper-parameters to be tuned is relatively large. 37 | 38 | -- [On Over-fitting in Model Selection and Subsequent Selection Bias in Performance Evaluation](https://www.jmlr.org/papers/volume11/cawley10a/cawley10a.pdf), 2010. 39 | 40 | The risk is that the variance in model performance on the test harness will result in an optimistic basis (i.e. model's look better than they are). 41 | 42 | This bias may be larger than the difference in performance between performance estimates of different models on the test harness, resulting in Type I errors (false positive) in model selection. 43 | 44 | > The scale of the bias observed on some data sets is much larger than the difference in performance between learning algorithms, and so one could easily draw incorrect inferences based on the results obtained. 45 | 46 | -- [On Over-fitting in Model Selection and Subsequent Selection Bias in Performance Evaluation](https://www.jmlr.org/papers/volume11/cawley10a/cawley10a.pdf), 2010. 47 | 48 | We can depict this scenario with an idealized plot, below. 49 | 50 | ### Graphical Depiction 51 | 52 | ![test harness hacking](/pics/test_harness_hacking.png) 53 | 54 | The plot above illustrates the distributions of performance metrics for two algorithms: 55 | 56 | - **Algorithm A ("Chosen Algorithm")**: 57 | - Slightly higher mean performance (75). 58 | - Larger variance (10). 59 | 60 | - **Algorithm B ("Alternative Algorithm")**: 61 | - Slightly lower mean performance (72). 62 | - Smaller variance (5). 63 | 64 | Even though Algorithm A is chosen due to its slightly higher mean performance, the variance in its performance is large enough that the difference in means may not be practically significant. 65 | 66 | This underscores the importance of considering the variability in performance and not relying solely on mean values for decision-making. 67 | 68 | ## Examples 69 | 70 | Below are some examples of test harness hacking. 71 | 72 | * [Hill Climb Cross-Validation Test Folds](test_harness_hacking_hill_climbing_test_folds.md): Adapt predictions for each cross-validation test fold over repeated trials. 73 | * [Hill Climb Cross-Validation Performance](test_harness_hacking_hill_climbing_performance.md): Excessively adapt a model for cross-validation performance. 74 | * [Test Harness Hacking Mitigation](test_harness_hacking_mitigation.md): Modern practices (repeated k-fold cv) mitigates the risk of test harness hacking. 75 | 76 | ## Impact 77 | 78 | The impact of overfitting the test harness manifests as **optimistic bias** in the performance of the chosen model. 79 | 80 | Here's how this unfold in a machine learning project: 81 | 82 | 1. **Overfitting to the Test Harness**: Through repeated tuning or evaluation on the test harness, the chosen model exploits idiosyncrasies in the validation/test set rather than learning generalizable patterns. 83 | 2. **Optimistic Performance Estimates**: The model appears to perform exceptionally well on the test harness, creating a false sense of superiority over other models. 84 | 3. **Final Model Evaluation**: When the model is retrained on all available training data and evaluated on a hold-out test set (or deployed in real-world scenarios), its performance is often significantly lower than expected. This happens because the model's improvements on the test harness were based on fitting noise or dataset-specific artifacts. 85 | 4. **Missed Opportunities**: Other models that may generalize better but were overlooked during evaluation (due to lower but more realistic performance on the test harness) might have been more suitable in practice. 86 | 87 | ## Push-Back 88 | 89 | It is possible that the issue of "too many model comparisons" is overblown in modern machine learning. 90 | 91 | This may be because the techniques that mitigate this type of overfitting have become best practices, such as: 92 | 93 | * Adoption of k-fold cross-validation in the test harness. 94 | * Adoption of repeated cross-validation to further reduce variance in performance estimates. 95 | * Adoption of nested cross-validation, to tune hyperparameters within each cross-validation fold. 96 | * Adoption of corrections to cross-validation when used for model selection (e.g. 1 standard error rule). 97 | * Adoption of statistical hypothesis tests to support differences in model performance on the test harness. 98 | * Adoption of modern machine learning learning algorithms that use regularization, early stopping and similar methods. 99 | 100 | > The adaptive data analysis literature provides a range of theoretical explanations for how the common machine learning workflow may implicitly mitigate overfitting 101 | 102 | -- [A Meta-Analysis of Overfitting in Machine Learning](https://proceedings.neurips.cc/paper/2019/hash/ee39e503b6bedf0c98c388b7e8589aca-Abstract.html), 2019. 103 | 104 | And: 105 | 106 | > We propose that the computational cost of performing repeated cross-validation and nested cross-validation in the cloud have reached a level where the use of substitutes to full nested cross-validation are no longer justified. 107 | 108 | -- [Cross-validation pitfalls when selecting and assessing regression and classification models](https://jcheminf.biomedcentral.com/articles/10.1186/1758-2946-6-10), 2014. 109 | 110 | And: 111 | 112 | > Often a “one-standard error” rule is used with cross-validation, in which we choose the most parsimonious model whose error is no more than one standard error above the error of the best model. 113 | 114 | -- Page 244, [The Elements of Statistical Learning](https://hastie.su.domains/ElemStatLearn/), 2016. 115 | 116 | As such, overfitting the test harness may be less of a concern than it once was one or two decades ago in applied machine learning. 117 | 118 | Evidence for this is seen in large-scale machine learning competitions, like those on Kaggle. 119 | 120 | > In each competition, numerous practitioners repeatedly evaluated their progress against a holdout set that forms the basis of a public ranking available throughout the competition. Performance on a separate test set used only once determined the final ranking. By systematically comparing the public ranking with the final ranking, we assess how much participants adapted to the holdout set over the course of a competition. Our study shows, somewhat surprisingly, little evidence of substantial overfitting. 121 | 122 | -- [A Meta-Analysis of Overfitting in Machine Learning](https://proceedings.neurips.cc/paper/2019/hash/ee39e503b6bedf0c98c388b7e8589aca-Abstract.html), 2019. 123 | 124 | And: 125 | 126 | > Overall, we conclude that the classification competitions on Kaggle show little to no signs of overfitting. While there are some outlier competitions in the data, these competitions usually have pathologies such as non-i.i.d. data splits or (effectively) small test sets. Among the remaining competitions, the public and private test scores show a remarkably good correspondence. The picture becomes more nuanced among the highest scoring submissions, but the overall effect sizes of (potential) overfitting are typically small (e.g., less than 1% classification accuracy). Thus, our findings show that substantial overfitting is unlikely to occur naturally in regular machine learning workflows. 127 | 128 | -- [A Meta-Analysis of Overfitting in Machine Learning](https://proceedings.neurips.cc/paper/2019/hash/ee39e503b6bedf0c98c388b7e8589aca-Abstract.html), 2019. 129 | 130 | Additional evidence for this is seen in popular computer vision deep learning benchmark datasets on which continued performance, rather than overfitting, is observed. 131 | 132 | > Recent replication studies [16] demonstrated that the popular CIFAR-10 and ImageNet benchmarks continue to support progress despite years of intensive use. The longevity of these benchmarks perhaps suggests that overfitting to holdout data is less of a concern than reasoning from first principles might have suggested. 133 | 134 | -- [A Meta-Analysis of Overfitting in Machine Learning](https://proceedings.neurips.cc/paper/2019/hash/ee39e503b6bedf0c98c388b7e8589aca-Abstract.html), 2019. 135 | 136 | These findings suggest that test-harness hacking may be achieved by intentionally not observing modern best practices like those listed above. 137 | 138 | ## Further Reading 139 | 140 | * [A Meta-Analysis of Overfitting in Machine Learning](https://proceedings.neurips.cc/paper/2019/hash/ee39e503b6bedf0c98c388b7e8589aca-Abstract.html), 2019. 141 | * [Cross-validation pitfalls when selecting and assessing regression and classification models](https://jcheminf.biomedcentral.com/articles/10.1186/1758-2946-6-10), 2014. 142 | * [Do ImageNet Classifiers Generalize to ImageNet?](https://arxiv.org/abs/1902.10811), 2019. 143 | * [Model Evaluation, Model Selection, and Algorithm Selection in Machine Learning](https://arxiv.org/abs/1811.12808), 2018. 144 | * [Model Similarity Mitigates Test Set Overuse](https://arxiv.org/abs/1905.12580), 2019. 145 | * [Multiple Comparisons in Induction Algorithms](https://link.springer.com/article/10.1023/A:1007631014630), 2000. 146 | * [On Over-fitting in Model Selection and Subsequent Selection Bias in Performance Evaluation](https://www.jmlr.org/papers/volume11/cawley10a/cawley10a.pdf), 2010. 147 | * [Preserving Statistical Validity in Adaptive Data Analysis](https://arxiv.org/abs/1411.2664), 2014. 148 | * [Preventing "Overfitting" of Cross-Validation Data](https://ai.stanford.edu/~ang/papers/cv-final.pdf), 1997. 149 | * [The Elements of Statistical Learning](https://hastie.su.domains/ElemStatLearn/), 2016. 150 | -------------------------------------------------------------------------------- /examples/p_hacking.md: -------------------------------------------------------------------------------- 1 | # p-Hacking 2 | 3 | > Repeating a statistical hypothesis test until a significant result is achieved. 4 | 5 | ## Description 6 | 7 | P-hacking is the practice of manipulating data analysis until you achieve a statistically significant result, typically to support a predetermined conclusion. 8 | 9 | This approach involves running multiple [statistical hypothesis tests](https://en.wikipedia.org/wiki/Statistical_hypothesis_test) on the same dataset, selectively choosing which data points to include, or adjusting variables until achieving the desired [p-value](https://en.wikipedia.org/wiki/P-value) (typically < 0.05). 10 | 11 | While it may seem tempting to keep testing until you get "significant" results, p-hacking invalidates the fundamental principles of statistical testing and leads to false discoveries. 12 | 13 | The danger lies in increasing the likelihood of [Type I errors](https://en.wikipedia.org/wiki/Type_I_and_type_II_errors) (false positives) through multiple comparisons, making spurious **mean differences** and **correlations** appear meaningful when they're actually due to random chance. 14 | 15 | For new data scientists, this pattern often emerges unintentionally when there's pressure to find significant results or when dealing with stakeholder expectations for positive outcomes. 16 | 17 | To avoid p-hacking, define your hypothesis and analysis plan before examining the data, use correction methods for multiple comparisons, and be transparent about all tests performed - including those that didn't yield significant results. 18 | 19 | Remember that negative results are valid scientific outcomes and should be reported alongside positive findings to maintain research integrity. 20 | 21 | ## Cases of p-hacking in Machine Learning 22 | 23 | Any time we want to use a statistical hypothesis test to compare two samples in a data science/machine learning project, this represents a point for p-hacking. 24 | 25 | Common cases include: 26 | 27 | - Comparing data sub-samples by impact on model performance. 28 | - Comparing subsets of input features by correlation with the target or impact on model performance. 29 | - Comparing the performance of models based on cross-validated performance. 30 | - Comparing the performance of a model with different hyperparameters. 31 | 32 | P-hacking requires varying something in the experiment to produce a distribution of samples that 1) give a result (such as the sample mean) that is "better" and 2) give a result that has a p-value (as calculated by a statistical test) below the threshold (i.e. significant). 33 | 34 | The aspect varied is often the seed for the pseudorandom number generator, such as when varying the a sampling procedure or learning algorithm. As such, many cases of p-hacking also require [seed hacking](seed_hacking.md). 35 | 36 | ### Worked Examples of p-Hacking 37 | 38 | Below are some worked examples of p-hacking in a data science/machine learning project. 39 | 40 | * [p-Hacking Selective Sampling](p_hacking_selective_sampling.md): _Vary samples of a dataset in order to fit a model with significantly better performance._ 41 | * [p-Hacking Feature Selection](p_hacking_feature_selection.md): _Vary feature subsets of a dataset in order to fit a model with significantly better performance._ 42 | * [p-Hacking the Learning Algorithm](p_hacking_learning_algorithm.md) _Vary the random numbers used by a learning algorithm in order to get a significantly better result._ 43 | 44 | ## What Does a p-Hacked Result Mean? 45 | 46 | From a statistical perspective, when we say two samples are different and that the difference is significant according to a statistical test, we are typically referring to the rejection of the null hypothesis in favor of the alternative hypothesis based on a test statistic. 47 | 48 | Here's what this means, and why p-hacking can make this problematic: 49 | 50 | ### What Statistical Significance Means 51 | 1. **Null Hypothesis ($H_0$)**: 52 | - The null hypothesis posits that there is no true difference between the two samples or groups. For example, it might assert that the means of the two samples are equal ($\mu_1 = \mu_2$). 53 | 54 | 2. **Alternative Hypothesis ($H_a$)**: 55 | - The alternative hypothesis suggests that there is a true difference between the two samples ($\mu_1 \neq \mu_2$). 56 | 57 | 3. **P-Value**: 58 | - The p-value quantifies the probability of observing a test statistic as extreme (or more extreme) as the one calculated from the data, assuming the null hypothesis is true. 59 | - A low p-value (typically less than 0.05) suggests that such extreme data is unlikely under the null hypothesis, leading to its rejection. 60 | 61 | 4. **Statistical Significance**: 62 | - When a test concludes "statistical significance," it means the data provides sufficient evidence to reject $H_0$ at a predefined significance level ($\alpha$), often 0.05. 63 | 64 | ### The Problem with P-Hacking 65 | 1. **Inflated False Positives**: 66 | - The p-value is conditional on the null hypothesis being true and the experimental procedure being followed correctly. P-hacking violates this assumption by: 67 | - Testing multiple hypotheses without proper correction. 68 | - Cherry-picking results or repeating experiments until a significant p-value is found. 69 | - This inflates the Type I error rate, leading to a higher likelihood of false positives (erroneously rejecting $H_0$ when it is actually true). 70 | 71 | 2. **Violation of Assumptions**: 72 | - Many statistical tests assume random sampling, independence, or a fixed number of hypotheses. P-hacking often violates these assumptions, invalidating the reported p-value. 73 | 74 | 3. **Misleading Conclusions**: 75 | - A significant p-value under p-hacking does not reflect a true effect but rather the exploitation of randomness or bias. This can mislead researchers, practitioners, and policymakers into believing a non-existent effect exists. 76 | 77 | 4. **Overfitting and Non-Reproducibility**: 78 | - P-hacking aligns findings to the specific data sample rather than the underlying population, resulting in overfitted models or findings that fail to generalize. 79 | 80 | ## p-Hacking vs Normal Experimentation 81 | 82 | **What is p-hacking, and what is normal experimental variation in a machine learning project?** 83 | 84 | P-hacking in a machine learning project and the normal variation of aspects of a machine learning pipeline share similarities in that both involve systematically exploring different configurations, datasets, or techniques to optimize results. 85 | 86 | However, they differ significantly in their intent, methodology, and implications. 87 | 88 | Here's an attempt at a comparison: 89 | 90 | ### Intent of Experimentation (intent matters!) 91 | - **P-Hacking:** 92 | - The primary goal is often to achieve statistical significance and a desired result (e.g., a low p-value and an improved metric), even at the cost of scientific or experimental integrity. 93 | - It reflects a bias towards confirming a hypothesis, regardless of whether the result is genuinely meaningful or reproducible. 94 | 95 | - **Normal Variation:** 96 | - The goal is to genuinely identify the best-performing model or configuration while ensuring that findings are robust and reproducible. 97 | - The process is exploratory but grounded in a scientific approach to assess performance in a meaningful and unbiased manner. 98 | 99 | ### Methodology 100 | - **P-Hacking:** 101 | - Involves deliberately cherry-picking or over-exploring configurations to obtain statistically significant results. 102 | - Examples include: 103 | - Running experiments on multiple datasets and only reporting the one that shows the desired results. 104 | - Trying numerous feature subsets or hyperparameters without a predefined protocol, then selecting the ones that yield significant outcomes. 105 | - Repeating experiments until statistical tests yield favorable results (e.g., p-values < 0.05). 106 | - Often lacks transparency, with omitted reporting of failed or contradictory experiments. 107 | 108 | - **Normal Variation:** 109 | - Follows systematic and reproducible protocols for varying datasets, features, models, or hyperparameters. 110 | - Examples include: 111 | - Using predefined validation or test datasets to avoid bias. 112 | - Employing cross-validation or other robust evaluation techniques to ensure generalizability. 113 | - Applying grid search, random search, or Bayesian optimization for hyperparameter tuning within a controlled framework. 114 | - Results are typically presented comprehensively, including cases where configurations performed poorly. 115 | 116 | ### Evaluation and Reporting 117 | - **P-Hacking:** 118 | - Relies heavily on statistical tests to "prove" a point, often without considering the broader context or reproducibility. 119 | - May selectively report results that confirm a hypothesis, leading to overfitting or misrepresentation. 120 | - Lacks emphasis on replicability; findings may not hold on unseen data or alternative setups. 121 | 122 | - **Normal Variation:** 123 | - Focuses on evaluating performance through unbiased metrics like accuracy, F1 score, AUC, etc., on unseen test data. 124 | - Emphasizes transparency, reporting the entire spectrum of experiments (successful and unsuccessful) to give a holistic view. 125 | - Stresses reproducibility, often sharing code, data, and experimental protocols for verification by others. 126 | 127 | ### Impact 128 | - **P-Hacking:** 129 | - Can lead to misleading conclusions, potentially wasting resources or eroding trust in the findings. 130 | - Results are often fragile and fail to generalize beyond the specific experimental conditions. 131 | - Undermines scientific and ethical standards in research. 132 | 133 | - **Normal Variation:** 134 | - Helps identify robust and reliable configurations that generalize well to new data. 135 | - Builds confidence in findings and advances the field by sharing insights into what works and what does not. 136 | - Adheres to principles of transparency, integrity, and reproducibility. 137 | 138 | ### **Key Distinction** 139 | The fundamental difference lies in **integrity and intent**. 140 | 141 | P-hacking prioritizes achieving "impressive" results at the expense of scientific rigor, often through selective reporting and overfitting. 142 | 143 | In contrast, normal variation is a legitimate and scientifically sound process to explore and optimize machine learning pipelines, grounded in transparency and reproducibility. 144 | 145 | ### Mitigation Strategies 146 | To avoid unintentional p-hacking while exploring variations in machine learning projects: 147 | - Use rigorous protocols such as cross-validation and pre-registered experiments. 148 | - Report all experiments, including those that yield negative or inconclusive results. 149 | - Evaluate findings on independent test sets that were not used during the exploratory phase. 150 | - Avoid over-reliance on statistical significance as the sole criterion for evaluating results; consider practical significance and generalizability. 151 | 152 | ### Summary 153 | 154 | | **Aspect** | **P-Hacking** | **Normal Variation** | 155 | |--------------------|-------------------------------------------------------------------------------|------------------------------------------------------------------------------| 156 | | **Intent** | Achieve desired results (e.g., statistical significance) at the cost of integrity. | Identify the best-performing configuration while ensuring robustness. | 157 | | **Methodology** | Cherry-picks or over-explores configurations to obtain favorable outcomes. Lacks transparency; omits reporting of failures. | Systematically and reproducibly explores configurations. Follows predefined protocols; reports successes and failures comprehensively. | 158 | | **Evaluation** | Focuses on statistical tests to confirm hypotheses, often ignoring context. Selectively reports results that support the hypothesis. | Evaluates unbiased metrics (e.g., accuracy, F1 score) on unseen test data. Reports entire spectrum of experiments for transparency. | 159 | | **Reporting** | Results often fail to generalize; lacks reproducibility. | Stresses reproducibility; shares code, data, and protocols. | 160 | | **Impact** | Misleading conclusions, wasted resources, erosion of trust. | Robust findings, confidence in results, adherence to ethical standards. | 161 | | **Key Distinction**| Prioritizes "impressive" results over scientific rigor. | Prioritizes transparency, integrity, and reproducibility. | 162 | | **Mitigation** | Avoid pre-defined protocols; over-rely on statistical tests. | Use cross-validation, independent test sets, and report all experiments. | 163 | 164 | 165 | 166 | 167 | ## Further Reading 168 | 169 | * [Data dredging](https://en.wikipedia.org/wiki/Data_dredging), Wikipedia. 170 | * [The Extent and Consequences of P-Hacking in Science](https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.1002106&), 2015. 171 | * [Big little lies: a compendium and simulation of p-hacking strategies](https://royalsocietypublishing.org/doi/10.1098/rsos.220346), 2023. 172 | -------------------------------------------------------------------------------- /examples/test_harness_hacking_mitigation.md: -------------------------------------------------------------------------------- 1 | # Test Harness Hacking Mitigation 2 | 3 | ## (i.e. the myth of "Overfitting Model Selection" in modern machine learning) 4 | 5 | > Modern practices (repeated k-fold cv) mitigates the risk of test harness hacking. 6 | 7 | ## Description 8 | 9 | When conducting model selection and hyperparameter tuning through cross-validation (CV), it is true that repeated model testing can exploit quirks in the training data, leading to overfitting. 10 | 11 | However, increasing the **number of folds** and **repetitions of CV runs** directly mitigates this risk by reducing the variance and sensitivity of the CV estimate, effectively counteracting the overfitting tendency. 12 | 13 | ### Increasing CV Folds 14 | 1. **Smaller Test Sets, More Diverse Training Sets**: With more folds, each data point participates in training and testing more frequently. This improves the representativeness of the CV procedure, ensuring that hyperparameters cannot exploit idiosyncrasies in a single test set. 15 | 2. **Natural Regularization via Bias**: Increasing folds slightly increases the bias of the performance estimate, as training is performed on smaller subsets of the data. This bias acts as a regularizer, making the evaluation less prone to being gamed by overfit hyperparameters. 16 | 17 | ### Increasing CV Repetitions 18 | 1. **Normalization of Random Effects**: Repeated CV introduces new random splits, ensuring that the hyperparameter tuning process cannot exploit specific train-test partitioning. The mean performance score over multiple runs reflects the model's generalization across diverse splits, not just one specific configuration. 19 | 2. **Resilience to Stochastic Algorithms**: For models or learning processes reliant on randomness (e.g., neural networks, random forests), repeated CV smooths out the variability from individual runs, further reducing the likelihood of overfitting. 20 | 21 | ### Mitigating the Risk of Overfitting the Training Set 22 | When folds and repetitions are increased: 23 | - **Variance Reduction**: The CV estimate becomes more stable, leaving less room for hyperparameter tuning to overfit to the noise of specific splits. 24 | - **Bias Introduction**: Higher folds increase bias, counteracting overfitting tendencies by making CV scores less sensitive to small variations in the training set. 25 | 26 | While exhaustive hyperparameter tuning can exploit CV, the combination of higher folds and repetitions strengthens the robustness of the CV process. These changes make it harder for models to overfit, even during extensive optimization, by ensuring performance reflects true generalization rather than quirks in the training data. 27 | 28 | ## Study 29 | 30 | This study investigates how the number of folds and repetitions in cross-validation (CV) affect the risk of overfitting during hyperparameter tuning, specifically when using a hill-climbing algorithm to optimize hyperparameters over 100 trials. 31 | 32 | 1. **Dataset Preparation**: 33 | - A synthetic classification dataset is created with 200 samples and 30 features, 50/50 split into a train and test set. 34 | 35 | 2. **Experimental Setup**: 36 | - The study evaluates combinations of k-fold CV (3, 5, 7, 10 folds) and repeated CV (1, 3, 5, and 10 repeats). 37 | - Each configuration undergoes 10 independent trials. 38 | 39 | 3. **Hill-Climbing Hyperparameter Tuning**: 40 | - For each CV configuration, a hill-climbing algorithm runs 100 iterations to optimize the `n_estimators` and `max_depth` hyperparameters of a `RandomForestClassifier`. 41 | - Each hyperparameter configuration is evaluated using the specified CV method, and the best configuration is selected based on the mean CV score. 42 | 43 | 4. **Metrics Computation**: 44 | - For each run, the correlation between the CV scores and hold-out test scores, as well as the mean absolute difference between them, is calculated to quantify overfitting. 45 | - Overfitting is characterized by a decrease in correlation and an increase in the mean absolute difference. 46 | 47 | 5. **Recording and Analysis**: 48 | - The study aggregates results for each combination of folds and repeats, computing the average correlation and mean absolute difference over 10 trials. 49 | 50 | The results are expected to show that increasing the number of CV folds and/or repetitions reduces overfitting, as reflected by: 51 | - **Higher correlation** between CV and hold-out scores. 52 | - **Lower mean absolute difference** between CV and hold-out scores. 53 | 54 | The study demonstrates how CV configurations impact the reliability of model selection, reinforcing the importance of folds and repetitions in mitigating overfitting. 55 | 56 | ```python 57 | import numpy as np 58 | import pandas as pd 59 | from sklearn.datasets import make_classification 60 | from sklearn.model_selection import train_test_split, cross_val_score, KFold, RepeatedKFold 61 | from sklearn.ensemble import RandomForestClassifier 62 | import matplotlib.pyplot as plt 63 | 64 | # Generate a synthetic classification dataset 65 | X, y = make_classification( 66 | n_samples=1000, n_features=30, n_informative=5, n_redundant=25, random_state=42 67 | ) 68 | 69 | # Create a train/test split of the dataset 70 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42) 71 | 72 | # Initialize result storage for experiments 73 | results = [] 74 | 75 | # Define the study parameters 76 | fold_range = [3, 5, 7, 10] # 3 to 10 folds 77 | repeat_range = [1, 3, 5, 10] # 1 to 10 repetitions 78 | n_trials = 10 # Number of trials for each configuration 79 | 80 | # Function for hill climbing optimization 81 | def hill_climb(cv, X_train, y_train, X_test, y_test, n_hill_trials=100): 82 | best_params = {"n_estimators": 10, "max_depth": 2} 83 | best_cv_score = -1 84 | 85 | cv_scores = [] 86 | holdout_scores = [] 87 | 88 | for hill_trial in range(n_hill_trials): 89 | # Propose new parameters 90 | new_params = { 91 | "n_estimators": best_params["n_estimators"] + np.random.randint(-10, 11), 92 | "max_depth": best_params["max_depth"] + np.random.randint(-1, 2) 93 | } 94 | new_params["n_estimators"] = max(1, new_params["n_estimators"]) 95 | new_params["max_depth"] = max(1, new_params["max_depth"]) 96 | 97 | # Evaluate new parameters 98 | new_model = RandomForestClassifier( 99 | n_estimators=new_params["n_estimators"], max_depth=new_params["max_depth"], random_state=42 100 | ) 101 | raw_scores = cross_val_score(new_model, X_train, y_train, cv=cv, scoring="accuracy") 102 | new_cv_score = np.mean(raw_scores) 103 | cv_scores.append(new_cv_score) 104 | 105 | # Evaluate the new model on the hold out test set 106 | new_model.fit(X_train, y_train) 107 | new_holdout_score = new_model.score(X_test, y_test) 108 | holdout_scores.append(new_holdout_score) 109 | 110 | # Update best parameters if score improves 111 | if new_cv_score > best_cv_score: 112 | best_params = new_params 113 | best_cv_score = new_cv_score 114 | 115 | return cv_scores, holdout_scores 116 | 117 | # Function to calculate metrics 118 | def calculate_metrics(cv_scores, holdout_scores): 119 | mean_cv_score = np.mean(cv_scores) 120 | correlation = np.corrcoef(cv_scores, holdout_scores)[0, 1] 121 | mean_abs_diff = np.mean(np.abs(np.array(cv_scores) - np.array(holdout_scores))) 122 | return correlation, mean_abs_diff 123 | 124 | # Main experiment loop 125 | for n_folds in fold_range: 126 | for n_repeats in repeat_range: 127 | trial_correlations = [] 128 | trial_mean_differences = [] 129 | 130 | for trial in range(n_trials): 131 | # Define CV with specific folds and repeats 132 | cv = RepeatedKFold(n_splits=n_folds, n_repeats=n_repeats, random_state=trial) 133 | 134 | # Perform hill climbing of the cross-validated train set 135 | cv_scores, holdout_scores = hill_climb(cv, X_train, y_train, X_test, y_test) 136 | 137 | # Calculate metrics 138 | corr, diff = calculate_metrics(cv_scores, holdout_scores) 139 | 140 | trial_correlations.append(corr) 141 | trial_mean_differences.append(diff) 142 | 143 | # Report progress 144 | print(f'folds={n_folds}, repeats={n_repeats}, i={(trial+1)}, corr={corr}, diff={diff}') 145 | 146 | # Record average results for this configuration 147 | avg_correlation = np.mean(trial_correlations) 148 | avg_mean_diff = np.mean(trial_mean_differences) 149 | 150 | results.append({ 151 | 'folds': n_folds, 152 | 'repeats': n_repeats, 153 | 'avg_correlation': avg_correlation, 154 | 'avg_mean_diff': avg_mean_diff 155 | }) 156 | 157 | # Log progress 158 | print(f"Completed: {n_folds} folds, {n_repeats} repeats | Avg Correlation: {avg_correlation:.4f}, Avg Mean Diff: {avg_mean_diff:.4f}") 159 | 160 | # Convert results to DataFrame 161 | results_df = pd.DataFrame(results) 162 | 163 | # Save results to CSV 164 | results_df.to_csv('cv_overfitting_study_results.csv', index=False) 165 | 166 | # Display final summary 167 | print("\nFinal Results:\n") 168 | print(results_df.sort_values(['folds', 'repeats'])) 169 | ``` 170 | 171 | ### Example Output 172 | 173 | ```text 174 | folds=3, repeats=1, i=1, corr=0.8975081501258906, diff=0.013994529495226418 175 | folds=3, repeats=1, i=2, corr=0.8177792410740738, diff=0.011125753793617622 176 | folds=3, repeats=1, i=3, corr=0.9428830954671136, diff=0.005017053122670292 177 | folds=3, repeats=1, i=4, corr=0.9049809252717387, diff=0.007363626481975841 178 | folds=3, repeats=1, i=5, corr=0.9758504080203283, diff=0.023852118413774832 179 | folds=3, repeats=1, i=6, corr=0.857747359046279, diff=0.01297499843686123 180 | folds=3, repeats=1, i=7, corr=0.9543552148233073, diff=0.010973930692831212 181 | folds=3, repeats=1, i=8, corr=0.9583072215690465, diff=0.012318426279970197 182 | folds=3, repeats=1, i=9, corr=0.9365443080461188, diff=0.016589097467715166 183 | folds=3, repeats=1, i=10, corr=0.972215872219083, diff=0.01591398937065626 184 | Completed: 3 folds, 1 repeats | Avg Correlation: 0.9218, Avg Mean Diff: 0.0130 185 | folds=3, repeats=3, i=1, corr=0.9728624412333399, diff=0.006810931550553492 186 | folds=3, repeats=3, i=2, corr=0.8854785215597786, diff=0.01782942420380126 187 | folds=3, repeats=3, i=3, corr=0.9575675906128579, diff=0.014725479162157584 188 | folds=3, repeats=3, i=4, corr=0.9778969635559742, diff=0.019854740799525407 189 | folds=3, repeats=3, i=5, corr=0.9739011078616541, diff=0.021317557335128957 190 | folds=3, repeats=3, i=6, corr=0.9364274591948702, diff=0.015523946003575146 191 | folds=3, repeats=3, i=7, corr=0.9350829635734347, diff=0.012577906676606609 192 | folds=3, repeats=3, i=8, corr=0.9851501120289593, diff=0.014629806892239604 193 | folds=3, repeats=3, i=9, corr=0.9849767257721495, diff=0.010881155279801769 194 | folds=3, repeats=3, i=10, corr=0.9849943833757703, diff=0.018577150438079643 195 | Completed: 3 folds, 3 repeats | Avg Correlation: 0.9594, Avg Mean Diff: 0.0153 196 | folds=3, repeats=5, i=1, corr=0.9906435183380549, diff=0.009374101435682656 197 | folds=3, repeats=5, i=2, corr=0.9305450478665325, diff=0.019447710843373447 198 | folds=3, repeats=5, i=3, corr=0.9698320836611718, diff=0.013032678738907611 199 | folds=3, repeats=5, i=4, corr=0.9826452213379984, diff=0.018392676334078654 200 | folds=3, repeats=5, i=5, corr=0.9720287635544785, diff=0.016324860159199598 201 | folds=3, repeats=5, i=6, corr=0.9454211768270858, diff=0.01394443787124552 202 | folds=3, repeats=5, i=7, corr=0.9204636647370464, diff=0.010203038260827574 203 | folds=3, repeats=5, i=8, corr=0.9955924474911255, diff=0.012422861746386698 204 | folds=3, repeats=5, i=9, corr=0.9792351257750852, diff=0.012180605535915996 205 | folds=3, repeats=5, i=10, corr=0.9905394275428722, diff=0.01734686963422549 206 | Completed: 3 folds, 5 repeats | Avg Correlation: 0.9677, Avg Mean Diff: 0.0143 207 | folds=3, repeats=10, i=1, corr=0.9917633072174894, diff=0.013716341774282666 208 | folds=3, repeats=10, i=2, corr=0.9893929850040933, diff=0.015559962003703322 209 | folds=3, repeats=10, i=3, corr=0.9569010802478903, diff=0.015219092898540065 210 | folds=3, repeats=10, i=4, corr=0.9879348648553065, diff=0.013215456797248795 211 | folds=3, repeats=10, i=5, corr=0.9735665396739734, diff=0.017831983262390768 212 | folds=3, repeats=10, i=6, corr=0.971173862752053, diff=0.016002982949763 213 | folds=3, repeats=10, i=7, corr=0.9635668621274783, diff=0.010760243368684199 214 | folds=3, repeats=10, i=8, corr=0.9648722608008841, diff=0.012233203953538581 215 | folds=3, repeats=10, i=9, corr=0.9587285072176933, diff=0.011003018060264878 216 | folds=3, repeats=10, i=10, corr=0.9207755703824012, diff=0.009858052569559616 217 | Completed: 3 folds, 10 repeats | Avg Correlation: 0.9679, Avg Mean Diff: 0.0135 218 | folds=5, repeats=1, i=1, corr=0.8861934056107431, diff=0.01589999999999999 219 | folds=5, repeats=1, i=2, corr=0.7662794730952668, diff=0.011219999999999977 220 | folds=5, repeats=1, i=3, corr=0.8826187133353169, diff=0.011020000000000012 221 | folds=5, repeats=1, i=4, corr=0.9440121829589864, diff=0.010340000000000021 222 | folds=5, repeats=1, i=5, corr=0.8766129357443, diff=0.004159999999999984 223 | folds=5, repeats=1, i=6, corr=0.9024164969658375, diff=0.017580000000000016 224 | folds=5, repeats=1, i=7, corr=0.9005217405018474, diff=0.008240000000000035 225 | folds=5, repeats=1, i=8, corr=0.9828141793722028, diff=0.008400000000000029 226 | folds=5, repeats=1, i=9, corr=0.961712143471749, diff=0.021140000000000003 227 | folds=5, repeats=1, i=10, corr=0.9813556722050953, diff=0.006739999999999999 228 | Completed: 5 folds, 1 repeats | Avg Correlation: 0.9085, Avg Mean Diff: 0.0115 229 | folds=5, repeats=3, i=1, corr=0.9122982760466545, diff=0.009366666666666688 230 | folds=5, repeats=3, i=2, corr=0.9890461182715037, diff=0.009599999999999985 231 | folds=5, repeats=3, i=3, corr=0.9183930613020971, diff=0.0060066666666666515 232 | folds=5, repeats=3, i=4, corr=0.9294897940388198, diff=0.011720000000000029 233 | folds=5, repeats=3, i=5, corr=0.981982679087837, diff=0.012706666666666746 234 | folds=5, repeats=3, i=6, corr=0.9711766765002295, diff=0.01302666666666672 235 | folds=5, repeats=3, i=7, corr=0.9651742983090498, diff=0.007959999999999997 236 | folds=5, repeats=3, i=8, corr=0.9616274843010032, diff=0.010246666666666682 237 | folds=5, repeats=3, i=9, corr=0.991484802507542, diff=0.012240000000000055 238 | folds=5, repeats=3, i=10, corr=0.949354181531814, diff=0.007480000000000023 239 | Completed: 5 folds, 3 repeats | Avg Correlation: 0.9570, Avg Mean Diff: 0.0100 240 | folds=5, repeats=5, i=1, corr=0.9124335330130132, diff=0.010407999999999962 241 | folds=5, repeats=5, i=2, corr=0.9943967107022027, diff=0.00918799999999998 242 | folds=5, repeats=5, i=3, corr=0.9481474796710471, diff=0.005691999999999955 243 | folds=5, repeats=5, i=4, corr=0.9638374112067487, diff=0.011388 244 | folds=5, repeats=5, i=5, corr=0.952119476071311, diff=0.011971999999999943 245 | folds=5, repeats=5, i=6, corr=0.9887007143739523, diff=0.01114399999999998 246 | folds=5, repeats=5, i=7, corr=0.9700508321437197, diff=0.005847999999999956 247 | folds=5, repeats=5, i=8, corr=0.9786839027183967, diff=0.00939999999999993 248 | folds=5, repeats=5, i=9, corr=0.9923019529456245, diff=0.010243999999999967 249 | folds=5, repeats=5, i=10, corr=0.9806823785079624, diff=0.008635999999999994 250 | Completed: 5 folds, 5 repeats | Avg Correlation: 0.9681, Avg Mean Diff: 0.0094 251 | folds=5, repeats=10, i=1, corr=0.9899246985924093, diff=0.010209999999999997 252 | folds=5, repeats=10, i=2, corr=0.9863212431811526, diff=0.009343999999999979 253 | folds=5, repeats=10, i=3, corr=0.980497156341154, diff=0.010361999999999982 254 | folds=5, repeats=10, i=4, corr=0.9809354894495217, diff=0.01051599999999992 255 | folds=5, repeats=10, i=5, corr=0.9716781005974886, diff=0.009705999999999982 256 | folds=5, repeats=10, i=6, corr=0.98761167410509, diff=0.007167999999999995 257 | folds=5, repeats=10, i=7, corr=0.9939648038833919, diff=0.008125999999999956 258 | folds=5, repeats=10, i=8, corr=0.9929684118650098, diff=0.00977600000000004 259 | folds=5, repeats=10, i=9, corr=0.9916934696335423, diff=0.011239999999999965 260 | folds=5, repeats=10, i=10, corr=0.9900364630237838, diff=0.0058139999999999555 261 | Completed: 5 folds, 10 repeats | Avg Correlation: 0.9866, Avg Mean Diff: 0.0092 262 | folds=7, repeats=1, i=1, corr=0.9613182318742974, diff=0.009815023474178432 263 | folds=7, repeats=1, i=2, corr=0.6909957358983967, diff=0.006366718086295516 264 | folds=7, repeats=1, i=3, corr=0.9895758217003858, diff=0.005750017885088313 265 | folds=7, repeats=1, i=4, corr=0.9877611150535017, diff=0.007385823831880195 266 | folds=7, repeats=1, i=5, corr=0.9749578118659352, diff=0.011593217080259361 267 | folds=7, repeats=1, i=6, corr=0.9711935713591127, diff=0.00582282137268052 268 | folds=7, repeats=1, i=7, corr=0.9760796613233829, diff=0.006076532528504364 269 | folds=7, repeats=1, i=8, corr=0.9371927392344911, diff=0.01633843952604521 270 | folds=7, repeats=1, i=9, corr=0.9746045786830242, diff=0.012915823831880134 271 | folds=7, repeats=1, i=10, corr=0.9711195358371305, diff=0.011247455846188222 272 | Completed: 7 folds, 1 repeats | Avg Correlation: 0.9435, Avg Mean Diff: 0.0093 273 | folds=7, repeats=3, i=1, corr=0.9128474983896583, diff=0.010857055667337348 274 | folds=7, repeats=3, i=2, corr=0.9924414793688743, diff=0.009876530292868339 275 | folds=7, repeats=3, i=3, corr=0.9753841890858064, diff=0.0066518794246963175 276 | folds=7, repeats=3, i=4, corr=0.9902161753182072, diff=0.01067602652954771 277 | folds=7, repeats=3, i=5, corr=0.9875467068521149, diff=0.014647294880393522 278 | folds=7, repeats=3, i=6, corr=0.9824724251891493, diff=0.0053529637081750035 279 | folds=7, repeats=3, i=7, corr=0.9590765688110139, diff=0.006180702734928126 280 | folds=7, repeats=3, i=8, corr=0.9858973820022585, diff=0.009431569416499024 281 | folds=7, repeats=3, i=9, corr=0.9713070448511517, diff=0.009675288024442926 282 | folds=7, repeats=3, i=10, corr=0.9635099762903706, diff=0.006624511513525587 283 | Completed: 7 folds, 3 repeats | Avg Correlation: 0.9721, Avg Mean Diff: 0.0090 284 | folds=7, repeats=5, i=1, corr=0.9199378342268244, diff=0.008922989045383398 285 | folds=7, repeats=5, i=2, corr=0.9305605240628312, diff=0.009462680527610085 286 | folds=7, repeats=5, i=3, corr=0.9868414126704743, diff=0.008307379834562902 287 | folds=7, repeats=5, i=4, corr=0.9899484002488507, diff=0.008614039794321458 288 | folds=7, repeats=5, i=5, corr=0.976576733728466, diff=0.009443724569640015 289 | folds=7, repeats=5, i=6, corr=0.9693547315705672, diff=0.005900277218868722 290 | folds=7, repeats=5, i=7, corr=0.9909369574070983, diff=0.005307771070869582 291 | folds=7, repeats=5, i=8, corr=0.9892888145831531, diff=0.011010156494522676 292 | folds=7, repeats=5, i=9, corr=0.9627449539744223, diff=0.010943483120947896 293 | folds=7, repeats=5, i=10, corr=0.9658412890907523, diff=0.006972148446232913 294 | Completed: 7 folds, 5 repeats | Avg Correlation: 0.9682, Avg Mean Diff: 0.0085 295 | folds=7, repeats=10, i=1, corr=0.9930980639376968, diff=0.010301558238318903 296 | folds=7, repeats=10, i=2, corr=0.9920910968628622, diff=0.010058085177733074 297 | folds=7, repeats=10, i=3, corr=0.9874525701739706, diff=0.007801203890006742 298 | folds=7, repeats=10, i=4, corr=0.9854169402266026, diff=0.006881775095014487 299 | folds=7, repeats=10, i=5, corr=0.9771779515816831, diff=0.006837573217080281 300 | folds=7, repeats=10, i=6, corr=0.9900552860025492, diff=0.006621960652805747 301 | folds=7, repeats=10, i=7, corr=0.9578589198570598, diff=0.007519338251732596 302 | folds=7, repeats=10, i=8, corr=0.9031445620706875, diff=0.005411533646322324 303 | folds=7, repeats=10, i=9, corr=0.9938400095864596, diff=0.010214242119382887 304 | folds=7, repeats=10, i=10, corr=0.9788426400080306, diff=0.005239181757209895 305 | Completed: 7 folds, 10 repeats | Avg Correlation: 0.9759, Avg Mean Diff: 0.0077 306 | folds=10, repeats=1, i=1, corr=0.9748033758314912, diff=0.015320000000000049 307 | folds=10, repeats=1, i=2, corr=0.9187986740124284, diff=0.010219999999999991 308 | folds=10, repeats=1, i=3, corr=0.9759534155628019, diff=0.005019999999999986 309 | folds=10, repeats=1, i=4, corr=0.9404497799396768, diff=0.0062200000000000085 310 | folds=10, repeats=1, i=5, corr=0.9774883703984174, diff=0.011080000000000003 311 | folds=10, repeats=1, i=6, corr=0.9149008540654683, diff=0.016480000000000064 312 | folds=10, repeats=1, i=7, corr=0.8801191409994777, diff=0.006560000000000007 313 | folds=10, repeats=1, i=8, corr=0.9719343758671737, diff=0.012880000000000025 314 | folds=10, repeats=1, i=9, corr=0.9856023103580355, diff=0.0071200000000000195 315 | folds=10, repeats=1, i=10, corr=0.9556385726542705, diff=0.009180000000000035 316 | Completed: 10 folds, 1 repeats | Avg Correlation: 0.9496, Avg Mean Diff: 0.0100 317 | folds=10, repeats=3, i=1, corr=0.9818082957751972, diff=0.010593333333333219 318 | folds=10, repeats=3, i=2, corr=0.9314961162728049, diff=0.005919999999999988 319 | folds=10, repeats=3, i=3, corr=0.9127620228239297, diff=0.004453333333333281 320 | folds=10, repeats=3, i=4, corr=0.9810389657863505, diff=0.006766666666666591 321 | folds=10, repeats=3, i=5, corr=0.9918364430409649, diff=0.007466666666666555 322 | folds=10, repeats=3, i=6, corr=0.9395927744026121, diff=0.010773333333333244 323 | folds=10, repeats=3, i=7, corr=0.9885928388539656, diff=0.007373333333333213 324 | folds=10, repeats=3, i=8, corr=0.9036806490023385, diff=0.0070266666666665475 325 | folds=10, repeats=3, i=9, corr=0.9612958178459122, diff=0.008580000000000004 326 | folds=10, repeats=3, i=10, corr=0.9852138676111265, diff=0.006326666666666592 327 | Completed: 10 folds, 3 repeats | Avg Correlation: 0.9577, Avg Mean Diff: 0.0075 328 | folds=10, repeats=5, i=1, corr=0.9869203108957099, diff=0.008232000000000034 329 | folds=10, repeats=5, i=2, corr=0.989158809458989, diff=0.006472000000000011 330 | folds=10, repeats=5, i=3, corr=0.9468112866930535, diff=0.005743999999999982 331 | folds=10, repeats=5, i=4, corr=0.9555637452766133, diff=0.009019999999999985 332 | folds=10, repeats=5, i=5, corr=0.8867829384047247, diff=0.008167999999999996 333 | folds=10, repeats=5, i=6, corr=0.9388857673846408, diff=0.0054799999999999875 334 | folds=10, repeats=5, i=7, corr=0.9689970283669487, diff=0.005588000000000004 335 | folds=10, repeats=5, i=8, corr=0.980303607125597, diff=0.008103999999999974 336 | folds=10, repeats=5, i=9, corr=0.9873591803251196, diff=0.006583999999999992 337 | folds=10, repeats=5, i=10, corr=0.9543215764841885, diff=0.005707999999999978 338 | Completed: 10 folds, 5 repeats | Avg Correlation: 0.9595, Avg Mean Diff: 0.0069 339 | folds=10, repeats=10, i=1, corr=0.9895772309728519, diff=0.008010000000000012 340 | folds=10, repeats=10, i=2, corr=0.9371148265713927, diff=0.0056619999999999926 341 | folds=10, repeats=10, i=3, corr=0.9946765193420288, diff=0.007330000000000046 342 | folds=10, repeats=10, i=4, corr=0.9700972981521802, diff=0.006941999999999987 343 | folds=10, repeats=10, i=5, corr=0.9869957451350777, diff=0.009159999999999979 344 | folds=10, repeats=10, i=6, corr=0.943939573626107, diff=0.004751999999999973 345 | folds=10, repeats=10, i=7, corr=0.8887097918134683, diff=0.00497599999999995 346 | folds=10, repeats=10, i=8, corr=0.9915157230477832, diff=0.010015999999999867 347 | folds=10, repeats=10, i=9, corr=0.9869359272490404, diff=0.007609999999999974 348 | folds=10, repeats=10, i=10, corr=0.9840942879680609, diff=0.005351999999999967 349 | Completed: 10 folds, 10 repeats | Avg Correlation: 0.9674, Avg Mean Diff: 0.0070 350 | 351 | Final Results: 352 | 353 | folds repeats avg_correlation avg_mean_diff 354 | 0 3 1 0.921817 0.013012 355 | 1 3 3 0.959434 0.015273 356 | 2 3 5 0.967695 0.014267 357 | 3 3 10 0.967868 0.013540 358 | 4 5 1 0.908454 0.011474 359 | 5 5 3 0.957003 0.010035 360 | 6 5 5 0.968135 0.009392 361 | 7 5 10 0.986563 0.009226 362 | 8 7 1 0.943480 0.009331 363 | 9 7 3 0.972070 0.008997 364 | 10 7 5 0.968203 0.008488 365 | 11 7 10 0.975898 0.007689 366 | 12 10 1 0.949569 0.010008 367 | 13 10 3 0.957732 0.007528 368 | 14 10 5 0.959510 0.006910 369 | 15 10 10 0.967366 0.006981 370 | ``` 371 | 372 | ### Observations 373 | 374 | Plots of results: 375 | 376 | ![](/pics/test_harness_hacking_mitigation_study1.png) 377 | 378 | ![](/pics/test_harness_hacking_mitigation_study2.png) 379 | 380 | Here’s an analysis of the experiment results based on the provided data: 381 | 382 | #### 1. Trends in Average Correlation 383 | - **General Trend**: 384 | - As the number of repeats increases, the average correlation tends to improve for all fold values. 385 | - This indicates that more repeats lead to more stable and consistent results, likely due to better statistical reliability. 386 | 387 | - **Impact of Folds**: 388 | - For **3 folds**, the correlation starts high (0.92) and stabilizes around 0.96-0.97 with increasing repeats. 389 | - For **5 folds**, correlation is slightly lower initially (0.91) but improves significantly with more repeats, reaching a peak at 10 repeats (0.986). 390 | - For **7 folds**, the correlation starts higher (0.94), improves consistently, but peaks slightly lower than 5 folds (around 0.975-0.968). 391 | - For **10 folds**, the correlation is generally high but improves more modestly compared to other fold values, peaking around 0.96-0.97. 392 | 393 | - **Key Observations**: 394 | - More folds combined with higher repeats generally provide better correlations. 395 | - 5 folds with 10 repeats show the highest correlation (0.986), suggesting it is an optimal balance. 396 | 397 | #### 2. Trends in Average Mean Difference 398 | - **General Trend**: 399 | - As the number of repeats increases, the average mean difference consistently decreases across all fold values. 400 | - This suggests that repeated experiments help to minimize variability and bring the mean difference closer to zero. 401 | 402 | - **Impact of Folds**: 403 | - For **3 folds**, the mean difference starts around 0.013 and gradually decreases with more repeats. 404 | - For **5 folds**, it starts lower (0.011) and decreases significantly to around 0.009 at 10 repeats. 405 | - For **7 folds**, the mean difference starts lower still (0.009) and shows the most dramatic improvement, dropping to 0.007 at 10 repeats. 406 | - For **10 folds**, the mean difference begins at 0.010 and also improves to 0.007 but shows diminishing returns with higher repeats. 407 | 408 | - **Key Observations**: 409 | - Higher fold values, such as 7 or 10 folds, generally produce lower mean differences, particularly when paired with a higher number of repeats. 410 | 411 | #### 3. Balancing Correlation and Mean Difference 412 | - **Trade-Offs**: 413 | - While 5 folds with 10 repeats yields the highest correlation (0.986), it does not produce the smallest mean difference. 414 | - 7 folds with 10 repeats achieves a slightly lower correlation (0.975) but has one of the smallest mean differences (0.007). 415 | 416 | - **Optimal Configuration**: 417 | - If correlation is prioritized, 5 folds and 10 repeats is optimal. 418 | - If minimizing mean difference is more important, 7 or 10 folds with 10 repeats might be preferable. 419 | 420 | #### 4. Recommendations for Future Experiments 421 | - **Choose Higher Repeats**: 422 | - Increasing the number of repeats provides diminishing returns beyond 10 but is generally effective at stabilizing results. 423 | 424 | - **Optimize Fold Selection**: 425 | - Depending on the metric of interest, 5 folds for correlation or 7 folds for mean difference are promising choices. 426 | 427 | - **Investigate Trade-Offs Further**: 428 | - Explore whether a compromise between correlation and mean difference exists, possibly at intermediate fold values (e.g., 6 or 8). 429 | 430 | 431 | 432 | -------------------------------------------------------------------------------- /examples/seed_hacking.md: -------------------------------------------------------------------------------- 1 | # Seed Hacking 2 | 3 | > Repeat an experiment with different random number seeds to get the best result. 4 | 5 | ## Description 6 | 7 | Recall that the **random number seed** is an integer that initializes the [pseudo random number generator](https://en.wikipedia.org/wiki/Random_number_generation) and influences the specific and repeatable sequence of random numbers that are generated. 8 | 9 | **Seed hacking** or **random seed shopping** or **seed optimization** is a problematic practice where practitioners manipulate random number seeds to artificially improve model performance metrics. 10 | 11 | The technique involves repeatedly running the same experiment (e.g. model, data split, etc.) with different random seeds until finding one that produces better-looking results. This is typically done during model validation or testing phases. 12 | 13 | While random seeds are important for reproducibility, exploiting them to [cherry-pick](https://en.wikipedia.org/wiki/Cherry_picking) favorable outcomes introduces severe bias. This practice masks the model's true performance and can lead to poor generalization in production. 14 | 15 | The key issue is that seed hacking violates the principle of independent validation. By selecting seeds based on outcomes, you're effectively leaking information from your test set into your model selection process. 16 | 17 | This practice is particularly dangerous for new data scientists because it can be tempting to use when under pressure to show improved metrics. However, it fundamentally undermines the scientific validity of your work. 18 | 19 | A more ethical approach is to use fixed random seeds for reproducibility, but to select them before seeing any results. This maintains experimental integrity while still allowing others to replicate your work. 20 | 21 | ## What Does a Seed-Hacked Result Mean? 22 | 23 | In a stochastic experiment, a single result is a point estimate of the unknown underlying distribution, such as the hold-out/test set prediction error. 24 | 25 | If we repeat the experiment and vary the randomness (e.g., by using different random seeds for data splits or model initialization) we obtain a distribution of estimates. Taking the mean, standard deviation, or confidence interval of this distribution gives us a more accurate and reliable understanding of the model's true performance. 26 | 27 | However, when we hack the seed to deliberately select the best possible result (e.g., lowest error or highest accuracy), we introduce [systematic bias](https://en.wikipedia.org/wiki/Observational_error). Rather than estimating the true mean of the performance distribution, we shift the estimate in a favorable direction. 28 | 29 | The result is no longer a fair or unbiased reflection of the model's performance but instead an overoptimistic artifact of the chosen randomness. This shift can be substantial and misrepresent the model's real-world generalizability. 30 | 31 | **Intentionally introducing a systematic bias by seed hacking is deceptive and misleading, perhaps fraud.** 32 | 33 | Here's a depiction of what is happening when we pick a seed hacked result: 34 | 35 | ![seed hacked result](/pics/seed_hacked_result.svg) 36 | 37 | ## Examples 38 | 39 | Below is a list of aspects of a data science project that could be subject to seed hacking: 40 | 41 | - **Data Splitting**: Splitting datasets into training, validation, and testing sets. Shuffling data during cross-validation evaluation. 42 | - **Resampling Techniques**: Bootstrapping or permutation tests. Creating synthetic datasets using resampling methods. 43 | - **Learning Algorithms**: Initializing weights in neural networks. Randomly selecting subsets of data for ensemble methods like Random Forest or Bagging. Stochastic gradient descent and related stochastic optimization methods. 44 | - **Hyperparameter Optimization**: Randomized search strategies for hyperparameter tuning. Distribution sampling search strategies like Bayesian Optimization. 45 | - **Data Augmentation**: Random transformations for data augmentation in image or text preprocessing. Generating synthetic data for privacy-preserving data sharing or experimentation. Simulating data with specific statistical properties. 46 | - **Feature Engineering**: Randomized feature selection or subset selection algorithms. Creating stochastic embeddings, e.g., in t-SNE or UMAP. 47 | 48 | ### Worked Examples 49 | 50 | Some worked examples of seed hacking applied to specific aspects of a project: 51 | 52 | * [Seed Hacking Cross-Validation](seed_hacking_cross_validation.md): _Vary the seed for creating cross-validation folds in order to get the best result._ 53 | * [Seed Hacking the Train/Test Split](seed_hacking_train_test_split.md): _Vary the seed for creating train/test splits in order to get the best result._ 54 | * [Seed Hacking Learning Algorithm](seed_hacking_learning_algorithm.md): _Vary the seed for the model training algorithm in order to get the best result._ 55 | * [Seed Hack Bootstrap Performance](seed_hacking_bootstrap_performance.md): _Vary the seed for a bootstrap of a final chosen model on the test set to present the best performance._ 56 | 57 | More seed hacking examples for learning algorithms: [bagging](src/seed_hacking_bagging.py), [decision tree](src/seed_hacking_decision_tree.py), [gradient boosting](src/seed_hacking_gradient_boosting.py), [logistic regression](src/seed_hacking_logistic_regression.py), [multilayer perceptron](src/seed_hacking_multilayer_percepron.py), [perceptron](src/seed_hacking_perceptron.py), [random forest](src/seed_hacking_random_forest.py), [ridge classifier](src/seed_hacking_ridge_classifier.py), [sgd classifier](src/seed_hacking_sgd_classifier.py). 58 | 59 | ## Negative Seed Hacking 60 | 61 | How can we defend the choice of random number seed on a project? 62 | 63 | * Use a widely used default, e.g. 1 or 42 or 1234 or 1337. 64 | * Use the current date as an integer, e.g. DDMMYYYY. 65 | * Look at the clock and use the current minute and/or second value. 66 | * Roll die and use the number that comes up. 67 | 68 | Then record what you chose and how you chose it in your project log. 69 | 70 | ## Quantify the Variance 71 | 72 | Don't guess or ignore the variance, measure and report it. 73 | 74 | Perform a [sensitivity analysis](https://en.wikipedia.org/wiki/Sensitivity_analysis) aka stability/robustness study. 75 | 76 | This involves: 77 | 78 | 1. Hold everything in your setup (data + model) constant. 79 | 2. Pick one aspect of your setup that uses randomness. 80 | 3. Vary the randomness for that one aspect (e.g. 30+ runs each with a different seed). 81 | 4. Collect performance scores and report/analyze the distribution (best + worst, mean + stdev, median + confidence interval, etc.). 82 | 83 | For example: 84 | 85 | 1. **Hold the Model Constant and Vary the Data**: Use techniques like k-fold cross-validation (CV), repeated k-fold CV, or repeated train/test splits while keeping the model and its random initialization fixed. 86 | - Quantify how sensitive the model's performance is to variations in the training and test data splits. 87 | - This approach reveals variance caused by differences in the sampled training/test data and helps assess the model's robustness to data variability. 88 | 2. **Hold the Data Constant and Vary the Learning Algorithm**: Use a fixed dataset and vary only the random seed for the algorithm (e.g., random initialization of weights, dropout masks, or other stochastic elements). 89 | - Quantify how the inherent randomness in the learning process affects model performance. 90 | - This captures the variance caused by the stochastic nature of the optimization algorithm or training procedure. 91 | 3. **Vary Both the Data and the Learning Algorithm**: Randomize both the data (through k-fold CV or similar techniques) and the algorithm (through different seeds). 92 | - Assess the **total variance** in the learning process, encompassing both data variability and algorithm randomness. 93 | - This provides a holistic view of the overall variability in model performance. 94 | 95 | How much variance to expect? It really depends. 96 | 97 | - The variance due to data could be a few percent (e.g. 1-2%). 98 | - The variance due to learning algorithm could be a few tenths of a percent to a few percent (e.g. 0.2-0.4% or 1-2%). 99 | 100 | ### Reduce the Variance 101 | 102 | Variance is reduced by adding bias, we cannot escape the [Bias–variance tradeoff](https://en.wikipedia.org/wiki/Bias%E2%80%93variance_tradeoff). 103 | 104 | The techniques for reducing variance are typically specific to your setup, especially your model. 105 | 106 | Nevertheless, here are some ideas: 107 | 108 | 1. **Reducing Performance Variance Due to Data**. Variance from data arises because models are sensitive to the specific training or test samples provided. Strategies to mitigate this include: 109 | - Regularization: Penalize model complexity to prevent overfitting to specific data splits. 110 | - Use More Data: Larger datasets typically reduce variability by making training samples more representative of the underlying distribution. 111 | - Robust Models: Use algorithms known for robustness to outliers or data variability, such as tree-based methods (e.g., Random Forests, Gradient Boosting). 112 | - ... 113 | 2. **Reducing Performance Variance Due to the Learning Algorithm**. Variance from the learning algorithm stems from stochasticity in the optimization process, such as random initialization, batch sampling, or other internal randomness. Strategies to reduce this variance include: 114 | - Ensembles: Combine predictions from multiple models trained on the same data but with different initializations or configurations. 115 | - Repeated Training and Averaging: Train the model multiple times with different seeds and average the predictions for a more robust output (simplest ensemble). 116 | - Better Initialization: Use advanced initialization techniques, such as Xavier or He initialization, to reduce sensitivity to starting conditions. 117 | - Use Stable Optimizers: Certain optimizers, such as AdamW or SGD with carefully tuned learning rates, can provide more consistent convergence compared to others. 118 | - Longer Training with Early Stopping: Allow models more time to converge but use early stopping based on validation performance to avoid overfitting. 119 | - ... 120 | 3. **Reducing Overall Variance (Both Data and Algorithm)**. For a holistic reduction in variance, consider strategies that address both data and algorithm variability: 121 | - Use Cross-Validation: Perform k-fold cross-validation to average out performance over different data splits and initialization seeds. 122 | - Hybrid Ensembles: Combine models trained on different data subsets (bagging) with models using different algorithm configurations or seeds. 123 | 124 | For a best practice approach, combine strategies: 125 | 126 | - Regularize the model and preprocess data to reduce data-driven variance. 127 | - Use ensembles or repeated runs to reduce algorithm-driven variance. 128 | - Report distributions of performance metrics to transparently communicate variability. 129 | 130 | ## What About Large One-Off Models (e.g. neural nets)? 131 | 132 | Some large deep learning neural networks can take days, weeks, or months to train, often at great expense. 133 | 134 | As such, typically only one model is trained. 135 | 136 | These models are sensitive to initial conditions e.g. initial random coefficients/weights. Additionally, the learning algorithm may be stochastic (e.g. shuffling of training samples, dropout, etc.). 137 | 138 | As such, the choice of random number seed influences the performance of the final model. 139 | 140 | In a (small) fast-to-train model, we might call this the variance in the performance of the model. In a (large) slow-to-train model that might take weeks to months to train, this could be the difference between a successful and unsuccessful project. 141 | 142 | For example: 143 | 144 | > Fine-tuning pretrained contextual word embedding models to supervised downstream tasks has become commonplace in natural language processing. This process, however, is often brittle: even with the same hyperparameter values, distinct random seeds can lead to substantially different results. 145 | 146 | -- [Fine-Tuning Pretrained Language Models: Weight Initializations, Data Orders, and Early Stopping](https://arxiv.org/abs/2002.06305), 2020. 147 | 148 | And: 149 | 150 | > However, deep neural network based models are often brittle to various sources of randomness in the training of the models. This could be attributed to several sources including, but not limited to, random parameter initialization, random sampling of examples during training and random dropping of neurons. It has been observed that these models have, more often, a set of random seeds that yield better results than others. This has also lead to research suggesting random seeds as an additional hyperparameter for tuning. 151 | 152 | -- [On Model Stability as a Function of Random Seed](https://arxiv.org/abs/1909.10447), 2019. 153 | 154 | What to do? 155 | 156 | It depends. Don't seed hack, but perhaps: 157 | 158 | * Can you ensemble a few model runs or model checkpoints together to reduce the variance? 159 | * Can you use early stopping and/or regularization during training to reduce the variance? 160 | 161 | > A common approach to creating neural network ensembles is to train the same architecture with different random seeds, and have the resulting models vote. 162 | 163 | -- [We need to talk about random seeds](https://arxiv.org/abs/2210.13393), 2022. 164 | 165 | One consolation is that a converged neural network model generally has a narrow distribution of performance across random seeds (as we might hope and expect). 166 | 167 | > What is the distribution of scores with respect to the choice of seed? The distribution of accuracy when varying seeds is relatively pointy, which means that results are fairly concentrated around the mean. Once the model converged, this distribution is relatively stable which means that some seed are intrinsically better than others. 168 | 169 | -- [Torch.manual_seed(3407) is all you need: On the influence of random seeds in deep learning architectures for computer vision](https://arxiv.org/abs/2109.08203), 2021. 170 | 171 | And: 172 | 173 | > Typically, the choice of random seed only has a slight effect on the result and can mostly be ignored in general or for most of the hyper-parameter search process. 174 | 175 | -- [Practical recommendations for gradient-based training of deep architectures](https://arxiv.org/abs/1206.5533), 2012. 176 | 177 | If you can perform multiple training runs for your neural network model, then you should, **with different random number seeds**. 178 | 179 | This is called multiple-restart optimization, see below. 180 | 181 | ## When is Seed Hacking Ethical? 182 | 183 | Is there such a thing as ethical seed hacking (in machine learning/data science)? 184 | 185 | YES! 186 | 187 | Here are some softer rationales: 188 | 189 | * Perhaps you want to make a point for a demonstration, presentation, course, tutorial, etc.? 190 | * Perhaps you require the best descriptive rather than predictive model? 191 | * Perhaps you want to find best/worst/distribution performance due to learning algorithm/initial condition variance (e.g. a sensitivity analysis)? 192 | 193 | The best case for seed hacking is as an stochastic optimization strategy called "multiple-restarts": 194 | 195 | * Some learning algorithms are solving a really hard (e.g. non-convex/discontinuous/deceptive/multimodal/etc.) optimization problem and random restarts of initial conditions in the search space is in fact a beneficial approach. 196 | 197 | ### Multiple-Restart Optimization 198 | 199 | The multiple-restart strategy is a technique used to address the challenges of solving harder optimization problems, particularly non-convex ones with multiple local minima, saddle points, or other complex structures. 200 | 201 | By running the optimization process multiple times with different initial conditions or random seeds, this approach increases the likelihood of exploring diverse regions of the solution space and finding better optima. 202 | 203 | > Heuristic search procedures that aspire to find global optimal solutions to hard combinatorial optimization problems usually require some type of diversification to overcome local optimality. One way to achieve diversification is to re-start the procedure from a new solution once a region has been explored. 204 | 205 | -- [Chapter 12: Multi-Start Methods](https://link.springer.com/chapter/10.1007/0-306-48056-5_12), Handbook of Metaheuristics, 2003. 206 | 207 | It is especially beneficial for algorithms that are sensitive to initialization, such as neural networks, clustering methods (e.g., K-Means), or stochastic optimization algorithms. 208 | 209 | While multi-restart offers significant advantages for non-convex and multimodal problems, it provides little to no benefit for convex optimization problems, where the global minimum is guaranteed regardless of the starting point. 210 | 211 | The strategy effectively balances computational cost with solution quality in scenarios where optimality cannot be guaranteed in a single run. 212 | 213 | Below is a table of common machine learning algorithms, the type of optimization problem they are solving (e.g. convex or non-convex), whether they are sensitive to initial conditions, and whether they will benefit from multiple restarts: 214 | 215 | | Algorithm | Problem Type | Sensitivity | Multi-Restart Benefit | 216 | |--------------------------|-----------------|-------------|-----------------------| 217 | | Linear Regression | Convex | None | None | 218 | | Logistic Regression | Convex | Minimal | Minimal | 219 | | K-Means | Non-convex | High | High | 220 | | t-SNE | Non-convex | High | High | 221 | | Neural Networks | Non-convex | High | High | 222 | | Random Forests | Non-convex | Low | Low to Moderate | 223 | | SVC | Convex | None | None | 224 | | PCA | Convex | None | None | 225 | 226 | 227 | As such, we may see what looks like seed hacking in the context of deep learning / reinforcement learning work, which may in fact be examples of a multiple-restart optimization. 228 | 229 | The problem is, how do you tell the difference? 230 | 231 | ### Seed Hacking vs Multiple-Restarts 232 | 233 | Differentiating between a legitimate multi-restart optimization strategy and "seed hacking" (cherry-picking the best result) requires careful scrutiny of how the results are reported and interpreted. 234 | 235 | Below are the characteristics of **legitimate multi-restart optimization**: 236 | 237 | 1. **Disclosure of Multi-Restart Process**: Clearly states that a multi-restart strategy was employed and describes the number of restarts, initialization strategy, and hyperparameters. 238 | 2. **Performance Distribution Reporting**: Reports the distribution of performance metrics across restarts, including mean, median, standard deviation, and possibly full histograms or box plots. This allows readers to assess the stability of the algorithm and whether the best result is an outlier or representative of typical performance. 239 | 3. **Procedure Replication:** If the "best result" is highlighted, it contextualizes this by repeating the entire multi-restart procedure multiple times and reporting the distribution of "best-of-restart" scores. This provides confidence that the approach is not a one-off fluke. 240 | 4. **Statistical Robustness:** Includes statistical tests to verify whether improvements from the best restart are statistically significant compared to baselines or other algorithms. 241 | 5. **Sensitivity Analysis:** Reports how sensitive the algorithm is to random initialization, demonstrating whether consistent performance can be expected or if results are highly variable. 242 | 243 | Conversely, below are the characteristics of **seed hacking with multi-restart optimization**: 244 | 245 | 1. **Single Point Estimate:** Reports only the best result without contextualizing it within the broader distribution of outcomes across restarts. This ignores variability and may cherry-pick an optimistic outlier. 246 | 2. **Non-Disclosure of Multi-Restart:** Fails to disclose that multiple restarts or seeds were used. This gives the impression that the reported result comes from a single unbiased run. 247 | 3. **Absence of Distribution Information:** Does not provide statistics (e.g., mean, standard deviation, quantiles) of performance across restarts. This lacks transparency on how consistently high-quality solutions are found. 248 | 4. **Selective Comparisons:** Compares the "best restart" of one algorithm with the "average performance" of another algorithm or baseline, creating unfair comparisons. 249 | 250 | 251 | ## FAQ 252 | 253 | I get a lot of questions about "how to pick the best seed". Some of the answers below may help. 254 | 255 | 256 | **Q. Is the random seed a hyperparameter?** 257 | 258 | Yes. 259 | 260 | It is a hyperparameter (to the model, to the test harness, etc.) that we should set, but not one we should optimize. 261 | 262 | **Q. What random number seed should I use?** 263 | 264 | No one cares. Use "1" or "42" or the current date in DDMMYYYY format. 265 | 266 | Even better, don't use one seed, use many and report a result distribution. 267 | 268 | **Q. What seed should I use for my final chosen model fit on all training data?** 269 | 270 | The same seed you used to evaluate candidate models on your test harness. 271 | 272 | Or, fit a suite of final models with different seeds (e.g. 30) and use them all in an ensemble to make predictions on new data. This will average out the variance in the learning algorithm. 273 | 274 | **Q. My model shows a large variance with different random number seeds, what should I do?** 275 | 276 | Add bias. 277 | 278 | * Perhaps increase training epochs, tree depth, etc. 279 | * Perhaps use regularization to reduce model variance. 280 | * Perhaps adjust hyperparameters to reduce model variance. 281 | * Perhaps use repeated evaluations (e.g. repeated k-fold cross-validation or repeated train/test splits) and report a performance distribution instead of a point estimate. 282 | 283 | **Q. What about a machine learning competition, like Kaggle?** 284 | 285 | Nope, or probably not. 286 | 287 | Your model must generalize to unseen data (e.g. the hidden test set) and optimizing for the public test set will likely (almost certainly) result in overfitting. 288 | 289 | **Q. Surely picking the model random number seed that gives the best cross-validation score is a good idea?** 290 | 291 | Nope, or probably not. 292 | 293 | It is likely that the difference in each distribution of CV scores is the same (e.g. check using a statistical hypothesis test, quantify using an effect size) and that any differences you are seeing are misleading. 294 | 295 | If there are differences, your model may have a variance that is a little too high for the given quantity of training data. Add some bias (see above). Or the model is fragile/overfit the hold out test set of your test harness and will not generalize well to changes (e.g. changes to the data, changes to the model). 296 | 297 | **Q. Okay, if I have to choose between two models, each fit with a different seed, I should choose the one with the better performance, right?** 298 | 299 | Nope, or probably not. See above. 300 | 301 | **Q. Can we seed hack (grid search the seed) for a model within nested k-fold cross-validation?** 302 | 303 | Oh man... I guess you could. 304 | 305 | Again, I suspect that in most cases, any difference between model performance distributions with a fixed vs optimized seed will not be statistically significant. 306 | 307 | If it is different, perhaps use methods to reduce model variance as discussed above. 308 | 309 | If it's for an algorithm with a non-convex optimization problem, think about what this means. It means that one initial condition performs "better" across different subsets of train/test data. Maybe it's true, it's probably not. 310 | 311 | **Q. How do I know if my seed hacked result is optimistically biased or a better solution to a hard optimization problem?** 312 | 313 | Now that is a good question! 314 | 315 | If we know a lot about the model and its optimization procedure, we might be able to draw a logical conclusion because of the underlying optimization problem the learning algorithm is solving (e.g. convex vs non-convex and to what degree it is sensitive to initial conditions or stochastic behaviors during the search). 316 | 317 | For example: 318 | 319 | - Did the change in seed permit the optimization algorithm locate a superior solution in the search space (if so, can you confirm with statistical tests)? 320 | 321 | Empirically, you can sample results for a ton of seeds and see where you sit on the distribution. All that tells you is what result percentile you might be in, not whether the solution is brittle. 322 | 323 | This is really hard and an "it depends" is the best I can manage. 324 | 325 | See the sections above on "Multiple-Restart Optimization" and "Seed Hacking vs Multiple-Restarts". 326 | 327 | ## Further Reading 328 | 329 | Sometimes it helps to read how others are thinking through this issue: 330 | 331 | ### Papers 332 | 333 | * [Fine-Tuning Pretrained Language Models: Weight Initializations, Data Orders, and Early Stopping](https://arxiv.org/abs/2002.06305), 2020. 334 | * [Multi-Start Methods](https://link.springer.com/chapter/10.1007/0-306-48056-5_12), Handbook of Metaheuristics, 2003. 335 | * [On Model Stability as a Function of Random Seed](https://arxiv.org/abs/1909.10447), 2019. 336 | * [Practical recommendations for gradient-based training of deep architectures](https://arxiv.org/abs/1206.5533), 2012. 337 | * [Pseudo-random Number Generator Influences on Average Treatment Effect Estimates Obtained with Machine Learning](https://pubmed.ncbi.nlm.nih.gov/39150879/), 2024. 338 | * [Torch.manual_seed(3407) is all you need: On the influence of random seeds in deep learning architectures for computer vision](https://arxiv.org/abs/2109.08203), 2021. 339 | * [We need to talk about random seeds](https://arxiv.org/abs/2210.13393), 2022. 340 | 341 | ### Blog Posts 342 | 343 | * [Are random seeds hyperparameters?](https://andrewcharlesjones.github.io/journal/random-seed-hyperparameter.html) 344 | * [Manipulating machine learning results with random state](https://towardsdatascience.com/manipulating-machine-learning-results-with-random-state-2a6f49b31081) 345 | * [Optimizing the Random Seed](https://towardsdatascience.com/optimizing-the-random-seed-99a90bd272e) 346 | 347 | ### Discussion 348 | 349 | Lots of people struggling with choosing/optimizing the random seed out there in the wild. Not enough background in statistics/stochastic optimization IMHO, but that's okay. 350 | 351 | * [Am I creating bias by using the same random seed over and over?](https://stats.stackexchange.com/questions/80407/am-i-creating-bias-by-using-the-same-random-seed-over-and-over) 352 | * [Choosing the "Correct" Seed for Reproducible Research/Results](https://stats.stackexchange.com/questions/335936/choosing-the-correct-seed-for-reproducible-research-results) 353 | * [Data folks of Reddit: How do you choose a random seed?](https://www.reddit.com/r/datascience/comments/17kxd5s/data_folks_of_reddit_how_do_you_choose_a_random/) 354 | * [Do Deep Learning/Machine Learning papers use a fixed seed to report their results?](https://www.reddit.com/r/MachineLearning/comments/fbl9ho/discussion_do_deep_learningmachine_learning/) 355 | * [How to choose the random seed?](https://datascience.stackexchange.com/questions/35869/how-to-choose-the-random-seed) 356 | * [How to deal with random parameters in MLOps](https://stats.stackexchange.com/questions/564045/how-to-deal-with-random-parameters-in-mlops) 357 | * [If so many people use set.seed(123) doesn't that affect randomness of world's reporting?](https://stats.stackexchange.com/questions/205961/if-so-many-people-use-set-seed123-doesnt-that-affect-randomness-of-worlds-re) 358 | * [Is it 'fair' to set a seed in a random forest regression to yield the highest accuracy?](https://stats.stackexchange.com/questions/341610/is-it-fair-to-set-a-seed-in-a-random-forest-regression-to-yield-the-highest-ac/) 359 | * [Is random seed a hyper-parameter to tune in training deep neural network?](https://stats.stackexchange.com/questions/478193/is-random-seed-a-hyper-parameter-to-tune-in-training-deep-neural-network) 360 | * [Is random state a parameter to tune?](https://stats.stackexchange.com/questions/263999/is-random-state-a-parameter-to-tune) 361 | * [Neural network hyperparameter tuning - is setting random seed a good idea?](https://stackoverflow.com/questions/65704588/neural-network-hyperparameter-tuning-is-setting-random-seed-a-good-idea) 362 | * [Optimization of hyperparameters and seed](https://www.reddit.com/r/reinforcementlearning/comments/ptsbvb/optimization_of_hyperparameters_and_seed/) 363 | * [Performance of Ridge and Lasso Regression depend on set.seed?](https://stats.stackexchange.com/questions/355256/performance-of-ridge-and-lasso-regression-depend-on-set-seed) 364 | * [Why is it valid to use CV to set parameters and hyperparameters but not seeds?](https://stats.stackexchange.com/questions/341619/why-is-it-valid-to-use-cv-to-set-parameters-and-hyperparameters-but-not-seeds) 365 | * [XGBoost - "Optimizing Random Seed"](https://stats.stackexchange.com/questions/273230/xgboost-optimizing-random-seed) 366 | 367 | 368 | 369 | --------------------------------------------------------------------------------