├── .gitignore
├── pics
├── cover.png
├── cover_cropped.png
├── test_harness_hacking.png
├── test_harness_hacking_mitigation_study1.png
├── test_harness_hacking_mitigation_study2.png
├── test_harness_hacking_hill_climbing_performance.png
├── logo.svg
└── seed_hacked_result.svg
├── examples
├── src
│ ├── test_set_memorization.py
│ ├── seed_hacking_learning_algorithm.py
│ ├── seed_hacking_cross_validation.py
│ ├── seed_hacking_train_test_split.py
│ ├── leaderboard_hacking.py
│ ├── test_set_pruning.py
│ ├── precision_hacking.py
│ ├── p_hacking_selective_sampling.py
│ ├── p_hacking_feature_selection.py
│ ├── train_test_ratio_gaming.py
│ ├── p_hacking_feature_transforms.py
│ ├── p_hacking_learning_algorithm.py
│ ├── seed_hacking_bootstrap_performance.py
│ ├── test_set_overfitting.py
│ ├── seed_hacking_perceptron.py
│ ├── seed_hacking_sgd_classifier.py
│ ├── seed_hacking_ridge_classifier.py
│ ├── seed_hacking_decision_tree.py
│ ├── seed_hacking_random_forest.py
│ ├── seed_hacking_logistic_regression.py
│ ├── seed_hacking_gradient_boosting.py
│ ├── seed_hacking_multilayer_percepron.py
│ ├── seed_hacking_bagging.py
│ ├── test_harness_hacking_hill_climbing_test_folds.py
│ ├── test_harness_hacking_hill_climbing_performance.py
│ └── test_harness_hacking_mitigation.py
├── test_set_memorization.md
├── seed_hacking_learning_algorithm.md
├── seed_hacking_cross_validation.md
├── seed_hacking_train_test_split.md
├── p_hacking_selective_sampling.md
├── p_hacking_feature_selection.md
├── p_hacking_learning_algorithm.md
├── leaderboard_hacking.md
├── test_set_pruning.md
├── test_set_overfitting.md
├── train_test_ratio_gaming.md
├── test_harness_hacking_hill_climbing_test_folds.md
├── test_harness_hacking_hill_climbing_performance.md
├── seed_hacking_bootstrap_performance.md
├── threshold_hacking.md
├── test_harness_hacking.md
├── p_hacking.md
├── test_harness_hacking_mitigation.md
└── seed_hacking.md
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | dev
2 | .DS_Store
3 |
--------------------------------------------------------------------------------
/pics/cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jason2Brownlee/MachineLearningMischief/HEAD/pics/cover.png
--------------------------------------------------------------------------------
/pics/cover_cropped.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jason2Brownlee/MachineLearningMischief/HEAD/pics/cover_cropped.png
--------------------------------------------------------------------------------
/pics/test_harness_hacking.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jason2Brownlee/MachineLearningMischief/HEAD/pics/test_harness_hacking.png
--------------------------------------------------------------------------------
/pics/test_harness_hacking_mitigation_study1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jason2Brownlee/MachineLearningMischief/HEAD/pics/test_harness_hacking_mitigation_study1.png
--------------------------------------------------------------------------------
/pics/test_harness_hacking_mitigation_study2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jason2Brownlee/MachineLearningMischief/HEAD/pics/test_harness_hacking_mitigation_study2.png
--------------------------------------------------------------------------------
/pics/test_harness_hacking_hill_climbing_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jason2Brownlee/MachineLearningMischief/HEAD/pics/test_harness_hacking_hill_climbing_performance.png
--------------------------------------------------------------------------------
/examples/src/test_set_memorization.py:
--------------------------------------------------------------------------------
1 | # Import necessary libraries
2 | from sklearn.datasets import make_classification # For generating synthetic dataset
3 | from sklearn.model_selection import train_test_split # For splitting the dataset
4 | from sklearn.neighbors import KNeighborsClassifier # For K-Nearest Neighbors classifier
5 | from sklearn.metrics import accuracy_score # For evaluating model performance
6 |
7 | # Generate a synthetic classification dataset
8 | X, y = make_classification(n_samples=100, n_features=10, n_classes=2, random_state=42)
9 |
10 | # Split the dataset into train and test sets
11 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
12 |
13 | # Create a K-Nearest Neighbors (KNN) classifier with k=1
14 | knn = KNeighborsClassifier(n_neighbors=1)
15 |
16 | # Fit the model on the test set (intentional test set leakage)
17 | knn.fit(X_test, y_test)
18 |
19 | # Evaluate the model on the test set
20 | y_pred = knn.predict(X_test)
21 |
22 | # Report the perfect score
23 | print("Accuracy:", accuracy_score(y_test, y_pred))
24 |
--------------------------------------------------------------------------------
/pics/logo.svg:
--------------------------------------------------------------------------------
1 |
33 |
--------------------------------------------------------------------------------
/examples/src/seed_hacking_learning_algorithm.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.datasets import make_classification
3 | from sklearn.ensemble import RandomForestClassifier
4 | from sklearn.model_selection import cross_val_score, KFold
5 |
6 | # Define the number of trials
7 | num_trials = 100
8 |
9 | # Define variables to track the best seed and best performance
10 | best_seed = None
11 | best_performance = -np.inf
12 |
13 | # Create a synthetic classification dataset
14 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42)
15 |
16 | # Fix the cross-validation folds for all evaluations
17 | kf = KFold(n_splits=5, shuffle=True, random_state=42)
18 |
19 | # Iterate over multiple seeds for the model's randomness
20 | for trial in range(num_trials):
21 | # Set the seed for the random forest model
22 | seed = trial
23 |
24 | # Initialize the model with the current seed
25 | model = RandomForestClassifier(n_estimators=50, random_state=seed)
26 |
27 | # Evaluate the model using cross-validation
28 | scores = cross_val_score(model, X, y, cv=kf)
29 |
30 | # Calculate the mean performance
31 | mean_performance = scores.mean()
32 |
33 | # Print the seed and performance if there is an improvement
34 | if mean_performance > best_performance:
35 | print(f"Seed: {seed}, Performance: {mean_performance:.4f}")
36 | best_performance = mean_performance
37 | best_seed = seed
38 |
39 | # Report the best seed and its performance
40 | print(f"\nBest Seed: {best_seed}, Best Performance: {best_performance:.4f}")
41 |
--------------------------------------------------------------------------------
/examples/src/seed_hacking_cross_validation.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.datasets import make_classification
3 | from sklearn.ensemble import RandomForestClassifier
4 | from sklearn.model_selection import cross_val_score, KFold
5 |
6 | # Define the number of trials
7 | num_trials = 100
8 |
9 | # Define variables to track the best fold configuration and best performance
10 | best_fold_seed = None
11 | best_performance = -np.inf
12 |
13 | # Create a synthetic classification dataset
14 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42)
15 |
16 | # Initialize the model with a fixed seed
17 | model = RandomForestClassifier(n_estimators=50, random_state=42)
18 |
19 | # Iterate over multiple seeds to vary the k-fold cross-validation splits
20 | for trial in range(num_trials):
21 | # Set the seed for the k-fold shuffle
22 | fold_seed = trial
23 |
24 | # Initialize k-fold cross-validation with the current seed
25 | kf = KFold(n_splits=5, shuffle=True, random_state=fold_seed)
26 |
27 | # Evaluate the model using cross-validation
28 | scores = cross_val_score(model, X, y, cv=kf)
29 |
30 | # Calculate the mean performance
31 | mean_performance = scores.mean()
32 |
33 | # Print the fold seed and performance if there is an improvement
34 | if mean_performance > best_performance:
35 | print(f"Fold Seed: {fold_seed}, Performance: {mean_performance:.4f}")
36 | best_performance = mean_performance
37 | best_fold_seed = fold_seed
38 |
39 | # Report the best fold seed and its performance
40 | print(f"\nBest Fold Seed: {best_fold_seed}, Best Performance: {best_performance:.4f}")
41 |
--------------------------------------------------------------------------------
/examples/src/seed_hacking_train_test_split.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.datasets import make_classification
3 | from sklearn.ensemble import RandomForestClassifier
4 | from sklearn.model_selection import train_test_split
5 | from sklearn.metrics import accuracy_score
6 |
7 | # Define the number of trials
8 | num_trials = 100
9 |
10 | # Define variables to track the best seed and best performance
11 | best_seed = None
12 | best_performance = -np.inf
13 |
14 | # Create a synthetic classification dataset
15 | X, y = make_classification(n_samples=100, n_features=20, n_informative=15, n_redundant=5, random_state=42)
16 |
17 | # Initialize the model with a fixed seed
18 | model = RandomForestClassifier(random_state=42)
19 |
20 | # Iterate over multiple seeds to vary the train/test split
21 | for trial in range(num_trials):
22 | # Set the seed for train/test split
23 | split_seed = trial
24 |
25 | # Create a train/test split with the current seed
26 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=split_seed)
27 |
28 | # Train the model on the training data
29 | model.fit(X_train, y_train)
30 |
31 | # Evaluate the model on the test data
32 | y_pred = model.predict(X_test)
33 | performance = accuracy_score(y_test, y_pred)
34 |
35 | # Print the split seed and performance if there is an improvement
36 | if performance > best_performance:
37 | print(f"Split Seed: {split_seed}, Performance: {performance:.4f}")
38 | best_performance = performance
39 | best_seed = split_seed
40 |
41 | # Report the best split seed and its performance
42 | print(f"\nBest Split Seed: {best_seed}, Best Performance: {best_performance:.4f}")
--------------------------------------------------------------------------------
/examples/src/leaderboard_hacking.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.datasets import make_classification
3 | from sklearn.model_selection import train_test_split
4 | from sklearn.metrics import accuracy_score
5 |
6 | # Generate a synthetic classification dataset
7 | X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
8 |
9 | # Split into training and test sets
10 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
11 |
12 | # Artificial "leaderboard" to evaluate predictions on the test set
13 | def leaderboard_evaluation(predictions, true_labels):
14 | """Calculate the leaderboard score (accuracy in this case)."""
15 | return accuracy_score(true_labels, predictions)
16 |
17 | # Initialize random predictions for the test set
18 | best_predictions = np.random.randint(0, 2, size=len(y_test))
19 | best_score = leaderboard_evaluation(best_predictions, y_test)
20 |
21 | # Stochastic hill climber: adjust predictions iteratively
22 | max_trials = 10000 # Maximum number of trials
23 | for trial in range(max_trials):
24 | # Copy the best predictions and randomly flip one value
25 | new_predictions = best_predictions.copy()
26 | index_to_flip = np.random.randint(len(new_predictions))
27 | new_predictions[index_to_flip] = 1 - new_predictions[index_to_flip] # Flip the prediction
28 |
29 | # Evaluate the new predictions
30 | new_score = leaderboard_evaluation(new_predictions, y_test)
31 |
32 | # If the new score is better, adopt the new predictions
33 | if new_score > best_score:
34 | best_predictions = new_predictions
35 | best_score = new_score
36 |
37 | # Report progress
38 | print(f"Trial {trial + 1}/{max_trials}: Leaderboard Score = {new_score:.4f}, Best Score = {best_score:.4f}")
39 |
40 | # Stop if perfect score is achieved
41 | if best_score == 1.0:
42 | print("Perfect score achieved!")
43 | break
--------------------------------------------------------------------------------
/examples/src/test_set_pruning.py:
--------------------------------------------------------------------------------
1 | # Import necessary libraries
2 | from sklearn.datasets import make_classification
3 | from sklearn.model_selection import train_test_split
4 | from sklearn.ensemble import RandomForestClassifier
5 | from sklearn.metrics import accuracy_score
6 | import numpy as np
7 |
8 | # Generate a synthetic classification dataset
9 | X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=42)
10 |
11 | # Split the dataset into training and testing sets
12 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
13 |
14 | # Initialize a Random Forest classifier
15 | model = RandomForestClassifier(random_state=42)
16 |
17 | # Train the model on the training set
18 | model.fit(X_train, y_train)
19 |
20 | # Predict on the test set
21 | y_pred = model.predict(X_test)
22 |
23 | # Calculate the initial accuracy
24 | initial_accuracy = accuracy_score(y_test, y_pred)
25 | print(f"Initial Test Accuracy: {initial_accuracy}")
26 |
27 | # Iteratively remove one misclassified example per iteration
28 | X_test_pruned = X_test
29 | y_test_pruned = y_test
30 | while True:
31 | # Predict on the pruned test set
32 | y_pred_pruned = model.predict(X_test_pruned)
33 |
34 | # Identify indices of misclassified samples
35 | misclassified_indices = np.where(y_pred_pruned != y_test_pruned)[0]
36 |
37 | # Break if no misclassified samples remain
38 | if len(misclassified_indices) == 0:
39 | break
40 |
41 | # Remove one misclassified sample
42 | index_to_remove = misclassified_indices[0] # Select the first misclassified sample
43 | X_test_pruned = np.delete(X_test_pruned, index_to_remove, axis=0)
44 | y_test_pruned = np.delete(y_test_pruned, index_to_remove, axis=0)
45 |
46 | # Recalculate accuracy on the pruned test set
47 | pruned_accuracy = accuracy_score(y_test_pruned, model.predict(X_test_pruned))
48 | print(f"Pruned Test Accuracy: {pruned_accuracy}")
49 |
--------------------------------------------------------------------------------
/examples/src/precision_hacking.py:
--------------------------------------------------------------------------------
1 | # Import necessary libraries
2 | import numpy as np
3 | from sklearn.datasets import make_classification
4 | from sklearn.model_selection import train_test_split
5 | from sklearn.linear_model import LogisticRegression
6 | from sklearn.metrics import precision_score
7 |
8 | # Generate a synthetic classification dataset
9 | X, y = make_classification(n_samples=1000, n_features=20, n_informative=15,
10 | n_redundant=5, random_state=42)
11 |
12 | # Split the dataset into train and test sets
13 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
14 |
15 | # Initialize the logistic regression model
16 | model = LogisticRegression(random_state=42, max_iter=1000)
17 |
18 | # Train the model on the training set
19 | model.fit(X_train, y_train)
20 |
21 | # Get raw predicted probabilities for the positive class
22 | y_proba = model.predict_proba(X_test)[:, 1]
23 |
24 | # Define a range of thresholds to evaluate
25 | thresholds = np.linspace(0.1, 0.9, 81)
26 |
27 | # Track best precision score and corresponding threshold
28 | best_precision = 0
29 | best_threshold = 0
30 |
31 | # Iterate over each threshold
32 | print("Threshold Tuning Progress:")
33 | print(f"{'Threshold':<10}{'Precision':<10}{'Best Precision':<15}{'Best Threshold':<15}")
34 | for threshold in thresholds:
35 | # Convert probabilities to binary predictions based on the current threshold
36 | y_pred = (y_proba >= threshold).astype(int)
37 |
38 | # Calculate precision score
39 | precision = precision_score(y_test, y_pred)
40 |
41 | # Check if this is the best precision score so far
42 | if precision > best_precision:
43 | best_precision = precision
44 | best_threshold = threshold
45 |
46 | # Report progress
47 | print(f"{threshold:<10.2f}{precision:<10.2f}{best_precision:<15.2f}{best_threshold:<15.2f}")
48 |
49 | # Final best score and threshold
50 | print("\nFinal Results:")
51 | print(f"Best Precision: {best_precision:.2f}")
52 | print(f"Best Threshold: {best_threshold:.2f}")
--------------------------------------------------------------------------------
/examples/src/p_hacking_selective_sampling.py:
--------------------------------------------------------------------------------
1 | # Import necessary libraries
2 | import numpy as np
3 | from sklearn.datasets import make_classification
4 | from sklearn.model_selection import cross_val_score, StratifiedKFold
5 | from sklearn.linear_model import LogisticRegression
6 | from scipy.stats import ttest_ind
7 |
8 | # Generate a synthetic classification dataset
9 | X, y = make_classification(n_samples=500, n_features=10, n_informative=5, n_redundant=5, random_state=42)
10 |
11 | # Define a classifier
12 | model = LogisticRegression(random_state=42, max_iter=1000)
13 |
14 | # Define a k-fold cross-validation strategy with a fixed seed
15 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
16 |
17 | # Evaluate the model on the full dataset using k-fold cross-validation
18 | baseline_scores = cross_val_score(model, X, y, cv=kfold)
19 | baseline_mean = np.mean(baseline_scores)
20 | print(f'Base result: {baseline_mean:.3f}')
21 |
22 | # Set up parameters for p-hacking
23 | p_threshold = 0.05 # Threshold for statistical significance
24 | max_trials = 1000 # Maximum number of sampling strategies to test
25 | sample_size = int(0.5 * X.shape[0])
26 |
27 | # Perform selective sampling and evaluate subsets
28 | for trial in range(max_trials):
29 | # Randomly select a subset of samples
30 | np.random.seed(trial + 1)
31 | sample_indices = np.random.choice(range(X.shape[0]), size=sample_size, replace=False)
32 | X_subset, y_subset = X[sample_indices], y[sample_indices]
33 |
34 | # Evaluate the model on the sampled subset using cross-validation
35 | trial_scores = cross_val_score(model, X_subset, y_subset, cv=kfold)
36 | trial_mean = np.mean(trial_scores)
37 | better = trial_mean > baseline_mean
38 |
39 | # Perform a t-test to compare means
40 | t_stat, p_value = ttest_ind(baseline_scores, trial_scores)
41 | significant = p_value < p_threshold
42 |
43 | # Report progress
44 | print(f'{trial+1}, Result: {trial_mean:.3f}, Better: {better}, p-value: {p_value:.3f} Significant: {significant}')
45 |
46 | # Stop if better and significant
47 | if better and significant:
48 | break
49 |
--------------------------------------------------------------------------------
/pics/seed_hacked_result.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/examples/src/p_hacking_feature_selection.py:
--------------------------------------------------------------------------------
1 | # Import necessary libraries
2 | import numpy as np
3 | from sklearn.datasets import make_classification
4 | from sklearn.model_selection import cross_val_score, StratifiedKFold
5 | from sklearn.ensemble import RandomForestClassifier
6 | from scipy.stats import ttest_ind
7 |
8 | # Generate a synthetic classification dataset
9 | X, y = make_classification(n_samples=500, n_features=10, n_informative=2, n_redundant=8, random_state=42)
10 |
11 | # Define a classifier
12 | model = RandomForestClassifier(n_estimators=10, random_state=42)
13 |
14 | # Define a k-fold cross-validation strategy with a fixed seed
15 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
16 |
17 | # Evaluate the model on the full dataset using k-fold cross-validation
18 | baseline_scores = cross_val_score(model, X, y, cv=kfold)
19 | baseline_mean = np.mean(baseline_scores)
20 | print(f'Base result: {baseline_mean:.3f}')
21 |
22 | # Set up parameters for p-hacking
23 | p_threshold = 0.05 # Threshold for statistical significance
24 | max_trials = 1000 # Maximum number of feature subsets to test
25 | num_features = X.shape[1]
26 |
27 | # Perform selective feature subset selection and evaluation
28 | for trial in range(max_trials):
29 | # Randomly select a subset of features
30 | np.random.seed(trial + 1)
31 | selected_features = np.random.choice(range(num_features), size=np.random.randint(1, num_features + 1), replace=False)
32 | X_subset = X[:, selected_features]
33 |
34 | # Evaluate the model on the selected feature subset using cross-validation
35 | trial_scores = cross_val_score(model, X_subset, y, cv=kfold)
36 | trial_mean = np.mean(trial_scores)
37 | better = trial_mean > baseline_mean
38 |
39 | # Perform a t-test to compare means
40 | t_stat, p_value = ttest_ind(baseline_scores, trial_scores)
41 | significant = p_value < p_threshold
42 |
43 | # Report progress
44 | print(f'{trial+1}, Features: {selected_features}, Result: {trial_mean:.3f}, Better: {better}, p-value: {p_value:.3f}, Significant: {significant}')
45 |
46 | # Stop if better and significant
47 | if better and significant:
48 | print("P-hacked subset identified!")
49 | break
50 |
--------------------------------------------------------------------------------
/examples/src/train_test_ratio_gaming.py:
--------------------------------------------------------------------------------
1 | # Import necessary libraries
2 | import numpy as np
3 | import pandas as pd
4 | from sklearn.datasets import make_classification
5 | from sklearn.ensemble import RandomForestClassifier
6 | from sklearn.model_selection import train_test_split
7 | from sklearn.metrics import accuracy_score
8 |
9 | # Generate a synthetic classification dataset
10 | X, y = make_classification(
11 | n_samples=1000, # Number of samples
12 | n_features=20, # Number of features
13 | n_informative=15, # Number of informative features
14 | n_redundant=5, # Number of redundant features
15 | random_state=42 # Fixing random state for reproducibility
16 | )
17 |
18 | # Fix random seed for consistent train/test splits
19 | random_seed = 42
20 |
21 | # Initialize a variable to track the best test performance and associated split ratio
22 | best_accuracy = 0
23 | best_ratio = 0
24 |
25 | # Iterate over train/test split ratios from 50% to 99% in 1% increments
26 | for train_size in range(50, 100): # Split ratios vary from 50% to 99%
27 | test_size = 100 - train_size # Calculate corresponding test size
28 |
29 | # Split the dataset into train and test sets
30 | X_train, X_test, y_train, y_test = train_test_split(
31 | X, y,
32 | train_size=train_size / 100.0, # Convert train_size to percentage
33 | random_state=random_seed # Fix the random seed
34 | )
35 |
36 | # Initialize a Random Forest Classifier
37 | model = RandomForestClassifier(random_state=random_seed)
38 |
39 | # Train the model on the training data
40 | model.fit(X_train, y_train)
41 |
42 | # Predict on the test set
43 | y_pred = model.predict(X_test)
44 |
45 | # Evaluate test performance using accuracy
46 | accuracy = accuracy_score(y_test, y_pred)
47 |
48 | # Report progress
49 | print(f'> {train_size}/{test_size}: {accuracy}')
50 |
51 | # Update the best accuracy and split ratio if current accuracy is better
52 | if accuracy > best_accuracy:
53 | best_accuracy = accuracy
54 | best_ratio = train_size
55 |
56 | # Print the best train/test split ratio and corresponding accuracy
57 | print(f"Best train/test split ratio: {best_ratio}/{100 - best_ratio}")
58 | print(f"Best test accuracy: {best_accuracy}")
--------------------------------------------------------------------------------
/examples/src/p_hacking_feature_transforms.py:
--------------------------------------------------------------------------------
1 | # Import necessary libraries
2 | import numpy as np
3 | import pandas as pd
4 | from sklearn.datasets import make_classification
5 | from sklearn.model_selection import cross_val_score, StratifiedKFold
6 | from sklearn.linear_model import LogisticRegression
7 | from scipy.stats import ttest_ind
8 |
9 | # Generate a synthetic classification dataset
10 | X, y = make_classification(n_samples=350, n_features=10, n_informative=2, n_redundant=8, random_state=42)
11 |
12 | # Define a high-capacity machine learning model
13 | model = LogisticRegression(max_iter=1000, random_state=42)
14 |
15 | # Define a k-fold cross-validation strategy with a fixed seed
16 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
17 |
18 | # Evaluate the model on the dataset using k-fold cross-validation (baseline without transformations)
19 | baseline_scores = cross_val_score(model, X, y, cv=kfold)
20 | baseline_mean = np.mean(baseline_scores)
21 |
22 | # Set up parameters for p-hacking
23 | p_threshold = 0.05 # Threshold for statistical significance
24 | transformations = ["none", "log", "sqrt", "square"] # Possible transformations to test
25 | significant_result_found = False
26 |
27 | # Loop through trials with different feature transformations
28 | for transform in transformations:
29 | # Apply the selected transformation to the features
30 | if transform == "log":
31 | X_transformed = np.log(np.abs(X) + 1) # Avoid log(0) or negative numbers
32 | elif transform == "sqrt":
33 | X_transformed = np.sqrt(np.abs(X)) # Avoid sqrt of negative numbers
34 | elif transform == "square":
35 | X_transformed = X ** 2
36 | else: # "none"
37 | X_transformed = X
38 |
39 | # Evaluate the model with k-fold cross-validation on transformed features
40 | trial_scores = cross_val_score(model, X_transformed, y, cv=kfold)
41 | trial_mean = np.mean(trial_scores)
42 |
43 | # Perform a t-test to compare means
44 | t_stat, p_value = ttest_ind(baseline_scores, trial_scores)
45 | significant = p_value < p_threshold
46 |
47 | # Report progress
48 | print(f'Transform: {transform} mean: {trial_mean:.3f} (base: {baseline_mean:.3f}), p-value: {p_value:.3f}')
49 | if significant:
50 | print('\tSignificant difference')
51 |
52 |
--------------------------------------------------------------------------------
/examples/src/p_hacking_learning_algorithm.py:
--------------------------------------------------------------------------------
1 | # Import necessary libraries
2 | import numpy as np
3 | import pandas as pd
4 | from sklearn.datasets import make_classification
5 | from sklearn.model_selection import cross_val_score, StratifiedKFold
6 | from sklearn.ensemble import RandomForestClassifier
7 | from scipy.stats import ttest_ind
8 |
9 | # Generate a synthetic classification dataset
10 | X, y = make_classification(n_samples=350, n_features=10, n_informative=2, n_redundant=8, random_state=42)
11 |
12 | # Define a high-capacity machine learning model
13 | model = RandomForestClassifier(n_estimators=10, random_state=42)
14 |
15 | # Define a k-fold cross-validation strategy with a fixed seed
16 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
17 |
18 | # Evaluate the model on the dataset using k-fold cross-validation
19 | baseline_scores = cross_val_score(model, X, y, cv=kfold)
20 | baseline_mean = np.mean(baseline_scores)
21 |
22 | # Set up parameters for p-hacking
23 | p_threshold = 0.05 # Threshold for statistical significance
24 | max_trials = 1000 # Maximum number of trials to perform
25 | significant_result_found = False
26 |
27 | # Loop through trials with different random seeds
28 | for trial in range(max_trials):
29 | # Use a new random seed for the model
30 | seed = trial + 100
31 | model = RandomForestClassifier(n_estimators=10, random_state=seed)
32 |
33 | # Evaluate the model with k-fold cross-validation
34 | trial_scores = cross_val_score(model, X, y, cv=kfold)
35 | trial_mean = np.mean(trial_scores)
36 |
37 | # Perform a t-test to compare means
38 | t_stat, p_value = ttest_ind(baseline_scores, trial_scores)
39 |
40 | # Check if the p-value is below the significance threshold
41 | if p_value < p_threshold:
42 | significant_result_found = True
43 | print(f"Significant difference found on trial {trial+1}")
44 | print(f"Baseline mean: {baseline_mean:.4f}, Trial mean: {trial_mean:.4f}, p-value: {p_value:.4f}")
45 | break
46 | else:
47 | print(f"No significant difference found yet, trial {trial+1}, p-value: {p_value:.4f}")
48 |
49 | # Report if no significant result was found within the maximum trials
50 | if not significant_result_found:
51 | print("No significant result found after maximum trials.")
--------------------------------------------------------------------------------
/examples/src/seed_hacking_bootstrap_performance.py:
--------------------------------------------------------------------------------
1 | # Import necessary libraries
2 | from sklearn.datasets import make_classification
3 | from sklearn.model_selection import train_test_split
4 | from sklearn.ensemble import RandomForestClassifier
5 | from sklearn.utils import resample
6 | from sklearn.metrics import accuracy_score
7 | import numpy as np
8 |
9 | # Generate a synthetic classification dataset
10 | X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
11 |
12 | # Split the dataset into a training set and a test set
13 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
14 |
15 | # Initialize the random forest classifier
16 | model = RandomForestClassifier(random_state=42)
17 |
18 | # Train the model on the training set
19 | model.fit(X_train, y_train)
20 |
21 | # Number of bootstrap iterations
22 | num_bootstrap_iterations = 50
23 |
24 | # Number of repetitions for each bootstrap sample
25 | num_repeats_per_sample = 10
26 |
27 | # Variable to track the best accuracy and corresponding seed
28 | best_accuracy = 0
29 | best_seed = None
30 |
31 | # Iterate through multiple random seeds for bootstrap sampling
32 | for seed in range(num_bootstrap_iterations):
33 | # List to store accuracy scores for each repeat
34 | repeat_accuracies = []
35 |
36 | # Evaluate the model on the same bootstrap sample multiple times
37 | for _ in range(num_repeats_per_sample):
38 | # Generate a bootstrap sample of the test set
39 | X_test_bootstrap, y_test_bootstrap = resample(X_test, y_test, random_state=seed)
40 | y_pred = model.predict(X_test_bootstrap)
41 | accuracy = accuracy_score(y_test_bootstrap, y_pred)
42 | repeat_accuracies.append(accuracy)
43 |
44 | # Compute the median accuracy for the current bootstrap sample
45 | median_accuracy = np.median(repeat_accuracies)
46 |
47 | # Report progress
48 | print(f'> Seed={seed}, Median Accuracy: {median_accuracy}')
49 |
50 | # Keep track of the best performance and its corresponding seed
51 | if median_accuracy > best_accuracy:
52 | best_accuracy = median_accuracy
53 | best_seed = seed
54 |
55 | # Print the selected seed with the best accuracy (artificially chosen for presentation)
56 | print(f"Best Seed: {best_seed}, Best Median Accuracy: {best_accuracy}")
--------------------------------------------------------------------------------
/examples/src/test_set_overfitting.py:
--------------------------------------------------------------------------------
1 | # Import necessary libraries
2 | from sklearn.datasets import make_classification # For generating a synthetic classification dataset
3 | from sklearn.model_selection import train_test_split # For splitting the dataset
4 | from sklearn.ensemble import RandomForestClassifier # High-capacity model
5 | from sklearn.metrics import accuracy_score # For model evaluation
6 | from itertools import product # For generating all combinations of hyperparameters
7 |
8 | # Generate a synthetic classification dataset
9 | X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=42)
10 |
11 | # Split the dataset into training and testing sets
12 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
13 |
14 | # Define possible values for hyperparameters
15 | n_estimators_options = [10, 50, 100, 200]
16 | max_depth_options = [5, 10, 15, 20]
17 |
18 | # Generate all combinations of hyperparameters
19 | configurations = list(product(n_estimators_options, max_depth_options))
20 |
21 | # Dictionary to store test set performance for each configuration
22 | test_set_performance = {}
23 |
24 | # Variable to track the best configuration so far
25 | best_config_so_far = None
26 | best_accuracy_so_far = 0
27 |
28 | # Loop through each configuration
29 | for n_estimators, max_depth in configurations:
30 | # Create the model with the current configuration
31 | model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
32 |
33 | # Fit the model on the training set
34 | model.fit(X_train, y_train)
35 |
36 | # Evaluate the model on the test set
37 | y_pred = model.predict(X_test)
38 | accuracy = accuracy_score(y_test, y_pred)
39 |
40 | # Store the performance on the test set
41 | test_set_performance[f"n_estimators={n_estimators}, max_depth={max_depth}"] = accuracy
42 |
43 | # Update and display progress
44 | if accuracy > best_accuracy_so_far:
45 | best_config_so_far = (n_estimators, max_depth)
46 | best_accuracy_so_far = accuracy
47 | print(f"cfg: n_estimators={n_estimators}, max_depth={max_depth}, Accuracy: {accuracy:.4f} " + f"(Best: {best_accuracy_so_far:.4f})")
48 |
49 | # Print the final best configuration and its test set accuracy
50 | print(f"Final Best Configuration: n_estimators={best_config_so_far[0]}, max_depth={best_config_so_far[1]}, Test Set Accuracy: {best_accuracy_so_far:.4f}")
51 |
--------------------------------------------------------------------------------
/examples/test_set_memorization.md:
--------------------------------------------------------------------------------
1 | # Test Set Memorization
2 |
3 | > Allow the model to memorize the test set and get a perfect score.
4 |
5 | ## Description
6 |
7 | Test set memorization is one of the most dangerous and deceptive mistakes in machine learning model development.
8 |
9 | This problem occurs when a model is accidentally or intentionally allowed to train on data that should be reserved for testing. The result appears amazing at first - the model achieves near-perfect accuracy scores. But these scores are completely meaningless.
10 |
11 | In reality, the model hasn't learned to generalize at all. It has simply memorized the correct answers for your specific test cases. When deployed to production with real-world data, this model will perform terribly because it never actually learned the underlying patterns.
12 |
13 | This issue commonly arises through data leakage, where test data inadvertently bleeds into the training process through improper data handling or preprocessing steps.
14 |
15 | For new data scientists, this can be especially problematic because the impressive metrics can mask fundamental problems with the model's ability to generalize.
16 |
17 | To avoid this problem, always maintain strict separation between training and test data throughout the entire machine learning pipeline.
18 |
19 |
20 |
21 |
22 | ## Example
23 |
24 | ```python
25 | # Import necessary libraries
26 | from sklearn.datasets import make_classification # For generating synthetic dataset
27 | from sklearn.model_selection import train_test_split # For splitting the dataset
28 | from sklearn.neighbors import KNeighborsClassifier # For K-Nearest Neighbors classifier
29 | from sklearn.metrics import accuracy_score # For evaluating model performance
30 |
31 | # Generate a synthetic classification dataset
32 | X, y = make_classification(n_samples=100, n_features=10, n_classes=2, random_state=42)
33 |
34 | # Split the dataset into train and test sets
35 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
36 |
37 | # Create a K-Nearest Neighbors (KNN) classifier with k=1
38 | knn = KNeighborsClassifier(n_neighbors=1)
39 |
40 | # Fit the model on the test set (intentional test set leakage)
41 | knn.fit(X_test, y_test)
42 |
43 | # Evaluate the model on the test set
44 | y_pred = knn.predict(X_test)
45 |
46 | # Report the perfect score
47 | print("Accuracy:", accuracy_score(y_test, y_pred))
48 | ```
49 |
50 | Example Output:
51 |
52 | ```text
53 | Accuracy: 1.0
54 | ```
--------------------------------------------------------------------------------
/examples/src/seed_hacking_perceptron.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn.datasets import make_classification
4 | from sklearn.linear_model import Perceptron
5 | from sklearn.model_selection import cross_val_score, KFold
6 | from statistics import mean, median, stdev
7 |
8 | # Define the number of trials
9 | num_trials = 100
10 |
11 | # Define variables to track the best seed and best performance
12 | best_seed = None
13 | best_performance = -np.inf
14 | performance_scores = [] # List to store performance scores
15 |
16 | # Create a synthetic classification dataset
17 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42)
18 |
19 | # Fix the cross-validation folds for all evaluations
20 | kf = KFold(n_splits=5, shuffle=True, random_state=42)
21 |
22 | # Iterate over multiple seeds for the model's randomness
23 | for trial in range(num_trials):
24 | # Set the seed for the bagging classifier
25 | seed = trial
26 |
27 | # Initialize the model with the current seed
28 | model = Perceptron(random_state=seed)
29 |
30 | # Evaluate the model using cross-validation
31 | scores = cross_val_score(model, X, y, cv=kf)
32 |
33 | # Calculate the mean performance
34 | mean_performance = scores.mean()
35 | performance_scores.append(mean_performance)
36 |
37 | # Print the seed and performance if there is an improvement
38 | if mean_performance > best_performance:
39 | print(f"Seed: {seed}, Performance: {mean_performance:.4f}")
40 | best_performance = mean_performance
41 | best_seed = seed
42 |
43 | # Report the best seed and its performance
44 | print(f"\nBest Seed: {best_seed}, Best Performance: {best_performance:.4f}")
45 |
46 | # Calculate statistics
47 | min_score = min(performance_scores)
48 | max_score = max(performance_scores)
49 | median_score = median(performance_scores)
50 | mean_score = mean(performance_scores)
51 | std_dev_score = stdev(performance_scores)
52 |
53 | print("\nPerformance Statistics:")
54 | print(f"Minimum: {min_score:.4f}")
55 | print(f"Median: {median_score:.4f}")
56 | print(f"Maximum: {max_score:.4f}")
57 | print(f"Mean: {mean_score:.4f}")
58 | print(f"Standard Deviation: {std_dev_score:.4f}")
59 |
60 | # Plot the distribution of performance scores
61 | plt.hist(performance_scores, bins=10, edgecolor='black', alpha=0.7)
62 | plt.title('Distribution of Performance Scores')
63 | plt.xlabel('Performance Score')
64 | plt.ylabel('Frequency')
65 | plt.grid(axis='y', linestyle='--', alpha=0.7)
66 | plt.show()
67 |
--------------------------------------------------------------------------------
/examples/src/seed_hacking_sgd_classifier.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn.datasets import make_classification
4 | from sklearn.linear_model import SGDClassifier
5 | from sklearn.model_selection import cross_val_score, KFold
6 | from statistics import mean, median, stdev
7 |
8 | # Define the number of trials
9 | num_trials = 100
10 |
11 | # Define variables to track the best seed and best performance
12 | best_seed = None
13 | best_performance = -np.inf
14 | performance_scores = [] # List to store performance scores
15 |
16 | # Create a synthetic classification dataset
17 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42)
18 |
19 | # Fix the cross-validation folds for all evaluations
20 | kf = KFold(n_splits=5, shuffle=True, random_state=42)
21 |
22 | # Iterate over multiple seeds for the model's randomness
23 | for trial in range(num_trials):
24 | # Set the seed for the bagging classifier
25 | seed = trial
26 |
27 | # Initialize the model with the current seed
28 | model = SGDClassifier(random_state=seed)
29 |
30 | # Evaluate the model using cross-validation
31 | scores = cross_val_score(model, X, y, cv=kf)
32 |
33 | # Calculate the mean performance
34 | mean_performance = scores.mean()
35 | performance_scores.append(mean_performance)
36 |
37 | # Print the seed and performance if there is an improvement
38 | if mean_performance > best_performance:
39 | print(f"Seed: {seed}, Performance: {mean_performance:.4f}")
40 | best_performance = mean_performance
41 | best_seed = seed
42 |
43 | # Report the best seed and its performance
44 | print(f"\nBest Seed: {best_seed}, Best Performance: {best_performance:.4f}")
45 |
46 | # Calculate statistics
47 | min_score = min(performance_scores)
48 | max_score = max(performance_scores)
49 | median_score = median(performance_scores)
50 | mean_score = mean(performance_scores)
51 | std_dev_score = stdev(performance_scores)
52 |
53 | print("\nPerformance Statistics:")
54 | print(f"Minimum: {min_score:.4f}")
55 | print(f"Median: {median_score:.4f}")
56 | print(f"Maximum: {max_score:.4f}")
57 | print(f"Mean: {mean_score:.4f}")
58 | print(f"Standard Deviation: {std_dev_score:.4f}")
59 |
60 | # Plot the distribution of performance scores
61 | plt.hist(performance_scores, bins=10, edgecolor='black', alpha=0.7)
62 | plt.title('Distribution of Performance Scores')
63 | plt.xlabel('Performance Score')
64 | plt.ylabel('Frequency')
65 | plt.grid(axis='y', linestyle='--', alpha=0.7)
66 | plt.show()
67 |
--------------------------------------------------------------------------------
/examples/src/seed_hacking_ridge_classifier.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn.datasets import make_classification
4 | from sklearn.linear_model import RidgeClassifier
5 | from sklearn.model_selection import cross_val_score, KFold
6 | from statistics import mean, median, stdev
7 |
8 | # Define the number of trials
9 | num_trials = 100
10 |
11 | # Define variables to track the best seed and best performance
12 | best_seed = None
13 | best_performance = -np.inf
14 | performance_scores = [] # List to store performance scores
15 |
16 | # Create a synthetic classification dataset
17 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42)
18 |
19 | # Fix the cross-validation folds for all evaluations
20 | kf = KFold(n_splits=5, shuffle=True, random_state=42)
21 |
22 | # Iterate over multiple seeds for the model's randomness
23 | for trial in range(num_trials):
24 | # Set the seed for the bagging classifier
25 | seed = trial
26 |
27 | # Initialize the model with the current seed
28 | model = RidgeClassifier(random_state=seed)
29 |
30 | # Evaluate the model using cross-validation
31 | scores = cross_val_score(model, X, y, cv=kf)
32 |
33 | # Calculate the mean performance
34 | mean_performance = scores.mean()
35 | performance_scores.append(mean_performance)
36 |
37 | # Print the seed and performance if there is an improvement
38 | if mean_performance > best_performance:
39 | print(f"Seed: {seed}, Performance: {mean_performance:.4f}")
40 | best_performance = mean_performance
41 | best_seed = seed
42 |
43 | # Report the best seed and its performance
44 | print(f"\nBest Seed: {best_seed}, Best Performance: {best_performance:.4f}")
45 |
46 | # Calculate statistics
47 | min_score = min(performance_scores)
48 | max_score = max(performance_scores)
49 | median_score = median(performance_scores)
50 | mean_score = mean(performance_scores)
51 | std_dev_score = stdev(performance_scores)
52 |
53 | print("\nPerformance Statistics:")
54 | print(f"Minimum: {min_score:.4f}")
55 | print(f"Median: {median_score:.4f}")
56 | print(f"Maximum: {max_score:.4f}")
57 | print(f"Mean: {mean_score:.4f}")
58 | print(f"Standard Deviation: {std_dev_score:.4f}")
59 |
60 | # Plot the distribution of performance scores
61 | plt.hist(performance_scores, bins=10, edgecolor='black', alpha=0.7)
62 | plt.title('Distribution of Performance Scores')
63 | plt.xlabel('Performance Score')
64 | plt.ylabel('Frequency')
65 | plt.grid(axis='y', linestyle='--', alpha=0.7)
66 | plt.show()
67 |
--------------------------------------------------------------------------------
/examples/src/seed_hacking_decision_tree.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn.datasets import make_classification
4 | from sklearn.tree import DecisionTreeClassifier
5 | from sklearn.model_selection import cross_val_score, KFold
6 | from statistics import mean, median, stdev
7 |
8 | # Define the number of trials
9 | num_trials = 100
10 |
11 | # Define variables to track the best seed and best performance
12 | best_seed = None
13 | best_performance = -np.inf
14 | performance_scores = [] # List to store performance scores
15 |
16 | # Create a synthetic classification dataset
17 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42)
18 |
19 | # Fix the cross-validation folds for all evaluations
20 | kf = KFold(n_splits=5, shuffle=True, random_state=42)
21 |
22 | # Iterate over multiple seeds for the model's randomness
23 | for trial in range(num_trials):
24 | # Set the seed for the decision tree classifier
25 | seed = trial
26 |
27 | # Initialize the model with the current seed
28 | model = DecisionTreeClassifier(random_state=seed)
29 |
30 | # Evaluate the model using cross-validation
31 | scores = cross_val_score(model, X, y, cv=kf)
32 |
33 | # Calculate the mean performance
34 | mean_performance = scores.mean()
35 | performance_scores.append(mean_performance)
36 |
37 | # Print the seed and performance if there is an improvement
38 | if mean_performance > best_performance:
39 | print(f"Seed: {seed}, Performance: {mean_performance:.4f}")
40 | best_performance = mean_performance
41 | best_seed = seed
42 |
43 | # Report the best seed and its performance
44 | print(f"\nBest Seed: {best_seed}, Best Performance: {best_performance:.4f}")
45 |
46 | # Calculate statistics
47 | min_score = min(performance_scores)
48 | max_score = max(performance_scores)
49 | median_score = median(performance_scores)
50 | mean_score = mean(performance_scores)
51 | std_dev_score = stdev(performance_scores)
52 |
53 | print("\nPerformance Statistics:")
54 | print(f"Minimum: {min_score:.4f}")
55 | print(f"Median: {median_score:.4f}")
56 | print(f"Maximum: {max_score:.4f}")
57 | print(f"Mean: {mean_score:.4f}")
58 | print(f"Standard Deviation: {std_dev_score:.4f}")
59 |
60 | # Plot the distribution of performance scores
61 | plt.hist(performance_scores, bins=10, edgecolor='black', alpha=0.7)
62 | plt.title('Distribution of Performance Scores')
63 | plt.xlabel('Performance Score')
64 | plt.ylabel('Frequency')
65 | plt.grid(axis='y', linestyle='--', alpha=0.7)
66 | plt.show()
67 |
--------------------------------------------------------------------------------
/examples/src/seed_hacking_random_forest.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn.datasets import make_classification
4 | from sklearn.ensemble import RandomForestClassifier
5 | from sklearn.model_selection import cross_val_score, KFold
6 | from statistics import mean, median, stdev
7 |
8 | # Define the number of trials
9 | num_trials = 100
10 |
11 | # Define variables to track the best seed and best performance
12 | best_seed = None
13 | best_performance = -np.inf
14 | performance_scores = [] # List to store performance scores
15 |
16 | # Create a synthetic classification dataset
17 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42)
18 |
19 | # Fix the cross-validation folds for all evaluations
20 | kf = KFold(n_splits=5, shuffle=True, random_state=42)
21 |
22 | # Iterate over multiple seeds for the model's randomness
23 | for trial in range(num_trials):
24 | # Set the seed for the random forest model
25 | seed = trial
26 |
27 | # Initialize the model with the current seed
28 | model = RandomForestClassifier(n_estimators=50, random_state=seed)
29 |
30 | # Evaluate the model using cross-validation
31 | scores = cross_val_score(model, X, y, cv=kf)
32 |
33 | # Calculate the mean performance
34 | mean_performance = scores.mean()
35 | performance_scores.append(mean_performance)
36 |
37 | # Print the seed and performance if there is an improvement
38 | if mean_performance > best_performance:
39 | print(f"Seed: {seed}, Performance: {mean_performance:.4f}")
40 | best_performance = mean_performance
41 | best_seed = seed
42 |
43 | # Report the best seed and its performance
44 | print(f"\nBest Seed: {best_seed}, Best Performance: {best_performance:.4f}")
45 |
46 | # Calculate statistics
47 | min_score = min(performance_scores)
48 | max_score = max(performance_scores)
49 | median_score = median(performance_scores)
50 | mean_score = mean(performance_scores)
51 | std_dev_score = stdev(performance_scores)
52 |
53 | print("\nPerformance Statistics:")
54 | print(f"Minimum: {min_score:.4f}")
55 | print(f"Median: {median_score:.4f}")
56 | print(f"Maximum: {max_score:.4f}")
57 | print(f"Mean: {mean_score:.4f}")
58 | print(f"Standard Deviation: {std_dev_score:.4f}")
59 |
60 | # Plot the distribution of performance scores
61 | plt.hist(performance_scores, bins=10, edgecolor='black', alpha=0.7)
62 | plt.title('Distribution of Performance Scores')
63 | plt.xlabel('Performance Score')
64 | plt.ylabel('Frequency')
65 | plt.grid(axis='y', linestyle='--', alpha=0.7)
66 | plt.show()
67 |
--------------------------------------------------------------------------------
/examples/src/seed_hacking_logistic_regression.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn.datasets import make_classification
4 | from sklearn.linear_model import LogisticRegression
5 | from sklearn.model_selection import cross_val_score, KFold
6 | from statistics import mean, median, stdev
7 |
8 | # Define the number of trials
9 | num_trials = 100
10 |
11 | # Define variables to track the best seed and best performance
12 | best_seed = None
13 | best_performance = -np.inf
14 | performance_scores = [] # List to store performance scores
15 |
16 | # Create a synthetic classification dataset
17 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42)
18 |
19 | # Fix the cross-validation folds for all evaluations
20 | kf = KFold(n_splits=5, shuffle=True, random_state=42)
21 |
22 | # Iterate over multiple seeds for the model's randomness
23 | for trial in range(num_trials):
24 | # Set the seed for the logistic regression model
25 | seed = trial
26 |
27 | # Initialize the model with the current seed
28 | model = LogisticRegression(random_state=seed, max_iter=1000)
29 |
30 | # Evaluate the model using cross-validation
31 | scores = cross_val_score(model, X, y, cv=kf)
32 |
33 | # Calculate the mean performance
34 | mean_performance = scores.mean()
35 | performance_scores.append(mean_performance)
36 |
37 | # Print the seed and performance if there is an improvement
38 | if mean_performance > best_performance:
39 | print(f"Seed: {seed}, Performance: {mean_performance:.4f}")
40 | best_performance = mean_performance
41 | best_seed = seed
42 |
43 | # Report the best seed and its performance
44 | print(f"\nBest Seed: {best_seed}, Best Performance: {best_performance:.4f}")
45 |
46 | # Calculate statistics
47 | min_score = min(performance_scores)
48 | max_score = max(performance_scores)
49 | median_score = median(performance_scores)
50 | mean_score = mean(performance_scores)
51 | std_dev_score = stdev(performance_scores)
52 |
53 | print("\nPerformance Statistics:")
54 | print(f"Minimum: {min_score:.4f}")
55 | print(f"Median: {median_score:.4f}")
56 | print(f"Maximum: {max_score:.4f}")
57 | print(f"Mean: {mean_score:.4f}")
58 | print(f"Standard Deviation: {std_dev_score:.4f}")
59 |
60 | # Plot the distribution of performance scores
61 | plt.hist(performance_scores, bins=10, edgecolor='black', alpha=0.7)
62 | plt.title('Distribution of Performance Scores')
63 | plt.xlabel('Performance Score')
64 | plt.ylabel('Frequency')
65 | plt.grid(axis='y', linestyle='--', alpha=0.7)
66 | plt.show()
67 |
--------------------------------------------------------------------------------
/examples/src/seed_hacking_gradient_boosting.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn.datasets import make_classification
4 | from sklearn.ensemble import GradientBoostingClassifier
5 | from sklearn.model_selection import cross_val_score, KFold
6 | from statistics import mean, median, stdev
7 |
8 | # Define the number of trials
9 | num_trials = 100
10 |
11 | # Define variables to track the best seed and best performance
12 | best_seed = None
13 | best_performance = -np.inf
14 | performance_scores = [] # List to store performance scores
15 |
16 | # Create a synthetic classification dataset
17 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42)
18 |
19 | # Fix the cross-validation folds for all evaluations
20 | kf = KFold(n_splits=5, shuffle=True, random_state=42)
21 |
22 | # Iterate over multiple seeds for the model's randomness
23 | for trial in range(num_trials):
24 | # Set the seed for the bagging classifier
25 | seed = trial
26 |
27 | # Initialize the model with the current seed
28 | model = GradientBoostingClassifier(n_estimators=50, random_state=seed)
29 |
30 | # Evaluate the model using cross-validation
31 | scores = cross_val_score(model, X, y, cv=kf)
32 |
33 | # Calculate the mean performance
34 | mean_performance = scores.mean()
35 | performance_scores.append(mean_performance)
36 |
37 | # Print the seed and performance if there is an improvement
38 | if mean_performance > best_performance:
39 | print(f"Seed: {seed}, Performance: {mean_performance:.4f}")
40 | best_performance = mean_performance
41 | best_seed = seed
42 |
43 | # Report the best seed and its performance
44 | print(f"\nBest Seed: {best_seed}, Best Performance: {best_performance:.4f}")
45 |
46 | # Calculate statistics
47 | min_score = min(performance_scores)
48 | max_score = max(performance_scores)
49 | median_score = median(performance_scores)
50 | mean_score = mean(performance_scores)
51 | std_dev_score = stdev(performance_scores)
52 |
53 | print("\nPerformance Statistics:")
54 | print(f"Minimum: {min_score:.4f}")
55 | print(f"Median: {median_score:.4f}")
56 | print(f"Maximum: {max_score:.4f}")
57 | print(f"Mean: {mean_score:.4f}")
58 | print(f"Standard Deviation: {std_dev_score:.4f}")
59 |
60 | # Plot the distribution of performance scores
61 | plt.hist(performance_scores, bins=10, edgecolor='black', alpha=0.7)
62 | plt.title('Distribution of Performance Scores')
63 | plt.xlabel('Performance Score')
64 | plt.ylabel('Frequency')
65 | plt.grid(axis='y', linestyle='--', alpha=0.7)
66 | plt.show()
67 |
--------------------------------------------------------------------------------
/examples/src/seed_hacking_multilayer_percepron.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn.datasets import make_classification
4 | from sklearn.neural_network import MLPClassifier
5 | from sklearn.model_selection import cross_val_score, KFold
6 | from statistics import mean, median, stdev
7 |
8 | # Define the number of trials
9 | num_trials = 100
10 |
11 | # Define variables to track the best seed and best performance
12 | best_seed = None
13 | best_performance = -np.inf
14 | performance_scores = [] # List to store performance scores
15 |
16 | # Create a synthetic classification dataset
17 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42)
18 |
19 | # Fix the cross-validation folds for all evaluations
20 | kf = KFold(n_splits=5, shuffle=True, random_state=42)
21 |
22 | # Iterate over multiple seeds for the model's randomness
23 | for trial in range(num_trials):
24 | # Set the seed for the random forest model
25 | seed = trial
26 |
27 | # Initialize the model with the current seed
28 | model = MLPClassifier(hidden_layer_sizes=(50,10), max_iter=100, random_state=seed)
29 |
30 | # Evaluate the model using cross-validation
31 | scores = cross_val_score(model, X, y, cv=kf)
32 |
33 | # Calculate the mean performance
34 | mean_performance = scores.mean()
35 | performance_scores.append(mean_performance)
36 |
37 | # Print the seed and performance if there is an improvement
38 | if mean_performance > best_performance:
39 | print(f"Seed: {seed}, Performance: {mean_performance:.4f}")
40 | best_performance = mean_performance
41 | best_seed = seed
42 |
43 | # Report the best seed and its performance
44 | print(f"\nBest Seed: {best_seed}, Best Performance: {best_performance:.4f}")
45 |
46 | # Calculate statistics
47 | min_score = min(performance_scores)
48 | max_score = max(performance_scores)
49 | median_score = median(performance_scores)
50 | mean_score = mean(performance_scores)
51 | std_dev_score = stdev(performance_scores)
52 |
53 | print("\nPerformance Statistics:")
54 | print(f"Minimum: {min_score:.4f}")
55 | print(f"Median: {median_score:.4f}")
56 | print(f"Maximum: {max_score:.4f}")
57 | print(f"Mean: {mean_score:.4f}")
58 | print(f"Standard Deviation: {std_dev_score:.4f}")
59 |
60 | # Plot the distribution of performance scores
61 | plt.hist(performance_scores, bins=10, edgecolor='black', alpha=0.7)
62 | plt.title('Distribution of Performance Scores')
63 | plt.xlabel('Performance Score')
64 | plt.ylabel('Frequency')
65 | plt.grid(axis='y', linestyle='--', alpha=0.7)
66 | plt.show()
67 |
--------------------------------------------------------------------------------
/examples/src/seed_hacking_bagging.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn.datasets import make_classification
4 | from sklearn.ensemble import BaggingClassifier
5 | from sklearn.tree import DecisionTreeClassifier
6 | from sklearn.model_selection import cross_val_score, KFold
7 | from statistics import mean, median, stdev
8 |
9 | # Define the number of trials
10 | num_trials = 100
11 |
12 | # Define variables to track the best seed and best performance
13 | best_seed = None
14 | best_performance = -np.inf
15 | performance_scores = [] # List to store performance scores
16 |
17 | # Create a synthetic classification dataset
18 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42)
19 |
20 | # Fix the cross-validation folds for all evaluations
21 | kf = KFold(n_splits=5, shuffle=True, random_state=42)
22 |
23 | # Iterate over multiple seeds for the model's randomness
24 | for trial in range(num_trials):
25 | # Set the seed for the bagging classifier
26 | seed = trial
27 |
28 | # Initialize the model with the current seed
29 | model = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10, random_state=seed)
30 |
31 | # Evaluate the model using cross-validation
32 | scores = cross_val_score(model, X, y, cv=kf)
33 |
34 | # Calculate the mean performance
35 | mean_performance = scores.mean()
36 | performance_scores.append(mean_performance)
37 |
38 | # Print the seed and performance if there is an improvement
39 | if mean_performance > best_performance:
40 | print(f"Seed: {seed}, Performance: {mean_performance:.4f}")
41 | best_performance = mean_performance
42 | best_seed = seed
43 |
44 | # Report the best seed and its performance
45 | print(f"\nBest Seed: {best_seed}, Best Performance: {best_performance:.4f}")
46 |
47 | # Calculate statistics
48 | min_score = min(performance_scores)
49 | max_score = max(performance_scores)
50 | median_score = median(performance_scores)
51 | mean_score = mean(performance_scores)
52 | std_dev_score = stdev(performance_scores)
53 |
54 | print("\nPerformance Statistics:")
55 | print(f"Minimum: {min_score:.4f}")
56 | print(f"Median: {median_score:.4f}")
57 | print(f"Maximum: {max_score:.4f}")
58 | print(f"Mean: {mean_score:.4f}")
59 | print(f"Standard Deviation: {std_dev_score:.4f}")
60 |
61 | # Plot the distribution of performance scores
62 | plt.hist(performance_scores, bins=10, edgecolor='black', alpha=0.7)
63 | plt.title('Distribution of Performance Scores')
64 | plt.xlabel('Performance Score')
65 | plt.ylabel('Frequency')
66 | plt.grid(axis='y', linestyle='--', alpha=0.7)
67 | plt.show()
68 |
--------------------------------------------------------------------------------
/examples/src/test_harness_hacking_hill_climbing_test_folds.py:
--------------------------------------------------------------------------------
1 | # Import necessary libraries
2 | import numpy as np
3 | from sklearn.datasets import make_classification
4 | from sklearn.model_selection import KFold, train_test_split
5 | from sklearn.metrics import accuracy_score
6 |
7 | # Generate a synthetic classification dataset
8 | X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
9 |
10 | # Split the dataset into a train and test set
11 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
12 |
13 | # Define k-fold cross-validation
14 | kfold = KFold(n_splits=5, shuffle=True, random_state=42)
15 |
16 | # Initialize random predictions across all data points in the training set
17 | predictions = np.random.choice(np.unique(y_train), size=len(X_train))
18 |
19 | # Maximum number of trials
20 | n_trials = 100
21 |
22 | # Begin hill-climbing meta-algorithm
23 | for trial in range(n_trials):
24 | print(f"Trial {trial + 1}/{n_trials}")
25 |
26 | # Initialize variables to track progress across folds
27 | fold_accuracies = []
28 |
29 | # Perform k-fold cross-validation
30 | for train_idx, test_idx in kfold.split(X_train):
31 | # Get test fold indices
32 | y_test_fold = y_train[test_idx]
33 | fold_predictions = predictions[test_idx]
34 |
35 | # Evaluate the current predictions on the test fold
36 | current_accuracy = accuracy_score(y_test_fold, fold_predictions)
37 |
38 | # Adapt predictions based on test fold performance (hill climbing)
39 | if current_accuracy < 1.0: # If not perfect
40 | for i in range(len(test_idx)):
41 | idx = test_idx[i]
42 | if predictions[idx] != y_train[idx]: # Fix one wrong prediction
43 | predictions[idx] = y_train[idx]
44 | break # Stop after a single modification
45 |
46 | # Recalculate fold accuracy after adaptation
47 | updated_fold_predictions = predictions[test_idx]
48 | updated_accuracy = accuracy_score(y_test_fold, updated_fold_predictions)
49 | fold_accuracies.append(updated_accuracy)
50 |
51 | # Calculate and report average accuracy across all folds for this trial
52 | avg_accuracy = np.mean(fold_accuracies)
53 | print(f"Average Accuracy Across Folds: {avg_accuracy:.4f}")
54 |
55 | # Stop trials if all folds achieve perfect accuracy
56 | if avg_accuracy == 1.0:
57 | print("All folds reached perfect accuracy. Stopping trials.")
58 | break
59 |
60 | # Evaluate the "model" on the holdout test set
61 | # Use random predictions for the holdout test set to simulate lack of generalization
62 | test_predictions = np.random.choice(np.unique(y_train), size=len(y_test))
63 | holdout_accuracy = accuracy_score(y_test, test_predictions)
64 |
65 | # Report final results
66 | print("\nFinal Results:")
67 | print(f"Accuracy on holdout test set: {holdout_accuracy:.4f}")
68 |
--------------------------------------------------------------------------------
/examples/seed_hacking_learning_algorithm.md:
--------------------------------------------------------------------------------
1 | # Seed Hacking Learning Algorithm
2 |
3 | > Vary the random number seed for the model training algorithm in order to get the best result.
4 |
5 | ## Description
6 |
7 | Random seed manipulation is a deceptive practice where data scientists repeatedly change the random seed during model training to artificially improve performance metrics.
8 |
9 | This approach exploits the randomness in model initialization (e.g. initial weights in a neural network) and model training algorithms (e.g. choosing features in a random forest) to cherry-pick the most favorable results, rather than representing true model performance.
10 |
11 | While it might seem like a clever optimization trick, it actually creates unreliable models that won't generalize well to real-world data. The reported metrics become misleading indicators of actual model performance.
12 |
13 | This practice is particularly tempting for new data scientists who are eager to demonstrate strong results or meet aggressive performance targets. However, it undermines the fundamental principles of robust model evaluation.
14 |
15 | Instead of random seed manipulation, focus on proper cross-validation, careful feature engineering, and thorough hyperparameter tuning. These practices will lead to more reliable and trustworthy models.
16 |
17 | The right way to handle random seeds is to fix them at the start of your project and maintain consistency throughout. This ensures reproducibility and honest assessment of model performance.
18 |
19 |
20 | ## Example
21 |
22 | ```python
23 | import numpy as np
24 | from sklearn.datasets import make_classification
25 | from sklearn.ensemble import RandomForestClassifier
26 | from sklearn.model_selection import cross_val_score, KFold
27 |
28 | # Define the number of trials
29 | num_trials = 100
30 |
31 | # Define variables to track the best seed and best performance
32 | best_seed = None
33 | best_performance = -np.inf
34 |
35 | # Create a synthetic classification dataset
36 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42)
37 |
38 | # Fix the cross-validation folds for all evaluations
39 | kf = KFold(n_splits=5, shuffle=True, random_state=42)
40 |
41 | # Iterate over multiple seeds for the model's randomness
42 | for trial in range(num_trials):
43 | # Set the seed for the random forest model
44 | seed = trial
45 |
46 | # Initialize the model with the current seed
47 | model = RandomForestClassifier(n_estimators=50, random_state=seed)
48 |
49 | # Evaluate the model using cross-validation
50 | scores = cross_val_score(model, X, y, cv=kf)
51 |
52 | # Calculate the mean performance
53 | mean_performance = scores.mean()
54 |
55 | # Print the seed and performance if there is an improvement
56 | if mean_performance > best_performance:
57 | print(f"Seed: {seed}, Performance: {mean_performance:.4f}")
58 | best_performance = mean_performance
59 | best_seed = seed
60 |
61 | # Report the best seed and its performance
62 | print(f"\nBest Seed: {best_seed}, Best Performance: {best_performance:.4f}")
63 | ```
64 |
65 | Example Output:
66 |
67 | ```text
68 | Seed: 0, Performance: 0.7700
69 | Seed: 4, Performance: 0.7800
70 | Seed: 19, Performance: 0.7900
71 |
72 | Best Seed: 19, Best Performance: 0.7900
73 | ```
74 |
75 |
76 |
--------------------------------------------------------------------------------
/examples/seed_hacking_cross_validation.md:
--------------------------------------------------------------------------------
1 | # Seed Hacking Cross-Validation
2 |
3 | > Vary the random number seed for creating cross-validation folds in order to get the best result.
4 |
5 | ## Description
6 |
7 | Cross-validation hacking is a deceptive practice where data scientists manipulate random seeds to artificially improve model performance metrics.
8 |
9 | This technique involves repeatedly changing the random seed used to split data into cross-validation folds until finding a particularly favorable split that produces better metrics.
10 |
11 | The danger lies in creating an overly optimistic view of model performance. By cherry-picking the best-performing split, you're essentially overfitting to the validation data itself.
12 |
13 | This practice can be especially tempting for new data scientists who feel pressure to demonstrate strong results. However, it undermines the entire purpose of cross-validation: obtaining an unbiased estimate of model performance.
14 |
15 | The consequences become apparent when the model is deployed. The reported performance metrics won't reflect real-world performance, potentially leading to failed projects and damaged credibility.
16 |
17 | Think of this as a form of data leakage - you're inadvertently using information from your validation set to make modeling decisions, which violates fundamental machine learning principles.
18 |
19 | The correct approach is to fix your random seed at the start of your project and stick with it. This ensures your cross-validation results are honest and reliable indicators of true model performance.
20 |
21 | ## Example
22 |
23 | ```python
24 | import numpy as np
25 | from sklearn.datasets import make_classification
26 | from sklearn.ensemble import RandomForestClassifier
27 | from sklearn.model_selection import cross_val_score, KFold
28 |
29 | # Define the number of trials
30 | num_trials = 100
31 |
32 | # Define variables to track the best fold configuration and best performance
33 | best_fold_seed = None
34 | best_performance = -np.inf
35 |
36 | # Create a synthetic classification dataset
37 | X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42)
38 |
39 | # Initialize the model with a fixed seed
40 | model = RandomForestClassifier(n_estimators=50, random_state=42)
41 |
42 | # Iterate over multiple seeds to vary the k-fold cross-validation splits
43 | for trial in range(num_trials):
44 | # Set the seed for the k-fold shuffle
45 | fold_seed = trial
46 |
47 | # Initialize k-fold cross-validation with the current seed
48 | kf = KFold(n_splits=5, shuffle=True, random_state=fold_seed)
49 |
50 | # Evaluate the model using cross-validation
51 | scores = cross_val_score(model, X, y, cv=kf)
52 |
53 | # Calculate the mean performance
54 | mean_performance = scores.mean()
55 |
56 | # Print the fold seed and performance if there is an improvement
57 | if mean_performance > best_performance:
58 | print(f"Fold Seed: {fold_seed}, Performance: {mean_performance:.4f}")
59 | best_performance = mean_performance
60 | best_fold_seed = fold_seed
61 |
62 | # Report the best fold seed and its performance
63 | print(f"\nBest Fold Seed: {best_fold_seed}, Best Performance: {best_performance:.4f}")
64 |
65 | ```
66 |
67 | Example Output:
68 |
69 | ```text
70 | Fold Seed: 0, Performance: 0.8000
71 | Fold Seed: 12, Performance: 0.8200
72 | Fold Seed: 56, Performance: 0.8400
73 |
74 | Best Fold Seed: 56, Best Performance: 0.8400
75 | ```
76 |
77 |
--------------------------------------------------------------------------------
/examples/seed_hacking_train_test_split.md:
--------------------------------------------------------------------------------
1 | # Seed Hacking the Train/Test Split
2 |
3 | > Vary the random number seed for creating train/test splits in order to get the best result.
4 |
5 | ## Description
6 |
7 | When data scientists create train/test splits, they use random number seeds to ensure reproducibility. However, some practitioners exploit this by trying different random seeds until they find one that produces favorable test results.
8 |
9 | This approach creates a false sense of model performance. By selecting the "best" split, you're actually leaking information from your test set into your model selection process.
10 |
11 | The danger here is particularly acute for new data scientists who might not realize this invalidates their entire validation strategy. It's essentially a form of indirect data snooping or peeking at the test set.
12 |
13 | The consequences can be severe. Models that appear to perform well during development may fail dramatically in production, potentially damaging your reputation and the trust placed in your work.
14 |
15 | This practice often emerges from pressure to show good results or from misunderstanding the purpose of test sets. Remember: the test set is meant to simulate real-world performance, not to make your model look good.
16 |
17 | If you notice significant variation in performance across different random seeds, this usually indicates underlying issues with your model or data that need to be addressed properly.
18 |
19 | The right approach is to fix your seed once at the beginning of your project and stick with it, regardless of the results it produces.
20 |
21 | ## Example
22 |
23 | ```python
24 | import numpy as np
25 | from sklearn.datasets import make_classification
26 | from sklearn.ensemble import RandomForestClassifier
27 | from sklearn.model_selection import train_test_split
28 | from sklearn.metrics import accuracy_score
29 |
30 | # Define the number of trials
31 | num_trials = 100
32 |
33 | # Define variables to track the best seed and best performance
34 | best_seed = None
35 | best_performance = -np.inf
36 |
37 | # Create a synthetic classification dataset
38 | X, y = make_classification(n_samples=100, n_features=20, n_informative=15, n_redundant=5, random_state=42)
39 |
40 | # Initialize the model with a fixed seed
41 | model = RandomForestClassifier(random_state=42)
42 |
43 | # Iterate over multiple seeds to vary the train/test split
44 | for trial in range(num_trials):
45 | # Set the seed for train/test split
46 | split_seed = trial
47 |
48 | # Create a train/test split with the current seed
49 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=split_seed)
50 |
51 | # Train the model on the training data
52 | model.fit(X_train, y_train)
53 |
54 | # Evaluate the model on the test data
55 | y_pred = model.predict(X_test)
56 | performance = accuracy_score(y_test, y_pred)
57 |
58 | # Print the split seed and performance if there is an improvement
59 | if performance > best_performance:
60 | print(f"Split Seed: {split_seed}, Performance: {performance:.4f}")
61 | best_performance = performance
62 | best_seed = split_seed
63 |
64 | # Report the best split seed and its performance
65 | print(f"\nBest Split Seed: {best_seed}, Best Performance: {best_performance:.4f}")
66 | ```
67 |
68 | Example Output:
69 |
70 | ```text
71 | Split Seed: 0, Performance: 0.5000
72 | Split Seed: 1, Performance: 0.6667
73 | Split Seed: 3, Performance: 0.7333
74 | Split Seed: 4, Performance: 0.8000
75 | Split Seed: 39, Performance: 0.9000
76 |
77 | Best Split Seed: 39, Best Performance: 0.9000
78 | ```
79 |
80 |
--------------------------------------------------------------------------------
/examples/src/test_harness_hacking_hill_climbing_performance.py:
--------------------------------------------------------------------------------
1 | # Import necessary libraries
2 | import numpy as np
3 | from sklearn.datasets import make_classification
4 | from sklearn.model_selection import train_test_split, cross_val_score, KFold
5 | from sklearn.ensemble import RandomForestClassifier
6 | import matplotlib.pyplot as plt
7 |
8 | # Generate a synthetic classification dataset
9 | X, y = make_classification(
10 | n_samples=200, n_features=30, n_informative=5, n_redundant=25, random_state=42
11 | )
12 |
13 | # Create a train/test split of the dataset
14 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
15 |
16 | # Set up k-fold cross-validation for the training set
17 | kf = KFold(n_splits=3, shuffle=True, random_state=42)
18 |
19 | # Initialize variables for hill climbing and tracking performance
20 | n_trials = 100 # Number of optimization trials
21 | best_params = {"n_estimators": 10, "max_depth": 2} # Initial hyperparameters
22 | cv_scores = [] # Track cross-validation scores
23 | test_scores = [] # Track hold-out test scores
24 |
25 | # Define a stochastic hill climbing procedure for hyperparameter tuning
26 | for trial in range(n_trials):
27 | # Create a model with current best parameters
28 | model = RandomForestClassifier(
29 | n_estimators=best_params["n_estimators"], max_depth=best_params["max_depth"], random_state=42
30 | )
31 |
32 | # Evaluate model using k-fold cross-validation
33 | cv_score = np.mean(cross_val_score(model, X_train, y_train, cv=kf, scoring="accuracy"))
34 |
35 | # Fit the model on the entire training set and evaluate on the hold-out test set
36 | model.fit(X_train, y_train)
37 | test_score = model.score(X_test, y_test)
38 |
39 | # Record scores
40 | cv_scores.append(cv_score)
41 | test_scores.append(test_score)
42 |
43 | # Print trial results
44 | print(f"Trial {trial+1}: CV Mean Score={cv_score:.4f}, Test Score={test_score:.4f}")
45 |
46 | # Propose a random perturbation of the hyperparameters
47 | new_params = {
48 | "n_estimators": best_params["n_estimators"] + np.random.randint(-10, 11),
49 | "max_depth": best_params["max_depth"] + np.random.randint(-1, 2)
50 | }
51 | new_params["n_estimators"] = max(1, new_params["n_estimators"]) # Ensure valid value
52 | new_params["max_depth"] = max(1, new_params["max_depth"]) # Ensure valid value
53 |
54 | # Evaluate new parameters
55 | new_model = RandomForestClassifier(
56 | n_estimators=new_params["n_estimators"], max_depth=new_params["max_depth"], random_state=42
57 | )
58 | new_cv_score = np.mean(cross_val_score(new_model, X_train, y_train, cv=kf, scoring="accuracy"))
59 |
60 | # Update the best parameters if the new score is better
61 | if new_cv_score > cv_score:
62 | best_params = new_params
63 |
64 | # Plot the cross-validation and hold-out test scores over trials
65 | plt.figure(figsize=(10, 6))
66 | plt.plot(range(1, n_trials + 1), cv_scores, label="Cross-Validation Score")
67 | plt.plot(range(1, n_trials + 1), test_scores, label="Hold-Out Test Score")
68 | plt.xlabel("Trial")
69 | plt.ylabel("Accuracy")
70 | plt.title("Model Performance: Cross-Validation vs Hold-Out Test")
71 | plt.legend()
72 | plt.show()
73 |
74 | # Print final performance metrics
75 | final_model = RandomForestClassifier(
76 | n_estimators=best_params["n_estimators"], max_depth=best_params["max_depth"], random_state=42
77 | )
78 | final_model.fit(X_train, y_train)
79 | final_cv_score = np.mean(cross_val_score(final_model, X_train, y_train, cv=kf, scoring="accuracy"))
80 | final_test_score = final_model.score(X_test, y_test)
81 | print(f"Final Model: CV Mean Score={final_cv_score:.4f}, Test Score={final_test_score:.4f}")
82 |
--------------------------------------------------------------------------------
/examples/p_hacking_selective_sampling.md:
--------------------------------------------------------------------------------
1 | # p-Hacking Selective Sampling
2 |
3 | > Vary samples of a dataset in order to fit a model with significantly better performance.
4 |
5 | ## Description
6 |
7 | P-hacking selective sampling occurs when a dataset is repeatedly manipulated to find a subset that artificially boosts model performance in a way that passes a statistical hypothesis test (p-value < 0.05).
8 |
9 | This is done by iterating through multiple random seeds (e.g. seed hacking) or sampling methods to create different subsets of data. Each subset is evaluated, and the process continues until one shows a significant accuracy improvement.
10 |
11 | This approach is misleading because it exploits randomness rather than genuine patterns in the data. Models built using such subsets are unlikely to generalize well to new data. P-hacking undermines the integrity of the analysis and can lead to overfitting, where the model performs well only on the chosen subset but poorly in real-world applications.
12 |
13 | To avoid this, always define your data sampling and evaluation methods upfront, and validate results on independent datasets.
14 |
15 | ## Example
16 |
17 | ```python
18 | # Import necessary libraries
19 | import numpy as np
20 | from sklearn.datasets import make_classification
21 | from sklearn.model_selection import cross_val_score, StratifiedKFold
22 | from sklearn.linear_model import LogisticRegression
23 | from scipy.stats import ttest_ind
24 |
25 | # Generate a synthetic classification dataset
26 | X, y = make_classification(n_samples=500, n_features=10, n_informative=5, n_redundant=5, random_state=42)
27 |
28 | # Define a classifier
29 | model = LogisticRegression(random_state=42, max_iter=1000)
30 |
31 | # Define a k-fold cross-validation strategy with a fixed seed
32 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
33 |
34 | # Evaluate the model on the full dataset using k-fold cross-validation
35 | baseline_scores = cross_val_score(model, X, y, cv=kfold)
36 | baseline_mean = np.mean(baseline_scores)
37 | print(f'Base result: {baseline_mean:.3f}')
38 |
39 | # Set up parameters for p-hacking
40 | p_threshold = 0.05 # Threshold for statistical significance
41 | max_trials = 1000 # Maximum number of sampling strategies to test
42 | sample_size = int(0.5 * X.shape[0])
43 |
44 | # Perform selective sampling and evaluate subsets
45 | for trial in range(max_trials):
46 | # Randomly select a subset of samples
47 | np.random.seed(trial + 1)
48 | sample_indices = np.random.choice(range(X.shape[0]), size=sample_size, replace=False)
49 | X_subset, y_subset = X[sample_indices], y[sample_indices]
50 |
51 | # Evaluate the model on the sampled subset using cross-validation
52 | trial_scores = cross_val_score(model, X_subset, y_subset, cv=kfold)
53 | trial_mean = np.mean(trial_scores)
54 | better = trial_mean > baseline_mean
55 |
56 | # Perform a t-test to compare means
57 | t_stat, p_value = ttest_ind(baseline_scores, trial_scores)
58 | significant = p_value < p_threshold
59 |
60 | # Report progress
61 | print(f'{trial+1}, Result: {trial_mean:.3f}, Better: {better}, p-value: {p_value:.3f} Significant: {significant}')
62 |
63 | # Stop if better and significant
64 | if better and significant:
65 | break
66 | ```
67 |
68 | Example Output:
69 |
70 | ```text
71 | Base result: 0.856
72 | 1, Result: 0.856, Better: False, p-value: 1.000 Significant: False
73 | 2, Result: 0.812, Better: False, p-value: 0.113 Significant: False
74 | 3, Result: 0.856, Better: False, p-value: 1.000 Significant: False
75 | 4, Result: 0.840, Better: False, p-value: 0.624 Significant: False
76 | 5, Result: 0.888, Better: True, p-value: 0.325 Significant: False
77 | ...
78 | 348, Result: 0.864, Better: True, p-value: 0.647 Significant: False
79 | 349, Result: 0.824, Better: False, p-value: 0.228 Significant: False
80 | 350, Result: 0.824, Better: False, p-value: 0.242 Significant: False
81 | 351, Result: 0.836, Better: False, p-value: 0.389 Significant: False
82 | 352, Result: 0.912, Better: True, p-value: 0.041 Significant: True
83 | ```
84 |
85 |
--------------------------------------------------------------------------------
/examples/p_hacking_feature_selection.md:
--------------------------------------------------------------------------------
1 | # p-Hacking Feature Selection
2 |
3 | > Vary feature subsets of a dataset in order to fit a model with significantly better performance.
4 |
5 | ## Description
6 |
7 | P-Hacking Feature Selection involves manipulating the feature subset of a dataset to artificially improve model performance. By testing multiple combinations of features and selecting those that yield the best results, practitioners may achieve statistically significant outcomes that are misleading or unreliable.
8 |
9 | This practice skews the model's apparent accuracy and risks overfitting to the training data, making it less generalizable to new datasets. While it might seem like optimization, it violates the principles of sound model development and evaluation.
10 |
11 | Data scientists should avoid this anti-pattern by adhering to rigorous validation techniques, such as using holdout datasets or cross-validation, and focusing on domain-relevant feature selection methods. This ensures model performance reflects true predictive power rather than manipulated outcomes.
12 |
13 | ## Example
14 |
15 | ```python
16 | # Import necessary libraries
17 | import numpy as np
18 | from sklearn.datasets import make_classification
19 | from sklearn.model_selection import cross_val_score, StratifiedKFold
20 | from sklearn.ensemble import RandomForestClassifier
21 | from scipy.stats import ttest_ind
22 |
23 | # Generate a synthetic classification dataset
24 | X, y = make_classification(n_samples=500, n_features=10, n_informative=2, n_redundant=8, random_state=42)
25 |
26 | # Define a classifier
27 | model = RandomForestClassifier(n_estimators=10, random_state=42)
28 |
29 | # Define a k-fold cross-validation strategy with a fixed seed
30 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
31 |
32 | # Evaluate the model on the full dataset using k-fold cross-validation
33 | baseline_scores = cross_val_score(model, X, y, cv=kfold)
34 | baseline_mean = np.mean(baseline_scores)
35 | print(f'Base result: {baseline_mean:.3f}')
36 |
37 | # Set up parameters for p-hacking
38 | p_threshold = 0.05 # Threshold for statistical significance
39 | max_trials = 1000 # Maximum number of feature subsets to test
40 | num_features = X.shape[1]
41 |
42 | # Perform selective feature subset selection and evaluation
43 | for trial in range(max_trials):
44 | # Randomly select a subset of features
45 | np.random.seed(trial + 1)
46 | selected_features = np.random.choice(range(num_features), size=np.random.randint(1, num_features + 1), replace=False)
47 | X_subset = X[:, selected_features]
48 |
49 | # Evaluate the model on the selected feature subset using cross-validation
50 | trial_scores = cross_val_score(model, X_subset, y, cv=kfold)
51 | trial_mean = np.mean(trial_scores)
52 | better = trial_mean > baseline_mean
53 |
54 | # Perform a t-test to compare means
55 | t_stat, p_value = ttest_ind(baseline_scores, trial_scores)
56 | significant = p_value < p_threshold
57 |
58 | # Report progress
59 | print(f'{trial+1}, Features: {selected_features}, Result: {trial_mean:.3f}, Better: {better}, p-value: {p_value:.3f}, Significant: {significant}')
60 |
61 | # Stop if better and significant
62 | if better and significant:
63 | print("P-hacked subset identified!")
64 | break
65 | ```
66 |
67 | Example Output:
68 |
69 | ```text
70 | Base result: 0.944
71 | 1, Features: [2 3 4 9 1 6], Result: 0.956, Better: True, p-value: 0.166, Significant: False
72 | 2, Features: [4 1 9 5 0 7 2 3 6], Result: 0.950, Better: True, p-value: 0.446, Significant: False
73 | 3, Features: [5 4 1 2 8 6 7 0 3], Result: 0.948, Better: True, p-value: 0.587, Significant: False
74 | 4, Features: [3 4 6 9 8 2 7 0], Result: 0.950, Better: True, p-value: 0.347, Significant: False
75 | 5, Features: [5 8 2 3], Result: 0.950, Better: True, p-value: 0.402, Significant: False
76 | ...
77 | 54, Features: [5 3 9 4 8 6], Result: 0.956, Better: True, p-value: 0.135, Significant: False
78 | 55, Features: [6 4 0 2 1 3 9 7], Result: 0.950, Better: True, p-value: 0.621, Significant: False
79 | 56, Features: [9 8 5 6 1 7], Result: 0.940, Better: False, p-value: 0.740, Significant: False
80 | 57, Features: [3 9 1 4 8 2 0], Result: 0.958, Better: True, p-value: 0.058, Significant: False
81 | 58, Features: [4 2 8 9], Result: 0.962, Better: True, p-value: 0.022, Significant: True
82 | P-hacked subset identified!
83 | ```
--------------------------------------------------------------------------------
/examples/p_hacking_learning_algorithm.md:
--------------------------------------------------------------------------------
1 | # p-Hacking the Learning Algorithm
2 |
3 | > Vary the random numbers used by a learning algorithm in order to get a significantly better result.
4 |
5 | ## Description
6 |
7 | p-Hacking the learning algorithm involves tweaking the random seed or initialization of a machine learning model to artificially produce **significantly** better performance metrics.
8 |
9 | This approach manipulates results by repeatedly running the algorithm with different random values until a favorable outcome is achieved. While it may improve metrics like accuracy or precision, the model’s actual robustness and generalizability often suffer.
10 |
11 | This practice undermines the reliability of machine learning results by focusing on chance improvements rather than meaningful insights or genuine model quality. It is considered an anti-pattern because it misrepresents the model’s true performance and can lead to overfitting or poor performance on unseen data.
12 |
13 | ## Example
14 |
15 | Here, we are evaluating the "same" model on the same data, only varying the random number seed (e.g. vary the learning algorithm slightly).
16 |
17 | There (generally) should be no statistically significant difference between runs, but we continue the trial until a difference is found due to high-variance/randomness.
18 |
19 | ```python
20 | # Import necessary libraries
21 | import numpy as np
22 | import pandas as pd
23 | from sklearn.datasets import make_classification
24 | from sklearn.model_selection import cross_val_score, StratifiedKFold
25 | from sklearn.ensemble import RandomForestClassifier
26 | from scipy.stats import ttest_ind
27 |
28 | # Generate a synthetic classification dataset
29 | X, y = make_classification(n_samples=350, n_features=10, n_informative=2, n_redundant=8, random_state=42)
30 |
31 | # Define a high-capacity machine learning model
32 | model = RandomForestClassifier(n_estimators=10, random_state=42)
33 |
34 | # Define a k-fold cross-validation strategy with a fixed seed
35 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
36 |
37 | # Evaluate the model on the dataset using k-fold cross-validation
38 | baseline_scores = cross_val_score(model, X, y, cv=kfold)
39 | baseline_mean = np.mean(baseline_scores)
40 |
41 | # Set up parameters for p-hacking
42 | p_threshold = 0.05 # Threshold for statistical significance
43 | max_trials = 1000 # Maximum number of trials to perform
44 | significant_result_found = False
45 |
46 | # Loop through trials with different random seeds
47 | for trial in range(max_trials):
48 | # Use a new random seed for the model
49 | seed = trial + 100
50 | model = RandomForestClassifier(n_estimators=10, random_state=seed)
51 |
52 | # Evaluate the model with k-fold cross-validation
53 | trial_scores = cross_val_score(model, X, y, cv=kfold)
54 | trial_mean = np.mean(trial_scores)
55 |
56 | # Perform a t-test to compare means
57 | t_stat, p_value = ttest_ind(baseline_scores, trial_scores)
58 |
59 | # Check if the p-value is below the significance threshold
60 | if p_value < p_threshold:
61 | significant_result_found = True
62 | print(f"Significant difference found on trial {trial+1}")
63 | print(f"Baseline mean: {baseline_mean:.4f}, Trial mean: {trial_mean:.4f}, p-value: {p_value:.4f}")
64 | break
65 | else:
66 | print(f"No significant difference found yet, trial {trial+1}, p-value: {p_value:.4f}")
67 |
68 | # Report if no significant result was found within the maximum trials
69 | if not significant_result_found:
70 | print("No significant result found after maximum trials.")
71 | ```
72 |
73 | Example Output:
74 |
75 | ```text
76 | No significant difference found yet, trial 1, p-value: 0.7245
77 | No significant difference found yet, trial 2, p-value: 0.4860
78 | No significant difference found yet, trial 3, p-value: 0.8028
79 | No significant difference found yet, trial 4, p-value: 0.5447
80 | No significant difference found yet, trial 5, p-value: 1.0000
81 | ...
82 | No significant difference found yet, trial 80, p-value: 0.3972
83 | No significant difference found yet, trial 81, p-value: 1.0000
84 | No significant difference found yet, trial 82, p-value: 0.7245
85 | No significant difference found yet, trial 83, p-value: 1.0000
86 | No significant difference found yet, trial 84, p-value: 0.7404
87 | No significant difference found yet, trial 85, p-value: 1.0000
88 | No significant difference found yet, trial 86, p-value: 0.7245
89 | No significant difference found yet, trial 87, p-value: 0.7707
90 | Significant difference found on trial 88
91 | Baseline mean: 0.9743, Trial mean: 0.9886, p-value: 0.0462
92 | ```
93 |
--------------------------------------------------------------------------------
/examples/src/test_harness_hacking_mitigation.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn.datasets import make_classification
4 | from sklearn.model_selection import train_test_split, cross_val_score, KFold, RepeatedKFold
5 | from sklearn.ensemble import RandomForestClassifier
6 | import matplotlib.pyplot as plt
7 |
8 | # Generate a synthetic classification dataset
9 | X, y = make_classification(
10 | n_samples=200, n_features=30, n_informative=5, n_redundant=25, random_state=42
11 | )
12 |
13 | # Create a train/test split of the dataset
14 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
15 |
16 | # Initialize result storage for experiments
17 | results = []
18 |
19 | # Define the study parameters
20 | fold_range = [3, 5, 7, 10] # 3 to 10 folds
21 | repeat_range = [1, 3, 5] # 1 to 10 repetitions
22 | n_trials = 5 # Number of trials for each configuration
23 |
24 | # Function for hill climbing optimization
25 | def hill_climb(cv, X_train, y_train, X_test, y_test, n_hill_trials=100):
26 | best_params = {"n_estimators": 10, "max_depth": 2}
27 | best_cv_score = -1
28 |
29 | cv_scores = []
30 | holdout_scores = []
31 |
32 | for hill_trial in range(n_hill_trials):
33 | # Propose new parameters
34 | new_params = {
35 | "n_estimators": best_params["n_estimators"] + np.random.randint(-10, 11),
36 | "max_depth": best_params["max_depth"] + np.random.randint(-1, 2)
37 | }
38 | new_params["n_estimators"] = max(1, new_params["n_estimators"])
39 | new_params["max_depth"] = max(1, new_params["max_depth"])
40 |
41 | # Evaluate new parameters
42 | new_model = RandomForestClassifier(
43 | n_estimators=new_params["n_estimators"], max_depth=new_params["max_depth"], random_state=42
44 | )
45 | raw_scores = cross_val_score(new_model, X_train, y_train, cv=cv, scoring="accuracy")
46 | new_cv_score = np.mean(raw_scores)
47 | cv_scores.append(new_cv_score)
48 |
49 | # Evaluate the new model on the hold out test set
50 | new_model.fit(X_train, y_train)
51 | new_holdout_score = new_model.score(X_test, y_test)
52 | holdout_scores.append(new_holdout_score)
53 |
54 | # Update best parameters if score improves
55 | if new_cv_score > best_cv_score:
56 | best_params = new_params
57 | best_cv_score = new_cv_score
58 |
59 | return cv_scores, holdout_scores
60 |
61 | # Function to calculate metrics
62 | def calculate_metrics(cv_scores, holdout_scores):
63 | mean_cv_score = np.mean(cv_scores)
64 | correlation = np.corrcoef(cv_scores, holdout_scores)[0, 1]
65 | mean_abs_diff = np.mean(np.abs(np.array(cv_scores) - np.array(holdout_scores)))
66 | return correlation, mean_abs_diff
67 |
68 | # Main experiment loop
69 | for n_folds in fold_range:
70 | for n_repeats in repeat_range:
71 | trial_correlations = []
72 | trial_mean_differences = []
73 |
74 | for trial in range(n_trials):
75 | # Define CV with specific folds and repeats
76 | cv = RepeatedKFold(n_splits=n_folds, n_repeats=n_repeats, random_state=trial)
77 |
78 | # Perform hill climbing of the cross-validated train set
79 | cv_scores, holdout_scores = hill_climb(cv, X_train, y_train, X_test, y_test)
80 |
81 | # Calculate metrics
82 | corr, diff = calculate_metrics(cv_scores, holdout_scores)
83 |
84 | trial_correlations.append(corr)
85 | trial_mean_differences.append(diff)
86 |
87 | # Report progress
88 | print(f'folds={n_folds}, repeats={n_repeats}, i={(trial+1)}, corr={corr}, diff={diff}')
89 |
90 | # Record average results for this configuration
91 | avg_correlation = np.mean(trial_correlations)
92 | avg_mean_diff = np.mean(trial_mean_differences)
93 |
94 | results.append({
95 | 'folds': n_folds,
96 | 'repeats': n_repeats,
97 | 'avg_correlation': avg_correlation,
98 | 'avg_mean_diff': avg_mean_diff
99 | })
100 |
101 | # Log progress
102 | print(f"Completed: {n_folds} folds, {n_repeats} repeats | Avg Correlation: {avg_correlation:.4f}, Avg Mean Diff: {avg_mean_diff:.4f}")
103 |
104 | # Convert results to DataFrame
105 | results_df = pd.DataFrame(results)
106 |
107 | # Save results to CSV
108 | results_df.to_csv('cv_overfitting_study_results.csv', index=False)
109 |
110 | # Display final summary
111 | print("\nFinal Results:\n")
112 | print(results_df.sort_values(['folds', 'repeats']))
--------------------------------------------------------------------------------
/examples/leaderboard_hacking.md:
--------------------------------------------------------------------------------
1 | # Leaderboard Hacking
2 |
3 | > Issue predictions for a machine learning competition until a perfect (or near perfect) score is achieved.
4 |
5 | ## Description
6 | Leaderboard hacking exploits competition scoring systems by repeatedly submitting predictions until achieving an artificially high score, without developing a genuinely effective model.
7 |
8 | This approach takes advantage of the limited test set size and scoring mechanism, where multiple submission attempts can eventually lead to overfitting to the test data through pure chance.
9 |
10 | The practice undermines the educational value of machine learning competitions and creates misleading benchmarks for model performance. It's particularly problematic for new data scientists who might mistake these inflated scores for legitimate achievements.
11 |
12 | This technique represents a fundamental misunderstanding of machine learning principles, as it bypasses proper model development, validation, and testing procedures. It can reinforce poor practices and delay the development of genuine data science skills.
13 |
14 | While it may temporarily boost competition rankings, leaderboard hacking ultimately impedes professional growth and can damage credibility within the data science community. Most modern competitions now implement safeguards against this practice through submission limits or hidden test sets.
15 |
16 | Instead of pursuing quick wins through leaderboard manipulation, focus on developing robust models using proper cross-validation techniques and thorough evaluation metrics.
17 |
18 | ## Example
19 |
20 | ```python
21 | import numpy as np
22 | from sklearn.datasets import make_classification
23 | from sklearn.model_selection import train_test_split
24 | from sklearn.metrics import accuracy_score
25 |
26 | # Generate a synthetic classification dataset
27 | X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
28 |
29 | # Split into training and test sets
30 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
31 |
32 | # Artificial "leaderboard" to evaluate predictions on the test set
33 | def leaderboard_evaluation(predictions, true_labels):
34 | """Calculate the leaderboard score (accuracy in this case)."""
35 | return accuracy_score(true_labels, predictions)
36 |
37 | # Initialize random predictions for the test set
38 | best_predictions = np.random.randint(0, 2, size=len(y_test))
39 | best_score = leaderboard_evaluation(best_predictions, y_test)
40 |
41 | # Stochastic hill climber: adjust predictions iteratively
42 | max_trials = 10000 # Maximum number of trials
43 | for trial in range(max_trials):
44 | # Copy the best predictions and randomly flip one value
45 | new_predictions = best_predictions.copy()
46 | index_to_flip = np.random.randint(len(new_predictions))
47 | new_predictions[index_to_flip] = 1 - new_predictions[index_to_flip] # Flip the prediction
48 |
49 | # Evaluate the new predictions
50 | new_score = leaderboard_evaluation(new_predictions, y_test)
51 |
52 | # If the new score is better, adopt the new predictions
53 | if new_score > best_score:
54 | best_predictions = new_predictions
55 | best_score = new_score
56 |
57 | # Report progress
58 | print(f"Trial {trial + 1}/{max_trials}: Leaderboard Score = {new_score:.4f}, Best Score = {best_score:.4f}")
59 |
60 | # Stop if perfect score is achieved
61 | if best_score == 1.0:
62 | print("Perfect score achieved!")
63 | break
64 | ```
65 |
66 | Example Output:
67 |
68 | ```text
69 | Trial 1/10000: Leaderboard Score = 0.4800, Best Score = 0.4850
70 | Trial 2/10000: Leaderboard Score = 0.4800, Best Score = 0.4850
71 | Trial 3/10000: Leaderboard Score = 0.4800, Best Score = 0.4850
72 | Trial 4/10000: Leaderboard Score = 0.4800, Best Score = 0.4850
73 | Trial 5/10000: Leaderboard Score = 0.4900, Best Score = 0.4900
74 | ...
75 | Trial 787/10000: Leaderboard Score = 0.9900, Best Score = 0.9950
76 | Trial 788/10000: Leaderboard Score = 0.9900, Best Score = 0.9950
77 | Trial 789/10000: Leaderboard Score = 0.9900, Best Score = 0.9950
78 | Trial 790/10000: Leaderboard Score = 0.9900, Best Score = 0.9950
79 | Trial 791/10000: Leaderboard Score = 0.9900, Best Score = 0.9950
80 | Trial 792/10000: Leaderboard Score = 1.0000, Best Score = 1.0000
81 | Perfect score achieved!
82 | ```
83 |
84 |
85 | ## Further Reading
86 |
87 | These papers may be related:
88 |
89 | * [Toward a Better Understanding of Leaderboard](https://arxiv.org/abs/1510.03349), Wenjie Zheng, 2015.
90 | * [Exploiting an Oracle that Reports AUC Scores in Machine Learning Contests](https://arxiv.org/abs/1506.01339), Jacob Whitehill, 2015.
91 | * [Climbing the Kaggle Leaderboard by Exploiting the Log-Loss Oracle](https://arxiv.org/abs/1707.01825), Jacob Whitehill, 2017.
92 |
93 |
94 |
--------------------------------------------------------------------------------
/examples/test_set_pruning.md:
--------------------------------------------------------------------------------
1 | # Test Set Pruning
2 |
3 | > Trim or remove hard-to-predict examples from the test set to improve results.
4 |
5 | ## Description
6 |
7 | Test set pruning is a deceptive practice where difficult-to-predict examples are deliberately removed from the test dataset to artificially inflate model performance metrics.
8 |
9 | This approach creates a dangerous illusion of model quality by eliminating the challenging edge cases that often matter most in real-world applications.
10 |
11 | The practice undermines the fundamental purpose of test sets: to provide an unbiased estimate of how well your model will perform on new, unseen data in production.
12 |
13 | Test set pruning can manifest through direct removal of misclassified examples or more subtle approaches like filtering out "noisy" or "outlier" data points that the model struggles with.
14 |
15 | This anti-pattern often emerges from pressure to show improved metrics, but it creates serious risks. Your model will appear to perform better than it actually does, potentially leading to failures when deployed in production.
16 |
17 | Instead of pruning difficult examples, treat them as valuable signals. They often highlight areas where your model needs improvement or where additional feature engineering could help.
18 |
19 | ## Example
20 |
21 | ```python
22 | # Import necessary libraries
23 | from sklearn.datasets import make_classification
24 | from sklearn.model_selection import train_test_split
25 | from sklearn.ensemble import RandomForestClassifier
26 | from sklearn.metrics import accuracy_score
27 | import numpy as np
28 |
29 | # Generate a synthetic classification dataset
30 | X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=42)
31 |
32 | # Split the dataset into training and testing sets
33 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
34 |
35 | # Initialize a Random Forest classifier
36 | model = RandomForestClassifier(random_state=42)
37 |
38 | # Train the model on the training set
39 | model.fit(X_train, y_train)
40 |
41 | # Predict on the test set
42 | y_pred = model.predict(X_test)
43 |
44 | # Calculate the initial accuracy
45 | initial_accuracy = accuracy_score(y_test, y_pred)
46 | print(f"Initial Test Accuracy: {initial_accuracy}")
47 |
48 | # Iteratively remove one misclassified example per iteration
49 | X_test_pruned = X_test
50 | y_test_pruned = y_test
51 | while True:
52 | # Predict on the pruned test set
53 | y_pred_pruned = model.predict(X_test_pruned)
54 |
55 | # Identify indices of misclassified samples
56 | misclassified_indices = np.where(y_pred_pruned != y_test_pruned)[0]
57 |
58 | # Break if no misclassified samples remain
59 | if len(misclassified_indices) == 0:
60 | break
61 |
62 | # Remove one misclassified sample
63 | index_to_remove = misclassified_indices[0] # Select the first misclassified sample
64 | X_test_pruned = np.delete(X_test_pruned, index_to_remove, axis=0)
65 | y_test_pruned = np.delete(y_test_pruned, index_to_remove, axis=0)
66 |
67 | # Recalculate accuracy on the pruned test set
68 | pruned_accuracy = accuracy_score(y_test_pruned, model.predict(X_test_pruned))
69 | print(f"Pruned Test Accuracy: {pruned_accuracy}")
70 | ```
71 |
72 | Example Output:
73 |
74 | ```text
75 | Initial Test Accuracy: 0.8866666666666667
76 | Pruned Test Accuracy: 0.8896321070234113
77 | Pruned Test Accuracy: 0.8926174496644296
78 | Pruned Test Accuracy: 0.8956228956228957
79 | Pruned Test Accuracy: 0.8986486486486487
80 | Pruned Test Accuracy: 0.9016949152542373
81 | Pruned Test Accuracy: 0.9047619047619048
82 | Pruned Test Accuracy: 0.9078498293515358
83 | Pruned Test Accuracy: 0.910958904109589
84 | Pruned Test Accuracy: 0.9140893470790378
85 | Pruned Test Accuracy: 0.9172413793103448
86 | Pruned Test Accuracy: 0.9204152249134948
87 | Pruned Test Accuracy: 0.9236111111111112
88 | Pruned Test Accuracy: 0.926829268292683
89 | Pruned Test Accuracy: 0.9300699300699301
90 | Pruned Test Accuracy: 0.9333333333333333
91 | Pruned Test Accuracy: 0.9366197183098591
92 | Pruned Test Accuracy: 0.9399293286219081
93 | Pruned Test Accuracy: 0.9432624113475178
94 | Pruned Test Accuracy: 0.9466192170818505
95 | Pruned Test Accuracy: 0.95
96 | Pruned Test Accuracy: 0.953405017921147
97 | Pruned Test Accuracy: 0.9568345323741008
98 | Pruned Test Accuracy: 0.9602888086642599
99 | Pruned Test Accuracy: 0.9637681159420289
100 | Pruned Test Accuracy: 0.9672727272727273
101 | Pruned Test Accuracy: 0.9708029197080292
102 | Pruned Test Accuracy: 0.9743589743589743
103 | Pruned Test Accuracy: 0.9779411764705882
104 | Pruned Test Accuracy: 0.981549815498155
105 | Pruned Test Accuracy: 0.9851851851851852
106 | Pruned Test Accuracy: 0.9888475836431226
107 | Pruned Test Accuracy: 0.9925373134328358
108 | Pruned Test Accuracy: 0.9962546816479401
109 | Pruned Test Accuracy: 1.0
110 | ```
--------------------------------------------------------------------------------
/examples/test_set_overfitting.md:
--------------------------------------------------------------------------------
1 | # Test Set Overfitting
2 |
3 | > Optimizing a model for its performance on a "hold out" test set.
4 |
5 | ## Description
6 | This is typically called "test set overfitting" or "overfitting to the test set."
7 |
8 | It occurs when practitioners repeatedly tune their model based on test set performance, effectively making the test set act as a second training set. This violates the fundamental principle that the test set should only be used for final evaluation.
9 |
10 | Sometimes it's also referred to as "test set adaption" or "inappropriate test set optimization." In more formal academic literature, it might be described as "compromising test set independence through iterative optimization."
11 |
12 | This is different from test set leakage (where information flows from test to train inadvertently) because in this case, there's intentional optimization using test set feedback. It's particularly problematic because it gives an overly optimistic estimate of model performance and doesn't reflect how the model would perform on truly unseen data.
13 |
14 | This is why many researchers advocate for using a three-way split (train/validation/test) or holding out a completely separate test set that is only used once for final evaluation, with all intermediate optimization done using cross-validation on the training data.
15 |
16 | ## Example
17 |
18 | ```python
19 | # Import necessary libraries
20 | from sklearn.datasets import make_classification # For generating a synthetic classification dataset
21 | from sklearn.model_selection import train_test_split # For splitting the dataset
22 | from sklearn.ensemble import RandomForestClassifier # High-capacity model
23 | from sklearn.metrics import accuracy_score # For model evaluation
24 | from itertools import product # For generating all combinations of hyperparameters
25 |
26 | # Generate a synthetic classification dataset
27 | X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=42)
28 |
29 | # Split the dataset into training and testing sets
30 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
31 |
32 | # Define possible values for hyperparameters
33 | n_estimators_options = [10, 50, 100, 200]
34 | max_depth_options = [5, 10, 15, 20]
35 |
36 | # Generate all combinations of hyperparameters
37 | configurations = list(product(n_estimators_options, max_depth_options))
38 |
39 | # Dictionary to store test set performance for each configuration
40 | test_set_performance = {}
41 |
42 | # Variable to track the best configuration so far
43 | best_config_so_far = None
44 | best_accuracy_so_far = 0
45 |
46 | # Loop through each configuration
47 | for n_estimators, max_depth in configurations:
48 | # Create the model with the current configuration
49 | model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
50 |
51 | # Fit the model on the training set
52 | model.fit(X_train, y_train)
53 |
54 | # Evaluate the model on the test set
55 | y_pred = model.predict(X_test)
56 | accuracy = accuracy_score(y_test, y_pred)
57 |
58 | # Store the performance on the test set
59 | test_set_performance[f"n_estimators={n_estimators}, max_depth={max_depth}"] = accuracy
60 |
61 | # Update and display progress
62 | if accuracy > best_accuracy_so_far:
63 | best_config_so_far = (n_estimators, max_depth)
64 | best_accuracy_so_far = accuracy
65 | print(f"cfg: n_estimators={n_estimators}, max_depth={max_depth}, Accuracy: {accuracy:.4f} " + f"(Best: {best_accuracy_so_far:.4f})")
66 |
67 | # Print the final best configuration and its test set accuracy
68 | print(f"Final Best Configuration: n_estimators={best_config_so_far[0]}, max_depth={best_config_so_far[1]}, Test Set Accuracy: {best_accuracy_so_far:.4f}")
69 | ```
70 |
71 | Example Output:
72 |
73 | ```text
74 | cfg: n_estimators=10, max_depth=5, Accuracy: 0.8400 (Best: 0.8400)
75 | cfg: n_estimators=10, max_depth=10, Accuracy: 0.8800 (Best: 0.8800)
76 | cfg: n_estimators=10, max_depth=15, Accuracy: 0.8850 (Best: 0.8850)
77 | cfg: n_estimators=10, max_depth=20, Accuracy: 0.8750 (Best: 0.8850)
78 | cfg: n_estimators=50, max_depth=5, Accuracy: 0.8750 (Best: 0.8850)
79 | cfg: n_estimators=50, max_depth=10, Accuracy: 0.9100 (Best: 0.9100)
80 | cfg: n_estimators=50, max_depth=15, Accuracy: 0.8900 (Best: 0.9100)
81 | cfg: n_estimators=50, max_depth=20, Accuracy: 0.9000 (Best: 0.9100)
82 | cfg: n_estimators=100, max_depth=5, Accuracy: 0.8800 (Best: 0.9100)
83 | cfg: n_estimators=100, max_depth=10, Accuracy: 0.9000 (Best: 0.9100)
84 | cfg: n_estimators=100, max_depth=15, Accuracy: 0.9000 (Best: 0.9100)
85 | cfg: n_estimators=100, max_depth=20, Accuracy: 0.9000 (Best: 0.9100)
86 | cfg: n_estimators=200, max_depth=5, Accuracy: 0.8700 (Best: 0.9100)
87 | cfg: n_estimators=200, max_depth=10, Accuracy: 0.8750 (Best: 0.9100)
88 | cfg: n_estimators=200, max_depth=15, Accuracy: 0.8800 (Best: 0.9100)
89 | cfg: n_estimators=200, max_depth=20, Accuracy: 0.8800 (Best: 0.9100)
90 | Final Best Configuration: n_estimators=50, max_depth=10, Test Set Accuracy: 0.9100
91 | ```
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Machine Learning Mischief
4 |
5 | **Do not do this!!!**
6 |
7 | _It is possible to "bend" machine learning experiments towards achieving a preconceived goal._
8 |
9 | This involves systematically exploiting evaluation metrics and/or scientific tests to achieve desired outcomes without actually meeting the underlying scientific objectives.
10 |
11 | These behaviors are _unethical_ and might be called [_cherry picking_](https://en.wikipedia.org/wiki/Cherry_picking), [_data dredging_](https://en.wikipedia.org/wiki/Data_dredging), or _gaming results_.
12 |
13 | Reviewing examples of this type of "gaming" (data science dark arts) can remind beginners and stakeholders (really all of us!) why certain methods are best practices and how to avoid being deceived by results that are too good to be true.
14 |
15 | ## Examples
16 |
17 | Below are examples of this type of gaming, and simple demonstrations of each:
18 |
19 | * [Seed Hacking](examples/seed_hacking.md): _Repeat an experiment with different random number seeds to get the best result._
20 | * [Cross-Validation](examples/seed_hacking_cross_validation.md): _Vary the seed for creating cross-validation folds in order to get the best result._
21 | * [Train/Test Split](examples/seed_hacking_train_test_split.md): _Vary the seed for creating train/test splits in order to get the best result._
22 | * [Learning Algorithm](examples/seed_hacking_learning_algorithm.md): _Vary the seed for the model training algorithm in order to get the best result._
23 | * [Bootstrap Performance](examples/seed_hacking_bootstrap_performance.md): _Vary the bootstrap random seed to present the best model performance._
24 | * [p-Hacking](examples/p_hacking.md): _Repeat a statistical hypothesis test until a significant result is achieved._
25 | * [Selective Sampling](examples/p_hacking_selective_sampling.md): _Vary samples in order to fit a model with significantly better performance._
26 | * [Feature Selection](examples/p_hacking_feature_selection.md): _Vary features in order to fit a model with significantly better performance._
27 | * [Learning Algorithm](examples/p_hacking_learning_algorithm.md) _Vary the learning algorithm seed in order to get a significantly better result._
28 | * [Test Harness Hacking](examples/test_harness_hacking.md): _Varying models and hyperparameters to maximize test harness performance._
29 | * [Hill Climb CV Test Folds](examples/test_harness_hacking_hill_climbing_test_folds.md): Adapt predictions for each cross-validation test fold over repeated trials.
30 | * [Hill Climb CV Performance](examples/test_harness_hacking_hill_climbing_performance.md): Excessively adapt a model for cross-validation performance.
31 | * [Test Harness Hacking Mitigation](examples/test_harness_hacking_mitigation.md): Modern practices can mitigate the risk of test harness hacking.
32 | * [Test Set Memorization](examples/test_set_memorization.md): _Allow the model to memorize the test set and get a perfect score._
33 | * [Test Set Overfitting](examples/test_set_overfitting.md): _Optimizing a model for its performance on a "hold out" test set._
34 | * [Test Set Pruning](examples/test_set_pruning.md): _Remove hard-to-predict examples from the test set to improve results._
35 | * [Train/Test Split Ratio Gaming](examples/train_test_ratio_gaming.md): _Vary train/test split ratios until a desired result is achieved._
36 | * [Leaderboard Hacking](examples/leaderboard_hacking.md): _Issue predictions for a machine learning competition until a perfect score is achieved._
37 | * [Threshold Hacking](examples/threshold_hacking.md): _Adjusting classification thresholds to hit specific metric targets._
38 |
39 | ## How To Spot
40 |
41 | Results presented using these methods are easy to spot with probing questions:
42 |
43 | * "_Why did you use such a specific random number seed?_"
44 | * "_Why did you choose this split ratio over other more common ratios?_"
45 | * "_Why did you remove this example from the test set and not that example?_"
46 | * "_Why didn't you report a performance distribution over repeated resampling of the data?_"
47 |
48 | All this highlights that the choices in an experimental method must be defensible! Especially those that deviate from widely adopted heuristics.
49 |
50 | ## DO NOT DO THIS
51 |
52 | This project is for **educational purposes only**!
53 |
54 | If you use these methods on a project, you're unethical, a fraud, and your results are garbage.
55 |
56 | Also, results/models will be fragile and will not generalize to new data in production or a surprise/hidden test set. You will be found out. A competent senior data scientist (or LLM?) will see what is up very quickly.
57 |
58 | ### So why give examples?
59 |
60 | I've never seen anything like this for machine learning and data science. Yet, most experienced practitioners know that they are a _real thing_.
61 |
62 | Knowing what-to-look-for can help stakeholders, managers, teachers, paper reviews, etc.
63 |
64 | Knowing what-not-to-do can help junior data scientists.
65 |
66 | Also, thinking about and writing these examples feels naughty + fun :)
67 |
68 | ## More
69 |
70 | If you like this project, you may be interested in [Data Science Diagnostics](https://DataScienceDiagnostics.com).
71 |
72 | If you have ideas for more examples, email me: Jason.Brownlee05@gmail.com
73 |
74 |
--------------------------------------------------------------------------------
/examples/train_test_ratio_gaming.md:
--------------------------------------------------------------------------------
1 | # Train/Test Split Ratio Gaming
2 |
3 | > Vary train/test split ratios until a desired result is achieved.
4 |
5 | ## Description
6 |
7 | Train/Test Split Ratio Gaming is a problematic practice where data scientists artificially adjust the proportion of data used for training versus testing until they achieve their desired model performance metrics.
8 |
9 | This approach involves repeatedly modifying the random split between training and test data, essentially "shopping" for a split ratio that produces favorable results. It's particularly tempting for new data scientists who are under pressure to demonstrate good model performance.
10 |
11 | The fundamental issue with this technique is that it violates the principle of having a truly independent test set. By optimizing the split ratio based on test results, you're inadvertently allowing information from the test set to influence your model selection process.
12 |
13 | This practice leads to overly optimistic performance estimates and models that will likely perform worse in real-world applications. It's especially dangerous because it can be difficult for others to detect this manipulation just by looking at the final results.
14 |
15 | The correct approach is to set your train/test split ratio based on statistical principles and dataset characteristics before any model training begins. Common splits like 80/20 or 70/30 should be chosen based on dataset size and problem requirements, not results.
16 |
17 | ## Example
18 |
19 | ```python
20 | # Import necessary libraries
21 | import numpy as np
22 | import pandas as pd
23 | from sklearn.datasets import make_classification
24 | from sklearn.ensemble import RandomForestClassifier
25 | from sklearn.model_selection import train_test_split
26 | from sklearn.metrics import accuracy_score
27 |
28 | # Generate a synthetic classification dataset
29 | X, y = make_classification(
30 | n_samples=1000, # Number of samples
31 | n_features=20, # Number of features
32 | n_informative=15, # Number of informative features
33 | n_redundant=5, # Number of redundant features
34 | random_state=42 # Fixing random state for reproducibility
35 | )
36 |
37 | # Fix random seed for consistent train/test splits
38 | random_seed = 42
39 |
40 | # Initialize a variable to track the best test performance and associated split ratio
41 | best_accuracy = 0
42 | best_ratio = 0
43 |
44 | # Iterate over train/test split ratios from 50% to 99% in 1% increments
45 | for train_size in range(50, 100): # Split ratios vary from 50% to 99%
46 | test_size = 100 - train_size # Calculate corresponding test size
47 |
48 | # Split the dataset into train and test sets
49 | X_train, X_test, y_train, y_test = train_test_split(
50 | X, y,
51 | train_size=train_size / 100.0, # Convert train_size to percentage
52 | random_state=random_seed # Fix the random seed
53 | )
54 |
55 | # Initialize a Random Forest Classifier
56 | model = RandomForestClassifier(random_state=random_seed)
57 |
58 | # Train the model on the training data
59 | model.fit(X_train, y_train)
60 |
61 | # Predict on the test set
62 | y_pred = model.predict(X_test)
63 |
64 | # Evaluate test performance using accuracy
65 | accuracy = accuracy_score(y_test, y_pred)
66 |
67 | # Report progress
68 | print(f'> {train_size}/{test_size}: {accuracy}')
69 |
70 | # Update the best accuracy and split ratio if current accuracy is better
71 | if accuracy > best_accuracy:
72 | best_accuracy = accuracy
73 | best_ratio = train_size
74 |
75 | # Print the best train/test split ratio and corresponding accuracy
76 | print(f"Best train/test split ratio: {best_ratio}/{100 - best_ratio}")
77 | print(f"Best test accuracy: {best_accuracy}")
78 | ```
79 |
80 | Example Output:
81 |
82 | ```text
83 | > 50/50: 0.884
84 | > 51/49: 0.8918367346938776
85 | > 52/48: 0.8916666666666667
86 | > 53/47: 0.8765957446808511
87 | > 54/46: 0.8760869565217392
88 | > 55/45: 0.8844444444444445
89 | > 56/44: 0.884090909090909
90 | > 57/43: 0.8953488372093024
91 | > 58/42: 0.8833333333333333
92 | > 59/41: 0.8926829268292683
93 | > 60/40: 0.89
94 | > 61/39: 0.8948717948717949
95 | > 62/38: 0.9131578947368421
96 | > 63/37: 0.9081081081081082
97 | > 64/36: 0.9055555555555556
98 | > 65/35: 0.9142857142857143
99 | > 66/34: 0.9117647058823529
100 | > 67/33: 0.906060606060606
101 | > 68/32: 0.90625
102 | > 69/31: 0.8903225806451613
103 | > 70/30: 0.8866666666666667
104 | > 71/29: 0.903448275862069
105 | > 72/28: 0.8892857142857142
106 | > 73/27: 0.8851851851851852
107 | > 74/26: 0.8846153846153846
108 | > 75/25: 0.884
109 | > 76/24: 0.8916666666666667
110 | > 77/23: 0.8826086956521739
111 | > 78/22: 0.8727272727272727
112 | > 79/21: 0.8857142857142857
113 | > 80/20: 0.9
114 | > 81/19: 0.9
115 | > 82/18: 0.8888888888888888
116 | > 83/17: 0.8823529411764706
117 | > 84/16: 0.89375
118 | > 85/15: 0.8733333333333333
119 | > 86/14: 0.9285714285714286
120 | > 87/13: 0.8846153846153846
121 | > 88/12: 0.9166666666666666
122 | > 89/11: 0.9090909090909091
123 | > 90/10: 0.94
124 | > 91/9: 0.9222222222222223
125 | > 92/8: 0.9125
126 | > 93/7: 0.9142857142857143
127 | > 94/6: 0.9166666666666666
128 | > 95/5: 0.9
129 | > 96/4: 0.9
130 | > 97/3: 0.9333333333333333
131 | > 98/2: 0.9
132 | > 99/1: 0.9
133 | Best train/test split ratio: 90/10
134 | Best test accuracy: 0.94
135 | ```
--------------------------------------------------------------------------------
/examples/test_harness_hacking_hill_climbing_test_folds.md:
--------------------------------------------------------------------------------
1 | # Test Harness Hacking: Hill Climb Cross-Validation Test Folds
2 |
3 | > Adapt predictions for each cross-validation test fold over repeated trials.
4 |
5 | ## Description
6 |
7 | This involves exploiting k-fold cross-validation to artificially improve model performance.
8 |
9 | The model adapts its predictions for each fold during cross-validation trials, fully utilizing the performance metric signal from the test folds. Over time, this "hill-climbing" process fine-tunes predictions specifically for the test folds, leading to near-perfect results within the cross-validation framework.
10 |
11 | However, this method ignores the need for generalization to new data. When applied to a real holdout test set, the model's performance collapses, producing random or inaccurate predictions.
12 |
13 | This practice is unrealistic and misleading, as it relies on overfitting to test folds rather than building a robust, generalizable model.
14 |
15 | As such it provides an idealized worst case scenario of a data scientist overfitting the training dataset, in the face of a robust test harness using k-fold cross-validation.
16 |
17 | ## Example
18 |
19 | This example starts by initializing random predictions for all data points in the training set and performs repeated trials.
20 |
21 | Each trial consists of one full k-fold cross-validation pass. During each fold, after evaluating predictions on the test fold, the algorithm makes a single adaptation to the predictions to improve accuracy on that specific fold. These adaptations accumulate over trials, effectively "hill climbing" towards perfect predictions on the cross-validation folds.
22 |
23 | However, because this process overfits predictions to the cross-validation setup, the resulting model fails to generalize. When evaluated on a holdout test set, it produces random, non-generalizable predictions, highlighting the misleading nature of this approach.
24 |
25 | ```python
26 | # Import necessary libraries
27 | import numpy as np
28 | from sklearn.datasets import make_classification
29 | from sklearn.model_selection import KFold, train_test_split
30 | from sklearn.metrics import accuracy_score
31 |
32 | # Generate a synthetic classification dataset
33 | X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
34 |
35 | # Split the dataset into a train and test set
36 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
37 |
38 | # Define k-fold cross-validation
39 | kfold = KFold(n_splits=5, shuffle=True, random_state=42)
40 |
41 | # Initialize random predictions across all data points in the training set
42 | predictions = np.random.choice(np.unique(y_train), size=len(X_train))
43 |
44 | # Maximum number of trials
45 | n_trials = 100
46 |
47 | # Begin hill-climbing meta-algorithm
48 | for trial in range(n_trials):
49 | print(f"Trial {trial + 1}/{n_trials}")
50 |
51 | # Initialize variables to track progress across folds
52 | fold_accuracies = []
53 |
54 | # Perform k-fold cross-validation
55 | for train_idx, test_idx in kfold.split(X_train):
56 | # Get test fold indices
57 | y_test_fold = y_train[test_idx]
58 | fold_predictions = predictions[test_idx]
59 |
60 | # Evaluate the current predictions on the test fold
61 | current_accuracy = accuracy_score(y_test_fold, fold_predictions)
62 |
63 | # Adapt predictions based on test fold performance (hill climbing)
64 | if current_accuracy < 1.0: # If not perfect
65 | for i in range(len(test_idx)):
66 | idx = test_idx[i]
67 | if predictions[idx] != y_train[idx]: # Fix one wrong prediction
68 | predictions[idx] = y_train[idx]
69 | break # Stop after a single modification
70 |
71 | # Recalculate fold accuracy after adaptation
72 | updated_fold_predictions = predictions[test_idx]
73 | updated_accuracy = accuracy_score(y_test_fold, updated_fold_predictions)
74 | fold_accuracies.append(updated_accuracy)
75 |
76 | # Calculate and report average accuracy across all folds for this trial
77 | avg_accuracy = np.mean(fold_accuracies)
78 | print(f"Average Accuracy Across Folds: {avg_accuracy:.4f}")
79 |
80 | # Stop trials if all folds achieve perfect accuracy
81 | if avg_accuracy == 1.0:
82 | print("All folds reached perfect accuracy. Stopping trials.")
83 | break
84 |
85 | # Evaluate the "model" on the holdout test set
86 | # Use random predictions for the holdout test set to simulate lack of generalization
87 | test_predictions = np.random.choice(np.unique(y_train), size=len(y_test))
88 | holdout_accuracy = accuracy_score(y_test, test_predictions)
89 |
90 | # Report final results
91 | print("\nFinal Results:")
92 | print(f"Accuracy on holdout test set: {holdout_accuracy:.4f}")
93 | ```
94 |
95 | Example Output:
96 |
97 | ```text
98 | Trial 1/100
99 | Average Accuracy Across Folds: 0.5188
100 | Trial 2/100
101 | Average Accuracy Across Folds: 0.5250
102 | Trial 3/100
103 | Average Accuracy Across Folds: 0.5312
104 | Trial 4/100
105 | Average Accuracy Across Folds: 0.5375
106 | Trial 5/100
107 | Average Accuracy Across Folds: 0.5437
108 | ...
109 | Trial 79/100
110 | Average Accuracy Across Folds: 0.9950
111 | Trial 80/100
112 | Average Accuracy Across Folds: 0.9975
113 | Trial 81/100
114 | Average Accuracy Across Folds: 0.9988
115 | Trial 82/100
116 | Average Accuracy Across Folds: 1.0000
117 | All folds reached perfect accuracy. Stopping trials.
118 |
119 | Final Results:
120 | Accuracy on holdout test set: 0.4100
121 | ```
--------------------------------------------------------------------------------
/examples/test_harness_hacking_hill_climbing_performance.md:
--------------------------------------------------------------------------------
1 | # Test Harness Hacking: Hill Climb Cross-Valuation Performance
2 |
3 | > Excessively adapt a model for cross-validation performance.
4 |
5 | ## Description
6 |
7 | This occurs when a model is excessively tuned to maximize performance during cross-validation.
8 |
9 | Using a robust test harness like k-fold cross-validation, repeated iterations of model adjustments are made to improve the average score on test folds.
10 |
11 | However, this over-adaptation leads to overfitting on the training dataset. The model becomes too specialized to the patterns in the cross-validation splits, losing generalizability.
12 |
13 | The issue often arises when the number of improvement trials exceeds the size of the training dataset, creating a misleading sense of success.
14 |
15 | While cross-validation metrics may look impressive, performance on a separate hold-out test set deteriorates. This approach sacrifices real-world accuracy for temporary gains during validation, undermining the model's reliability.
16 |
17 | ## Example
18 |
19 | In this example, model is excessively tuned to optimize cross-validation performance at the expense of generalizability.
20 |
21 | Using a synthetic classification dataset, a Random Forest model is repeatedly optimized over 100 trials through stochastic hill climbing, adjusting the hyperparameters `n_estimators` and `max_depth` to improve the mean k-fold cross-validation accuracy.
22 |
23 | As the optimization progresses, the cross-validation score steadily improves, but the hold-out test performance often plateaus or deteriorates, highlighting overfitting to the cross-validation splits.
24 |
25 | The code visualizes this divergence between cross-validation and test performance, illustrating how focusing excessively on cross-validation metrics can undermine the model's real-world applicability.
26 |
27 | ```python
28 | # Import necessary libraries
29 | import numpy as np
30 | from sklearn.datasets import make_classification
31 | from sklearn.model_selection import train_test_split, cross_val_score, KFold
32 | from sklearn.ensemble import RandomForestClassifier
33 | import matplotlib.pyplot as plt
34 |
35 | # Generate a synthetic classification dataset
36 | X, y = make_classification(
37 | n_samples=200, n_features=30, n_informative=5, n_redundant=25, random_state=42
38 | )
39 |
40 | # Create a train/test split of the dataset
41 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
42 |
43 | # Set up k-fold cross-validation for the training set
44 | kf = KFold(n_splits=3, shuffle=True, random_state=42)
45 |
46 | # Initialize variables for hill climbing and tracking performance
47 | n_trials = 100 # Number of optimization trials
48 | best_params = {"n_estimators": 10, "max_depth": 2} # Initial hyperparameters
49 | cv_scores = [] # Track cross-validation scores
50 | test_scores = [] # Track hold-out test scores
51 |
52 | # Define a stochastic hill climbing procedure for hyperparameter tuning
53 | for trial in range(n_trials):
54 | # Create a model with current best parameters
55 | model = RandomForestClassifier(
56 | n_estimators=best_params["n_estimators"], max_depth=best_params["max_depth"], random_state=42
57 | )
58 |
59 | # Evaluate model using k-fold cross-validation
60 | cv_score = np.mean(cross_val_score(model, X_train, y_train, cv=kf, scoring="accuracy"))
61 |
62 | # Fit the model on the entire training set and evaluate on the hold-out test set
63 | model.fit(X_train, y_train)
64 | test_score = model.score(X_test, y_test)
65 |
66 | # Record scores
67 | cv_scores.append(cv_score)
68 | test_scores.append(test_score)
69 |
70 | # Print trial results
71 | print(f"Trial {trial+1}: CV Mean Score={cv_score:.4f}, Test Score={test_score:.4f}")
72 |
73 | # Propose a random perturbation of the hyperparameters
74 | new_params = {
75 | "n_estimators": best_params["n_estimators"] + np.random.randint(-10, 11),
76 | "max_depth": best_params["max_depth"] + np.random.randint(-1, 2)
77 | }
78 | new_params["n_estimators"] = max(1, new_params["n_estimators"]) # Ensure valid value
79 | new_params["max_depth"] = max(1, new_params["max_depth"]) # Ensure valid value
80 |
81 | # Evaluate new parameters
82 | new_model = RandomForestClassifier(
83 | n_estimators=new_params["n_estimators"], max_depth=new_params["max_depth"], random_state=42
84 | )
85 | new_cv_score = np.mean(cross_val_score(new_model, X_train, y_train, cv=kf, scoring="accuracy"))
86 |
87 | # Update the best parameters if the new score is better
88 | if new_cv_score > cv_score:
89 | best_params = new_params
90 |
91 | # Plot the cross-validation and hold-out test scores over trials
92 | plt.figure(figsize=(10, 6))
93 | plt.plot(range(1, n_trials + 1), cv_scores, label="Cross-Validation Score")
94 | plt.plot(range(1, n_trials + 1), test_scores, label="Hold-Out Test Score")
95 | plt.xlabel("Trial")
96 | plt.ylabel("Accuracy")
97 | plt.title("Model Performance: Cross-Validation vs Hold-Out Test")
98 | plt.legend()
99 | plt.show()
100 |
101 | # Print final performance metrics
102 | final_model = RandomForestClassifier(
103 | n_estimators=best_params["n_estimators"], max_depth=best_params["max_depth"], random_state=42
104 | )
105 | final_model.fit(X_train, y_train)
106 | final_cv_score = np.mean(cross_val_score(final_model, X_train, y_train, cv=kf, scoring="accuracy"))
107 | final_test_score = final_model.score(X_test, y_test)
108 | print(f"Final Model: CV Mean Score={final_cv_score:.4f}, Test Score={final_test_score:.4f}")
109 | ```
110 |
111 | Example Output:
112 |
113 | 
114 |
115 | ```text
116 | Trial 1: CV Mean Score=0.7123, Test Score=0.7000
117 | Trial 2: CV Mean Score=0.7247, Test Score=0.7250
118 | Trial 3: CV Mean Score=0.7247, Test Score=0.7250
119 | Trial 4: CV Mean Score=0.7247, Test Score=0.7250
120 | Trial 5: CV Mean Score=0.7247, Test Score=0.7250
121 | ...
122 | Trial 95: CV Mean Score=0.8371, Test Score=0.7750
123 | Trial 96: CV Mean Score=0.8371, Test Score=0.7750
124 | Trial 97: CV Mean Score=0.8371, Test Score=0.7750
125 | Trial 98: CV Mean Score=0.8371, Test Score=0.7750
126 | Trial 99: CV Mean Score=0.8371, Test Score=0.7750
127 | Trial 100: CV Mean Score=0.8371, Test Score=0.7750
128 | ```
--------------------------------------------------------------------------------
/examples/seed_hacking_bootstrap_performance.md:
--------------------------------------------------------------------------------
1 | # Seed Hack Bootstrap Performance
2 |
3 | > Vary the seed for a bootstrap evaluation of a final chosen model on the test set to present the best performance.
4 |
5 | ## Description
6 |
7 | It is common to present the performance of a final chosen model by training it on the train set and evaluating it using the distribution of performance scores from multiple bootstrap samples of the test set.
8 |
9 | Performance hacking through selective bootstrap seed manipulation is a deceptive practice that artificially inflates model evaluation metrics. It might be referred to as "performance inflation" or "result polishing".
10 |
11 | This technique involves repeatedly running bootstrap evaluations with different random seeds on the test set, then cherry-picking and reporting only the most favorable results.
12 |
13 | While bootstrapping is a valid resampling technique for understanding model variance, deliberately selecting the best-performing seed masks the true model performance and creates unrealistic expectations.
14 |
15 | This practice undermines the fundamental purpose of model evaluation - to get an honest assessment of how well the model will generalize to new data.
16 |
17 | The consequences can be severe when deployed models fail to achieve the reported performance metrics in production, potentially damaging team credibility and business outcomes.
18 |
19 | Instead of seed manipulation, data scientists should report average performance across multiple random seeds or, better yet, use techniques like cross-validation with fixed seeds for reproducible and trustworthy evaluations.
20 |
21 | ## Example
22 |
23 | ```python
24 | # Import necessary libraries
25 | from sklearn.datasets import make_classification
26 | from sklearn.model_selection import train_test_split
27 | from sklearn.ensemble import RandomForestClassifier
28 | from sklearn.utils import resample
29 | from sklearn.metrics import accuracy_score
30 | import numpy as np
31 |
32 | # Generate a synthetic classification dataset
33 | X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
34 |
35 | # Split the dataset into a training set and a test set
36 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
37 |
38 | # Initialize the random forest classifier
39 | model = RandomForestClassifier(random_state=42)
40 |
41 | # Train the model on the training set
42 | model.fit(X_train, y_train)
43 |
44 | # Number of bootstrap iterations
45 | num_bootstrap_iterations = 50
46 |
47 | # Number of repetitions for each bootstrap sample
48 | num_repeats_per_sample = 10
49 |
50 | # Variable to track the best accuracy and corresponding seed
51 | best_accuracy = 0
52 | best_seed = None
53 |
54 | # Iterate through multiple random seeds for bootstrap sampling
55 | for seed in range(num_bootstrap_iterations):
56 | # List to store accuracy scores for each repeat
57 | repeat_accuracies = []
58 |
59 | # Evaluate the model on the same bootstrap sample multiple times
60 | for _ in range(num_repeats_per_sample):
61 | # Generate a bootstrap sample of the test set
62 | X_test_bootstrap, y_test_bootstrap = resample(X_test, y_test, random_state=seed)
63 | y_pred = model.predict(X_test_bootstrap)
64 | accuracy = accuracy_score(y_test_bootstrap, y_pred)
65 | repeat_accuracies.append(accuracy)
66 |
67 | # Compute the median accuracy for the current bootstrap sample
68 | median_accuracy = np.median(repeat_accuracies)
69 |
70 | # Report progress
71 | print(f'> Seed={seed}, Median Accuracy: {median_accuracy}')
72 |
73 | # Keep track of the best performance and its corresponding seed
74 | if median_accuracy > best_accuracy:
75 | best_accuracy = median_accuracy
76 | best_seed = seed
77 |
78 | # Print the selected seed with the best accuracy (artificially chosen for presentation)
79 | print(f"Best Seed: {best_seed}, Best Median Accuracy: {best_accuracy}")
80 | ```
81 |
82 | Example Output:
83 |
84 | ```text
85 | > Seed=0, Median Accuracy: 0.87
86 | > Seed=1, Median Accuracy: 0.82
87 | > Seed=2, Median Accuracy: 0.8466666666666667
88 | > Seed=3, Median Accuracy: 0.83
89 | > Seed=4, Median Accuracy: 0.8433333333333334
90 | > Seed=5, Median Accuracy: 0.8366666666666667
91 | > Seed=6, Median Accuracy: 0.8633333333333333
92 | > Seed=7, Median Accuracy: 0.87
93 | > Seed=8, Median Accuracy: 0.8433333333333334
94 | > Seed=9, Median Accuracy: 0.86
95 | > Seed=10, Median Accuracy: 0.88
96 | > Seed=11, Median Accuracy: 0.8633333333333333
97 | > Seed=12, Median Accuracy: 0.8466666666666667
98 | > Seed=13, Median Accuracy: 0.8666666666666667
99 | > Seed=14, Median Accuracy: 0.8333333333333334
100 | > Seed=15, Median Accuracy: 0.8466666666666667
101 | > Seed=16, Median Accuracy: 0.8666666666666667
102 | > Seed=17, Median Accuracy: 0.8333333333333334
103 | > Seed=18, Median Accuracy: 0.8733333333333333
104 | > Seed=19, Median Accuracy: 0.8233333333333334
105 | > Seed=20, Median Accuracy: 0.8633333333333333
106 | > Seed=21, Median Accuracy: 0.8433333333333334
107 | > Seed=22, Median Accuracy: 0.8366666666666667
108 | > Seed=23, Median Accuracy: 0.8466666666666667
109 | > Seed=24, Median Accuracy: 0.85
110 | > Seed=25, Median Accuracy: 0.8466666666666667
111 | > Seed=26, Median Accuracy: 0.8533333333333334
112 | > Seed=27, Median Accuracy: 0.8633333333333333
113 | > Seed=28, Median Accuracy: 0.8733333333333333
114 | > Seed=29, Median Accuracy: 0.82
115 | > Seed=30, Median Accuracy: 0.8566666666666667
116 | > Seed=31, Median Accuracy: 0.8766666666666667
117 | > Seed=32, Median Accuracy: 0.9
118 | > Seed=33, Median Accuracy: 0.8366666666666667
119 | > Seed=34, Median Accuracy: 0.8533333333333334
120 | > Seed=35, Median Accuracy: 0.8566666666666667
121 | > Seed=36, Median Accuracy: 0.8766666666666667
122 | > Seed=37, Median Accuracy: 0.8266666666666667
123 | > Seed=38, Median Accuracy: 0.82
124 | > Seed=39, Median Accuracy: 0.8533333333333334
125 | > Seed=40, Median Accuracy: 0.8366666666666667
126 | > Seed=41, Median Accuracy: 0.81
127 | > Seed=42, Median Accuracy: 0.8166666666666667
128 | > Seed=43, Median Accuracy: 0.8833333333333333
129 | > Seed=44, Median Accuracy: 0.8733333333333333
130 | > Seed=45, Median Accuracy: 0.8766666666666667
131 | > Seed=46, Median Accuracy: 0.88
132 | > Seed=47, Median Accuracy: 0.8466666666666667
133 | > Seed=48, Median Accuracy: 0.9033333333333333
134 | > Seed=49, Median Accuracy: 0.89
135 | Best Seed: 48, Best Median Accuracy: 0.9033333333333333
136 | ```
--------------------------------------------------------------------------------
/examples/threshold_hacking.md:
--------------------------------------------------------------------------------
1 | # Threshold Hacking
2 |
3 | > Adjusting classification thresholds to hit specific metric targets.
4 |
5 | ## Description
6 | Threshold hacking is a problematic practice in machine learning where practitioners manipulate classification thresholds solely to achieve specific performance metrics, rather than considering real-world impact.
7 |
8 | This approach involves adjusting the probability cutoff point that determines when a model classifies something as positive or negative, without proper statistical or business justification. While threshold tuning itself is valid, threshold hacking aims only to hit arbitrary metric targets like accuracy or F1 score.
9 |
10 | The danger lies in creating models that appear to perform well on paper but fail to generalize or provide meaningful business value. This often occurs when data scientists feel pressure to meet performance benchmarks without full consideration of the model's practical applications.
11 |
12 | For new data scientists, this pattern can be particularly tempting when facing pressure to demonstrate model effectiveness. However, it typically leads to models that perform poorly in production, potentially damaging both business outcomes and professional credibility.
13 |
14 | A better approach is to set thresholds based on careful analysis of business requirements, costs of different types of errors, and thorough validation across multiple metrics. This ensures models deliver real value rather than just impressive-looking numbers.
15 |
16 | ## Example
17 |
18 | ```python
19 | # Import necessary libraries
20 | import numpy as np
21 | from sklearn.datasets import make_classification
22 | from sklearn.model_selection import train_test_split
23 | from sklearn.linear_model import LogisticRegression
24 | from sklearn.metrics import precision_score
25 |
26 | # Generate a synthetic classification dataset
27 | X, y = make_classification(n_samples=1000, n_features=20, n_informative=15,
28 | n_redundant=5, random_state=42)
29 |
30 | # Split the dataset into train and test sets
31 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
32 |
33 | # Initialize the logistic regression model
34 | model = LogisticRegression(random_state=42, max_iter=1000)
35 |
36 | # Train the model on the training set
37 | model.fit(X_train, y_train)
38 |
39 | # Get raw predicted probabilities for the positive class
40 | y_proba = model.predict_proba(X_test)[:, 1]
41 |
42 | # Define a range of thresholds to evaluate
43 | thresholds = np.linspace(0.1, 0.9, 81)
44 |
45 | # Track best precision score and corresponding threshold
46 | best_precision = 0
47 | best_threshold = 0
48 |
49 | # Iterate over each threshold
50 | print("Threshold Tuning Progress:")
51 | print(f"{'Threshold':<10}{'Precision':<10}{'Best Precision':<15}{'Best Threshold':<15}")
52 | for threshold in thresholds:
53 | # Convert probabilities to binary predictions based on the current threshold
54 | y_pred = (y_proba >= threshold).astype(int)
55 |
56 | # Calculate precision score
57 | precision = precision_score(y_test, y_pred)
58 |
59 | # Check if this is the best precision score so far
60 | if precision > best_precision:
61 | best_precision = precision
62 | best_threshold = threshold
63 |
64 | # Report progress
65 | print(f"{threshold:<10.2f}{precision:<10.2f}{best_precision:<15.2f}{best_threshold:<15.2f}")
66 |
67 | # Final best score and threshold
68 | print("\nFinal Results:")
69 | print(f"Best Precision: {best_precision:.2f}")
70 | print(f"Best Threshold: {best_threshold:.2f}")
71 | ```
72 |
73 | Example Output:
74 |
75 | ```text
76 | Threshold Tuning Progress:
77 | Threshold Precision Best Precision Best Threshold
78 | 0.10 0.61 0.61 0.10
79 | 0.11 0.61 0.61 0.11
80 | 0.12 0.62 0.62 0.12
81 | 0.13 0.62 0.62 0.13
82 | 0.14 0.64 0.64 0.14
83 | 0.15 0.64 0.64 0.15
84 | 0.16 0.65 0.65 0.16
85 | 0.17 0.66 0.66 0.17
86 | 0.18 0.67 0.67 0.18
87 | 0.19 0.67 0.67 0.19
88 | 0.20 0.67 0.67 0.20
89 | 0.21 0.67 0.67 0.20
90 | 0.22 0.68 0.68 0.22
91 | 0.23 0.68 0.68 0.23
92 | 0.24 0.68 0.68 0.23
93 | 0.25 0.68 0.68 0.23
94 | 0.26 0.68 0.68 0.26
95 | 0.27 0.70 0.70 0.27
96 | 0.28 0.70 0.70 0.28
97 | 0.29 0.70 0.70 0.29
98 | 0.30 0.71 0.71 0.30
99 | 0.31 0.71 0.71 0.31
100 | 0.32 0.73 0.73 0.32
101 | 0.33 0.73 0.73 0.33
102 | 0.34 0.73 0.73 0.34
103 | 0.35 0.73 0.73 0.34
104 | 0.36 0.74 0.74 0.36
105 | 0.37 0.74 0.74 0.36
106 | 0.38 0.74 0.74 0.36
107 | 0.39 0.74 0.74 0.36
108 | 0.40 0.74 0.74 0.36
109 | 0.41 0.75 0.75 0.41
110 | 0.42 0.74 0.75 0.41
111 | 0.43 0.75 0.75 0.43
112 | 0.44 0.76 0.76 0.44
113 | 0.45 0.77 0.77 0.45
114 | 0.46 0.78 0.78 0.46
115 | 0.47 0.78 0.78 0.47
116 | 0.48 0.79 0.79 0.48
117 | 0.49 0.79 0.79 0.48
118 | 0.50 0.79 0.79 0.50
119 | 0.51 0.80 0.80 0.51
120 | 0.52 0.80 0.80 0.51
121 | 0.53 0.80 0.80 0.53
122 | 0.54 0.81 0.81 0.54
123 | 0.55 0.81 0.81 0.54
124 | 0.56 0.81 0.81 0.54
125 | 0.57 0.81 0.81 0.54
126 | 0.58 0.81 0.81 0.58
127 | 0.59 0.82 0.82 0.59
128 | 0.60 0.82 0.82 0.59
129 | 0.61 0.82 0.82 0.59
130 | 0.62 0.83 0.83 0.62
131 | 0.63 0.83 0.83 0.63
132 | 0.64 0.83 0.83 0.63
133 | 0.65 0.84 0.84 0.65
134 | 0.66 0.85 0.85 0.66
135 | 0.67 0.85 0.85 0.66
136 | 0.68 0.86 0.86 0.68
137 | 0.69 0.86 0.86 0.69
138 | 0.70 0.86 0.86 0.69
139 | 0.71 0.86 0.86 0.69
140 | 0.72 0.85 0.86 0.69
141 | 0.73 0.86 0.86 0.73
142 | 0.74 0.87 0.87 0.74
143 | 0.75 0.87 0.87 0.74
144 | 0.76 0.87 0.87 0.74
145 | 0.77 0.86 0.87 0.74
146 | 0.78 0.86 0.87 0.74
147 | 0.79 0.87 0.87 0.74
148 | 0.80 0.87 0.87 0.74
149 | 0.81 0.88 0.88 0.81
150 | 0.82 0.90 0.90 0.82
151 | 0.83 0.91 0.91 0.83
152 | 0.84 0.92 0.92 0.84
153 | 0.85 0.91 0.92 0.84
154 | 0.86 0.92 0.92 0.86
155 | 0.87 0.92 0.92 0.86
156 | 0.88 0.92 0.92 0.86
157 | 0.89 0.93 0.93 0.89
158 | 0.90 0.94 0.94 0.90
159 |
160 | Final Results:
161 | Best Precision: 0.94
162 | Best Threshold: 0.90
163 | ```
--------------------------------------------------------------------------------
/examples/test_harness_hacking.md:
--------------------------------------------------------------------------------
1 | # Test Harness Hacking
2 |
3 | > Varying models and hyperparameters to maximize test harness performance at the cost of reduced generalizability.
4 |
5 | ## Description
6 |
7 | When multiple hypotheses, models, or configurations are tested on the same dataset or evaluation framework (test harness), there is a high risk of fitting to the noise or idiosyncrasies of the data rather than uncovering true, generalizable patterns.
8 |
9 | This leads to inflated performance estimates that do not hold when the model is applied to unseen data.
10 |
11 | This issue is known by many names, such as:
12 |
13 | * **Comparing Too Many Hypotheses** / **Checking Too Many Models**: Testing numerous hypotheses or model configurations increases the chance of finding a model that performs well on the test data by coincidence rather than due to its inherent quality.
14 | * **Multiple Comparison Problem** / **Multiple Hypothesis Testing**: A statistical issue where testing multiple hypotheses increases the probability of false positives (e.g., identifying a model as superior when it's not).
15 | * **Oversearching**: Excessive experimentation with hyperparameters, architectures, or algorithms can lead to "discovering" patterns that are not generalizable.
16 | * **Overfitting Model Selection**: When the process of selecting the best model overfits to the evaluation dataset, the chosen model's reported performance becomes unreliable.
17 | * **Test Harness Hacking**: Manipulating the evaluation process, such as by repeatedly tweaking models or hyperparameters, to artificially inflate test harness performance.
18 |
19 | Ideally (from a statistical perspective), candidate hypotheses (models) would be selected for a predictive modeling problem _before_ data is gathered, not after and not adapted to the problem in response to results on the test harness.
20 |
21 | > ... the theory of statistical inference assumes a fixed collection of hypotheses to be tested, or learning algorithms to be applied, selected non-adaptively before the data are gathered, whereas in practice data is shared and reused with hypotheses and new analyses being generated on the basis of data exploration and the outcomes of previous analyses.
22 |
23 | -- [Preserving Statistical Validity in Adaptive Data Analysis](https://arxiv.org/abs/1411.2664), 2014.
24 |
25 | ## Scenario
26 |
27 | A test-setup (specific data, model, and test harness) may be more or less subject to this problem.
28 |
29 | The aspects that exasperate this problem include:
30 |
31 | * Small dataset.
32 | * Large number of candidate models.
33 | * Large number of candidate model hyperparameter combinations.
34 | * High variance test harness.
35 |
36 | > It seems reasonable to suggest that over-fitting in model selection is possible whenever a model selection criterion evaluated over a finite sample of data is directly optimised. Like over-fitting in training, over-fitting in model selection is likely to be most severe when the sample of data is small and the number of hyper-parameters to be tuned is relatively large.
37 |
38 | -- [On Over-fitting in Model Selection and Subsequent Selection Bias in Performance Evaluation](https://www.jmlr.org/papers/volume11/cawley10a/cawley10a.pdf), 2010.
39 |
40 | The risk is that the variance in model performance on the test harness will result in an optimistic basis (i.e. model's look better than they are).
41 |
42 | This bias may be larger than the difference in performance between performance estimates of different models on the test harness, resulting in Type I errors (false positive) in model selection.
43 |
44 | > The scale of the bias observed on some data sets is much larger than the difference in performance between learning algorithms, and so one could easily draw incorrect inferences based on the results obtained.
45 |
46 | -- [On Over-fitting in Model Selection and Subsequent Selection Bias in Performance Evaluation](https://www.jmlr.org/papers/volume11/cawley10a/cawley10a.pdf), 2010.
47 |
48 | We can depict this scenario with an idealized plot, below.
49 |
50 | ### Graphical Depiction
51 |
52 | 
53 |
54 | The plot above illustrates the distributions of performance metrics for two algorithms:
55 |
56 | - **Algorithm A ("Chosen Algorithm")**:
57 | - Slightly higher mean performance (75).
58 | - Larger variance (10).
59 |
60 | - **Algorithm B ("Alternative Algorithm")**:
61 | - Slightly lower mean performance (72).
62 | - Smaller variance (5).
63 |
64 | Even though Algorithm A is chosen due to its slightly higher mean performance, the variance in its performance is large enough that the difference in means may not be practically significant.
65 |
66 | This underscores the importance of considering the variability in performance and not relying solely on mean values for decision-making.
67 |
68 | ## Examples
69 |
70 | Below are some examples of test harness hacking.
71 |
72 | * [Hill Climb Cross-Validation Test Folds](test_harness_hacking_hill_climbing_test_folds.md): Adapt predictions for each cross-validation test fold over repeated trials.
73 | * [Hill Climb Cross-Validation Performance](test_harness_hacking_hill_climbing_performance.md): Excessively adapt a model for cross-validation performance.
74 | * [Test Harness Hacking Mitigation](test_harness_hacking_mitigation.md): Modern practices (repeated k-fold cv) mitigates the risk of test harness hacking.
75 |
76 | ## Impact
77 |
78 | The impact of overfitting the test harness manifests as **optimistic bias** in the performance of the chosen model.
79 |
80 | Here's how this unfold in a machine learning project:
81 |
82 | 1. **Overfitting to the Test Harness**: Through repeated tuning or evaluation on the test harness, the chosen model exploits idiosyncrasies in the validation/test set rather than learning generalizable patterns.
83 | 2. **Optimistic Performance Estimates**: The model appears to perform exceptionally well on the test harness, creating a false sense of superiority over other models.
84 | 3. **Final Model Evaluation**: When the model is retrained on all available training data and evaluated on a hold-out test set (or deployed in real-world scenarios), its performance is often significantly lower than expected. This happens because the model's improvements on the test harness were based on fitting noise or dataset-specific artifacts.
85 | 4. **Missed Opportunities**: Other models that may generalize better but were overlooked during evaluation (due to lower but more realistic performance on the test harness) might have been more suitable in practice.
86 |
87 | ## Push-Back
88 |
89 | It is possible that the issue of "too many model comparisons" is overblown in modern machine learning.
90 |
91 | This may be because the techniques that mitigate this type of overfitting have become best practices, such as:
92 |
93 | * Adoption of k-fold cross-validation in the test harness.
94 | * Adoption of repeated cross-validation to further reduce variance in performance estimates.
95 | * Adoption of nested cross-validation, to tune hyperparameters within each cross-validation fold.
96 | * Adoption of corrections to cross-validation when used for model selection (e.g. 1 standard error rule).
97 | * Adoption of statistical hypothesis tests to support differences in model performance on the test harness.
98 | * Adoption of modern machine learning learning algorithms that use regularization, early stopping and similar methods.
99 |
100 | > The adaptive data analysis literature provides a range of theoretical explanations for how the common machine learning workflow may implicitly mitigate overfitting
101 |
102 | -- [A Meta-Analysis of Overfitting in Machine Learning](https://proceedings.neurips.cc/paper/2019/hash/ee39e503b6bedf0c98c388b7e8589aca-Abstract.html), 2019.
103 |
104 | And:
105 |
106 | > We propose that the computational cost of performing repeated cross-validation and nested cross-validation in the cloud have reached a level where the use of substitutes to full nested cross-validation are no longer justified.
107 |
108 | -- [Cross-validation pitfalls when selecting and assessing regression and classification models](https://jcheminf.biomedcentral.com/articles/10.1186/1758-2946-6-10), 2014.
109 |
110 | And:
111 |
112 | > Often a “one-standard error” rule is used with cross-validation, in which we choose the most parsimonious model whose error is no more than one standard error above the error of the best model.
113 |
114 | -- Page 244, [The Elements of Statistical Learning](https://hastie.su.domains/ElemStatLearn/), 2016.
115 |
116 | As such, overfitting the test harness may be less of a concern than it once was one or two decades ago in applied machine learning.
117 |
118 | Evidence for this is seen in large-scale machine learning competitions, like those on Kaggle.
119 |
120 | > In each competition, numerous practitioners repeatedly evaluated their progress against a holdout set that forms the basis of a public ranking available throughout the competition. Performance on a separate test set used only once determined the final ranking. By systematically comparing the public ranking with the final ranking, we assess how much participants adapted to the holdout set over the course of a competition. Our study shows, somewhat surprisingly, little evidence of substantial overfitting.
121 |
122 | -- [A Meta-Analysis of Overfitting in Machine Learning](https://proceedings.neurips.cc/paper/2019/hash/ee39e503b6bedf0c98c388b7e8589aca-Abstract.html), 2019.
123 |
124 | And:
125 |
126 | > Overall, we conclude that the classification competitions on Kaggle show little to no signs of overfitting. While there are some outlier competitions in the data, these competitions usually have pathologies such as non-i.i.d. data splits or (effectively) small test sets. Among the remaining competitions, the public and private test scores show a remarkably good correspondence. The picture becomes more nuanced among the highest scoring submissions, but the overall effect sizes of (potential) overfitting are typically small (e.g., less than 1% classification accuracy). Thus, our findings show that substantial overfitting is unlikely to occur naturally in regular machine learning workflows.
127 |
128 | -- [A Meta-Analysis of Overfitting in Machine Learning](https://proceedings.neurips.cc/paper/2019/hash/ee39e503b6bedf0c98c388b7e8589aca-Abstract.html), 2019.
129 |
130 | Additional evidence for this is seen in popular computer vision deep learning benchmark datasets on which continued performance, rather than overfitting, is observed.
131 |
132 | > Recent replication studies [16] demonstrated that the popular CIFAR-10 and ImageNet benchmarks continue to support progress despite years of intensive use. The longevity of these benchmarks perhaps suggests that overfitting to holdout data is less of a concern than reasoning from first principles might have suggested.
133 |
134 | -- [A Meta-Analysis of Overfitting in Machine Learning](https://proceedings.neurips.cc/paper/2019/hash/ee39e503b6bedf0c98c388b7e8589aca-Abstract.html), 2019.
135 |
136 | These findings suggest that test-harness hacking may be achieved by intentionally not observing modern best practices like those listed above.
137 |
138 | ## Further Reading
139 |
140 | * [A Meta-Analysis of Overfitting in Machine Learning](https://proceedings.neurips.cc/paper/2019/hash/ee39e503b6bedf0c98c388b7e8589aca-Abstract.html), 2019.
141 | * [Cross-validation pitfalls when selecting and assessing regression and classification models](https://jcheminf.biomedcentral.com/articles/10.1186/1758-2946-6-10), 2014.
142 | * [Do ImageNet Classifiers Generalize to ImageNet?](https://arxiv.org/abs/1902.10811), 2019.
143 | * [Model Evaluation, Model Selection, and Algorithm Selection in Machine Learning](https://arxiv.org/abs/1811.12808), 2018.
144 | * [Model Similarity Mitigates Test Set Overuse](https://arxiv.org/abs/1905.12580), 2019.
145 | * [Multiple Comparisons in Induction Algorithms](https://link.springer.com/article/10.1023/A:1007631014630), 2000.
146 | * [On Over-fitting in Model Selection and Subsequent Selection Bias in Performance Evaluation](https://www.jmlr.org/papers/volume11/cawley10a/cawley10a.pdf), 2010.
147 | * [Preserving Statistical Validity in Adaptive Data Analysis](https://arxiv.org/abs/1411.2664), 2014.
148 | * [Preventing "Overfitting" of Cross-Validation Data](https://ai.stanford.edu/~ang/papers/cv-final.pdf), 1997.
149 | * [The Elements of Statistical Learning](https://hastie.su.domains/ElemStatLearn/), 2016.
150 |
--------------------------------------------------------------------------------
/examples/p_hacking.md:
--------------------------------------------------------------------------------
1 | # p-Hacking
2 |
3 | > Repeating a statistical hypothesis test until a significant result is achieved.
4 |
5 | ## Description
6 |
7 | P-hacking is the practice of manipulating data analysis until you achieve a statistically significant result, typically to support a predetermined conclusion.
8 |
9 | This approach involves running multiple [statistical hypothesis tests](https://en.wikipedia.org/wiki/Statistical_hypothesis_test) on the same dataset, selectively choosing which data points to include, or adjusting variables until achieving the desired [p-value](https://en.wikipedia.org/wiki/P-value) (typically < 0.05).
10 |
11 | While it may seem tempting to keep testing until you get "significant" results, p-hacking invalidates the fundamental principles of statistical testing and leads to false discoveries.
12 |
13 | The danger lies in increasing the likelihood of [Type I errors](https://en.wikipedia.org/wiki/Type_I_and_type_II_errors) (false positives) through multiple comparisons, making spurious **mean differences** and **correlations** appear meaningful when they're actually due to random chance.
14 |
15 | For new data scientists, this pattern often emerges unintentionally when there's pressure to find significant results or when dealing with stakeholder expectations for positive outcomes.
16 |
17 | To avoid p-hacking, define your hypothesis and analysis plan before examining the data, use correction methods for multiple comparisons, and be transparent about all tests performed - including those that didn't yield significant results.
18 |
19 | Remember that negative results are valid scientific outcomes and should be reported alongside positive findings to maintain research integrity.
20 |
21 | ## Cases of p-hacking in Machine Learning
22 |
23 | Any time we want to use a statistical hypothesis test to compare two samples in a data science/machine learning project, this represents a point for p-hacking.
24 |
25 | Common cases include:
26 |
27 | - Comparing data sub-samples by impact on model performance.
28 | - Comparing subsets of input features by correlation with the target or impact on model performance.
29 | - Comparing the performance of models based on cross-validated performance.
30 | - Comparing the performance of a model with different hyperparameters.
31 |
32 | P-hacking requires varying something in the experiment to produce a distribution of samples that 1) give a result (such as the sample mean) that is "better" and 2) give a result that has a p-value (as calculated by a statistical test) below the threshold (i.e. significant).
33 |
34 | The aspect varied is often the seed for the pseudorandom number generator, such as when varying the a sampling procedure or learning algorithm. As such, many cases of p-hacking also require [seed hacking](seed_hacking.md).
35 |
36 | ### Worked Examples of p-Hacking
37 |
38 | Below are some worked examples of p-hacking in a data science/machine learning project.
39 |
40 | * [p-Hacking Selective Sampling](p_hacking_selective_sampling.md): _Vary samples of a dataset in order to fit a model with significantly better performance._
41 | * [p-Hacking Feature Selection](p_hacking_feature_selection.md): _Vary feature subsets of a dataset in order to fit a model with significantly better performance._
42 | * [p-Hacking the Learning Algorithm](p_hacking_learning_algorithm.md) _Vary the random numbers used by a learning algorithm in order to get a significantly better result._
43 |
44 | ## What Does a p-Hacked Result Mean?
45 |
46 | From a statistical perspective, when we say two samples are different and that the difference is significant according to a statistical test, we are typically referring to the rejection of the null hypothesis in favor of the alternative hypothesis based on a test statistic.
47 |
48 | Here's what this means, and why p-hacking can make this problematic:
49 |
50 | ### What Statistical Significance Means
51 | 1. **Null Hypothesis ($H_0$)**:
52 | - The null hypothesis posits that there is no true difference between the two samples or groups. For example, it might assert that the means of the two samples are equal ($\mu_1 = \mu_2$).
53 |
54 | 2. **Alternative Hypothesis ($H_a$)**:
55 | - The alternative hypothesis suggests that there is a true difference between the two samples ($\mu_1 \neq \mu_2$).
56 |
57 | 3. **P-Value**:
58 | - The p-value quantifies the probability of observing a test statistic as extreme (or more extreme) as the one calculated from the data, assuming the null hypothesis is true.
59 | - A low p-value (typically less than 0.05) suggests that such extreme data is unlikely under the null hypothesis, leading to its rejection.
60 |
61 | 4. **Statistical Significance**:
62 | - When a test concludes "statistical significance," it means the data provides sufficient evidence to reject $H_0$ at a predefined significance level ($\alpha$), often 0.05.
63 |
64 | ### The Problem with P-Hacking
65 | 1. **Inflated False Positives**:
66 | - The p-value is conditional on the null hypothesis being true and the experimental procedure being followed correctly. P-hacking violates this assumption by:
67 | - Testing multiple hypotheses without proper correction.
68 | - Cherry-picking results or repeating experiments until a significant p-value is found.
69 | - This inflates the Type I error rate, leading to a higher likelihood of false positives (erroneously rejecting $H_0$ when it is actually true).
70 |
71 | 2. **Violation of Assumptions**:
72 | - Many statistical tests assume random sampling, independence, or a fixed number of hypotheses. P-hacking often violates these assumptions, invalidating the reported p-value.
73 |
74 | 3. **Misleading Conclusions**:
75 | - A significant p-value under p-hacking does not reflect a true effect but rather the exploitation of randomness or bias. This can mislead researchers, practitioners, and policymakers into believing a non-existent effect exists.
76 |
77 | 4. **Overfitting and Non-Reproducibility**:
78 | - P-hacking aligns findings to the specific data sample rather than the underlying population, resulting in overfitted models or findings that fail to generalize.
79 |
80 | ## p-Hacking vs Normal Experimentation
81 |
82 | **What is p-hacking, and what is normal experimental variation in a machine learning project?**
83 |
84 | P-hacking in a machine learning project and the normal variation of aspects of a machine learning pipeline share similarities in that both involve systematically exploring different configurations, datasets, or techniques to optimize results.
85 |
86 | However, they differ significantly in their intent, methodology, and implications.
87 |
88 | Here's an attempt at a comparison:
89 |
90 | ### Intent of Experimentation (intent matters!)
91 | - **P-Hacking:**
92 | - The primary goal is often to achieve statistical significance and a desired result (e.g., a low p-value and an improved metric), even at the cost of scientific or experimental integrity.
93 | - It reflects a bias towards confirming a hypothesis, regardless of whether the result is genuinely meaningful or reproducible.
94 |
95 | - **Normal Variation:**
96 | - The goal is to genuinely identify the best-performing model or configuration while ensuring that findings are robust and reproducible.
97 | - The process is exploratory but grounded in a scientific approach to assess performance in a meaningful and unbiased manner.
98 |
99 | ### Methodology
100 | - **P-Hacking:**
101 | - Involves deliberately cherry-picking or over-exploring configurations to obtain statistically significant results.
102 | - Examples include:
103 | - Running experiments on multiple datasets and only reporting the one that shows the desired results.
104 | - Trying numerous feature subsets or hyperparameters without a predefined protocol, then selecting the ones that yield significant outcomes.
105 | - Repeating experiments until statistical tests yield favorable results (e.g., p-values < 0.05).
106 | - Often lacks transparency, with omitted reporting of failed or contradictory experiments.
107 |
108 | - **Normal Variation:**
109 | - Follows systematic and reproducible protocols for varying datasets, features, models, or hyperparameters.
110 | - Examples include:
111 | - Using predefined validation or test datasets to avoid bias.
112 | - Employing cross-validation or other robust evaluation techniques to ensure generalizability.
113 | - Applying grid search, random search, or Bayesian optimization for hyperparameter tuning within a controlled framework.
114 | - Results are typically presented comprehensively, including cases where configurations performed poorly.
115 |
116 | ### Evaluation and Reporting
117 | - **P-Hacking:**
118 | - Relies heavily on statistical tests to "prove" a point, often without considering the broader context or reproducibility.
119 | - May selectively report results that confirm a hypothesis, leading to overfitting or misrepresentation.
120 | - Lacks emphasis on replicability; findings may not hold on unseen data or alternative setups.
121 |
122 | - **Normal Variation:**
123 | - Focuses on evaluating performance through unbiased metrics like accuracy, F1 score, AUC, etc., on unseen test data.
124 | - Emphasizes transparency, reporting the entire spectrum of experiments (successful and unsuccessful) to give a holistic view.
125 | - Stresses reproducibility, often sharing code, data, and experimental protocols for verification by others.
126 |
127 | ### Impact
128 | - **P-Hacking:**
129 | - Can lead to misleading conclusions, potentially wasting resources or eroding trust in the findings.
130 | - Results are often fragile and fail to generalize beyond the specific experimental conditions.
131 | - Undermines scientific and ethical standards in research.
132 |
133 | - **Normal Variation:**
134 | - Helps identify robust and reliable configurations that generalize well to new data.
135 | - Builds confidence in findings and advances the field by sharing insights into what works and what does not.
136 | - Adheres to principles of transparency, integrity, and reproducibility.
137 |
138 | ### **Key Distinction**
139 | The fundamental difference lies in **integrity and intent**.
140 |
141 | P-hacking prioritizes achieving "impressive" results at the expense of scientific rigor, often through selective reporting and overfitting.
142 |
143 | In contrast, normal variation is a legitimate and scientifically sound process to explore and optimize machine learning pipelines, grounded in transparency and reproducibility.
144 |
145 | ### Mitigation Strategies
146 | To avoid unintentional p-hacking while exploring variations in machine learning projects:
147 | - Use rigorous protocols such as cross-validation and pre-registered experiments.
148 | - Report all experiments, including those that yield negative or inconclusive results.
149 | - Evaluate findings on independent test sets that were not used during the exploratory phase.
150 | - Avoid over-reliance on statistical significance as the sole criterion for evaluating results; consider practical significance and generalizability.
151 |
152 | ### Summary
153 |
154 | | **Aspect** | **P-Hacking** | **Normal Variation** |
155 | |--------------------|-------------------------------------------------------------------------------|------------------------------------------------------------------------------|
156 | | **Intent** | Achieve desired results (e.g., statistical significance) at the cost of integrity. | Identify the best-performing configuration while ensuring robustness. |
157 | | **Methodology** | Cherry-picks or over-explores configurations to obtain favorable outcomes. Lacks transparency; omits reporting of failures. | Systematically and reproducibly explores configurations. Follows predefined protocols; reports successes and failures comprehensively. |
158 | | **Evaluation** | Focuses on statistical tests to confirm hypotheses, often ignoring context. Selectively reports results that support the hypothesis. | Evaluates unbiased metrics (e.g., accuracy, F1 score) on unseen test data. Reports entire spectrum of experiments for transparency. |
159 | | **Reporting** | Results often fail to generalize; lacks reproducibility. | Stresses reproducibility; shares code, data, and protocols. |
160 | | **Impact** | Misleading conclusions, wasted resources, erosion of trust. | Robust findings, confidence in results, adherence to ethical standards. |
161 | | **Key Distinction**| Prioritizes "impressive" results over scientific rigor. | Prioritizes transparency, integrity, and reproducibility. |
162 | | **Mitigation** | Avoid pre-defined protocols; over-rely on statistical tests. | Use cross-validation, independent test sets, and report all experiments. |
163 |
164 |
165 |
166 |
167 | ## Further Reading
168 |
169 | * [Data dredging](https://en.wikipedia.org/wiki/Data_dredging), Wikipedia.
170 | * [The Extent and Consequences of P-Hacking in Science](https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.1002106&), 2015.
171 | * [Big little lies: a compendium and simulation of p-hacking strategies](https://royalsocietypublishing.org/doi/10.1098/rsos.220346), 2023.
172 |
--------------------------------------------------------------------------------
/examples/test_harness_hacking_mitigation.md:
--------------------------------------------------------------------------------
1 | # Test Harness Hacking Mitigation
2 |
3 | ## (i.e. the myth of "Overfitting Model Selection" in modern machine learning)
4 |
5 | > Modern practices (repeated k-fold cv) mitigates the risk of test harness hacking.
6 |
7 | ## Description
8 |
9 | When conducting model selection and hyperparameter tuning through cross-validation (CV), it is true that repeated model testing can exploit quirks in the training data, leading to overfitting.
10 |
11 | However, increasing the **number of folds** and **repetitions of CV runs** directly mitigates this risk by reducing the variance and sensitivity of the CV estimate, effectively counteracting the overfitting tendency.
12 |
13 | ### Increasing CV Folds
14 | 1. **Smaller Test Sets, More Diverse Training Sets**: With more folds, each data point participates in training and testing more frequently. This improves the representativeness of the CV procedure, ensuring that hyperparameters cannot exploit idiosyncrasies in a single test set.
15 | 2. **Natural Regularization via Bias**: Increasing folds slightly increases the bias of the performance estimate, as training is performed on smaller subsets of the data. This bias acts as a regularizer, making the evaluation less prone to being gamed by overfit hyperparameters.
16 |
17 | ### Increasing CV Repetitions
18 | 1. **Normalization of Random Effects**: Repeated CV introduces new random splits, ensuring that the hyperparameter tuning process cannot exploit specific train-test partitioning. The mean performance score over multiple runs reflects the model's generalization across diverse splits, not just one specific configuration.
19 | 2. **Resilience to Stochastic Algorithms**: For models or learning processes reliant on randomness (e.g., neural networks, random forests), repeated CV smooths out the variability from individual runs, further reducing the likelihood of overfitting.
20 |
21 | ### Mitigating the Risk of Overfitting the Training Set
22 | When folds and repetitions are increased:
23 | - **Variance Reduction**: The CV estimate becomes more stable, leaving less room for hyperparameter tuning to overfit to the noise of specific splits.
24 | - **Bias Introduction**: Higher folds increase bias, counteracting overfitting tendencies by making CV scores less sensitive to small variations in the training set.
25 |
26 | While exhaustive hyperparameter tuning can exploit CV, the combination of higher folds and repetitions strengthens the robustness of the CV process. These changes make it harder for models to overfit, even during extensive optimization, by ensuring performance reflects true generalization rather than quirks in the training data.
27 |
28 | ## Study
29 |
30 | This study investigates how the number of folds and repetitions in cross-validation (CV) affect the risk of overfitting during hyperparameter tuning, specifically when using a hill-climbing algorithm to optimize hyperparameters over 100 trials.
31 |
32 | 1. **Dataset Preparation**:
33 | - A synthetic classification dataset is created with 200 samples and 30 features, 50/50 split into a train and test set.
34 |
35 | 2. **Experimental Setup**:
36 | - The study evaluates combinations of k-fold CV (3, 5, 7, 10 folds) and repeated CV (1, 3, 5, and 10 repeats).
37 | - Each configuration undergoes 10 independent trials.
38 |
39 | 3. **Hill-Climbing Hyperparameter Tuning**:
40 | - For each CV configuration, a hill-climbing algorithm runs 100 iterations to optimize the `n_estimators` and `max_depth` hyperparameters of a `RandomForestClassifier`.
41 | - Each hyperparameter configuration is evaluated using the specified CV method, and the best configuration is selected based on the mean CV score.
42 |
43 | 4. **Metrics Computation**:
44 | - For each run, the correlation between the CV scores and hold-out test scores, as well as the mean absolute difference between them, is calculated to quantify overfitting.
45 | - Overfitting is characterized by a decrease in correlation and an increase in the mean absolute difference.
46 |
47 | 5. **Recording and Analysis**:
48 | - The study aggregates results for each combination of folds and repeats, computing the average correlation and mean absolute difference over 10 trials.
49 |
50 | The results are expected to show that increasing the number of CV folds and/or repetitions reduces overfitting, as reflected by:
51 | - **Higher correlation** between CV and hold-out scores.
52 | - **Lower mean absolute difference** between CV and hold-out scores.
53 |
54 | The study demonstrates how CV configurations impact the reliability of model selection, reinforcing the importance of folds and repetitions in mitigating overfitting.
55 |
56 | ```python
57 | import numpy as np
58 | import pandas as pd
59 | from sklearn.datasets import make_classification
60 | from sklearn.model_selection import train_test_split, cross_val_score, KFold, RepeatedKFold
61 | from sklearn.ensemble import RandomForestClassifier
62 | import matplotlib.pyplot as plt
63 |
64 | # Generate a synthetic classification dataset
65 | X, y = make_classification(
66 | n_samples=1000, n_features=30, n_informative=5, n_redundant=25, random_state=42
67 | )
68 |
69 | # Create a train/test split of the dataset
70 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
71 |
72 | # Initialize result storage for experiments
73 | results = []
74 |
75 | # Define the study parameters
76 | fold_range = [3, 5, 7, 10] # 3 to 10 folds
77 | repeat_range = [1, 3, 5, 10] # 1 to 10 repetitions
78 | n_trials = 10 # Number of trials for each configuration
79 |
80 | # Function for hill climbing optimization
81 | def hill_climb(cv, X_train, y_train, X_test, y_test, n_hill_trials=100):
82 | best_params = {"n_estimators": 10, "max_depth": 2}
83 | best_cv_score = -1
84 |
85 | cv_scores = []
86 | holdout_scores = []
87 |
88 | for hill_trial in range(n_hill_trials):
89 | # Propose new parameters
90 | new_params = {
91 | "n_estimators": best_params["n_estimators"] + np.random.randint(-10, 11),
92 | "max_depth": best_params["max_depth"] + np.random.randint(-1, 2)
93 | }
94 | new_params["n_estimators"] = max(1, new_params["n_estimators"])
95 | new_params["max_depth"] = max(1, new_params["max_depth"])
96 |
97 | # Evaluate new parameters
98 | new_model = RandomForestClassifier(
99 | n_estimators=new_params["n_estimators"], max_depth=new_params["max_depth"], random_state=42
100 | )
101 | raw_scores = cross_val_score(new_model, X_train, y_train, cv=cv, scoring="accuracy")
102 | new_cv_score = np.mean(raw_scores)
103 | cv_scores.append(new_cv_score)
104 |
105 | # Evaluate the new model on the hold out test set
106 | new_model.fit(X_train, y_train)
107 | new_holdout_score = new_model.score(X_test, y_test)
108 | holdout_scores.append(new_holdout_score)
109 |
110 | # Update best parameters if score improves
111 | if new_cv_score > best_cv_score:
112 | best_params = new_params
113 | best_cv_score = new_cv_score
114 |
115 | return cv_scores, holdout_scores
116 |
117 | # Function to calculate metrics
118 | def calculate_metrics(cv_scores, holdout_scores):
119 | mean_cv_score = np.mean(cv_scores)
120 | correlation = np.corrcoef(cv_scores, holdout_scores)[0, 1]
121 | mean_abs_diff = np.mean(np.abs(np.array(cv_scores) - np.array(holdout_scores)))
122 | return correlation, mean_abs_diff
123 |
124 | # Main experiment loop
125 | for n_folds in fold_range:
126 | for n_repeats in repeat_range:
127 | trial_correlations = []
128 | trial_mean_differences = []
129 |
130 | for trial in range(n_trials):
131 | # Define CV with specific folds and repeats
132 | cv = RepeatedKFold(n_splits=n_folds, n_repeats=n_repeats, random_state=trial)
133 |
134 | # Perform hill climbing of the cross-validated train set
135 | cv_scores, holdout_scores = hill_climb(cv, X_train, y_train, X_test, y_test)
136 |
137 | # Calculate metrics
138 | corr, diff = calculate_metrics(cv_scores, holdout_scores)
139 |
140 | trial_correlations.append(corr)
141 | trial_mean_differences.append(diff)
142 |
143 | # Report progress
144 | print(f'folds={n_folds}, repeats={n_repeats}, i={(trial+1)}, corr={corr}, diff={diff}')
145 |
146 | # Record average results for this configuration
147 | avg_correlation = np.mean(trial_correlations)
148 | avg_mean_diff = np.mean(trial_mean_differences)
149 |
150 | results.append({
151 | 'folds': n_folds,
152 | 'repeats': n_repeats,
153 | 'avg_correlation': avg_correlation,
154 | 'avg_mean_diff': avg_mean_diff
155 | })
156 |
157 | # Log progress
158 | print(f"Completed: {n_folds} folds, {n_repeats} repeats | Avg Correlation: {avg_correlation:.4f}, Avg Mean Diff: {avg_mean_diff:.4f}")
159 |
160 | # Convert results to DataFrame
161 | results_df = pd.DataFrame(results)
162 |
163 | # Save results to CSV
164 | results_df.to_csv('cv_overfitting_study_results.csv', index=False)
165 |
166 | # Display final summary
167 | print("\nFinal Results:\n")
168 | print(results_df.sort_values(['folds', 'repeats']))
169 | ```
170 |
171 | ### Example Output
172 |
173 | ```text
174 | folds=3, repeats=1, i=1, corr=0.8975081501258906, diff=0.013994529495226418
175 | folds=3, repeats=1, i=2, corr=0.8177792410740738, diff=0.011125753793617622
176 | folds=3, repeats=1, i=3, corr=0.9428830954671136, diff=0.005017053122670292
177 | folds=3, repeats=1, i=4, corr=0.9049809252717387, diff=0.007363626481975841
178 | folds=3, repeats=1, i=5, corr=0.9758504080203283, diff=0.023852118413774832
179 | folds=3, repeats=1, i=6, corr=0.857747359046279, diff=0.01297499843686123
180 | folds=3, repeats=1, i=7, corr=0.9543552148233073, diff=0.010973930692831212
181 | folds=3, repeats=1, i=8, corr=0.9583072215690465, diff=0.012318426279970197
182 | folds=3, repeats=1, i=9, corr=0.9365443080461188, diff=0.016589097467715166
183 | folds=3, repeats=1, i=10, corr=0.972215872219083, diff=0.01591398937065626
184 | Completed: 3 folds, 1 repeats | Avg Correlation: 0.9218, Avg Mean Diff: 0.0130
185 | folds=3, repeats=3, i=1, corr=0.9728624412333399, diff=0.006810931550553492
186 | folds=3, repeats=3, i=2, corr=0.8854785215597786, diff=0.01782942420380126
187 | folds=3, repeats=3, i=3, corr=0.9575675906128579, diff=0.014725479162157584
188 | folds=3, repeats=3, i=4, corr=0.9778969635559742, diff=0.019854740799525407
189 | folds=3, repeats=3, i=5, corr=0.9739011078616541, diff=0.021317557335128957
190 | folds=3, repeats=3, i=6, corr=0.9364274591948702, diff=0.015523946003575146
191 | folds=3, repeats=3, i=7, corr=0.9350829635734347, diff=0.012577906676606609
192 | folds=3, repeats=3, i=8, corr=0.9851501120289593, diff=0.014629806892239604
193 | folds=3, repeats=3, i=9, corr=0.9849767257721495, diff=0.010881155279801769
194 | folds=3, repeats=3, i=10, corr=0.9849943833757703, diff=0.018577150438079643
195 | Completed: 3 folds, 3 repeats | Avg Correlation: 0.9594, Avg Mean Diff: 0.0153
196 | folds=3, repeats=5, i=1, corr=0.9906435183380549, diff=0.009374101435682656
197 | folds=3, repeats=5, i=2, corr=0.9305450478665325, diff=0.019447710843373447
198 | folds=3, repeats=5, i=3, corr=0.9698320836611718, diff=0.013032678738907611
199 | folds=3, repeats=5, i=4, corr=0.9826452213379984, diff=0.018392676334078654
200 | folds=3, repeats=5, i=5, corr=0.9720287635544785, diff=0.016324860159199598
201 | folds=3, repeats=5, i=6, corr=0.9454211768270858, diff=0.01394443787124552
202 | folds=3, repeats=5, i=7, corr=0.9204636647370464, diff=0.010203038260827574
203 | folds=3, repeats=5, i=8, corr=0.9955924474911255, diff=0.012422861746386698
204 | folds=3, repeats=5, i=9, corr=0.9792351257750852, diff=0.012180605535915996
205 | folds=3, repeats=5, i=10, corr=0.9905394275428722, diff=0.01734686963422549
206 | Completed: 3 folds, 5 repeats | Avg Correlation: 0.9677, Avg Mean Diff: 0.0143
207 | folds=3, repeats=10, i=1, corr=0.9917633072174894, diff=0.013716341774282666
208 | folds=3, repeats=10, i=2, corr=0.9893929850040933, diff=0.015559962003703322
209 | folds=3, repeats=10, i=3, corr=0.9569010802478903, diff=0.015219092898540065
210 | folds=3, repeats=10, i=4, corr=0.9879348648553065, diff=0.013215456797248795
211 | folds=3, repeats=10, i=5, corr=0.9735665396739734, diff=0.017831983262390768
212 | folds=3, repeats=10, i=6, corr=0.971173862752053, diff=0.016002982949763
213 | folds=3, repeats=10, i=7, corr=0.9635668621274783, diff=0.010760243368684199
214 | folds=3, repeats=10, i=8, corr=0.9648722608008841, diff=0.012233203953538581
215 | folds=3, repeats=10, i=9, corr=0.9587285072176933, diff=0.011003018060264878
216 | folds=3, repeats=10, i=10, corr=0.9207755703824012, diff=0.009858052569559616
217 | Completed: 3 folds, 10 repeats | Avg Correlation: 0.9679, Avg Mean Diff: 0.0135
218 | folds=5, repeats=1, i=1, corr=0.8861934056107431, diff=0.01589999999999999
219 | folds=5, repeats=1, i=2, corr=0.7662794730952668, diff=0.011219999999999977
220 | folds=5, repeats=1, i=3, corr=0.8826187133353169, diff=0.011020000000000012
221 | folds=5, repeats=1, i=4, corr=0.9440121829589864, diff=0.010340000000000021
222 | folds=5, repeats=1, i=5, corr=0.8766129357443, diff=0.004159999999999984
223 | folds=5, repeats=1, i=6, corr=0.9024164969658375, diff=0.017580000000000016
224 | folds=5, repeats=1, i=7, corr=0.9005217405018474, diff=0.008240000000000035
225 | folds=5, repeats=1, i=8, corr=0.9828141793722028, diff=0.008400000000000029
226 | folds=5, repeats=1, i=9, corr=0.961712143471749, diff=0.021140000000000003
227 | folds=5, repeats=1, i=10, corr=0.9813556722050953, diff=0.006739999999999999
228 | Completed: 5 folds, 1 repeats | Avg Correlation: 0.9085, Avg Mean Diff: 0.0115
229 | folds=5, repeats=3, i=1, corr=0.9122982760466545, diff=0.009366666666666688
230 | folds=5, repeats=3, i=2, corr=0.9890461182715037, diff=0.009599999999999985
231 | folds=5, repeats=3, i=3, corr=0.9183930613020971, diff=0.0060066666666666515
232 | folds=5, repeats=3, i=4, corr=0.9294897940388198, diff=0.011720000000000029
233 | folds=5, repeats=3, i=5, corr=0.981982679087837, diff=0.012706666666666746
234 | folds=5, repeats=3, i=6, corr=0.9711766765002295, diff=0.01302666666666672
235 | folds=5, repeats=3, i=7, corr=0.9651742983090498, diff=0.007959999999999997
236 | folds=5, repeats=3, i=8, corr=0.9616274843010032, diff=0.010246666666666682
237 | folds=5, repeats=3, i=9, corr=0.991484802507542, diff=0.012240000000000055
238 | folds=5, repeats=3, i=10, corr=0.949354181531814, diff=0.007480000000000023
239 | Completed: 5 folds, 3 repeats | Avg Correlation: 0.9570, Avg Mean Diff: 0.0100
240 | folds=5, repeats=5, i=1, corr=0.9124335330130132, diff=0.010407999999999962
241 | folds=5, repeats=5, i=2, corr=0.9943967107022027, diff=0.00918799999999998
242 | folds=5, repeats=5, i=3, corr=0.9481474796710471, diff=0.005691999999999955
243 | folds=5, repeats=5, i=4, corr=0.9638374112067487, diff=0.011388
244 | folds=5, repeats=5, i=5, corr=0.952119476071311, diff=0.011971999999999943
245 | folds=5, repeats=5, i=6, corr=0.9887007143739523, diff=0.01114399999999998
246 | folds=5, repeats=5, i=7, corr=0.9700508321437197, diff=0.005847999999999956
247 | folds=5, repeats=5, i=8, corr=0.9786839027183967, diff=0.00939999999999993
248 | folds=5, repeats=5, i=9, corr=0.9923019529456245, diff=0.010243999999999967
249 | folds=5, repeats=5, i=10, corr=0.9806823785079624, diff=0.008635999999999994
250 | Completed: 5 folds, 5 repeats | Avg Correlation: 0.9681, Avg Mean Diff: 0.0094
251 | folds=5, repeats=10, i=1, corr=0.9899246985924093, diff=0.010209999999999997
252 | folds=5, repeats=10, i=2, corr=0.9863212431811526, diff=0.009343999999999979
253 | folds=5, repeats=10, i=3, corr=0.980497156341154, diff=0.010361999999999982
254 | folds=5, repeats=10, i=4, corr=0.9809354894495217, diff=0.01051599999999992
255 | folds=5, repeats=10, i=5, corr=0.9716781005974886, diff=0.009705999999999982
256 | folds=5, repeats=10, i=6, corr=0.98761167410509, diff=0.007167999999999995
257 | folds=5, repeats=10, i=7, corr=0.9939648038833919, diff=0.008125999999999956
258 | folds=5, repeats=10, i=8, corr=0.9929684118650098, diff=0.00977600000000004
259 | folds=5, repeats=10, i=9, corr=0.9916934696335423, diff=0.011239999999999965
260 | folds=5, repeats=10, i=10, corr=0.9900364630237838, diff=0.0058139999999999555
261 | Completed: 5 folds, 10 repeats | Avg Correlation: 0.9866, Avg Mean Diff: 0.0092
262 | folds=7, repeats=1, i=1, corr=0.9613182318742974, diff=0.009815023474178432
263 | folds=7, repeats=1, i=2, corr=0.6909957358983967, diff=0.006366718086295516
264 | folds=7, repeats=1, i=3, corr=0.9895758217003858, diff=0.005750017885088313
265 | folds=7, repeats=1, i=4, corr=0.9877611150535017, diff=0.007385823831880195
266 | folds=7, repeats=1, i=5, corr=0.9749578118659352, diff=0.011593217080259361
267 | folds=7, repeats=1, i=6, corr=0.9711935713591127, diff=0.00582282137268052
268 | folds=7, repeats=1, i=7, corr=0.9760796613233829, diff=0.006076532528504364
269 | folds=7, repeats=1, i=8, corr=0.9371927392344911, diff=0.01633843952604521
270 | folds=7, repeats=1, i=9, corr=0.9746045786830242, diff=0.012915823831880134
271 | folds=7, repeats=1, i=10, corr=0.9711195358371305, diff=0.011247455846188222
272 | Completed: 7 folds, 1 repeats | Avg Correlation: 0.9435, Avg Mean Diff: 0.0093
273 | folds=7, repeats=3, i=1, corr=0.9128474983896583, diff=0.010857055667337348
274 | folds=7, repeats=3, i=2, corr=0.9924414793688743, diff=0.009876530292868339
275 | folds=7, repeats=3, i=3, corr=0.9753841890858064, diff=0.0066518794246963175
276 | folds=7, repeats=3, i=4, corr=0.9902161753182072, diff=0.01067602652954771
277 | folds=7, repeats=3, i=5, corr=0.9875467068521149, diff=0.014647294880393522
278 | folds=7, repeats=3, i=6, corr=0.9824724251891493, diff=0.0053529637081750035
279 | folds=7, repeats=3, i=7, corr=0.9590765688110139, diff=0.006180702734928126
280 | folds=7, repeats=3, i=8, corr=0.9858973820022585, diff=0.009431569416499024
281 | folds=7, repeats=3, i=9, corr=0.9713070448511517, diff=0.009675288024442926
282 | folds=7, repeats=3, i=10, corr=0.9635099762903706, diff=0.006624511513525587
283 | Completed: 7 folds, 3 repeats | Avg Correlation: 0.9721, Avg Mean Diff: 0.0090
284 | folds=7, repeats=5, i=1, corr=0.9199378342268244, diff=0.008922989045383398
285 | folds=7, repeats=5, i=2, corr=0.9305605240628312, diff=0.009462680527610085
286 | folds=7, repeats=5, i=3, corr=0.9868414126704743, diff=0.008307379834562902
287 | folds=7, repeats=5, i=4, corr=0.9899484002488507, diff=0.008614039794321458
288 | folds=7, repeats=5, i=5, corr=0.976576733728466, diff=0.009443724569640015
289 | folds=7, repeats=5, i=6, corr=0.9693547315705672, diff=0.005900277218868722
290 | folds=7, repeats=5, i=7, corr=0.9909369574070983, diff=0.005307771070869582
291 | folds=7, repeats=5, i=8, corr=0.9892888145831531, diff=0.011010156494522676
292 | folds=7, repeats=5, i=9, corr=0.9627449539744223, diff=0.010943483120947896
293 | folds=7, repeats=5, i=10, corr=0.9658412890907523, diff=0.006972148446232913
294 | Completed: 7 folds, 5 repeats | Avg Correlation: 0.9682, Avg Mean Diff: 0.0085
295 | folds=7, repeats=10, i=1, corr=0.9930980639376968, diff=0.010301558238318903
296 | folds=7, repeats=10, i=2, corr=0.9920910968628622, diff=0.010058085177733074
297 | folds=7, repeats=10, i=3, corr=0.9874525701739706, diff=0.007801203890006742
298 | folds=7, repeats=10, i=4, corr=0.9854169402266026, diff=0.006881775095014487
299 | folds=7, repeats=10, i=5, corr=0.9771779515816831, diff=0.006837573217080281
300 | folds=7, repeats=10, i=6, corr=0.9900552860025492, diff=0.006621960652805747
301 | folds=7, repeats=10, i=7, corr=0.9578589198570598, diff=0.007519338251732596
302 | folds=7, repeats=10, i=8, corr=0.9031445620706875, diff=0.005411533646322324
303 | folds=7, repeats=10, i=9, corr=0.9938400095864596, diff=0.010214242119382887
304 | folds=7, repeats=10, i=10, corr=0.9788426400080306, diff=0.005239181757209895
305 | Completed: 7 folds, 10 repeats | Avg Correlation: 0.9759, Avg Mean Diff: 0.0077
306 | folds=10, repeats=1, i=1, corr=0.9748033758314912, diff=0.015320000000000049
307 | folds=10, repeats=1, i=2, corr=0.9187986740124284, diff=0.010219999999999991
308 | folds=10, repeats=1, i=3, corr=0.9759534155628019, diff=0.005019999999999986
309 | folds=10, repeats=1, i=4, corr=0.9404497799396768, diff=0.0062200000000000085
310 | folds=10, repeats=1, i=5, corr=0.9774883703984174, diff=0.011080000000000003
311 | folds=10, repeats=1, i=6, corr=0.9149008540654683, diff=0.016480000000000064
312 | folds=10, repeats=1, i=7, corr=0.8801191409994777, diff=0.006560000000000007
313 | folds=10, repeats=1, i=8, corr=0.9719343758671737, diff=0.012880000000000025
314 | folds=10, repeats=1, i=9, corr=0.9856023103580355, diff=0.0071200000000000195
315 | folds=10, repeats=1, i=10, corr=0.9556385726542705, diff=0.009180000000000035
316 | Completed: 10 folds, 1 repeats | Avg Correlation: 0.9496, Avg Mean Diff: 0.0100
317 | folds=10, repeats=3, i=1, corr=0.9818082957751972, diff=0.010593333333333219
318 | folds=10, repeats=3, i=2, corr=0.9314961162728049, diff=0.005919999999999988
319 | folds=10, repeats=3, i=3, corr=0.9127620228239297, diff=0.004453333333333281
320 | folds=10, repeats=3, i=4, corr=0.9810389657863505, diff=0.006766666666666591
321 | folds=10, repeats=3, i=5, corr=0.9918364430409649, diff=0.007466666666666555
322 | folds=10, repeats=3, i=6, corr=0.9395927744026121, diff=0.010773333333333244
323 | folds=10, repeats=3, i=7, corr=0.9885928388539656, diff=0.007373333333333213
324 | folds=10, repeats=3, i=8, corr=0.9036806490023385, diff=0.0070266666666665475
325 | folds=10, repeats=3, i=9, corr=0.9612958178459122, diff=0.008580000000000004
326 | folds=10, repeats=3, i=10, corr=0.9852138676111265, diff=0.006326666666666592
327 | Completed: 10 folds, 3 repeats | Avg Correlation: 0.9577, Avg Mean Diff: 0.0075
328 | folds=10, repeats=5, i=1, corr=0.9869203108957099, diff=0.008232000000000034
329 | folds=10, repeats=5, i=2, corr=0.989158809458989, diff=0.006472000000000011
330 | folds=10, repeats=5, i=3, corr=0.9468112866930535, diff=0.005743999999999982
331 | folds=10, repeats=5, i=4, corr=0.9555637452766133, diff=0.009019999999999985
332 | folds=10, repeats=5, i=5, corr=0.8867829384047247, diff=0.008167999999999996
333 | folds=10, repeats=5, i=6, corr=0.9388857673846408, diff=0.0054799999999999875
334 | folds=10, repeats=5, i=7, corr=0.9689970283669487, diff=0.005588000000000004
335 | folds=10, repeats=5, i=8, corr=0.980303607125597, diff=0.008103999999999974
336 | folds=10, repeats=5, i=9, corr=0.9873591803251196, diff=0.006583999999999992
337 | folds=10, repeats=5, i=10, corr=0.9543215764841885, diff=0.005707999999999978
338 | Completed: 10 folds, 5 repeats | Avg Correlation: 0.9595, Avg Mean Diff: 0.0069
339 | folds=10, repeats=10, i=1, corr=0.9895772309728519, diff=0.008010000000000012
340 | folds=10, repeats=10, i=2, corr=0.9371148265713927, diff=0.0056619999999999926
341 | folds=10, repeats=10, i=3, corr=0.9946765193420288, diff=0.007330000000000046
342 | folds=10, repeats=10, i=4, corr=0.9700972981521802, diff=0.006941999999999987
343 | folds=10, repeats=10, i=5, corr=0.9869957451350777, diff=0.009159999999999979
344 | folds=10, repeats=10, i=6, corr=0.943939573626107, diff=0.004751999999999973
345 | folds=10, repeats=10, i=7, corr=0.8887097918134683, diff=0.00497599999999995
346 | folds=10, repeats=10, i=8, corr=0.9915157230477832, diff=0.010015999999999867
347 | folds=10, repeats=10, i=9, corr=0.9869359272490404, diff=0.007609999999999974
348 | folds=10, repeats=10, i=10, corr=0.9840942879680609, diff=0.005351999999999967
349 | Completed: 10 folds, 10 repeats | Avg Correlation: 0.9674, Avg Mean Diff: 0.0070
350 |
351 | Final Results:
352 |
353 | folds repeats avg_correlation avg_mean_diff
354 | 0 3 1 0.921817 0.013012
355 | 1 3 3 0.959434 0.015273
356 | 2 3 5 0.967695 0.014267
357 | 3 3 10 0.967868 0.013540
358 | 4 5 1 0.908454 0.011474
359 | 5 5 3 0.957003 0.010035
360 | 6 5 5 0.968135 0.009392
361 | 7 5 10 0.986563 0.009226
362 | 8 7 1 0.943480 0.009331
363 | 9 7 3 0.972070 0.008997
364 | 10 7 5 0.968203 0.008488
365 | 11 7 10 0.975898 0.007689
366 | 12 10 1 0.949569 0.010008
367 | 13 10 3 0.957732 0.007528
368 | 14 10 5 0.959510 0.006910
369 | 15 10 10 0.967366 0.006981
370 | ```
371 |
372 | ### Observations
373 |
374 | Plots of results:
375 |
376 | 
377 |
378 | 
379 |
380 | Here’s an analysis of the experiment results based on the provided data:
381 |
382 | #### 1. Trends in Average Correlation
383 | - **General Trend**:
384 | - As the number of repeats increases, the average correlation tends to improve for all fold values.
385 | - This indicates that more repeats lead to more stable and consistent results, likely due to better statistical reliability.
386 |
387 | - **Impact of Folds**:
388 | - For **3 folds**, the correlation starts high (0.92) and stabilizes around 0.96-0.97 with increasing repeats.
389 | - For **5 folds**, correlation is slightly lower initially (0.91) but improves significantly with more repeats, reaching a peak at 10 repeats (0.986).
390 | - For **7 folds**, the correlation starts higher (0.94), improves consistently, but peaks slightly lower than 5 folds (around 0.975-0.968).
391 | - For **10 folds**, the correlation is generally high but improves more modestly compared to other fold values, peaking around 0.96-0.97.
392 |
393 | - **Key Observations**:
394 | - More folds combined with higher repeats generally provide better correlations.
395 | - 5 folds with 10 repeats show the highest correlation (0.986), suggesting it is an optimal balance.
396 |
397 | #### 2. Trends in Average Mean Difference
398 | - **General Trend**:
399 | - As the number of repeats increases, the average mean difference consistently decreases across all fold values.
400 | - This suggests that repeated experiments help to minimize variability and bring the mean difference closer to zero.
401 |
402 | - **Impact of Folds**:
403 | - For **3 folds**, the mean difference starts around 0.013 and gradually decreases with more repeats.
404 | - For **5 folds**, it starts lower (0.011) and decreases significantly to around 0.009 at 10 repeats.
405 | - For **7 folds**, the mean difference starts lower still (0.009) and shows the most dramatic improvement, dropping to 0.007 at 10 repeats.
406 | - For **10 folds**, the mean difference begins at 0.010 and also improves to 0.007 but shows diminishing returns with higher repeats.
407 |
408 | - **Key Observations**:
409 | - Higher fold values, such as 7 or 10 folds, generally produce lower mean differences, particularly when paired with a higher number of repeats.
410 |
411 | #### 3. Balancing Correlation and Mean Difference
412 | - **Trade-Offs**:
413 | - While 5 folds with 10 repeats yields the highest correlation (0.986), it does not produce the smallest mean difference.
414 | - 7 folds with 10 repeats achieves a slightly lower correlation (0.975) but has one of the smallest mean differences (0.007).
415 |
416 | - **Optimal Configuration**:
417 | - If correlation is prioritized, 5 folds and 10 repeats is optimal.
418 | - If minimizing mean difference is more important, 7 or 10 folds with 10 repeats might be preferable.
419 |
420 | #### 4. Recommendations for Future Experiments
421 | - **Choose Higher Repeats**:
422 | - Increasing the number of repeats provides diminishing returns beyond 10 but is generally effective at stabilizing results.
423 |
424 | - **Optimize Fold Selection**:
425 | - Depending on the metric of interest, 5 folds for correlation or 7 folds for mean difference are promising choices.
426 |
427 | - **Investigate Trade-Offs Further**:
428 | - Explore whether a compromise between correlation and mean difference exists, possibly at intermediate fold values (e.g., 6 or 8).
429 |
430 |
431 |
432 |
--------------------------------------------------------------------------------
/examples/seed_hacking.md:
--------------------------------------------------------------------------------
1 | # Seed Hacking
2 |
3 | > Repeat an experiment with different random number seeds to get the best result.
4 |
5 | ## Description
6 |
7 | Recall that the **random number seed** is an integer that initializes the [pseudo random number generator](https://en.wikipedia.org/wiki/Random_number_generation) and influences the specific and repeatable sequence of random numbers that are generated.
8 |
9 | **Seed hacking** or **random seed shopping** or **seed optimization** is a problematic practice where practitioners manipulate random number seeds to artificially improve model performance metrics.
10 |
11 | The technique involves repeatedly running the same experiment (e.g. model, data split, etc.) with different random seeds until finding one that produces better-looking results. This is typically done during model validation or testing phases.
12 |
13 | While random seeds are important for reproducibility, exploiting them to [cherry-pick](https://en.wikipedia.org/wiki/Cherry_picking) favorable outcomes introduces severe bias. This practice masks the model's true performance and can lead to poor generalization in production.
14 |
15 | The key issue is that seed hacking violates the principle of independent validation. By selecting seeds based on outcomes, you're effectively leaking information from your test set into your model selection process.
16 |
17 | This practice is particularly dangerous for new data scientists because it can be tempting to use when under pressure to show improved metrics. However, it fundamentally undermines the scientific validity of your work.
18 |
19 | A more ethical approach is to use fixed random seeds for reproducibility, but to select them before seeing any results. This maintains experimental integrity while still allowing others to replicate your work.
20 |
21 | ## What Does a Seed-Hacked Result Mean?
22 |
23 | In a stochastic experiment, a single result is a point estimate of the unknown underlying distribution, such as the hold-out/test set prediction error.
24 |
25 | If we repeat the experiment and vary the randomness (e.g., by using different random seeds for data splits or model initialization) we obtain a distribution of estimates. Taking the mean, standard deviation, or confidence interval of this distribution gives us a more accurate and reliable understanding of the model's true performance.
26 |
27 | However, when we hack the seed to deliberately select the best possible result (e.g., lowest error or highest accuracy), we introduce [systematic bias](https://en.wikipedia.org/wiki/Observational_error). Rather than estimating the true mean of the performance distribution, we shift the estimate in a favorable direction.
28 |
29 | The result is no longer a fair or unbiased reflection of the model's performance but instead an overoptimistic artifact of the chosen randomness. This shift can be substantial and misrepresent the model's real-world generalizability.
30 |
31 | **Intentionally introducing a systematic bias by seed hacking is deceptive and misleading, perhaps fraud.**
32 |
33 | Here's a depiction of what is happening when we pick a seed hacked result:
34 |
35 | 
36 |
37 | ## Examples
38 |
39 | Below is a list of aspects of a data science project that could be subject to seed hacking:
40 |
41 | - **Data Splitting**: Splitting datasets into training, validation, and testing sets. Shuffling data during cross-validation evaluation.
42 | - **Resampling Techniques**: Bootstrapping or permutation tests. Creating synthetic datasets using resampling methods.
43 | - **Learning Algorithms**: Initializing weights in neural networks. Randomly selecting subsets of data for ensemble methods like Random Forest or Bagging. Stochastic gradient descent and related stochastic optimization methods.
44 | - **Hyperparameter Optimization**: Randomized search strategies for hyperparameter tuning. Distribution sampling search strategies like Bayesian Optimization.
45 | - **Data Augmentation**: Random transformations for data augmentation in image or text preprocessing. Generating synthetic data for privacy-preserving data sharing or experimentation. Simulating data with specific statistical properties.
46 | - **Feature Engineering**: Randomized feature selection or subset selection algorithms. Creating stochastic embeddings, e.g., in t-SNE or UMAP.
47 |
48 | ### Worked Examples
49 |
50 | Some worked examples of seed hacking applied to specific aspects of a project:
51 |
52 | * [Seed Hacking Cross-Validation](seed_hacking_cross_validation.md): _Vary the seed for creating cross-validation folds in order to get the best result._
53 | * [Seed Hacking the Train/Test Split](seed_hacking_train_test_split.md): _Vary the seed for creating train/test splits in order to get the best result._
54 | * [Seed Hacking Learning Algorithm](seed_hacking_learning_algorithm.md): _Vary the seed for the model training algorithm in order to get the best result._
55 | * [Seed Hack Bootstrap Performance](seed_hacking_bootstrap_performance.md): _Vary the seed for a bootstrap of a final chosen model on the test set to present the best performance._
56 |
57 | More seed hacking examples for learning algorithms: [bagging](src/seed_hacking_bagging.py), [decision tree](src/seed_hacking_decision_tree.py), [gradient boosting](src/seed_hacking_gradient_boosting.py), [logistic regression](src/seed_hacking_logistic_regression.py), [multilayer perceptron](src/seed_hacking_multilayer_percepron.py), [perceptron](src/seed_hacking_perceptron.py), [random forest](src/seed_hacking_random_forest.py), [ridge classifier](src/seed_hacking_ridge_classifier.py), [sgd classifier](src/seed_hacking_sgd_classifier.py).
58 |
59 | ## Negative Seed Hacking
60 |
61 | How can we defend the choice of random number seed on a project?
62 |
63 | * Use a widely used default, e.g. 1 or 42 or 1234 or 1337.
64 | * Use the current date as an integer, e.g. DDMMYYYY.
65 | * Look at the clock and use the current minute and/or second value.
66 | * Roll die and use the number that comes up.
67 |
68 | Then record what you chose and how you chose it in your project log.
69 |
70 | ## Quantify the Variance
71 |
72 | Don't guess or ignore the variance, measure and report it.
73 |
74 | Perform a [sensitivity analysis](https://en.wikipedia.org/wiki/Sensitivity_analysis) aka stability/robustness study.
75 |
76 | This involves:
77 |
78 | 1. Hold everything in your setup (data + model) constant.
79 | 2. Pick one aspect of your setup that uses randomness.
80 | 3. Vary the randomness for that one aspect (e.g. 30+ runs each with a different seed).
81 | 4. Collect performance scores and report/analyze the distribution (best + worst, mean + stdev, median + confidence interval, etc.).
82 |
83 | For example:
84 |
85 | 1. **Hold the Model Constant and Vary the Data**: Use techniques like k-fold cross-validation (CV), repeated k-fold CV, or repeated train/test splits while keeping the model and its random initialization fixed.
86 | - Quantify how sensitive the model's performance is to variations in the training and test data splits.
87 | - This approach reveals variance caused by differences in the sampled training/test data and helps assess the model's robustness to data variability.
88 | 2. **Hold the Data Constant and Vary the Learning Algorithm**: Use a fixed dataset and vary only the random seed for the algorithm (e.g., random initialization of weights, dropout masks, or other stochastic elements).
89 | - Quantify how the inherent randomness in the learning process affects model performance.
90 | - This captures the variance caused by the stochastic nature of the optimization algorithm or training procedure.
91 | 3. **Vary Both the Data and the Learning Algorithm**: Randomize both the data (through k-fold CV or similar techniques) and the algorithm (through different seeds).
92 | - Assess the **total variance** in the learning process, encompassing both data variability and algorithm randomness.
93 | - This provides a holistic view of the overall variability in model performance.
94 |
95 | How much variance to expect? It really depends.
96 |
97 | - The variance due to data could be a few percent (e.g. 1-2%).
98 | - The variance due to learning algorithm could be a few tenths of a percent to a few percent (e.g. 0.2-0.4% or 1-2%).
99 |
100 | ### Reduce the Variance
101 |
102 | Variance is reduced by adding bias, we cannot escape the [Bias–variance tradeoff](https://en.wikipedia.org/wiki/Bias%E2%80%93variance_tradeoff).
103 |
104 | The techniques for reducing variance are typically specific to your setup, especially your model.
105 |
106 | Nevertheless, here are some ideas:
107 |
108 | 1. **Reducing Performance Variance Due to Data**. Variance from data arises because models are sensitive to the specific training or test samples provided. Strategies to mitigate this include:
109 | - Regularization: Penalize model complexity to prevent overfitting to specific data splits.
110 | - Use More Data: Larger datasets typically reduce variability by making training samples more representative of the underlying distribution.
111 | - Robust Models: Use algorithms known for robustness to outliers or data variability, such as tree-based methods (e.g., Random Forests, Gradient Boosting).
112 | - ...
113 | 2. **Reducing Performance Variance Due to the Learning Algorithm**. Variance from the learning algorithm stems from stochasticity in the optimization process, such as random initialization, batch sampling, or other internal randomness. Strategies to reduce this variance include:
114 | - Ensembles: Combine predictions from multiple models trained on the same data but with different initializations or configurations.
115 | - Repeated Training and Averaging: Train the model multiple times with different seeds and average the predictions for a more robust output (simplest ensemble).
116 | - Better Initialization: Use advanced initialization techniques, such as Xavier or He initialization, to reduce sensitivity to starting conditions.
117 | - Use Stable Optimizers: Certain optimizers, such as AdamW or SGD with carefully tuned learning rates, can provide more consistent convergence compared to others.
118 | - Longer Training with Early Stopping: Allow models more time to converge but use early stopping based on validation performance to avoid overfitting.
119 | - ...
120 | 3. **Reducing Overall Variance (Both Data and Algorithm)**. For a holistic reduction in variance, consider strategies that address both data and algorithm variability:
121 | - Use Cross-Validation: Perform k-fold cross-validation to average out performance over different data splits and initialization seeds.
122 | - Hybrid Ensembles: Combine models trained on different data subsets (bagging) with models using different algorithm configurations or seeds.
123 |
124 | For a best practice approach, combine strategies:
125 |
126 | - Regularize the model and preprocess data to reduce data-driven variance.
127 | - Use ensembles or repeated runs to reduce algorithm-driven variance.
128 | - Report distributions of performance metrics to transparently communicate variability.
129 |
130 | ## What About Large One-Off Models (e.g. neural nets)?
131 |
132 | Some large deep learning neural networks can take days, weeks, or months to train, often at great expense.
133 |
134 | As such, typically only one model is trained.
135 |
136 | These models are sensitive to initial conditions e.g. initial random coefficients/weights. Additionally, the learning algorithm may be stochastic (e.g. shuffling of training samples, dropout, etc.).
137 |
138 | As such, the choice of random number seed influences the performance of the final model.
139 |
140 | In a (small) fast-to-train model, we might call this the variance in the performance of the model. In a (large) slow-to-train model that might take weeks to months to train, this could be the difference between a successful and unsuccessful project.
141 |
142 | For example:
143 |
144 | > Fine-tuning pretrained contextual word embedding models to supervised downstream tasks has become commonplace in natural language processing. This process, however, is often brittle: even with the same hyperparameter values, distinct random seeds can lead to substantially different results.
145 |
146 | -- [Fine-Tuning Pretrained Language Models: Weight Initializations, Data Orders, and Early Stopping](https://arxiv.org/abs/2002.06305), 2020.
147 |
148 | And:
149 |
150 | > However, deep neural network based models are often brittle to various sources of randomness in the training of the models. This could be attributed to several sources including, but not limited to, random parameter initialization, random sampling of examples during training and random dropping of neurons. It has been observed that these models have, more often, a set of random seeds that yield better results than others. This has also lead to research suggesting random seeds as an additional hyperparameter for tuning.
151 |
152 | -- [On Model Stability as a Function of Random Seed](https://arxiv.org/abs/1909.10447), 2019.
153 |
154 | What to do?
155 |
156 | It depends. Don't seed hack, but perhaps:
157 |
158 | * Can you ensemble a few model runs or model checkpoints together to reduce the variance?
159 | * Can you use early stopping and/or regularization during training to reduce the variance?
160 |
161 | > A common approach to creating neural network ensembles is to train the same architecture with different random seeds, and have the resulting models vote.
162 |
163 | -- [We need to talk about random seeds](https://arxiv.org/abs/2210.13393), 2022.
164 |
165 | One consolation is that a converged neural network model generally has a narrow distribution of performance across random seeds (as we might hope and expect).
166 |
167 | > What is the distribution of scores with respect to the choice of seed? The distribution of accuracy when varying seeds is relatively pointy, which means that results are fairly concentrated around the mean. Once the model converged, this distribution is relatively stable which means that some seed are intrinsically better than others.
168 |
169 | -- [Torch.manual_seed(3407) is all you need: On the influence of random seeds in deep learning architectures for computer vision](https://arxiv.org/abs/2109.08203), 2021.
170 |
171 | And:
172 |
173 | > Typically, the choice of random seed only has a slight effect on the result and can mostly be ignored in general or for most of the hyper-parameter search process.
174 |
175 | -- [Practical recommendations for gradient-based training of deep architectures](https://arxiv.org/abs/1206.5533), 2012.
176 |
177 | If you can perform multiple training runs for your neural network model, then you should, **with different random number seeds**.
178 |
179 | This is called multiple-restart optimization, see below.
180 |
181 | ## When is Seed Hacking Ethical?
182 |
183 | Is there such a thing as ethical seed hacking (in machine learning/data science)?
184 |
185 | YES!
186 |
187 | Here are some softer rationales:
188 |
189 | * Perhaps you want to make a point for a demonstration, presentation, course, tutorial, etc.?
190 | * Perhaps you require the best descriptive rather than predictive model?
191 | * Perhaps you want to find best/worst/distribution performance due to learning algorithm/initial condition variance (e.g. a sensitivity analysis)?
192 |
193 | The best case for seed hacking is as an stochastic optimization strategy called "multiple-restarts":
194 |
195 | * Some learning algorithms are solving a really hard (e.g. non-convex/discontinuous/deceptive/multimodal/etc.) optimization problem and random restarts of initial conditions in the search space is in fact a beneficial approach.
196 |
197 | ### Multiple-Restart Optimization
198 |
199 | The multiple-restart strategy is a technique used to address the challenges of solving harder optimization problems, particularly non-convex ones with multiple local minima, saddle points, or other complex structures.
200 |
201 | By running the optimization process multiple times with different initial conditions or random seeds, this approach increases the likelihood of exploring diverse regions of the solution space and finding better optima.
202 |
203 | > Heuristic search procedures that aspire to find global optimal solutions to hard combinatorial optimization problems usually require some type of diversification to overcome local optimality. One way to achieve diversification is to re-start the procedure from a new solution once a region has been explored.
204 |
205 | -- [Chapter 12: Multi-Start Methods](https://link.springer.com/chapter/10.1007/0-306-48056-5_12), Handbook of Metaheuristics, 2003.
206 |
207 | It is especially beneficial for algorithms that are sensitive to initialization, such as neural networks, clustering methods (e.g., K-Means), or stochastic optimization algorithms.
208 |
209 | While multi-restart offers significant advantages for non-convex and multimodal problems, it provides little to no benefit for convex optimization problems, where the global minimum is guaranteed regardless of the starting point.
210 |
211 | The strategy effectively balances computational cost with solution quality in scenarios where optimality cannot be guaranteed in a single run.
212 |
213 | Below is a table of common machine learning algorithms, the type of optimization problem they are solving (e.g. convex or non-convex), whether they are sensitive to initial conditions, and whether they will benefit from multiple restarts:
214 |
215 | | Algorithm | Problem Type | Sensitivity | Multi-Restart Benefit |
216 | |--------------------------|-----------------|-------------|-----------------------|
217 | | Linear Regression | Convex | None | None |
218 | | Logistic Regression | Convex | Minimal | Minimal |
219 | | K-Means | Non-convex | High | High |
220 | | t-SNE | Non-convex | High | High |
221 | | Neural Networks | Non-convex | High | High |
222 | | Random Forests | Non-convex | Low | Low to Moderate |
223 | | SVC | Convex | None | None |
224 | | PCA | Convex | None | None |
225 |
226 |
227 | As such, we may see what looks like seed hacking in the context of deep learning / reinforcement learning work, which may in fact be examples of a multiple-restart optimization.
228 |
229 | The problem is, how do you tell the difference?
230 |
231 | ### Seed Hacking vs Multiple-Restarts
232 |
233 | Differentiating between a legitimate multi-restart optimization strategy and "seed hacking" (cherry-picking the best result) requires careful scrutiny of how the results are reported and interpreted.
234 |
235 | Below are the characteristics of **legitimate multi-restart optimization**:
236 |
237 | 1. **Disclosure of Multi-Restart Process**: Clearly states that a multi-restart strategy was employed and describes the number of restarts, initialization strategy, and hyperparameters.
238 | 2. **Performance Distribution Reporting**: Reports the distribution of performance metrics across restarts, including mean, median, standard deviation, and possibly full histograms or box plots. This allows readers to assess the stability of the algorithm and whether the best result is an outlier or representative of typical performance.
239 | 3. **Procedure Replication:** If the "best result" is highlighted, it contextualizes this by repeating the entire multi-restart procedure multiple times and reporting the distribution of "best-of-restart" scores. This provides confidence that the approach is not a one-off fluke.
240 | 4. **Statistical Robustness:** Includes statistical tests to verify whether improvements from the best restart are statistically significant compared to baselines or other algorithms.
241 | 5. **Sensitivity Analysis:** Reports how sensitive the algorithm is to random initialization, demonstrating whether consistent performance can be expected or if results are highly variable.
242 |
243 | Conversely, below are the characteristics of **seed hacking with multi-restart optimization**:
244 |
245 | 1. **Single Point Estimate:** Reports only the best result without contextualizing it within the broader distribution of outcomes across restarts. This ignores variability and may cherry-pick an optimistic outlier.
246 | 2. **Non-Disclosure of Multi-Restart:** Fails to disclose that multiple restarts or seeds were used. This gives the impression that the reported result comes from a single unbiased run.
247 | 3. **Absence of Distribution Information:** Does not provide statistics (e.g., mean, standard deviation, quantiles) of performance across restarts. This lacks transparency on how consistently high-quality solutions are found.
248 | 4. **Selective Comparisons:** Compares the "best restart" of one algorithm with the "average performance" of another algorithm or baseline, creating unfair comparisons.
249 |
250 |
251 | ## FAQ
252 |
253 | I get a lot of questions about "how to pick the best seed". Some of the answers below may help.
254 |
255 |
256 | **Q. Is the random seed a hyperparameter?**
257 |
258 | Yes.
259 |
260 | It is a hyperparameter (to the model, to the test harness, etc.) that we should set, but not one we should optimize.
261 |
262 | **Q. What random number seed should I use?**
263 |
264 | No one cares. Use "1" or "42" or the current date in DDMMYYYY format.
265 |
266 | Even better, don't use one seed, use many and report a result distribution.
267 |
268 | **Q. What seed should I use for my final chosen model fit on all training data?**
269 |
270 | The same seed you used to evaluate candidate models on your test harness.
271 |
272 | Or, fit a suite of final models with different seeds (e.g. 30) and use them all in an ensemble to make predictions on new data. This will average out the variance in the learning algorithm.
273 |
274 | **Q. My model shows a large variance with different random number seeds, what should I do?**
275 |
276 | Add bias.
277 |
278 | * Perhaps increase training epochs, tree depth, etc.
279 | * Perhaps use regularization to reduce model variance.
280 | * Perhaps adjust hyperparameters to reduce model variance.
281 | * Perhaps use repeated evaluations (e.g. repeated k-fold cross-validation or repeated train/test splits) and report a performance distribution instead of a point estimate.
282 |
283 | **Q. What about a machine learning competition, like Kaggle?**
284 |
285 | Nope, or probably not.
286 |
287 | Your model must generalize to unseen data (e.g. the hidden test set) and optimizing for the public test set will likely (almost certainly) result in overfitting.
288 |
289 | **Q. Surely picking the model random number seed that gives the best cross-validation score is a good idea?**
290 |
291 | Nope, or probably not.
292 |
293 | It is likely that the difference in each distribution of CV scores is the same (e.g. check using a statistical hypothesis test, quantify using an effect size) and that any differences you are seeing are misleading.
294 |
295 | If there are differences, your model may have a variance that is a little too high for the given quantity of training data. Add some bias (see above). Or the model is fragile/overfit the hold out test set of your test harness and will not generalize well to changes (e.g. changes to the data, changes to the model).
296 |
297 | **Q. Okay, if I have to choose between two models, each fit with a different seed, I should choose the one with the better performance, right?**
298 |
299 | Nope, or probably not. See above.
300 |
301 | **Q. Can we seed hack (grid search the seed) for a model within nested k-fold cross-validation?**
302 |
303 | Oh man... I guess you could.
304 |
305 | Again, I suspect that in most cases, any difference between model performance distributions with a fixed vs optimized seed will not be statistically significant.
306 |
307 | If it is different, perhaps use methods to reduce model variance as discussed above.
308 |
309 | If it's for an algorithm with a non-convex optimization problem, think about what this means. It means that one initial condition performs "better" across different subsets of train/test data. Maybe it's true, it's probably not.
310 |
311 | **Q. How do I know if my seed hacked result is optimistically biased or a better solution to a hard optimization problem?**
312 |
313 | Now that is a good question!
314 |
315 | If we know a lot about the model and its optimization procedure, we might be able to draw a logical conclusion because of the underlying optimization problem the learning algorithm is solving (e.g. convex vs non-convex and to what degree it is sensitive to initial conditions or stochastic behaviors during the search).
316 |
317 | For example:
318 |
319 | - Did the change in seed permit the optimization algorithm locate a superior solution in the search space (if so, can you confirm with statistical tests)?
320 |
321 | Empirically, you can sample results for a ton of seeds and see where you sit on the distribution. All that tells you is what result percentile you might be in, not whether the solution is brittle.
322 |
323 | This is really hard and an "it depends" is the best I can manage.
324 |
325 | See the sections above on "Multiple-Restart Optimization" and "Seed Hacking vs Multiple-Restarts".
326 |
327 | ## Further Reading
328 |
329 | Sometimes it helps to read how others are thinking through this issue:
330 |
331 | ### Papers
332 |
333 | * [Fine-Tuning Pretrained Language Models: Weight Initializations, Data Orders, and Early Stopping](https://arxiv.org/abs/2002.06305), 2020.
334 | * [Multi-Start Methods](https://link.springer.com/chapter/10.1007/0-306-48056-5_12), Handbook of Metaheuristics, 2003.
335 | * [On Model Stability as a Function of Random Seed](https://arxiv.org/abs/1909.10447), 2019.
336 | * [Practical recommendations for gradient-based training of deep architectures](https://arxiv.org/abs/1206.5533), 2012.
337 | * [Pseudo-random Number Generator Influences on Average Treatment Effect Estimates Obtained with Machine Learning](https://pubmed.ncbi.nlm.nih.gov/39150879/), 2024.
338 | * [Torch.manual_seed(3407) is all you need: On the influence of random seeds in deep learning architectures for computer vision](https://arxiv.org/abs/2109.08203), 2021.
339 | * [We need to talk about random seeds](https://arxiv.org/abs/2210.13393), 2022.
340 |
341 | ### Blog Posts
342 |
343 | * [Are random seeds hyperparameters?](https://andrewcharlesjones.github.io/journal/random-seed-hyperparameter.html)
344 | * [Manipulating machine learning results with random state](https://towardsdatascience.com/manipulating-machine-learning-results-with-random-state-2a6f49b31081)
345 | * [Optimizing the Random Seed](https://towardsdatascience.com/optimizing-the-random-seed-99a90bd272e)
346 |
347 | ### Discussion
348 |
349 | Lots of people struggling with choosing/optimizing the random seed out there in the wild. Not enough background in statistics/stochastic optimization IMHO, but that's okay.
350 |
351 | * [Am I creating bias by using the same random seed over and over?](https://stats.stackexchange.com/questions/80407/am-i-creating-bias-by-using-the-same-random-seed-over-and-over)
352 | * [Choosing the "Correct" Seed for Reproducible Research/Results](https://stats.stackexchange.com/questions/335936/choosing-the-correct-seed-for-reproducible-research-results)
353 | * [Data folks of Reddit: How do you choose a random seed?](https://www.reddit.com/r/datascience/comments/17kxd5s/data_folks_of_reddit_how_do_you_choose_a_random/)
354 | * [Do Deep Learning/Machine Learning papers use a fixed seed to report their results?](https://www.reddit.com/r/MachineLearning/comments/fbl9ho/discussion_do_deep_learningmachine_learning/)
355 | * [How to choose the random seed?](https://datascience.stackexchange.com/questions/35869/how-to-choose-the-random-seed)
356 | * [How to deal with random parameters in MLOps](https://stats.stackexchange.com/questions/564045/how-to-deal-with-random-parameters-in-mlops)
357 | * [If so many people use set.seed(123) doesn't that affect randomness of world's reporting?](https://stats.stackexchange.com/questions/205961/if-so-many-people-use-set-seed123-doesnt-that-affect-randomness-of-worlds-re)
358 | * [Is it 'fair' to set a seed in a random forest regression to yield the highest accuracy?](https://stats.stackexchange.com/questions/341610/is-it-fair-to-set-a-seed-in-a-random-forest-regression-to-yield-the-highest-ac/)
359 | * [Is random seed a hyper-parameter to tune in training deep neural network?](https://stats.stackexchange.com/questions/478193/is-random-seed-a-hyper-parameter-to-tune-in-training-deep-neural-network)
360 | * [Is random state a parameter to tune?](https://stats.stackexchange.com/questions/263999/is-random-state-a-parameter-to-tune)
361 | * [Neural network hyperparameter tuning - is setting random seed a good idea?](https://stackoverflow.com/questions/65704588/neural-network-hyperparameter-tuning-is-setting-random-seed-a-good-idea)
362 | * [Optimization of hyperparameters and seed](https://www.reddit.com/r/reinforcementlearning/comments/ptsbvb/optimization_of_hyperparameters_and_seed/)
363 | * [Performance of Ridge and Lasso Regression depend on set.seed?](https://stats.stackexchange.com/questions/355256/performance-of-ridge-and-lasso-regression-depend-on-set-seed)
364 | * [Why is it valid to use CV to set parameters and hyperparameters but not seeds?](https://stats.stackexchange.com/questions/341619/why-is-it-valid-to-use-cv-to-set-parameters-and-hyperparameters-but-not-seeds)
365 | * [XGBoost - "Optimizing Random Seed"](https://stats.stackexchange.com/questions/273230/xgboost-optimizing-random-seed)
366 |
367 |
368 |
369 |
--------------------------------------------------------------------------------