├── README.md
├── deep_learning_with_python_code
    ├── 02_theano_example.py
    ├── 03_tensorflow_example.py
    ├── 07_first_mlp.py
    ├── 08_automatic_split.py
    ├── 08_manual_cross_validation.py
    ├── 08_manual_split.py
    ├── 09_sklearn_cross_validation.py
    ├── 09_sklearn_grid_search_params.py
    ├── 10_iris_example.py
    ├── 11_sonar_baseline.py
    ├── 11_sonar_standardized.py
    ├── 11_sonar_standardized_larger.py
    ├── 11_sonar_standardized_smaller.py
    ├── 12_boston_baseline.py
    ├── 12_boston_standardized.py
    ├── 12_boston_standardized_larger.py
    ├── 12_boston_standardized_wider.py
    ├── 13_serialize_json.py
    ├── 13_serialize_yaml.py
    ├── 14_checkpoint_best_model.py
    ├── 14_checkpoint_load.py
    ├── 14_checkpoint_model_improvements.py
    ├── 15_plot_history.py
    ├── 16_baseline.py
    ├── 16_dropout_hidden.py
    ├── 16_dropout_visible.py
    ├── 17_decay_drop_based.py
    ├── 17_decay_time_based.py
    ├── 19_mnist_cnn.py
    ├── 19_mnist_cnn_large.py
    ├── 19_mnist_mlp_baseline.py
    ├── 19_mnist_plot.py
    ├── 20_augment_baseline.py
    ├── 20_augment_feature_standardize.py
    ├── 20_augment_flips.py
    ├── 20_augment_rotations.py
    ├── 20_augment_save_to_file.py
    ├── 20_augment_shifts.py
    ├── 20_augment_zca.py
    ├── 21_cifar10_cnn.py
    ├── 21_cifar10_cnn_large.py
    ├── 21_cifar10_plot.py
    ├── 22_imdb_cnn.py
    ├── 22_imdb_mlp.py
    ├── 22_imdb_plot.py
    ├── 24_mlp_simple.py
    ├── 24_mlp_window.py
    ├── 25_lstm_simple.py
    ├── 25_lstm_stacked.py
    ├── 25_lstm_stateful.py
    ├── 25_lstm_time_steps.py
    ├── 25_lstm_window.py
    ├── 26_lstm_cnn.py
    ├── 26_lstm_dropout_gates.py
    ├── 26_lstm_dropout_layers.py
    ├── 26_lstm_simple.py
    ├── 27_lstm_char_seq_batch.py
    ├── 27_lstm_char_seq_features.py
    ├── 27_lstm_char_seq_timesteps.py
    ├── 27_lstm_one_char.py
    ├── 27_lstm_one_char_stateful.py
    ├── 27_lstm_var_length.py
    ├── 28_lstm_gen_text.py
    ├── 28_lstm_larger.py
    ├── 28_lstm_larger_gen_text.py
    ├── 28_lstm_small.py
    ├── housing.csv
    ├── international-airline-passengers.csv
    ├── ionosphere.csv
    ├── iris.csv
    ├── pima-indians-diabetes.csv
    ├── sonar.csv
    ├── weights-improvement-19-1.9435.hdf5
    ├── weights-improvement-47-1.2219-bigger.hdf5
    └── wonderland.txt
├── machine_learning_mastery_with_r_code
    ├── .DS_Store
    ├── 1-AnalyzeData
    │   ├── .DS_Store
    │   ├── 1-LoadData
    │   │   ├── datasets-mlbench.R
    │   │   ├── datasets_appliedpredictivemodeling.R
    │   │   ├── datasets_datasets.R
    │   │   ├── iris.csv
    │   │   ├── load_csv_file.R
    │   │   └── load_csv_url.R
    │   ├── 2-DataSummarization
    │   │   ├── class_distribution.R
    │   │   ├── correction_spearman.R
    │   │   ├── correlation_pearson.R
    │   │   ├── data_types.R
    │   │   ├── dimensions.R
    │   │   ├── peek.R
    │   │   ├── skewness.R
    │   │   ├── standard_deviation.R
    │   │   └── summary.R
    │   └── 3-DataVisualization
    │   │   ├── 1-Univariate
    │   │       ├── barplot.R
    │   │       ├── boxplot.R
    │   │       ├── density_plot.R
    │   │       ├── histogram.R
    │   │       └── missing_plot.R
    │   │   ├── 2-Multivaraite
    │   │       ├── boxplot_by_class.R
    │   │       ├── correlation_plot.R
    │   │       ├── density_plot_by_class.R
    │   │       ├── scatterplot_matrix.R
    │   │       └── scatterplot_matrix_by_class.R
    │   │   └── 3-Projection
    │   │       ├── andrews_curves.R
    │   │       ├── parallel_coordinates.R
    │   │       ├── pca.R
    │   │       ├── sammons.R
    │   │       └── som.R
    ├── 2-PrepareData
    │   ├── .DS_Store
    │   ├── 1-DataCleaning
    │   │   ├── impute_missing_values.R
    │   │   ├── mark_missing_values.R
    │   │   ├── rebalance_SMOTE.R
    │   │   ├── remove_duplicates.R
    │   │   ├── remove_na.R
    │   │   └── remove_outliers.R
    │   ├── 2-FeatureSelection
    │   │   ├── rank_features_by_importance_caret.R
    │   │   ├── recursive_feature_elimination_caret.R
    │   │   └── remove_highly_correlated_features_caret.R
    │   └── 3-TransformData
    │   │   ├── boxcox_transform.R
    │   │   ├── center.R
    │   │   ├── ica_transform.R
    │   │   ├── normalize.R
    │   │   ├── pca_transform.R
    │   │   ├── scale.R
    │   │   ├── standardize.R
    │   │   └── yeojohnson_transform.R
    ├── 3-Algorithms
    │   ├── .DS_Store
    │   ├── 1-Algorithms
    │   │   ├── .DS_Store
    │   │   ├── 1-LinearRegression
    │   │   │   ├── ordinary_least_squares_regression.R
    │   │   │   ├── partial_least_squares_regression.R
    │   │   │   ├── principal_component_regression.R
    │   │   │   └── stepwise_linear_regression.R
    │   │   ├── 2-PenalizedLinearRegression
    │   │   │   ├── LASSO.R
    │   │   │   ├── elastic_net.R
    │   │   │   └── ridge_regression.R
    │   │   ├── 3-NonLinearRegression
    │   │   │   ├── M5P.R
    │   │   │   ├── M5Rules.R
    │   │   │   ├── bagging_CART.R
    │   │   │   ├── classification_and_regression_trees.R
    │   │   │   ├── conditional_decision_trees.R
    │   │   │   ├── cubist.R
    │   │   │   ├── feed_forward_neural_network.R
    │   │   │   ├── gradient_boosted_machine.R
    │   │   │   ├── k-nearest_neighbor.R
    │   │   │   ├── multivariate_adaptive_regression_splines.R
    │   │   │   ├── random_forest.R
    │   │   │   └── support_vector_machine.R
    │   │   ├── 4-LinearClassification
    │   │   │   ├── linear_discriminant_analysis.R
    │   │   │   ├── logistic_regression.R
    │   │   │   ├── logistic_regression_multiclass.R
    │   │   │   └── partial_least_squares_discriminant_analysis.R
    │   │   ├── 5-NonLinearClassiication
    │   │   │   ├── C4.5.R
    │   │   │   ├── C5.0.R
    │   │   │   ├── PART.R
    │   │   │   ├── bagging_CART.R
    │   │   │   ├── classification_and_regression_trees.R
    │   │   │   ├── feed_forward_neural_network.R
    │   │   │   ├── flexible_discriminant_analysis.R
    │   │   │   ├── gradient_boosted_machine.R
    │   │   │   ├── k-nearest_neighbors.R
    │   │   │   ├── mixture_discriminant_analysis.R
    │   │   │   ├── naive_bayes.R
    │   │   │   ├── quadratic_discriminant_analysis.R
    │   │   │   ├── random_forest.R
    │   │   │   ├── regularized_discriminant_analysis.R
    │   │   │   └── support_vector_machine.R
    │   │   └── 6-Optimization
    │   │   │   ├── bfgs.R
    │   │   │   ├── conjugate_gradient.R
    │   │   │   ├── golden_section_search.R
    │   │   │   ├── gradient_descent.R
    │   │   │   └── nelder_mead.R
    │   ├── 2-CaretAlgorithms
    │   │   ├── binary_classification_algorithms.R
    │   │   └── regression_algorithms.R
    │   └── algorithm_spot_check.R
    ├── 4-EvaluateAlgorithms
    │   ├── .DS_Store
    │   ├── 1-ResamplingMethods
    │   │   ├── bootstrap.R
    │   │   ├── data_split.R
    │   │   ├── kfold_cross_validation.R
    │   │   ├── leave_one_out_cross_validation.R
    │   │   └── repeated_kfold_cross_validation.R
    │   ├── 2-Metrics
    │   │   ├── Accuracy.R
    │   │   ├── Kappa.R
    │   │   ├── LogLoss.R
    │   │   ├── RMSE.R
    │   │   ├── ROC.R
    │   │   └── RSquared.R
    │   └── 3-ModelSelection
    │   │   ├── .DS_Store
    │   │   ├── compare_boxplots.R
    │   │   ├── compare_densityplot.R
    │   │   ├── compare_dotplot.R
    │   │   ├── compare_parallelplot.R
    │   │   ├── compare_scatterplot_matrix.R
    │   │   ├── compare_summary.R
    │   │   ├── compare_xyplot.R
    │   │   └── significant_difference.R
    ├── 5-ImproveResults
    │   ├── .DS_Store
    │   ├── 1-TuneAlgorithms
    │   │   ├── automatic_grid_search.R
    │   │   ├── custom_search.R
    │   │   ├── manual_grid_search.R
    │   │   ├── manual_search.R
    │   │   ├── optimal_parameters.R
    │   │   └── random_search.R
    │   └── 2-Ensembles
    │   │   ├── bagging.R
    │   │   ├── blending.R
    │   │   └── stacking.R
    ├── 6-FinalizeModel
    │   ├── .DS_Store
    │   ├── 1-Predict
    │   │   ├── predict_caret.R
    │   │   └── train_all_dataset.R
    │   ├── 2-FinalModel
    │   │   └── standalone_model.R
    │   └── 3-SaveLoadModel
    │   │   └── save_load_model.R
    ├── 7-Other
    │   ├── install_list_of_packages.R
    │   ├── install_package_with_dependencies.R
    │   └── r_crash_course.R
    ├── 8-CaseStudies
    │   ├── .DS_Store
    │   ├── BinaryClassification
    │   │   ├── breast_cancer.R
    │   │   ├── diabetes.R
    │   │   ├── diabetes_spot_check.R
    │   │   ├── ionosphere.R
    │   │   ├── ionosphere_ensemble.R
    │   │   ├── sonar.R
    │   │   └── sonar_tuning.R
    │   ├── MultiClassClassification
    │   │   ├── .DS_Store
    │   │   ├── glass.R
    │   │   ├── iris.R
    │   │   └── soybean.R
    │   ├── Regression
    │   │   ├── abalone.R
    │   │   ├── boston.R
    │   │   └── longley.R
    │   └── project_template.R
    └── README.txt
├── ml_with_python_code
    ├── 02_scipy_versions.py
    ├── 02_sklearn_version.py
    ├── 03_matplotlib_crash_course.py
    ├── 03_numpy_crash_course.py
    ├── 03_pandas_crash_course.py
    ├── 03_python_crash_course.py
    ├── 04_load_csv.py
    ├── 04_load_csv_np.py
    ├── 04_load_csv_np_url.py
    ├── 04_load_csv_pandas.py
    ├── 04_load_csv_pandas_url.py
    ├── 05_class_distribution.py
    ├── 05_data_types.py
    ├── 05_describe.py
    ├── 05_dimensions.py
    ├── 05_head.py
    ├── 05_pearson_correlation.py
    ├── 05_skew.py
    ├── 06_boxplot.py
    ├── 06_correlation_matrix.py
    ├── 06_correlation_matrix_generic.py
    ├── 06_density_plots.py
    ├── 06_histograms.py
    ├── 06_scatterplot_matrix.py
    ├── 07_binarization.py
    ├── 07_normalize_data.py
    ├── 07_rescale_data.py
    ├── 07_standardize_data.py
    ├── 08_feature_importance.py
    ├── 08_pca.py
    ├── 08_recursive_feature_elimination.py
    ├── 08_univariate_selection.py
    ├── 09_cross_validation.py
    ├── 09_loocv.py
    ├── 09_shuffle_split.py
    ├── 09_train_test.py
    ├── 10_classification_accuracy.py
    ├── 10_classification_auc.py
    ├── 10_classification_confusion_matrix.py
    ├── 10_classification_logloss.py
    ├── 10_classification_report.py
    ├── 10_regression_mae.py
    ├── 10_regression_mse.py
    ├── 10_regression_rsquared.py
    ├── 11_classification_and_regression_trees_classification.py
    ├── 11_gaussian_naive_bayes.py
    ├── 11_k_nearest_neighbors_classification.py
    ├── 11_linear_discriminant_analysis.py
    ├── 11_logistic_regression.py
    ├── 11_support_vector_machines_classification.py
    ├── 12_classification_and_regression_trees_regression.py
    ├── 12_elastic_net.py
    ├── 12_k_nearest_neighbors_regression.py
    ├── 12_lasso_regression.py
    ├── 12_linear_regression.py
    ├── 12_ridge_regression.py
    ├── 12_support_vector_machines_regression.py
    ├── 13_race_algorithms.py
    ├── 14_feature_union_model_pipeline.py
    ├── 14_standardize_model_pipeline.py
    ├── 15_adaboost_classification.py
    ├── 15_bagged_cart_classification.py
    ├── 15_extra_trees_classification.py
    ├── 15_gradient_boosting_classification.py
    ├── 15_random_forest_classification.py
    ├── 15_voting_ensemble_classification.py
    ├── 16_grid_search.py
    ├── 16_random_search.py
    ├── 17_save_model_joblib.py
    ├── 17_save_model_pickel.py
    ├── 18_project_template.py
    ├── 19_project_classification_iris.py
    ├── 20_project_regression_boston.py
    ├── 21_project_classification_sonar.py
    ├── housing.csv
    ├── iris.data.csv
    ├── pima-indians-diabetes.data.csv
    └── sonar.all-data.csv
└── xgboost_with_python_code
    ├── 04_first_model.py
    ├── 05_breast_one_hot.py
    ├── 05_horse_colic_missing.py
    ├── 05_horse_colic_missing_imputer.py
    ├── 05_iris_label_encode.py
    ├── 06_cross_validation.py
    ├── 06_stratified_cross_validation.py
    ├── 06_train_test_split.py
    ├── 07_plot_tree-left-to-right.py
    ├── 07_plot_tree.py
    ├── 08_serialize_with_joblib.py
    ├── 08_serialize_with_pickle.py
    ├── 09_automatic_feature_importance.py
    ├── 09_feature_selection.py
    ├── 09_manual_feature_importance.py
    ├── 10_early_stopping.py
    ├── 10_evaluate_validation_set.py
    ├── 10_learning_curves.py
    ├── 11_eval_num_threads.py
    ├── 11_eval_parallel_cv_and_xgboost.py
    ├── 12_check_num_threads.py
    ├── 14_tune_depth.py
    ├── 14_tune_num_trees_and_depth.py
    ├── 14_tune_trees.py
    ├── 15_plot_performance.py
    ├── 15_tune_learning_rate.py
    ├── 15_tune_learning_rate_and_num_trees.py
    ├── 16_tune_column_sample_rate_bytree.py
    ├── 16_tune_column_sample_rate_split.py
    ├── 16_tune_row_sample_rate.py
    ├── datasets-uci-breast-cancer.csv
    ├── horse-colic.csv
    ├── iris.csv
    └── pima-indians-diabetes.csv


/README.md:
--------------------------------------------------------------------------------
1 | # ML-mastery
2 | Code from Jason Brownlee's course on mastering machine learning 
3 | 


--------------------------------------------------------------------------------
/deep_learning_with_python_code/02_theano_example.py:
--------------------------------------------------------------------------------
 1 | # Example of Theano library
 2 | import theano
 3 | from theano import tensor
 4 | # declare two symbolic floating-point scalars
 5 | a = tensor.dscalar()
 6 | b = tensor.dscalar()
 7 | # create a simple symbolic expression
 8 | c = a + b
 9 | # convert the expression into a callable object that takes (a,b) and computes c
10 | f = theano.function([a,b], c)
11 | # bind 1.5 to 'a', 2.5 to 'b', and evaluate 'c'
12 | result = f(1.5, 2.5)
13 | print(result)


--------------------------------------------------------------------------------
/deep_learning_with_python_code/03_tensorflow_example.py:
--------------------------------------------------------------------------------
 1 | # Example of TensorFlow library
 2 | import tensorflow as tf
 3 | # declare two symbolic floating-point scalars
 4 | a = tf.placeholder(tf.float32)
 5 | b = tf.placeholder(tf.float32)
 6 | # create a simple symbolic expression using the add function
 7 | add = tf.add(a, b)
 8 | # bind 1.5 to 'a', 2.5 to 'b', and evaluate 'c'
 9 | sess = tf.Session()
10 | binding = {a: 1.5, b: 2.5}
11 | c = sess.run(add, feed_dict=binding)
12 | print(c)


--------------------------------------------------------------------------------
/deep_learning_with_python_code/07_first_mlp.py:
--------------------------------------------------------------------------------
 1 | # Create your first MLP in Keras
 2 | from keras.models import Sequential
 3 | from keras.layers import Dense
 4 | import numpy
 5 | # fix random seed for reproducibility
 6 | seed = 7
 7 | numpy.random.seed(seed)
 8 | # load pima indians dataset
 9 | dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",")
10 | # split into input (X) and output (Y) variables
11 | X = dataset[:,0:8]
12 | Y = dataset[:,8]
13 | # create model
14 | model = Sequential()
15 | model.add(Dense(12, input_dim=8, init='uniform', activation='relu'))
16 | model.add(Dense(8, init='uniform', activation='relu'))
17 | model.add(Dense(1, init='uniform', activation='sigmoid'))
18 | # Compile model
19 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
20 | # Fit the model
21 | model.fit(X, Y, nb_epoch=150, batch_size=10)
22 | # evaluate the model
23 | scores = model.evaluate(X, Y)
24 | print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))


--------------------------------------------------------------------------------
/deep_learning_with_python_code/08_automatic_split.py:
--------------------------------------------------------------------------------
 1 | # MLP with automatic validation set
 2 | from keras.models import Sequential
 3 | from keras.layers import Dense
 4 | import numpy
 5 | # fix random seed for reproducibility
 6 | seed = 7
 7 | numpy.random.seed(seed)
 8 | # load pima indians dataset
 9 | dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",")
10 | # split into input (X) and output (Y) variables
11 | X = dataset[:,0:8]
12 | Y = dataset[:,8]
13 | # create model
14 | model = Sequential()
15 | model.add(Dense(12, input_dim=8, init='uniform', activation='relu'))
16 | model.add(Dense(8, init='uniform', activation='relu'))
17 | model.add(Dense(1, init='uniform', activation='sigmoid'))
18 | # Compile model
19 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
20 | # Fit the model
21 | model.fit(X, Y, validation_split=0.33, nb_epoch=150, batch_size=10)


--------------------------------------------------------------------------------
/deep_learning_with_python_code/08_manual_cross_validation.py:
--------------------------------------------------------------------------------
 1 | # MLP for Pima Indians Dataset with 10-fold cross validation
 2 | from keras.models import Sequential
 3 | from keras.layers import Dense
 4 | from sklearn.model_selection import StratifiedKFold
 5 | import numpy
 6 | # fix random seed for reproducibility
 7 | seed = 7
 8 | numpy.random.seed(seed)
 9 | # load pima indians dataset
10 | dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",")
11 | # split into input (X) and output (Y) variables
12 | X = dataset[:,0:8]
13 | Y = dataset[:,8]
14 | # define 10-fold cross validation test harness
15 | kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
16 | cvscores = []
17 | for train, test in kfold.split(X, Y):
18 |   # create model
19 | 	model = Sequential()
20 | 	model.add(Dense(12, input_dim=8, init='uniform', activation='relu'))
21 | 	model.add(Dense(8, init='uniform', activation='relu'))
22 | 	model.add(Dense(1, init='uniform', activation='sigmoid'))
23 | 	# Compile model
24 | 	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
25 | 	# Fit the model
26 | 	model.fit(X[train], Y[train], nb_epoch=150, batch_size=10, verbose=0)
27 | 	# evaluate the model
28 | 	scores = model.evaluate(X[test], Y[test], verbose=0)
29 | 	print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
30 | 	cvscores.append(scores[1] * 100)
31 | 
32 | print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores)))
33 | 


--------------------------------------------------------------------------------
/deep_learning_with_python_code/08_manual_split.py:
--------------------------------------------------------------------------------
 1 | # MLP with manual validation set
 2 | from keras.models import Sequential
 3 | from keras.layers import Dense
 4 | from sklearn.model_selection import train_test_split
 5 | import numpy
 6 | # fix random seed for reproducibility
 7 | seed = 7
 8 | numpy.random.seed(seed)
 9 | # load pima indians dataset
10 | dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",")
11 | # split into input (X) and output (Y) variables
12 | X = dataset[:,0:8]
13 | Y = dataset[:,8]
14 | # split into 67% for train and 33% for test
15 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=seed)
16 | # create model
17 | model = Sequential()
18 | model.add(Dense(12, input_dim=8, init='uniform', activation='relu'))
19 | model.add(Dense(8, init='uniform', activation='relu'))
20 | model.add(Dense(1, init='uniform', activation='sigmoid'))
21 | # Compile model
22 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
23 | # Fit the model
24 | model.fit(X_train, y_train, validation_data=(X_test,y_test), nb_epoch=150, batch_size=10)


--------------------------------------------------------------------------------
/deep_learning_with_python_code/09_sklearn_cross_validation.py:
--------------------------------------------------------------------------------
 1 | # MLP for Pima Indians Dataset with 10-fold cross validation via sklearn
 2 | from keras.models import Sequential
 3 | from keras.layers import Dense
 4 | from keras.wrappers.scikit_learn import KerasClassifier
 5 | from sklearn.model_selection import StratifiedKFold
 6 | from sklearn.model_selection import cross_val_score
 7 | import numpy
 8 | 
 9 | # Function to create model, required for KerasClassifier
10 | def create_model():
11 | 	# create model
12 | 	model = Sequential()
13 | 	model.add(Dense(12, input_dim=8, init='uniform', activation='relu'))
14 | 	model.add(Dense(8, init='uniform', activation='relu'))
15 | 	model.add(Dense(1, init='uniform', activation='sigmoid'))
16 | 	# Compile model
17 | 	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
18 | 	return model
19 | 
20 | # fix random seed for reproducibility
21 | seed = 7
22 | numpy.random.seed(seed)
23 | # load pima indians dataset
24 | dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",")
25 | # split into input (X) and output (Y) variables
26 | X = dataset[:,0:8]
27 | Y = dataset[:,8]
28 | # create model
29 | model = KerasClassifier(build_fn=create_model, nb_epoch=150, batch_size=10, verbose=0)
30 | # evaluate using 10-fold cross validation
31 | kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
32 | results = cross_val_score(model, X, Y, cv=kfold)
33 | print(results.mean())
34 | 


--------------------------------------------------------------------------------
/deep_learning_with_python_code/10_iris_example.py:
--------------------------------------------------------------------------------
 1 | # Multiclass Classification with the Iris Flowers Dataset
 2 | import numpy
 3 | import pandas
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense
 6 | from keras.wrappers.scikit_learn import KerasClassifier
 7 | from keras.utils import np_utils
 8 | from sklearn.model_selection import cross_val_score
 9 | from sklearn.model_selection import KFold
10 | from sklearn.preprocessing import LabelEncoder
11 | from sklearn.pipeline import Pipeline
12 | # fix random seed for reproducibility
13 | seed = 7
14 | numpy.random.seed(seed)
15 | # load dataset
16 | dataframe = pandas.read_csv("iris.csv", header=None)
17 | dataset = dataframe.values
18 | X = dataset[:,0:4].astype(float)
19 | Y = dataset[:,4]
20 | # encode class values as integers
21 | encoder = LabelEncoder()
22 | encoder.fit(Y)
23 | encoded_Y = encoder.transform(Y)
24 | # convert integers to dummy variables (i.e. one hot encoded)
25 | dummy_y = np_utils.to_categorical(encoded_Y)
26 | # define baseline model
27 | def baseline_model():
28 | 	# create model
29 | 	model = Sequential()
30 | 	model.add(Dense(4, input_dim=4, init='normal', activation='relu'))
31 | 	model.add(Dense(3, init='normal', activation='sigmoid'))
32 | 	# Compile model
33 | 	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
34 | 	return model
35 | estimator = KerasClassifier(build_fn=baseline_model, nb_epoch=200, batch_size=5, verbose=0)
36 | kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
37 | results = cross_val_score(estimator, X, dummy_y, cv=kfold)
38 | print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
39 | 


--------------------------------------------------------------------------------
/deep_learning_with_python_code/11_sonar_baseline.py:
--------------------------------------------------------------------------------
 1 | # Binary Classification with Sonar Dataset: Baseline
 2 | import numpy
 3 | import pandas
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense
 6 | from keras.wrappers.scikit_learn import KerasClassifier
 7 | from sklearn.model_selection import cross_val_score
 8 | from sklearn.preprocessing import LabelEncoder
 9 | from sklearn.model_selection import StratifiedKFold
10 | from sklearn.preprocessing import StandardScaler
11 | from sklearn.pipeline import Pipeline
12 | # fix random seed for reproducibility
13 | seed = 7
14 | numpy.random.seed(seed)
15 | # load dataset
16 | dataframe = pandas.read_csv("sonar.csv", header=None)
17 | dataset = dataframe.values
18 | # split into input (X) and output (Y) variables
19 | X = dataset[:,0:60].astype(float)
20 | Y = dataset[:,60]
21 | # encode class values as integers
22 | encoder = LabelEncoder()
23 | encoder.fit(Y)
24 | encoded_Y = encoder.transform(Y)
25 | # baseline model
26 | def create_baseline():
27 | 	# create model
28 | 	model = Sequential()
29 | 	model.add(Dense(60, input_dim=60, init='normal', activation='relu'))
30 | 	model.add(Dense(1, init='normal', activation='sigmoid'))
31 | 	# Compile model
32 | 	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
33 | 	return model
34 | # evaluate model with standardized dataset
35 | estimator = KerasClassifier(build_fn=create_baseline, nb_epoch=100, batch_size=5, verbose=0)
36 | kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
37 | results = cross_val_score(estimator, X, encoded_Y, cv=kfold)
38 | print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
39 | 


--------------------------------------------------------------------------------
/deep_learning_with_python_code/12_boston_baseline.py:
--------------------------------------------------------------------------------
 1 | # Regression Example With Boston Dataset: Baseline
 2 | import numpy
 3 | import pandas
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense
 6 | from keras.wrappers.scikit_learn import KerasRegressor
 7 | from sklearn.model_selection import cross_val_score
 8 | from sklearn.model_selection import KFold
 9 | from sklearn.preprocessing import StandardScaler
10 | from sklearn.pipeline import Pipeline
11 | # load dataset
12 | dataframe = pandas.read_csv("housing.csv", delim_whitespace=True, header=None)
13 | dataset = dataframe.values
14 | # split into input (X) and output (Y) variables
15 | X = dataset[:,0:13]
16 | Y = dataset[:,13]
17 | # define base model
18 | def baseline_model():
19 | 	# create model
20 | 	model = Sequential()
21 | 	model.add(Dense(13, input_dim=13, init='normal', activation='relu'))
22 | 	model.add(Dense(1, init='normal'))
23 | 	# Compile model
24 | 	model.compile(loss='mean_squared_error', optimizer='adam')
25 | 	return model
26 | # fix random seed for reproducibility
27 | seed = 7
28 | numpy.random.seed(seed)
29 | # evaluate model with standardized dataset
30 | estimator = KerasRegressor(build_fn=baseline_model, nb_epoch=100, batch_size=5, verbose=0)
31 | kfold = KFold(n_splits=10, random_state=seed)
32 | results = cross_val_score(estimator, X, Y, cv=kfold)
33 | print("Baseline: %.2f (%.2f) MSE" % (results.mean(), results.std()))
34 | 


--------------------------------------------------------------------------------
/deep_learning_with_python_code/12_boston_standardized.py:
--------------------------------------------------------------------------------
 1 | # Regression Example With Boston Dataset: Standardized
 2 | import numpy
 3 | import pandas
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense
 6 | from keras.wrappers.scikit_learn import KerasRegressor
 7 | from sklearn.model_selection import cross_val_score
 8 | from sklearn.model_selection import KFold
 9 | from sklearn.preprocessing import StandardScaler
10 | from sklearn.pipeline import Pipeline
11 | # load dataset
12 | dataframe = pandas.read_csv("housing.csv", delim_whitespace=True, header=None)
13 | dataset = dataframe.values
14 | # split into input (X) and output (Y) variables
15 | X = dataset[:,0:13]
16 | Y = dataset[:,13]
17 | # define base model
18 | def baseline_model():
19 | 	# create model
20 | 	model = Sequential()
21 | 	model.add(Dense(13, input_dim=13, init='normal', activation='relu'))
22 | 	model.add(Dense(1, init='normal'))
23 | 	# Compile model
24 | 	model.compile(loss='mean_squared_error', optimizer='adam')
25 | 	return model
26 | # fix random seed for reproducibility
27 | seed = 7
28 | numpy.random.seed(seed)
29 | # evaluate model with standardized dataset
30 | estimators = []
31 | estimators.append(('standardize', StandardScaler()))
32 | estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, nb_epoch=50, batch_size=5, verbose=0)))
33 | pipeline = Pipeline(estimators)
34 | kfold = KFold(n_splits=10, random_state=seed)
35 | results = cross_val_score(pipeline, X, Y, cv=kfold)
36 | print("Standardized: %.2f (%.2f) MSE" % (results.mean(), results.std()))
37 | 


--------------------------------------------------------------------------------
/deep_learning_with_python_code/12_boston_standardized_larger.py:
--------------------------------------------------------------------------------
 1 | # Regression Example With Boston Dataset: Standardized and Larger
 2 | import numpy
 3 | import pandas
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense
 6 | from keras.wrappers.scikit_learn import KerasRegressor
 7 | from sklearn.model_selection import cross_val_score
 8 | from sklearn.model_selection import KFold
 9 | from sklearn.preprocessing import StandardScaler
10 | from sklearn.pipeline import Pipeline
11 | # load dataset
12 | dataframe = pandas.read_csv("housing.csv", delim_whitespace=True, header=None)
13 | dataset = dataframe.values
14 | # split into input (X) and output (Y) variables
15 | X = dataset[:,0:13]
16 | Y = dataset[:,13]
17 | # define the model
18 | def larger_model():
19 | 	# create model
20 | 	model = Sequential()
21 | 	model.add(Dense(13, input_dim=13, init='normal', activation='relu'))
22 | 	model.add(Dense(6, init='normal', activation='relu'))
23 | 	model.add(Dense(1, init='normal'))
24 | 	# Compile model
25 | 	model.compile(loss='mean_squared_error', optimizer='adam')
26 | 	return model
27 | # fix random seed for reproducibility
28 | seed = 7
29 | numpy.random.seed(seed)
30 | # evaluate model with standardized dataset
31 | estimators = []
32 | estimators.append(('standardize', StandardScaler()))
33 | estimators.append(('mlp', KerasRegressor(build_fn=larger_model, nb_epoch=50, batch_size=5, verbose=0)))
34 | pipeline = Pipeline(estimators)
35 | kfold = KFold(n_splits=10, random_state=seed)
36 | results = cross_val_score(pipeline, X, Y, cv=kfold)
37 | print("Larger: %.2f (%.2f) MSE" % (results.mean(), results.std()))
38 | 


--------------------------------------------------------------------------------
/deep_learning_with_python_code/12_boston_standardized_wider.py:
--------------------------------------------------------------------------------
 1 | # Regression Example With Boston Dataset: Standardized and Wider
 2 | import numpy
 3 | import pandas
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense
 6 | from keras.wrappers.scikit_learn import KerasRegressor
 7 | from sklearn.model_selection import cross_val_score
 8 | from sklearn.model_selection import KFold
 9 | from sklearn.preprocessing import StandardScaler
10 | from sklearn.pipeline import Pipeline
11 | # load dataset
12 | dataframe = pandas.read_csv("housing.csv", delim_whitespace=True, header=None)
13 | dataset = dataframe.values
14 | # split into input (X) and output (Y) variables
15 | X = dataset[:,0:13]
16 | Y = dataset[:,13]
17 | # define wider model
18 | def wider_model():
19 | 	# create model
20 | 	model = Sequential()
21 | 	model.add(Dense(20, input_dim=13, init='normal', activation='relu'))
22 | 	model.add(Dense(1, init='normal'))
23 | 	# Compile model
24 | 	model.compile(loss='mean_squared_error', optimizer='adam')
25 | 	return model
26 | # fix random seed for reproducibility
27 | seed = 7
28 | numpy.random.seed(seed)
29 | # evaluate model with standardized dataset
30 | estimators = []
31 | estimators.append(('standardize', StandardScaler()))
32 | estimators.append(('mlp', KerasRegressor(build_fn=wider_model, nb_epoch=100, batch_size=5, verbose=0)))
33 | pipeline = Pipeline(estimators)
34 | kfold = KFold(n_splits=10, random_state=seed)
35 | results = cross_val_score(pipeline, X, Y, cv=kfold)
36 | print("Wider: %.2f (%.2f) MSE" % (results.mean(), results.std()))
37 | 


--------------------------------------------------------------------------------
/deep_learning_with_python_code/14_checkpoint_best_model.py:
--------------------------------------------------------------------------------
 1 | # Checkpoint the weights for best model on validation accuracy
 2 | from keras.models import Sequential
 3 | from keras.layers import Dense
 4 | from keras.callbacks import ModelCheckpoint
 5 | import matplotlib.pyplot as plt
 6 | import numpy
 7 | # fix random seed for reproducibility
 8 | seed = 7
 9 | numpy.random.seed(seed)
10 | # load pima indians dataset
11 | dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",")
12 | # split into input (X) and output (Y) variables
13 | X = dataset[:,0:8]
14 | Y = dataset[:,8]
15 | # create model
16 | model = Sequential()
17 | model.add(Dense(12, input_dim=8, init='uniform', activation='relu'))
18 | model.add(Dense(8, init='uniform', activation='relu'))
19 | model.add(Dense(1, init='uniform', activation='sigmoid'))
20 | # Compile model
21 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
22 | # checkpoint
23 | filepath="weights.best.hdf5"
24 | checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
25 | callbacks_list = [checkpoint]
26 | # Fit the model
27 | model.fit(X, Y, validation_split=0.33, nb_epoch=150, batch_size=10, callbacks=callbacks_list, verbose=0)
28 | 


--------------------------------------------------------------------------------
/deep_learning_with_python_code/14_checkpoint_load.py:
--------------------------------------------------------------------------------
 1 | # How to load and use weights from a checkpoint
 2 | from keras.models import Sequential
 3 | from keras.layers import Dense
 4 | from keras.callbacks import ModelCheckpoint
 5 | import matplotlib.pyplot as plt
 6 | import numpy
 7 | # fix random seed for reproducibility
 8 | seed = 7
 9 | numpy.random.seed(seed)
10 | # create model
11 | model = Sequential()
12 | model.add(Dense(12, input_dim=8, init='uniform', activation='relu'))
13 | model.add(Dense(8, init='uniform', activation='relu'))
14 | model.add(Dense(1, init='uniform', activation='sigmoid'))
15 | # load weights
16 | model.load_weights("weights.best.hdf5")
17 | # Compile model (required to make predictions)
18 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
19 | print("Created model and loaded weights from file")
20 | # load pima indians dataset
21 | dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",")
22 | # split into input (X) and output (Y) variables
23 | X = dataset[:,0:8]
24 | Y = dataset[:,8]
25 | # estimate accuracy on whole dataset using loaded weights
26 | scores = model.evaluate(X, Y, verbose=0)
27 | print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
28 | 


--------------------------------------------------------------------------------
/deep_learning_with_python_code/14_checkpoint_model_improvements.py:
--------------------------------------------------------------------------------
 1 | # Checkpoint the weights when validation accuracy improves
 2 | from keras.models import Sequential
 3 | from keras.layers import Dense
 4 | from keras.callbacks import ModelCheckpoint
 5 | import matplotlib.pyplot as plt
 6 | import numpy
 7 | # fix random seed for reproducibility
 8 | seed = 7
 9 | numpy.random.seed(seed)
10 | # load pima indians dataset
11 | dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",")
12 | # split into input (X) and output (Y) variables
13 | X = dataset[:,0:8]
14 | Y = dataset[:,8]
15 | # create model
16 | model = Sequential()
17 | model.add(Dense(12, input_dim=8, init='uniform', activation='relu'))
18 | model.add(Dense(8, init='uniform', activation='relu'))
19 | model.add(Dense(1, init='uniform', activation='sigmoid'))
20 | # Compile model
21 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
22 | # checkpoint
23 | filepath="weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
24 | checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
25 | callbacks_list = [checkpoint]
26 | # Fit the model
27 | model.fit(X, Y, validation_split=0.33, nb_epoch=150, batch_size=10, callbacks=callbacks_list, verbose=0)
28 | 


--------------------------------------------------------------------------------
/deep_learning_with_python_code/15_plot_history.py:
--------------------------------------------------------------------------------
 1 | # Visualize training history
 2 | from keras.models import Sequential
 3 | from keras.layers import Dense
 4 | import matplotlib.pyplot as plt
 5 | import numpy
 6 | # fix random seed for reproducibility
 7 | seed = 7
 8 | numpy.random.seed(seed)
 9 | # load pima indians dataset
10 | dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",")
11 | # split into input (X) and output (Y) variables
12 | X = dataset[:,0:8]
13 | Y = dataset[:,8]
14 | # create model
15 | model = Sequential()
16 | model.add(Dense(12, input_dim=8, init='uniform', activation='relu'))
17 | model.add(Dense(8, init='uniform', activation='relu'))
18 | model.add(Dense(1, init='uniform', activation='sigmoid'))
19 | # Compile model
20 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
21 | # Fit the model
22 | history = model.fit(X, Y, validation_split=0.33, nb_epoch=150, batch_size=10, verbose=0)
23 | # list all data in history
24 | print(history.history.keys())
25 | # summarize history for accuracy
26 | plt.plot(history.history['acc'])
27 | plt.plot(history.history['val_acc'])
28 | plt.title('model accuracy')
29 | plt.ylabel('accuracy')
30 | plt.xlabel('epoch')
31 | plt.legend(['train', 'test'], loc='upper left')
32 | plt.show()
33 | # summarize history for loss
34 | plt.plot(history.history['loss'])
35 | plt.plot(history.history['val_loss'])
36 | plt.title('model loss')
37 | plt.ylabel('loss')
38 | plt.xlabel('epoch')
39 | plt.legend(['train', 'test'], loc='upper left')
40 | plt.show()
41 | 


--------------------------------------------------------------------------------
/deep_learning_with_python_code/17_decay_drop_based.py:
--------------------------------------------------------------------------------
 1 | # Drop-Based Learning Rate Decay
 2 | import pandas
 3 | import pandas
 4 | import numpy
 5 | import math
 6 | from keras.models import Sequential
 7 | from keras.layers import Dense
 8 | from keras.optimizers import SGD
 9 | from sklearn.preprocessing import LabelEncoder
10 | from keras.callbacks import LearningRateScheduler
11 | 
12 | # learning rate schedule
13 | def step_decay(epoch):
14 | 	initial_lrate = 0.1
15 | 	drop = 0.5
16 | 	epochs_drop = 10.0
17 | 	lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop))
18 | 	return lrate
19 | 
20 | # fix random seed for reproducibility
21 | seed = 7
22 | numpy.random.seed(seed)
23 | # load dataset
24 | dataframe = pandas.read_csv("ionosphere.csv", header=None)
25 | dataset = dataframe.values
26 | # split into input (X) and output (Y) variables
27 | X = dataset[:,0:34].astype(float)
28 | Y = dataset[:,34]
29 | # encode class values as integers
30 | encoder = LabelEncoder()
31 | encoder.fit(Y)
32 | Y = encoder.transform(Y)
33 | # create model
34 | model = Sequential()
35 | model.add(Dense(34, input_dim=34, init='normal', activation='relu'))
36 | model.add(Dense(1, init='normal', activation='sigmoid'))
37 | # Compile model
38 | sgd = SGD(lr=0.0, momentum=0.9, decay=0.0, nesterov=False)
39 | model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
40 | # learning schedule callback
41 | lrate = LearningRateScheduler(step_decay)
42 | callbacks_list = [lrate]
43 | # Fit the model
44 | model.fit(X, Y, validation_split=0.33, nb_epoch=50, batch_size=28, callbacks=callbacks_list, verbose=2)
45 | 


--------------------------------------------------------------------------------
/deep_learning_with_python_code/17_decay_time_based.py:
--------------------------------------------------------------------------------
 1 | # Time Based Learning Rate Decay
 2 | import pandas
 3 | import numpy
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense
 6 | from keras.optimizers import SGD
 7 | from sklearn.preprocessing import LabelEncoder
 8 | # fix random seed for reproducibility
 9 | seed = 7
10 | numpy.random.seed(seed)
11 | # load dataset
12 | dataframe = pandas.read_csv("ionosphere.csv", header=None)
13 | dataset = dataframe.values
14 | # split into input (X) and output (Y) variables
15 | X = dataset[:,0:34].astype(float)
16 | Y = dataset[:,34]
17 | # encode class values as integers
18 | encoder = LabelEncoder()
19 | encoder.fit(Y)
20 | Y = encoder.transform(Y)
21 | # create model
22 | model = Sequential()
23 | model.add(Dense(34, input_dim=34, init='normal', activation='relu'))
24 | model.add(Dense(1, init='normal', activation='sigmoid'))
25 | # Compile model
26 | epochs = 50
27 | learning_rate = 0.1
28 | decay_rate = learning_rate / epochs
29 | momentum = 0.8
30 | sgd = SGD(lr=learning_rate, momentum=momentum, decay=decay_rate, nesterov=False)
31 | model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
32 | # Fit the model
33 | model.fit(X, Y, validation_split=0.33, nb_epoch=epochs, batch_size=28, verbose=2)
34 | 


--------------------------------------------------------------------------------
/deep_learning_with_python_code/19_mnist_mlp_baseline.py:
--------------------------------------------------------------------------------
 1 | # Baseline MLP for MNIST dataset
 2 | import numpy
 3 | from keras.datasets import mnist
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense
 6 | from keras.layers import Dropout
 7 | from keras.utils import np_utils
 8 | # fix random seed for reproducibility
 9 | seed = 7
10 | numpy.random.seed(seed)
11 | # load data
12 | (X_train, y_train), (X_test, y_test) = mnist.load_data()
13 | # flatten 28*28 images to a 784 vector for each image
14 | num_pixels = X_train.shape[1] * X_train.shape[2]
15 | X_train = X_train.reshape(X_train.shape[0], num_pixels).astype('float32')
16 | X_test = X_test.reshape(X_test.shape[0], num_pixels).astype('float32')
17 | # normalize inputs from 0-255 to 0-1
18 | X_train = X_train / 255
19 | X_test = X_test / 255
20 | # one hot encode outputs
21 | y_train = np_utils.to_categorical(y_train)
22 | y_test = np_utils.to_categorical(y_test)
23 | num_classes = y_test.shape[1]
24 | # define baseline model
25 | def baseline_model():
26 | 	# create model
27 | 	model = Sequential()
28 | 	model.add(Dense(num_pixels, input_dim=num_pixels, init='normal', activation='relu'))
29 | 	model.add(Dense(num_classes, init='normal', activation='softmax'))
30 | 	# Compile model
31 | 	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
32 | 	return model
33 | # build the model
34 | model = baseline_model()
35 | # Fit the model
36 | model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=10, batch_size=200, verbose=2)
37 | # Final evaluation of the model
38 | scores = model.evaluate(X_test, y_test, verbose=0)
39 | print("Baseline Error: %.2f%%" % (100-scores[1]*100))
40 | 


--------------------------------------------------------------------------------
/deep_learning_with_python_code/19_mnist_plot.py:
--------------------------------------------------------------------------------
 1 | # Plot ad hoc mnist instances
 2 | from keras.datasets import mnist
 3 | import matplotlib.pyplot as plt
 4 | # load (downloaded if needed) the MNIST dataset
 5 | (X_train, y_train), (X_test, y_test) = mnist.load_data()
 6 | # plot 4 images as gray scale
 7 | plt.subplot(221)
 8 | plt.imshow(X_train[0], cmap=plt.get_cmap('gray'))
 9 | plt.subplot(222)
10 | plt.imshow(X_train[1], cmap=plt.get_cmap('gray'))
11 | plt.subplot(223)
12 | plt.imshow(X_train[2], cmap=plt.get_cmap('gray'))
13 | plt.subplot(224)
14 | plt.imshow(X_train[3], cmap=plt.get_cmap('gray'))
15 | # show the plot
16 | plt.show()
17 | 


--------------------------------------------------------------------------------
/deep_learning_with_python_code/20_augment_baseline.py:
--------------------------------------------------------------------------------
 1 | # Plot of images as baseline for comparison
 2 | from keras.datasets import mnist
 3 | from matplotlib import pyplot
 4 | # load data
 5 | (X_train, y_train), (X_test, y_test) = mnist.load_data()
 6 | # create a grid of 3x3 images
 7 | for i in range(0, 9):
 8 | 	pyplot.subplot(330 + 1 + i)
 9 | 	pyplot.imshow(X_train[i], cmap=pyplot.get_cmap('gray'))
10 | 
11 | # show the plot
12 | pyplot.show()


--------------------------------------------------------------------------------
/deep_learning_with_python_code/20_augment_feature_standardize.py:
--------------------------------------------------------------------------------
 1 | # Standardize images across the dataset, mean=0, stdev=1
 2 | from keras.datasets import mnist
 3 | from keras.preprocessing.image import ImageDataGenerator
 4 | from matplotlib import pyplot
 5 | # load data
 6 | (X_train, y_train), (X_test, y_test) = mnist.load_data()
 7 | # reshape to be [samples][pixels][width][height]
 8 | X_train = X_train.reshape(X_train.shape[0], 1, 28, 28)
 9 | X_test = X_test.reshape(X_test.shape[0], 1, 28, 28)
10 | # convert from int to float
11 | X_train = X_train.astype('float32')
12 | X_test = X_test.astype('float32')
13 | # define data preparation
14 | datagen = ImageDataGenerator(featurewise_center=True, featurewise_std_normalization=True)
15 | # fit parameters from data
16 | datagen.fit(X_train)
17 | # configure batch size and retrieve one batch of images
18 | for X_batch, y_batch in datagen.flow(X_train, y_train, batch_size=9):
19 | 	# create a grid of 3x3 images
20 | 	for i in range(0, 9):
21 | 		pyplot.subplot(330 + 1 + i)
22 | 		pyplot.imshow(X_batch[i].reshape(28, 28), cmap=pyplot.get_cmap('gray'))
23 | 	# show the plot
24 | 	pyplot.show()
25 | 	break
26 | 


--------------------------------------------------------------------------------
/deep_learning_with_python_code/20_augment_flips.py:
--------------------------------------------------------------------------------
 1 | # Random Flips
 2 | from keras.datasets import mnist
 3 | from keras.preprocessing.image import ImageDataGenerator
 4 | from matplotlib import pyplot
 5 | # load data
 6 | (X_train, y_train), (X_test, y_test) = mnist.load_data()
 7 | # reshape to be [samples][pixels][width][height]
 8 | X_train = X_train.reshape(X_train.shape[0], 1, 28, 28)
 9 | X_test = X_test.reshape(X_test.shape[0], 1, 28, 28)
10 | # convert from int to float
11 | X_train = X_train.astype('float32')
12 | X_test = X_test.astype('float32')
13 | # define data preparation
14 | datagen = ImageDataGenerator(horizontal_flip=True, vertical_flip=True)
15 | # fit parameters from data
16 | datagen.fit(X_train)
17 | # configure batch size and retrieve one batch of images
18 | for X_batch, y_batch in datagen.flow(X_train, y_train, batch_size=9):
19 | 	# create a grid of 3x3 images
20 | 	for i in range(0, 9):
21 | 		pyplot.subplot(330 + 1 + i)
22 | 		pyplot.imshow(X_batch[i].reshape(28, 28), cmap=pyplot.get_cmap('gray'))
23 | 	# show the plot
24 | 	pyplot.show()
25 | 	break


--------------------------------------------------------------------------------
/deep_learning_with_python_code/20_augment_rotations.py:
--------------------------------------------------------------------------------
 1 | # Random Rotations
 2 | from keras.datasets import mnist
 3 | from keras.preprocessing.image import ImageDataGenerator
 4 | from matplotlib import pyplot
 5 | # load data
 6 | (X_train, y_train), (X_test, y_test) = mnist.load_data()
 7 | # reshape to be [samples][pixels][width][height]
 8 | X_train = X_train.reshape(X_train.shape[0], 1, 28, 28)
 9 | X_test = X_test.reshape(X_test.shape[0], 1, 28, 28)
10 | # convert from int to float
11 | X_train = X_train.astype('float32')
12 | X_test = X_test.astype('float32')
13 | # define data preparation
14 | datagen = ImageDataGenerator(rotation_range=90)
15 | # fit parameters from data
16 | datagen.fit(X_train)
17 | # configure batch size and retrieve one batch of images
18 | for X_batch, y_batch in datagen.flow(X_train, y_train, batch_size=9):
19 | 	# create a grid of 3x3 images
20 | 	for i in range(0, 9):
21 | 		pyplot.subplot(330 + 1 + i)
22 | 		pyplot.imshow(X_batch[i].reshape(28, 28), cmap=pyplot.get_cmap('gray'))
23 | 	# show the plot
24 | 	pyplot.show()
25 | 	break


--------------------------------------------------------------------------------
/deep_learning_with_python_code/20_augment_save_to_file.py:
--------------------------------------------------------------------------------
 1 | # Save augmented images to file
 2 | from keras.datasets import mnist
 3 | from keras.preprocessing.image import ImageDataGenerator
 4 | from matplotlib import pyplot
 5 | import os
 6 | from keras import backend as K
 7 | K.set_image_dim_ordering('th')
 8 | # load data
 9 | (X_train, y_train), (X_test, y_test) = mnist.load_data()
10 | # reshape to be [samples][pixels][width][height]
11 | X_train = X_train.reshape(X_train.shape[0], 1, 28, 28)
12 | X_test = X_test.reshape(X_test.shape[0], 1, 28, 28)
13 | # convert from int to float
14 | X_train = X_train.astype('float32')
15 | X_test = X_test.astype('float32')
16 | # define data preparation
17 | datagen = ImageDataGenerator()
18 | # fit parameters from data
19 | datagen.fit(X_train)
20 | # configure batch size and retrieve one batch of images
21 | os.makedirs('images')
22 | for X_batch, y_batch in datagen.flow(X_train, y_train, batch_size=9, save_to_dir='images', save_prefix='aug', save_format='png'):
23 | 	# create a grid of 3x3 images
24 | 	for i in range(0, 9):
25 | 		pyplot.subplot(330 + 1 + i)
26 | 		pyplot.imshow(X_batch[i].reshape(28, 28), cmap=pyplot.get_cmap('gray'))
27 | 	# show the plot
28 | 	pyplot.show()
29 | 	break


--------------------------------------------------------------------------------
/deep_learning_with_python_code/20_augment_shifts.py:
--------------------------------------------------------------------------------
 1 | # Random Shifts
 2 | from keras.datasets import mnist
 3 | from keras.preprocessing.image import ImageDataGenerator
 4 | from matplotlib import pyplot
 5 | # load data
 6 | (X_train, y_train), (X_test, y_test) = mnist.load_data()
 7 | # reshape to be [samples][pixels][width][height]
 8 | X_train = X_train.reshape(X_train.shape[0], 1, 28, 28)
 9 | X_test = X_test.reshape(X_test.shape[0], 1, 28, 28)
10 | # convert from int to float
11 | X_train = X_train.astype('float32')
12 | X_test = X_test.astype('float32')
13 | # define data preparation
14 | shift = 0.2
15 | datagen = ImageDataGenerator(width_shift_range=shift, height_shift_range=shift)
16 | # fit parameters from data
17 | datagen.fit(X_train)
18 | # configure batch size and retrieve one batch of images
19 | for X_batch, y_batch in datagen.flow(X_train, y_train, batch_size=9):
20 | 	# create a grid of 3x3 images
21 | 	for i in range(0, 9):
22 | 		pyplot.subplot(330 + 1 + i)
23 | 		pyplot.imshow(X_batch[i].reshape(28, 28), cmap=pyplot.get_cmap('gray'))
24 | 	# show the plot
25 | 	pyplot.show()
26 | 	break


--------------------------------------------------------------------------------
/deep_learning_with_python_code/20_augment_zca.py:
--------------------------------------------------------------------------------
 1 | # ZCA whitening
 2 | from keras.datasets import mnist
 3 | from keras.preprocessing.image import ImageDataGenerator
 4 | from matplotlib import pyplot
 5 | # load data
 6 | (X_train, y_train), (X_test, y_test) = mnist.load_data()
 7 | # reshape to be [samples][pixels][width][height]
 8 | X_train = X_train.reshape(X_train.shape[0], 1, 28, 28)
 9 | X_test = X_test.reshape(X_test.shape[0], 1, 28, 28)
10 | # convert from int to float
11 | X_train = X_train.astype('float32')
12 | X_test = X_test.astype('float32')
13 | # define data preparation
14 | datagen = ImageDataGenerator(zca_whitening=True)
15 | # fit parameters from data
16 | datagen.fit(X_train)
17 | # configure batch size and retrieve one batch of images
18 | for X_batch, y_batch in datagen.flow(X_train, y_train, batch_size=9):
19 | 	# create a grid of 3x3 images
20 | 	for i in range(0, 9):
21 | 		pyplot.subplot(330 + 1 + i)
22 | 		pyplot.imshow(X_batch[i].reshape(28, 28), cmap=pyplot.get_cmap('gray'))
23 | 	# show the plot
24 | 	pyplot.show()
25 | 	break
26 | 


--------------------------------------------------------------------------------
/deep_learning_with_python_code/21_cifar10_plot.py:
--------------------------------------------------------------------------------
 1 | # Plot ad hoc CIFAR10 instances
 2 | from keras.datasets import cifar10
 3 | from matplotlib import pyplot
 4 | from scipy.misc import toimage
 5 | # load data
 6 | (X_train, y_train), (X_test, y_test) = cifar10.load_data()
 7 | # create a grid of 3x3 images
 8 | for i in range(0, 9):
 9 | 	pyplot.subplot(330 + 1 + i)
10 | 	pyplot.imshow(toimage(X_train[i]))
11 | # show the plot
12 | pyplot.show()


--------------------------------------------------------------------------------
/deep_learning_with_python_code/22_imdb_cnn.py:
--------------------------------------------------------------------------------
 1 | # CNN for the IMDB problem
 2 | import numpy
 3 | from keras.datasets import imdb
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense
 6 | from keras.layers import Flatten
 7 | from keras.layers.convolutional import Convolution1D
 8 | from keras.layers.convolutional import MaxPooling1D
 9 | from keras.layers.embeddings import Embedding
10 | from keras.preprocessing import sequence
11 | # fix random seed for reproducibility
12 | seed = 7
13 | numpy.random.seed(seed)
14 | # load the dataset but only keep the top n words, zero the rest
15 | top_words = 5000
16 | (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words)
17 | # pad dataset to a maximum review length in words
18 | max_words = 500
19 | X_train = sequence.pad_sequences(X_train, maxlen=max_words)
20 | X_test = sequence.pad_sequences(X_test, maxlen=max_words)
21 | # create the model
22 | model = Sequential()
23 | model.add(Embedding(top_words, 32, input_length=max_words))
24 | model.add(Convolution1D(nb_filter=32, filter_length=3, border_mode='same', activation='relu'))
25 | model.add(MaxPooling1D(pool_length=2))
26 | model.add(Flatten())
27 | model.add(Dense(250, activation='relu'))
28 | model.add(Dense(1, activation='sigmoid'))
29 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
30 | print(model.summary())
31 | # Fit the model
32 | model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=2, batch_size=128, verbose=1)
33 | # Final evaluation of the model
34 | scores = model.evaluate(X_test, y_test, verbose=0)
35 | print("Accuracy: %.2f%%" % (scores[1]*100))
36 | 


--------------------------------------------------------------------------------
/deep_learning_with_python_code/22_imdb_mlp.py:
--------------------------------------------------------------------------------
 1 | # MLP for the IMDB problem
 2 | import numpy
 3 | from keras.datasets import imdb
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense
 6 | from keras.layers import Flatten
 7 | from keras.layers.embeddings import Embedding
 8 | from keras.preprocessing import sequence
 9 | # fix random seed for reproducibility
10 | seed = 7
11 | numpy.random.seed(seed)
12 | # load the dataset but only keep the top n words, zero the rest
13 | top_words = 5000
14 | (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words)
15 | max_words = 500
16 | X_train = sequence.pad_sequences(X_train, maxlen=max_words)
17 | X_test = sequence.pad_sequences(X_test, maxlen=max_words)
18 | # create the model
19 | model = Sequential()
20 | model.add(Embedding(top_words, 32, input_length=max_words))
21 | model.add(Flatten())
22 | model.add(Dense(250, activation='relu'))
23 | model.add(Dense(1, activation='sigmoid'))
24 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
25 | print(model.summary())
26 | # Fit the model
27 | model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=2, batch_size=128, verbose=1)
28 | # Final evaluation of the model
29 | scores = model.evaluate(X_test, y_test, verbose=0)
30 | print("Accuracy: %.2f%%" % (scores[1]*100))
31 | 


--------------------------------------------------------------------------------
/deep_learning_with_python_code/22_imdb_plot.py:
--------------------------------------------------------------------------------
 1 | # Load and Plot the IMDB dataset
 2 | import numpy
 3 | from keras.datasets import imdb
 4 | from matplotlib import pyplot
 5 | # load the dataset
 6 | (X_train, y_train), (X_test, y_test) = imdb.load_data()
 7 | X = numpy.concatenate((X_train, X_test), axis=0)
 8 | y = numpy.concatenate((y_train, y_test), axis=0)
 9 | # summarize size
10 | print("Training data: ")
11 | print(X.shape)
12 | print(y.shape)
13 | # Summarize number of classes
14 | print("Classes: ")
15 | print(numpy.unique(y))
16 | # Summarize number of words
17 | print("Number of words: ")
18 | print(len(numpy.unique(numpy.hstack(X))))
19 | # Summarize review length
20 | print("Review length: ")
21 | result = map(len, X)
22 | print("Mean %.2f words (%f)" % (numpy.mean(result), numpy.std(result)))
23 | # plot review length as a boxplot and histogram
24 | pyplot.subplot(121)
25 | pyplot.boxplot(result)
26 | pyplot.subplot(122)
27 | pyplot.hist(result)
28 | pyplot.show()
29 | 


--------------------------------------------------------------------------------
/deep_learning_with_python_code/26_lstm_cnn.py:
--------------------------------------------------------------------------------
 1 | # LSTM and CNN for sequence classification in the IMDB dataset
 2 | import numpy
 3 | from keras.datasets import imdb
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense
 6 | from keras.layers import LSTM
 7 | from keras.layers.convolutional import Convolution1D
 8 | from keras.layers.convolutional import MaxPooling1D
 9 | from keras.layers.embeddings import Embedding
10 | from keras.preprocessing import sequence
11 | # fix random seed for reproducibility
12 | numpy.random.seed(7)
13 | # load the dataset but only keep the top n words, zero the rest
14 | top_words = 5000
15 | (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words)
16 | # truncate and pad input sequences
17 | max_review_length = 500
18 | X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
19 | X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
20 | # create the model
21 | embedding_vecor_length = 32
22 | model = Sequential()
23 | model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
24 | model.add(Convolution1D(nb_filter=32, filter_length=3, border_mode='same', activation='relu'))
25 | model.add(MaxPooling1D(pool_length=2))
26 | model.add(LSTM(100))
27 | model.add(Dense(1, activation='sigmoid'))
28 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
29 | print(model.summary())
30 | model.fit(X_train, y_train, nb_epoch=3, batch_size=64)
31 | # Final evaluation of the model
32 | scores = model.evaluate(X_test, y_test, verbose=0)
33 | print("Accuracy: %.2f%%" % (scores[1]*100))


--------------------------------------------------------------------------------
/deep_learning_with_python_code/26_lstm_dropout_gates.py:
--------------------------------------------------------------------------------
 1 | # LSTM with dropout for sequence classification in the IMDB dataset
 2 | import numpy
 3 | from keras.datasets import imdb
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense
 6 | from keras.layers import LSTM
 7 | from keras.layers.embeddings import Embedding
 8 | from keras.preprocessing import sequence
 9 | # fix random seed for reproducibility
10 | numpy.random.seed(7)
11 | # load the dataset but only keep the top n words, zero the rest
12 | top_words = 5000
13 | (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words)
14 | # truncate and pad input sequences
15 | max_review_length = 500
16 | X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
17 | X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
18 | # create the model
19 | embedding_vecor_length = 32
20 | model = Sequential()
21 | model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length, dropout=0.2))
22 | model.add(LSTM(100, dropout_W=0.2, dropout_U=0.2))
23 | model.add(Dense(1, activation='sigmoid'))
24 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
25 | print(model.summary())
26 | model.fit(X_train, y_train, nb_epoch=3, batch_size=64)
27 | # Final evaluation of the model
28 | scores = model.evaluate(X_test, y_test, verbose=0)
29 | print("Accuracy: %.2f%%" % (scores[1]*100))


--------------------------------------------------------------------------------
/deep_learning_with_python_code/26_lstm_dropout_layers.py:
--------------------------------------------------------------------------------
 1 | # LSTM with Dropout for sequence classification in the IMDB dataset
 2 | import numpy
 3 | from keras.datasets import imdb
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense
 6 | from keras.layers import LSTM
 7 | from keras.layers import Dropout
 8 | from keras.layers.embeddings import Embedding
 9 | from keras.preprocessing import sequence
10 | # fix random seed for reproducibility
11 | numpy.random.seed(7)
12 | # load the dataset but only keep the top n words, zero the rest
13 | top_words = 5000
14 | (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words)
15 | # truncate and pad input sequences
16 | max_review_length = 500
17 | X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
18 | X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
19 | # create the model
20 | embedding_vecor_length = 32
21 | model = Sequential()
22 | model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length, dropout=0.2))
23 | model.add(Dropout(0.2))
24 | model.add(LSTM(100))
25 | model.add(Dropout(0.2))
26 | model.add(Dense(1, activation='sigmoid'))
27 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
28 | print(model.summary())
29 | model.fit(X_train, y_train, nb_epoch=3, batch_size=64)
30 | # Final evaluation of the model
31 | scores = model.evaluate(X_test, y_test, verbose=0)
32 | print("Accuracy: %.2f%%" % (scores[1]*100))


--------------------------------------------------------------------------------
/deep_learning_with_python_code/26_lstm_simple.py:
--------------------------------------------------------------------------------
 1 | # LSTM for sequence classification in the IMDB dataset
 2 | import numpy
 3 | from keras.datasets import imdb
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense
 6 | from keras.layers import LSTM
 7 | from keras.layers.embeddings import Embedding
 8 | from keras.preprocessing import sequence
 9 | # fix random seed for reproducibility
10 | numpy.random.seed(7)
11 | # load the dataset but only keep the top n words, zero the rest
12 | top_words = 5000
13 | (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words)
14 | # truncate and pad input sequences
15 | max_review_length = 500
16 | X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
17 | X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
18 | # create the model
19 | embedding_vecor_length = 32
20 | model = Sequential()
21 | model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
22 | model.add(LSTM(100))
23 | model.add(Dense(1, activation='sigmoid'))
24 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
25 | print(model.summary())
26 | model.fit(X_train, y_train, nb_epoch=3, batch_size=64)
27 | # Final evaluation of the model
28 | scores = model.evaluate(X_test, y_test, verbose=0)
29 | print("Accuracy: %.2f%%" % (scores[1]*100))


--------------------------------------------------------------------------------
/deep_learning_with_python_code/weights-improvement-19-1.9435.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/deep_learning_with_python_code/weights-improvement-19-1.9435.hdf5


--------------------------------------------------------------------------------
/deep_learning_with_python_code/weights-improvement-47-1.2219-bigger.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/deep_learning_with_python_code/weights-improvement-47-1.2219-bigger.hdf5


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/machine_learning_mastery_with_r_code/.DS_Store


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/machine_learning_mastery_with_r_code/1-AnalyzeData/.DS_Store


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/1-LoadData/datasets-mlbench.R:
--------------------------------------------------------------------------------
 1 | # Datasets from the mlbench library
 2 | 
 3 | # load the library
 4 | library(mlbench)
 5 | 
 6 | # list the contents of the library
 7 | library(help = "mlbench")
 8 | 
 9 | # Boston Housing dataset
10 | data(BostonHousing)
11 | dim(BostonHousing)
12 | head(BostonHousing)
13 | 
14 | # Wisconsin Breast Cancer dataset
15 | data(BreastCancer)
16 | dim(BreastCancer)
17 | levels(BreastCancer$Class)
18 | head(BreastCancer)
19 | 
20 | # Glass Identification dataset
21 | data(Glass)
22 | dim(Glass)
23 | levels(Glass$Type)
24 | head(Glass)
25 | 
26 | # Johns Hopkins University Ionosphere dataset
27 | data(Ionosphere)
28 | dim(Ionosphere)
29 | levels(Ionosphere$Class)
30 | head(Ionosphere)
31 | 
32 | # Pima Indians Diabetes dataset
33 | data(PimaIndiansDiabetes)
34 | dim(PimaIndiansDiabetes)
35 | levels(PimaIndiansDiabetes$diabetes)
36 | head(PimaIndiansDiabetes)
37 | 
38 | # Sonar, Mines vs. Rocks dataset
39 | data(Sonar)
40 | dim(Sonar)
41 | levels(Sonar$Class)
42 | head(Sonar)
43 | 
44 | # Soybean dataset
45 | data(Soybean)
46 | dim(Soybean)
47 | levels(Soybean$Class)
48 | head(Soybean)


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/1-LoadData/datasets_appliedpredictivemodeling.R:
--------------------------------------------------------------------------------
 1 | # Datasets from the AppliedPredictiveModeling library
 2 | 
 3 | # load the library
 4 | library(AppliedPredictiveModeling)
 5 | 
 6 | # list the contents of the library
 7 | library(help = "AppliedPredictiveModeling")
 8 | 
 9 | # Abalone Data
10 | data(abalone)
11 | dim(abalone)
12 | head(abalone)
13 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/1-LoadData/datasets_datasets.R:
--------------------------------------------------------------------------------
 1 | # Datasets from the dataset library
 2 | 
 3 | # list the contents of the library
 4 | library(help = "datasets")
 5 | 
 6 | # list all available datasets in all loaded libraries
 7 | data()
 8 | 
 9 | # Iris flowers datasets
10 | data(iris)
11 | dim(iris)
12 | levels(iris$Species)
13 | head(iris)
14 | 
15 | # Longley's Economic Regression Data
16 | data(longley)
17 | dim(longley)
18 | head(longley)


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/1-LoadData/load_csv_file.R:
--------------------------------------------------------------------------------
1 | # Load data from a CSV file in the local directory
2 | 
3 | # define the filename
4 | filename <- "iris.csv"
5 | # load the CSV file from the local directory
6 | dataset <- read.csv(filename, header=FALSE)
7 | # preview the first 5 rows
8 | head(dataset)


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/1-LoadData/load_csv_url.R:
--------------------------------------------------------------------------------
 1 | # Load CSV From a URL
 2 | 
 3 | # load the library
 4 | library(RCurl)
 5 | # specify the URL for the Iris data CSV
 6 | urlfile <-'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
 7 | # download the file
 8 | downloaded <- getURL(urlfile, ssl.verifypeer=FALSE)
 9 | # treat the text data as a steam so we can read from it
10 | connection <- textConnection(downloaded)
11 | # parse the downloaded data as CSV
12 | dataset <- read.csv(connection, header=FALSE)
13 | # preview the first 5 rows
14 | head(dataset)


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/2-DataSummarization/class_distribution.R:
--------------------------------------------------------------------------------
1 | # Class Distribution
2 | 
3 | # load the libraries
4 | library(mlbench)
5 | # load the dataset
6 | data(PimaIndiansDiabetes)
7 | # distribution of class variable
8 | y <- PimaIndiansDiabetes$diabetes
9 | cbind(freq=table(y), percentage=prop.table(table(y))*100)


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/2-DataSummarization/correction_spearman.R:
--------------------------------------------------------------------------------
 1 | # Pair-wise correlations using pearson spearman coefficients
 2 | 
 3 | # load the libraries
 4 | library(mlbench)
 5 | # load the dataset
 6 | data(PimaIndiansDiabetes)
 7 | # calculate a correlation matrix for numeric variables
 8 | correlations <- cor(PimaIndiansDiabetes[,1:8], method="spearman")
 9 | # display the correlation matrix
10 | print(correlations)
11 | 
12 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/2-DataSummarization/correlation_pearson.R:
--------------------------------------------------------------------------------
 1 | # Pair-wise correlations using pearson correlation coefficients
 2 | 
 3 | # load the libraries
 4 | library(mlbench)
 5 | # load the dataset
 6 | data(PimaIndiansDiabetes)
 7 | # calculate a correlation matrix for numeric variables
 8 | correlations <- cor(PimaIndiansDiabetes[,1:8])
 9 | # display the correlation matrix
10 | print(correlations)
11 | 
12 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/2-DataSummarization/data_types.R:
--------------------------------------------------------------------------------
1 | # Data Types
2 | 
3 | # load library
4 | library(mlbench)
5 | # load dataset
6 | data(BostonHousing)
7 | # list types for each attribute
8 | sapply(BostonHousing, class)


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/2-DataSummarization/dimensions.R:
--------------------------------------------------------------------------------
1 | # Dimensions of your dataset
2 | 
3 | # load the libraries
4 | library(mlbench)
5 | # load the dataset
6 | data(PimaIndiansDiabetes)
7 | # display the dimensions of the dataset
8 | dim(PimaIndiansDiabetes)
9 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/2-DataSummarization/peek.R:
--------------------------------------------------------------------------------
1 | # Peek at raw data
2 | 
3 | # load the library
4 | library(mlbench)
5 | # load the dataset
6 | data(PimaIndiansDiabetes)
7 | # display first 20 rows of data
8 | head(PimaIndiansDiabetes, n=20)
9 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/2-DataSummarization/skewness.R:
--------------------------------------------------------------------------------
 1 | # Calculate Skewness
 2 | 
 3 | # load libraries
 4 | library(mlbench)
 5 | library(e1071)
 6 | # load the dataset
 7 | data(PimaIndiansDiabetes)
 8 | # calculate skewness for each variable
 9 | skew <- apply(PimaIndiansDiabetes[,1:8], 2, skewness)
10 | # display skewness, larger/smaller deviations from 0 show more skew
11 | print(skew)
12 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/2-DataSummarization/standard_deviation.R:
--------------------------------------------------------------------------------
1 | # Standard Deviation
2 | 
3 | # load the libraries
4 | library(mlbench)
5 | # load the dataset
6 | data(PimaIndiansDiabetes)
7 | # calculate standard deviation for all attributes
8 | sapply(PimaIndiansDiabetes[,1:8], sd)
9 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/2-DataSummarization/summary.R:
--------------------------------------------------------------------------------
1 | # Summarize each attribute of a dataset using min, max, mean, 25%, 50% and 75%.
2 | 
3 | 
4 | # load the iris dataset
5 | data(iris)
6 | # summarize the dataset
7 | summary(iris)
8 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/1-Univariate/barplot.R:
--------------------------------------------------------------------------------
 1 | # Plot Factor
 2 | 
 3 | # load the library
 4 | library(mlbench)
 5 | # load the dataset
 6 | data(BreastCancer)
 7 | # create a bar plot of each categorical attribute
 8 | par(mfrow=c(2,4))
 9 | for(i in 2:9) {
10 | 	counts <- table(BreastCancer[,i])
11 | 	name <- names(BreastCancer)[i]
12 | 	barplot(counts, main=name)
13 | }
14 | 
15 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/1-Univariate/boxplot.R:
--------------------------------------------------------------------------------
 1 | # Univarate Box And Whisker Plots
 2 | 
 3 | # load dataset
 4 | data(iris)
 5 | # Create separate boxplots for each attribute
 6 | par(mfrow=c(1,4))
 7 | for(i in 1:4) {
 8 | 	boxplot(iris[,i], main=names(iris)[i])
 9 | }
10 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/1-Univariate/density_plot.R:
--------------------------------------------------------------------------------
 1 | # Univariate Density Plots
 2 | 
 3 | # load libraries
 4 | library(lattice)
 5 | # load dataset
 6 | data(iris)
 7 | # create a panel of simpler density plots by attribute
 8 | par(mfrow=c(1,4))
 9 | for(i in 1:4) {
10 | 	plot(density(iris[,i]), main=names(iris)[i])
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/1-Univariate/histogram.R:
--------------------------------------------------------------------------------
1 | # Univariate Histograms
2 | 
3 | # load the data
4 | data(iris)
5 | # create histograms for each attribute
6 | par(mfrow=c(1,4))
7 | for(i in 1:4) {
8 | 	hist(iris[,i], main=names(iris)[i])
9 | }


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/1-Univariate/missing_plot.R:
--------------------------------------------------------------------------------
 1 | # Plot missing data
 2 | 
 3 | # load libraries
 4 | library(Amelia)
 5 | library(mlbench)
 6 | # load dataset
 7 | data(Soybean)
 8 | # create a missing map
 9 | missmap(Soybean, col=c("black", "grey"), legend=FALSE)
10 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/2-Multivaraite/boxplot_by_class.R:
--------------------------------------------------------------------------------
 1 | # Create a box and whisker plots for each variable organized by class.
 2 | 
 3 | # load the caret library
 4 | library(caret)
 5 | # load the iris dataset
 6 | data(iris)
 7 | # box and whisker plots for each attribute by class value
 8 | x <- iris[,1:4]
 9 | y <- iris[,5]
10 | featurePlot(x=x, y=y, plot="box")
11 | 
12 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/2-Multivaraite/correlation_plot.R:
--------------------------------------------------------------------------------
 1 | # Correlation Plot
 2 | 
 3 | # load library
 4 | library(corrplot)
 5 | # load the data
 6 | data(iris)
 7 | # calculate correlations
 8 | correlations <- cor(iris[,1:4])
 9 | # create correlation plot
10 | corrplot(correlations, method="circle")


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/2-Multivaraite/density_plot_by_class.R:
--------------------------------------------------------------------------------
 1 | # Create a density plot for each variable-class combination.
 2 | 
 3 | # load the library
 4 | library(caret)
 5 | # load the data
 6 | data(iris)
 7 | # density plots for each attribute by class value
 8 | x <- iris[,1:4]
 9 | y <- iris[,5]
10 | scales <- list(x=list(relation="free"), y=list(relation="free"))
11 | featurePlot(x=x, y=y, plot="density", scales=scales)
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/2-Multivaraite/scatterplot_matrix.R:
--------------------------------------------------------------------------------
1 | # Multivariate Scatterplot Matrix
2 | 
3 | # load the data
4 | data(iris)
5 | # pair-wise scatterplots of all 4 attributes
6 | pairs(iris)
7 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/2-Multivaraite/scatterplot_matrix_by_class.R:
--------------------------------------------------------------------------------
1 | # Multivariate Scatterplot Matrix By Class
2 | 
3 | # load the data
4 | data(iris)
5 | # pair-wise scatterplots colored by class
6 | pairs(Species~., data=iris, col=iris$Species)
7 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/3-Projection/andrews_curves.R:
--------------------------------------------------------------------------------
1 | # Andrews Curves
2 | 
3 | # load library
4 | library(andrews)
5 | # load dataset
6 | data(iris)
7 | # generate andres curves
8 | andrews(iris, clr=5, ymax=3)


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/3-Projection/parallel_coordinates.R:
--------------------------------------------------------------------------------
 1 | # Parallel Coordinates
 2 | 
 3 | # load library
 4 | library(MASS)
 5 | # load dataset
 6 | data(iris)
 7 | # convert data frame to matrix
 8 | iris_matrix <- data.matrix(iris)
 9 | parcoord(iris_matrix)
10 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/3-Projection/pca.R:
--------------------------------------------------------------------------------
 1 | # Principal Component Analysis
 2 | 
 3 | # load the dataset
 4 | data(iris)
 5 | # separate numerical inputs
 6 | x <- data.matrix(iris[,1:4])
 7 | y <- iris[,5]
 8 | # calculate components
 9 | components <- prcomp(x, center=TRUE, scale=TRUE)
10 | # display components
11 | print(components)
12 | # summarize components
13 | summary(ir.pca)
14 | # plot the components
15 | biplot(components)
16 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/3-Projection/sammons.R:
--------------------------------------------------------------------------------
 1 | # Sammons Mapping
 2 | 
 3 | # load library
 4 | library(MASS)
 5 | # load dataset
 6 | data(iris)
 7 | # remove duplicates
 8 | clean <- unique(iris)
 9 | # split out numerical inputs
10 | x <- data.matrix(clean[, 1:4])
11 | # create a sammon mapping
12 | mapping <- sammon(dist(x))
13 | # plot mapping by class
14 | plot(mapping$points, type="n")
15 | text(mapping$points, labels=clean[,5])
16 | 
17 | # TODO colour dots by class


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/3-Projection/som.R:
--------------------------------------------------------------------------------
 1 | # Self Organizing Map (Kohonen)
 2 | 
 3 | # load the library
 4 | library("kohonen")
 5 | # load the dataset
 6 | data(iris)
 7 | # split input and output
 8 | x <- data.matrix(iris[,1:4])
 9 | y <- iris[,5]
10 | # set the random seed for repetable results
11 | set.seed(7)
12 | # create a map of the x values
13 | iris_map <- som(data=x, grid=somgrid(5, 5, "hexagonal"))
14 | # plot the map
15 | plot(iris_map)
16 | 
17 | # TODO label the map by class
18 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/2-PrepareData/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/machine_learning_mastery_with_r_code/2-PrepareData/.DS_Store


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/2-PrepareData/1-DataCleaning/impute_missing_values.R:
--------------------------------------------------------------------------------
 1 |  # Impute missing values
 2 | 
 3 |  # load the libraries
 4 | library(mlbench)
 5 | library(Hmisc)
 6 | # load the dataset
 7 | data(PimaIndiansDiabetes)
 8 | # mark a pressure of 0 as N/A, it is impossible
 9 | invalid <- 0
10 | PimaIndiansDiabetes$pressure[PimaIndiansDiabetes$pressure==invalid] <- NA
11 | # impute missing pressure values using the mean
12 | PimaIndiansDiabetes$pressure <- with(PimaIndiansDiabetes, impute(pressure, mean))
13 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/2-PrepareData/1-DataCleaning/mark_missing_values.R:
--------------------------------------------------------------------------------
 1 | # Mark Missing Values as N/A
 2 | 
 3 | # load the libraries
 4 | library(mlbench)
 5 | # load the dataset
 6 | data(PimaIndiansDiabetes)
 7 | # mark a pressure of 0 as N/A, it is impossible
 8 | invalid <- 0
 9 | PimaIndiansDiabetes$pressure[PimaIndiansDiabetes$pressure==invalid] <- NA
10 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/2-PrepareData/1-DataCleaning/rebalance_SMOTE.R:
--------------------------------------------------------------------------------
 1 | # Rebalance a dataset using Synthetic Minority Over-sampling Technique (SMOTE)
 2 | 
 3 | # load the libraries
 4 | library(mlbench)
 5 | library(DMwR)
 6 | # load the dataset
 7 | data(PimaIndiansDiabetes)
 8 | # display count of instances of each class (unbalanced)
 9 | table(PimaIndiansDiabetes$diabetes)
10 | # use SMOTE to created a "more balance" version of the dataset
11 | balanced <- SMOTE(diabetes~., PimaIndiansDiabetes, perc.over=300, perc.under=100)
12 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/2-PrepareData/1-DataCleaning/remove_duplicates.R:
--------------------------------------------------------------------------------
 1 | # Remove Duplicate Instances
 2 | 
 3 | # load the libraries
 4 | library(mlbench)
 5 | # load the dataset
 6 | data(iris)
 7 | dim(iris)
 8 | # remove duplicates
 9 | clean <- unique(iris)
10 | dim(clean)
11 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/2-PrepareData/1-DataCleaning/remove_na.R:
--------------------------------------------------------------------------------
 1 | # Remove rows with NA
 2 | 
 3 | # load library
 4 | library(mlbench)
 5 | # load dataset
 6 | data(BreastCancer)
 7 | # summarize dimensions of dataset
 8 | dim(BreastCancer)
 9 | # Remove all incomplete rows
10 | dataset <- BreastCancer[complete.cases(BreastCancer),]
11 | # summarize dimensions of resulting dataset
12 | dim(dataset)


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/2-PrepareData/1-DataCleaning/remove_outliers.R:
--------------------------------------------------------------------------------
 1 | # Update Data Frame to Remove Outliers
 2 | 
 3 | # load the libraries
 4 | library(mlbench)
 5 | # load the dataset
 6 | data(PimaIndiansDiabetes)
 7 | # calculate stats for pregnant (number of times pregnant)
 8 | pregnant.mean <- mean(PimaIndiansDiabetes$pregnant)
 9 | pregnant.sd <- sd(PimaIndiansDiabetes$pregnant)
10 | # max reasonable value is within 99.7% of the data (if Gaussian)
11 | pregnant.max <- pregnant.mean + (3*pregnant.sd)
12 | # mark outlier pregnant values as N/A
13 | PimaIndiansDiabetes$pregnant[PimaIndiansDiabetes$pregnant>pregnant.max] <- NA
14 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/2-PrepareData/2-FeatureSelection/rank_features_by_importance_caret.R:
--------------------------------------------------------------------------------
 1 | # Rank features by their importance.
 2 | 
 3 | # load the libraries
 4 | library(mlbench)
 5 | library(caret)
 6 | # load the dataset
 7 | data(PimaIndiansDiabetes)
 8 | # prepare training scheme
 9 | control <- trainControl(method="cv", number=10)
10 | # train the model
11 | model <- train(diabetes~., data=PimaIndiansDiabetes, method="lvq", preProcess="scale", trControl=control)
12 | # estimate variable importance
13 | importance <- varImp(model, scale=FALSE)
14 | # summarize importance
15 | print(importance)
16 | # plot importance
17 | plot(importance)
18 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/2-PrepareData/2-FeatureSelection/recursive_feature_elimination_caret.R:
--------------------------------------------------------------------------------
 1 | # Use RFE and to select features
 2 | 
 3 | # load the library
 4 | library(mlbench)
 5 | library(caret)
 6 | # load the data
 7 | data(Sonar)
 8 | # define the control using a random forest selection function
 9 | control <- rfeControl(functions=rfFuncs, method="cv", number=10)
10 | # run the RFE algorithm
11 | x <- Sonar[,1:60]
12 | y <- Sonar[,61]
13 | sizes <- c(10,20,30,40,50,60)
14 | results <- rfe(x, y, sizes=sizes, rfeControl=control)
15 | # summarize the results
16 | print(results)
17 | # list the chosen features
18 | predictors(results)
19 | # plot accuracy versus the number of features
20 | plot(results, type=c("g", "o"))
21 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/2-PrepareData/2-FeatureSelection/remove_highly_correlated_features_caret.R:
--------------------------------------------------------------------------------
 1 | # Identify and remove highly correlated features
 2 | 
 3 | # load the libraries
 4 | library(mlbench)
 5 | library(caret)
 6 | # load the data
 7 | data(PimaIndiansDiabetes)
 8 | # calculate correlation matrix
 9 | correlationMatrix <- cor(PimaIndiansDiabetes[,1:8])
10 | # find attributes that are highly corrected (ideally >0.75)
11 | cutoff <- 0.50
12 | highlyCorrelated <- findCorrelation(correlationMatrix, cutoff=cutoff)
13 | # create a new dataset without highly corrected features
14 | dataset <- PimaIndiansDiabetes[,-highlyCorrelated]
15 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/2-PrepareData/3-TransformData/boxcox_transform.R:
--------------------------------------------------------------------------------
 1 | # Box-Cox Transform (attributes must be numeric and >0)
 2 | 
 3 | # load libraries
 4 | library(mlbench)
 5 | library(caret)
 6 | # load the dataset
 7 | data(PimaIndiansDiabetes)
 8 | # summarize pedigree and age
 9 | summary(PimaIndiansDiabetes[,7:8])
10 | # calculate the pre-process parameters from the dataset
11 | preprocessParams <- preProcess(PimaIndiansDiabetes[,7:8], method=c("BoxCox"))
12 | # summarize transform parameters
13 | print(preprocessParams)
14 | # transform the dataset using the parameters
15 | transformed <- predict(preprocessParams, PimaIndiansDiabetes[,7:8])
16 | # summarize the transformed dataset (note pedigree and age)
17 | summary(transformed)
18 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/2-PrepareData/3-TransformData/center.R:
--------------------------------------------------------------------------------
 1 | # Center attributes by subtracting the mean
 2 | 
 3 | # load libraries
 4 | library(caret)
 5 | # load the dataset
 6 | data(iris)
 7 | # summarize data
 8 | summary(iris[,1:4])
 9 | # calculate the pre-process parameters from the dataset
10 | preprocessParams <- preProcess(iris[,1:4], method=c("center"))
11 | # summarize transform parameters
12 | print(preprocessParams)
13 | # transform the dataset using the parameters
14 | transformed <- predict(preprocessParams, iris[,1:4])
15 | # summarize the transformed dataset
16 | summary(transformed)
17 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/2-PrepareData/3-TransformData/ica_transform.R:
--------------------------------------------------------------------------------
 1 | # Independent Component Analysis Pre-processing
 2 | 
 3 | # load libraries
 4 | library(mlbench)
 5 | library(caret)
 6 | # load the dataset
 7 | data(PimaIndiansDiabetes)
 8 | # summarize dataset
 9 | summary(PimaIndiansDiabetes[,1:8])
10 | # calculate the pre-process parameters from the dataset
11 | preprocessParams <- preProcess(PimaIndiansDiabetes[,1:8], method=c("center", "scale", "ica"), n.comp=5)
12 | # summarize transform parameters
13 | print(preprocessParams)
14 | # transform the dataset using the parameters
15 | transformed <- predict(preprocessParams, PimaIndiansDiabetes[,1:8])
16 | # summarize the transformed dataset
17 | summary(transformed)
18 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/2-PrepareData/3-TransformData/normalize.R:
--------------------------------------------------------------------------------
 1 | # Normalize numeric attributes to the range [0,1]
 2 | 
 3 | # load libraries
 4 | library(caret)
 5 | # load the dataset
 6 | data(iris)
 7 | # summarize data
 8 | summary(iris[,1:4])
 9 | # calculate the pre-process parameters from the dataset
10 | preprocessParams <- preProcess(iris[,1:4], method=c("range"))
11 | # summarize transform parameters
12 | print(preprocessParams)
13 | # transform the dataset using the parameters
14 | transformed <- predict(preprocessParams, iris[,1:4])
15 | # summarize the transformed dataset
16 | summary(transformed)
17 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/2-PrepareData/3-TransformData/pca_transform.R:
--------------------------------------------------------------------------------
 1 | # Principal Component Analysis Pre-processing
 2 | 
 3 | # load the libraries
 4 | library(mlbench)
 5 | # load the dataset
 6 | data(iris)
 7 | # summarize dataset
 8 | summary(iris)
 9 | # calculate the pre-process parameters from the dataset
10 | preprocessParams <- preProcess(iris, method=c("center", "scale", "pca"))
11 | # summarize transform parameters
12 | print(preprocessParams)
13 | # transform the dataset using the parameters
14 | transformed <- predict(preprocessParams, iris)
15 | # summarize the transformed dataset
16 | summary(transformed)
17 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/2-PrepareData/3-TransformData/scale.R:
--------------------------------------------------------------------------------
 1 | # Scale attributes by dividing by standard deviation
 2 | 
 3 | # load libraries
 4 | library(caret)
 5 | # load the dataset
 6 | data(iris)
 7 | # summarize data
 8 | summary(iris[,1:4])
 9 | # calculate the pre-process parameters from the dataset
10 | preprocessParams <- preProcess(iris[,1:4], method=c("scale"))
11 | # summarize transform parameters
12 | print(preprocessParams)
13 | # transform the dataset using the parameters
14 | transformed <- predict(preprocessParams, iris[,1:4])
15 | # summarize the transformed dataset
16 | summary(transformed)
17 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/2-PrepareData/3-TransformData/standardize.R:
--------------------------------------------------------------------------------
 1 | # Standardize numeric attributes so they have zero mean and unit variance.
 2 | 
 3 | # load libraries
 4 | library(caret)
 5 | # load the dataset
 6 | data(iris)
 7 | # summarize data
 8 | summary(iris[,1:4])
 9 | # calculate the pre-process parameters from the dataset
10 | preprocessParams <- preProcess(iris[,1:4], method=c("center", "scale"))
11 | # summarize transform parameters
12 | print(preprocessParams)
13 | # transform the dataset using the parameters
14 | transformed <- predict(preprocessParams, iris[,1:4])
15 | # summarize the transformed dataset
16 | summary(transformed)


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/2-PrepareData/3-TransformData/yeojohnson_transform.R:
--------------------------------------------------------------------------------
 1 | # Yeo-Johnson Transform
 2 | 
 3 | # load libraries
 4 | library(mlbench)
 5 | library(caret)
 6 | # load the dataset
 7 | data(PimaIndiansDiabetes)
 8 | # summarize pedigree and age
 9 | summary(PimaIndiansDiabetes[,7:8])
10 | # calculate the pre-process parameters from the dataset
11 | preprocessParams <- preProcess(PimaIndiansDiabetes[,7:8], method=c("YeoJohnson"))
12 | # summarize transform parameters
13 | print(preprocessParams)
14 | # transform the dataset using the parameters
15 | transformed <- predict(preprocessParams, PimaIndiansDiabetes[,7:8])
16 | # summarize the transformed dataset (note pedigree and age)
17 | summary(transformed)
18 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/machine_learning_mastery_with_r_code/3-Algorithms/.DS_Store


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/.DS_Store


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/1-LinearRegression/ordinary_least_squares_regression.R:
--------------------------------------------------------------------------------
 1 | # Ordinary Least Squares Regression
 2 | 
 3 | # load data
 4 | data(longley)
 5 | # fit model
 6 | fit <- lm(Employed~., longley)
 7 | # summarize the fit
 8 | print(fit)
 9 | # make predictions
10 | predictions <- predict(fit, longley)
11 | # summarize accuracy
12 | mse <- mean((longley$Employed - predictions)^2)
13 | print(mse)
14 | 
15 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/1-LinearRegression/partial_least_squares_regression.R:
--------------------------------------------------------------------------------
 1 | # Partial Least Squares Regression
 2 | 
 3 | # load the package
 4 | library(pls)
 5 | # load data
 6 | data(longley)
 7 | # fit model
 8 | fit <- plsr(Employed~., data=longley, validation="CV")
 9 | # summarize the fit
10 | print(fit)
11 | # make predictions
12 | predictions <- predict(fit, longley, ncomp=6)
13 | # summarize accuracy
14 | mse <- mean((longley$Employed - predictions)^2)
15 | print(mse)
16 | 
17 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/1-LinearRegression/principal_component_regression.R:
--------------------------------------------------------------------------------
 1 | # Principal Component Regression
 2 | 
 3 | # load the package
 4 | library(pls)
 5 | # load data
 6 | data(longley)
 7 | # fit model
 8 | fit <- pcr(Employed~., data=longley, validation="CV")
 9 | # summarize the fit
10 | print(fit)
11 | # make predictions
12 | predictions <- predict(fit, longley, ncomp=6)
13 | # summarize accuracy
14 | mse <- mean((longley$Employed - predictions)^2)
15 | print(mse)
16 | 
17 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/1-LinearRegression/stepwise_linear_regression.R:
--------------------------------------------------------------------------------
 1 | # Stepwise Linear Regression
 2 | 
 3 | # load data
 4 | data(longley)
 5 | # fit model
 6 | base <- lm(Employed~., longley)
 7 | # summarize the fit
 8 | summary(base)
 9 | # perform step-wise feature selection
10 | fit <- step(base)
11 | # summarize the selected model
12 | print(fit)
13 | # make predictions
14 | predictions <- predict(fit, longley)
15 | # summarize accuracy
16 | mse <- mean((longley$Employed - predictions)^2)
17 | print(mse)
18 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/2-PenalizedLinearRegression/LASSO.R:
--------------------------------------------------------------------------------
 1 | # Least Absolute Shrinkage and Selection Operator
 2 | 
 3 | # load the package
 4 | library(lars)
 5 | # load data
 6 | data(longley)
 7 | x <- as.matrix(longley[,1:6])
 8 | y <- as.matrix(longley[,7])
 9 | # fit model
10 | fit <- lars(x, y, type="lasso")
11 | # summarize the fit
12 | print(fit)
13 | # select a step with a minimum error
14 | best_step <- fit$df[which.min(fit$RSS)]
15 | # make predictions
16 | predictions <- predict(fit, x, s=best_step, type="fit")$fit
17 | # summarize accuracy
18 | mse <- mean((y - predictions)^2)
19 | print(mse)
20 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/2-PenalizedLinearRegression/elastic_net.R:
--------------------------------------------------------------------------------
 1 | # Elastic Net
 2 | 
 3 | # load the package
 4 | library(glmnet)
 5 | # load data
 6 | data(longley)
 7 | x <- as.matrix(longley[,1:6])
 8 | y <- as.matrix(longley[,7])
 9 | # fit model
10 | fit <- glmnet(x, y, family="gaussian", alpha=0.5, lambda=0.001)
11 | # summarize the fit
12 | print(fit)
13 | # make predictions
14 | predictions <- predict(fit, x, type="link")
15 | # summarize accuracy
16 | mse <- mean((y - predictions)^2)
17 | print(mse)
18 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/2-PenalizedLinearRegression/ridge_regression.R:
--------------------------------------------------------------------------------
 1 | # Ridge Regression
 2 | 
 3 | # load the package
 4 | library(glmnet)
 5 | # load data
 6 | data(longley)
 7 | x <- as.matrix(longley[,1:6])
 8 | y <- as.matrix(longley[,7])
 9 | # fit model
10 | fit <- glmnet(x, y, family="gaussian", alpha=0, lambda=0.001)
11 | # summarize the fit
12 | print(fit)
13 | # make predictions
14 | predictions <- predict(fit, x, type="link")
15 | # summarize accuracy
16 | mse <- mean((y - predictions)^2)
17 | print(mse)
18 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/M5P.R:
--------------------------------------------------------------------------------
 1 | # Model Trees
 2 | 
 3 | # load the package
 4 | library(RWeka)
 5 | # load data
 6 | data(longley)
 7 | # fit model
 8 | fit <- M5P(Employed~., data=longley)
 9 | # summarize the fit
10 | print(fit)
11 | # make predictions
12 | predictions <- predict(fit, longley[,1:6])
13 | # summarize accuracy
14 | mse <- mean((longley$Employed - predictions)^2)
15 | print(mse)
16 | 
17 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/M5Rules.R:
--------------------------------------------------------------------------------
 1 | # Rule System
 2 | 
 3 | # load the package
 4 | library(RWeka)
 5 | # load data
 6 | data(longley)
 7 | # fit model
 8 | fit <- M5Rules(Employed~., data=longley)
 9 | # summarize the fit
10 | print(fit)
11 | # make predictions
12 | predictions <- predict(fit, longley[,1:6])
13 | # summarize accuracy
14 | mse <- mean((longley$Employed - predictions)^2)
15 | print(mse)
16 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/bagging_CART.R:
--------------------------------------------------------------------------------
 1 | # Bagging CART
 2 | 
 3 | # load the package
 4 | library(ipred)
 5 | # load data
 6 | data(longley)
 7 | # fit model
 8 | fit <- bagging(Employed~., data=longley, control=rpart.control(minsplit=5))
 9 | # summarize the fit
10 | print(fit)
11 | # make predictions
12 | predictions <- predict(fit, longley[,1:6])
13 | # summarize accuracy
14 | mse <- mean((longley$Employed - predictions)^2)
15 | print(mse)
16 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/classification_and_regression_trees.R:
--------------------------------------------------------------------------------
 1 | # Classification and Regression Trees
 2 | 
 3 | # load the package
 4 | library(rpart)
 5 | # load data
 6 | data(longley)
 7 | # fit model
 8 | fit <- rpart(Employed~., data=longley, control=rpart.control(minsplit=5))
 9 | # summarize the fit
10 | print(fit)
11 | # make predictions
12 | predictions <- predict(fit, longley[,1:6])
13 | # summarize accuracy
14 | mse <- mean((longley$Employed - predictions)^2)
15 | print(mse)
16 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/conditional_decision_trees.R:
--------------------------------------------------------------------------------
 1 | # Conditional Decision Trees
 2 | 
 3 | # load the package
 4 | library(party)
 5 | # load data
 6 | data(longley)
 7 | # fit model
 8 | fit <- ctree(Employed~., data=longley, controls=ctree_control(minsplit=2,minbucket=2,testtype="Univariate"))
 9 | # summarize the fit
10 | print(fit)
11 | # make predictions
12 | predictions <- predict(fit, longley[,1:6])
13 | # summarize accuracy
14 | mse <- mean((longley$Employed - predictions)^2)
15 | print(mse)
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/cubist.R:
--------------------------------------------------------------------------------
 1 | # Cubist
 2 | 
 3 | # load the package
 4 | library(Cubist)
 5 | # load data
 6 | data(longley)
 7 | # fit model
 8 | fit <- cubist(longley[,1:6], longley[,7])
 9 | # summarize the fit
10 | print(fit)
11 | # make predictions
12 | predictions <- predict(fit, longley[,1:6])
13 | # summarize accuracy
14 | mse <- mean((longley$Employed - predictions)^2)
15 | print(mse)


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/feed_forward_neural_network.R:
--------------------------------------------------------------------------------
 1 | # Feed Forward Neural Network
 2 | 
 3 | # load the package
 4 | library(nnet)
 5 | # load data
 6 | data(longley)
 7 | x <- longley[,1:6]
 8 | y <- longley[,7]
 9 | # fit model
10 | fit <- nnet(Employed~., longley, size=12, maxit=500, linout=T, decay=0.01)
11 | # summarize the fit
12 | print(fit)
13 | # make predictions
14 | predictions <- predict(fit, x, type="raw")
15 | # summarize accuracy
16 | mse <- mean((y - predictions)^2)
17 | print(mse)


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/gradient_boosted_machine.R:
--------------------------------------------------------------------------------
 1 | # Gradient Boosted Machine
 2 | 
 3 | # load the package
 4 | library(gbm)
 5 | # load data
 6 | data(longley)
 7 | # fit model
 8 | fit <- gbm(Employed~., data=longley, distribution="gaussian", n.minobsinnode=1)
 9 | # summarize the fit
10 | print(fit)
11 | # make predictions
12 | predictions <- predict(fit, longley[,1:6], n.trees=1)
13 | # summarize accuracy
14 | mse <- mean((longley$Employed - predictions)^2)
15 | print(mse)
16 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/k-nearest_neighbor.R:
--------------------------------------------------------------------------------
 1 | # k-Nearest Neighbor
 2 | 
 3 | # load the package
 4 | library(caret)
 5 | # load data
 6 | data(longley)
 7 | # fit model
 8 | fit <- knnreg(longley[,1:6], longley[,7], k=3)
 9 | # summarize the fit
10 | print(fit)
11 | # make predictions
12 | predictions <- predict(fit, longley[,1:6])
13 | # summarize accuracy
14 | mse <- mean((longley$Employed - predictions)^2)
15 | print(mse)
16 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/multivariate_adaptive_regression_splines.R:
--------------------------------------------------------------------------------
 1 | # Multivariate Adaptive Regression Splines
 2 | 
 3 | # load the package
 4 | library(earth)
 5 | # load data
 6 | data(longley)
 7 | # fit model
 8 | fit <- earth(Employed~., longley)
 9 | # summarize the fit
10 | print(fit)
11 | # summarize the importance of input variables
12 | evimp(fit)
13 | # make predictions
14 | predictions <- predict(fit, longley)
15 | # summarize accuracy
16 | mse <- mean((longley$Employed - predictions)^2)
17 | print(mse)
18 | 
19 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/random_forest.R:
--------------------------------------------------------------------------------
 1 | # Random Forest
 2 | 
 3 | # load the package
 4 | library(randomForest)
 5 | # load data
 6 | data(longley)
 7 | # fit model
 8 | fit <- randomForest(Employed~., data=longley)
 9 | # summarize the fit
10 | print(fit)
11 | # make predictions
12 | predictions <- predict(fit, longley[,1:6])
13 | # summarize accuracy
14 | mse <- mean((longley$Employed - predictions)^2)
15 | print(mse)
16 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/support_vector_machine.R:
--------------------------------------------------------------------------------
 1 | # Support Vector Machine
 2 | 
 3 | # load the package
 4 | library(kernlab)
 5 | # load data
 6 | data(longley)
 7 | # fit model
 8 | fit <- ksvm(Employed~., longley)
 9 | # summarize the fit
10 | print(fit)
11 | # make predictions
12 | predictions <- predict(fit, longley)
13 | # summarize accuracy
14 | mse <- mean((longley$Employed - predictions)^2)
15 | print(mse)
16 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/4-LinearClassification/linear_discriminant_analysis.R:
--------------------------------------------------------------------------------
 1 | # Linear Discriminant Analysis
 2 | 
 3 | # load the package
 4 | library(MASS)
 5 | data(iris)
 6 | # fit model
 7 | fit <- lda(Species~., data=iris)
 8 | # summarize the fit
 9 | print(fit)
10 | # make predictions
11 | predictions <- predict(fit, iris[,1:4])$class
12 | # summarize accuracy
13 | table(predictions, iris$Species)
14 | 
15 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/4-LinearClassification/logistic_regression.R:
--------------------------------------------------------------------------------
 1 | Logistic Regression
 2 | 
 3 | # Load the dataset
 4 | data(PimaIndiansDiabetes)
 5 | # fit model
 6 | fit <- glm(diabetes~., data=PimaIndiansDiabetes, family=binomial(link='logit'))
 7 | # summarize the fit
 8 | print(fit)
 9 | # make predictions
10 | probabilities <- predict(fit, PimaIndiansDiabetes[,1:8], type='response')
11 | predictions <- ifelse(probabilities > 0.5,'pos','neg')
12 | # summarize accuracy
13 | table(predictions, PimaIndiansDiabetes$diabetes)
14 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/4-LinearClassification/logistic_regression_multiclass.R:
--------------------------------------------------------------------------------
 1 | # Logistic Regression
 2 | 
 3 | # load the package
 4 | library(VGAM)
 5 | # load data
 6 | data(iris)
 7 | # fit model
 8 | fit <- vglm(Species~., family=multinomial, data=iris)
 9 | # summarize the fit
10 | print(fit)
11 | # make predictions
12 | probabilities <- predict(fit, iris[,1:4], type="response")
13 | predictions <- apply(probabilities, 1, which.max)
14 | predictions[which(predictions=="1")] <- levels(iris$Species)[1]
15 | predictions[which(predictions=="2")] <- levels(iris$Species)[2]
16 | predictions[which(predictions=="3")] <- levels(iris$Species)[3]
17 | # summarize accuracy
18 | table(predictions, iris$Species)
19 | 
20 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/4-LinearClassification/partial_least_squares_discriminant_analysis.R:
--------------------------------------------------------------------------------
 1 | # Partial Least Squares Discriminant Analysis
 2 | 
 3 | # load the package
 4 | library(caret)
 5 | data(iris)
 6 | x <- iris[,1:4]
 7 | y <- iris[,5]
 8 | # fit model
 9 | fit <- plsda(x, y, probMethod="Bayes")
10 | # summarize the fit
11 | print(fit)
12 | # make predictions
13 | predictions <- predict(fit, iris[,1:4])
14 | # summarize accuracy
15 | table(predictions, iris$Species)
16 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/C4.5.R:
--------------------------------------------------------------------------------
 1 | # C4.5
 2 | 
 3 | # load the package
 4 | library(RWeka)
 5 | # load data
 6 | data(iris)
 7 | # fit model
 8 | fit <- J48(Species~., data=iris)
 9 | # summarize the fit
10 | print(fit)
11 | # make predictions
12 | predictions <- predict(fit, iris[,1:4])
13 | # summarize accuracy
14 | table(predictions, iris$Species)


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/C5.0.R:
--------------------------------------------------------------------------------
 1 | # C5.0
 2 | 
 3 | # load the package
 4 | library(C50)
 5 | # load data
 6 | data(iris)
 7 | # fit model
 8 | fit <- C5.0(Species~., data=iris, trials=10)
 9 | # summarize the fit
10 | print(fit)
11 | # make predictions
12 | predictions <- predict(fit, iris[,1:4])
13 | # summarize accuracy
14 | table(predictions, iris$Species)
15 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/PART.R:
--------------------------------------------------------------------------------
 1 | # PART
 2 | 
 3 | # load the package
 4 | library(RWeka)
 5 | # load data
 6 | data(iris)
 7 | # fit model
 8 | fit <- PART(Species~., data=iris)
 9 | # summarize the fit
10 | print(fit)
11 | # make predictions
12 | predictions <- predict(fit, iris[,1:4])
13 | # summarize accuracy
14 | table(predictions, iris$Species)
15 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/bagging_CART.R:
--------------------------------------------------------------------------------
 1 | # Bagging CART
 2 | 
 3 | # load the package
 4 | library(ipred)
 5 | # load data
 6 | data(iris)
 7 | # fit model
 8 | fit <- bagging(Species~., data=iris)
 9 | # summarize the fit
10 | print(fit)
11 | # make predictions
12 | predictions <- predict(fit, iris[,1:4], type="class")
13 | # summarize accuracy
14 | table(predictions, iris$Species)
15 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/classification_and_regression_trees.R:
--------------------------------------------------------------------------------
 1 | # Classification and Regression Trees
 2 | 
 3 | # load the package
 4 | library(rpart)
 5 | # load data
 6 | data(iris)
 7 | # fit model
 8 | fit <- rpart(Species~., data=iris)
 9 | # summarize the fit
10 | print(fit)
11 | # make predictions
12 | predictions <- predict(fit, iris[,1:4], type="class")
13 | # summarize accuracy
14 | table(predictions, iris$Species)
15 | 
16 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/feed_forward_neural_network.R:
--------------------------------------------------------------------------------
 1 | # Feed Forward Neural Network
 2 | 
 3 | # load the package
 4 | library(nnet)
 5 | data(iris)
 6 | # fit model
 7 | fit <- nnet(Species~., data=iris, size=4, decay=0.0001, maxit=500)
 8 | # summarize the fit
 9 | print(fit)
10 | # make predictions
11 | predictions <- predict(fit, iris[,1:4], type="class")
12 | # summarize accuracy
13 | table(predictions, iris$Species)
14 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/flexible_discriminant_analysis.R:
--------------------------------------------------------------------------------
 1 | # Flexible Discriminant Analysis
 2 | 
 3 | # load the package
 4 | library(mda)
 5 | data(iris)
 6 | # fit model
 7 | fit <- fda(Species~., data=iris)
 8 | # summarize the fit
 9 | print(fit)
10 | # make predictions
11 | predictions <- predict(fit, iris[,1:4])
12 | # summarize accuracy
13 | table(predictions, iris$Species)
14 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/gradient_boosted_machine.R:
--------------------------------------------------------------------------------
 1 | # Gradient Boosted Machine
 2 | 
 3 | # load the package
 4 | library(gbm)
 5 | # load data
 6 | data(iris)
 7 | # fit model
 8 | fit <- gbm(Species~., data=iris, distribution="multinomial")
 9 | # summarize the fit
10 | print(fit)
11 | # make predictions
12 | probabilities <- predict(fit, iris[,1:4], n.trees=1)
13 | predictions <-  colnames(probabilities)[apply(probabilities, 1, which.max)]
14 | # summarize accuracy
15 | table(predictions, iris$Species)
16 | 
17 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/k-nearest_neighbors.R:
--------------------------------------------------------------------------------
 1 | # k-Nearest Neighbors
 2 | 
 3 | # load the package
 4 | library(caret)
 5 | data(iris)
 6 | # fit model
 7 | fit <- knn3(Species~., data=iris, k=5)
 8 | # summarize the fit
 9 | print(fit)
10 | # make predictions
11 | predictions <- predict(fit, iris[,1:4], type="class")
12 | # summarize accuracy
13 | table(predictions, iris$Species)
14 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/mixture_discriminant_analysis.R:
--------------------------------------------------------------------------------
 1 | # Mixture Discriminant Analysis
 2 | 
 3 | # load the package
 4 | library(mda)
 5 | data(iris)
 6 | # fit model
 7 | fit <- mda(Species~., data=iris)
 8 | # summarize the fit
 9 | print(fit)
10 | # make predictions
11 | predictions <- predict(fit, iris[,1:4])
12 | # summarize accuracy
13 | table(predictions, iris$Species)
14 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/naive_bayes.R:
--------------------------------------------------------------------------------
 1 | # Naive Bayes
 2 | 
 3 | # load the package
 4 | library(e1071)
 5 | data(iris)
 6 | # fit model
 7 | fit <- naiveBayes(Species~., data=iris)
 8 | # summarize the fit
 9 | print(fit)
10 | # make predictions
11 | predictions <- predict(fit, iris[,1:4])
12 | # summarize accuracy
13 | table(predictions, iris$Species)
14 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/quadratic_discriminant_analysis.R:
--------------------------------------------------------------------------------
 1 | # Quadratic Discriminant Analysis
 2 | 
 3 | # load the package
 4 | library(MASS)
 5 | data(iris)
 6 | # fit model
 7 | fit <- qda(Species~., data=iris)
 8 | # summarize the fit
 9 | print(fit)
10 | # make predictions
11 | predictions <- predict(fit, iris[,1:4])$class
12 | # summarize accuracy
13 | table(predictions, iris$Species)
14 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/random_forest.R:
--------------------------------------------------------------------------------
 1 | # Random Forest
 2 | 
 3 | # load the package
 4 | library(randomForest)
 5 | # load data
 6 | data(iris)
 7 | # fit model
 8 | fit <- randomForest(Species~., data=iris)
 9 | # summarize the fit
10 | print(fit)
11 | # make predictions
12 | predictions <- predict(fit, iris[,1:4])
13 | # summarize accuracy
14 | table(predictions, iris$Species)
15 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/regularized_discriminant_analysis.R:
--------------------------------------------------------------------------------
 1 | # Regularized Discriminant Analysis
 2 | 
 3 | # load the package
 4 | library(klaR)
 5 | data(iris)
 6 | # fit model
 7 | fit <- rda(Species~., data=iris, gamma=0.05, lambda=0.01)
 8 | # summarize the fit
 9 | print(fit)
10 | # make predictions
11 | predictions <- predict(fit, iris[,1:4])$class
12 | # summarize accuracy
13 | table(predictions, iris$Species)
14 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/support_vector_machine.R:
--------------------------------------------------------------------------------
 1 | # Support Vector Machine
 2 | 
 3 | # load the package
 4 | library(kernlab)
 5 | data(iris)
 6 | # fit model
 7 | fit <- ksvm(Species~., data=iris)
 8 | # summarize the fit
 9 | print(fit)
10 | # make predictions
11 | predictions <- predict(fit, iris[,1:4], type="response")
12 | # summarize accuracy
13 | table(predictions, iris$Species)
14 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/6-Optimization/bfgs.R:
--------------------------------------------------------------------------------
 1 | # BFGS
 2 | 
 3 | # definition of the 2D Rosenbrock function, optima is at (1,1)
 4 | rosenbrock <- function(v) {
 5 | 	(1 - v[1])^2 + 100 * (v[2] - v[1]*v[1])^2
 6 | }
 7 | 
 8 | # definition of the gradient of the 2D Rosenbrock function
 9 | derivative <- function(v) {
10 | 	c(-400 * v[1] * (v[2] - v[1]*v[1]) - 2 * (1 - v[1]),
11 | 	  200 * (v[2] - v[1]*v[1]))
12 | }
13 | 
14 | # locate the minimum of the function using the BFGS method
15 | result <- optim(
16 | 	c(runif(1,-3,3), runif(1,-3,3)), # start at a random position
17 | 	rosenbrock, # the function to minimize
18 | 	derivative, # no function gradient
19 | 	method="BFGS", # use the BFGS method
20 | 	control=c( # configure BFGS
21 | 		maxit=100, # maximum iterations of 100
22 | 		reltol=1e-8)) # response tolerance over-one step
23 | 
24 | # summarise results
25 | print(result$par) # the coordinate of the minimim
26 | print(result$value) # the function response of the minimum
27 | print(result$counts) # the number of function calls performed
28 | 
29 | # dispaly the function as a contour plot
30 | x <- seq(-3, 3, length.out=100)
31 | y <- seq(-3, 3, length.out=100)
32 | z <- rosenbrock(expand.grid(x, y))
33 | contour(x, y, matrix(log10(z), length(x)), xlab="x", ylab="y")
34 | # draw the optima as a point
35 | points(result$par[1], result$par[2], col="red", pch=19)
36 | # draw a square around the optima to highlight it
37 | rect(result$par[1]-0.2, result$par[2]-0.2, result$par[1]+0.2,
38 | 	 result$par[2]+0.2, lwd=2)
39 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/6-Optimization/conjugate_gradient.R:
--------------------------------------------------------------------------------
 1 | # Conjugate Gradient method
 2 | 
 3 | # definition of the 2D Rosenbrock function, optima is at (1,1)
 4 | rosenbrock <- function(v) {
 5 | 	(1 - v[1])^2 + 100 * (v[2] - v[1]*v[1])^2
 6 | }
 7 | 
 8 | # definition of the gradient of the 2D Rosenbrock function
 9 | derivative <- function(v) {
10 | 	c(-400 * v[1] * (v[2] - v[1]*v[1]) - 2 * (1 - v[1]),
11 | 	  200 * (v[2] - v[1]*v[1]))
12 | }
13 | 
14 | # locate the minimum of the function using the Conjugate Gradient method
15 | result <- optim(
16 | 	c(runif(1,-3,3), runif(1,-3,3)), # start at a random position
17 | 	rosenbrock, # the function to minimize
18 | 	derivative, # no function gradient
19 | 	method="CG", # use the Conjugate Gradient method
20 | 	control=c( # configure Conjugate Gradient
21 | 		maxit=100, # maximum iterations of 100
22 | 		reltol=1e-8, # response tolerance over-one step
23 | 		type=2)) # use the Polak-Ribiere update method
24 | 
25 | # summarise results
26 | print(result$par) # the coordinate of the minimim
27 | print(result$value) # the function response of the minimum
28 | print(result$counts) # the number of function calls performed
29 | 
30 | # dispaly the function as a contour plot
31 | x <- seq(-3, 3, length.out=100)
32 | y <- seq(-3, 3, length.out=100)
33 | z <- rosenbrock(expand.grid(x, y))
34 | contour(x, y, matrix(log10(z), length(x)), xlab="x", ylab="y")
35 | # draw the optima as a point
36 | points(result$par[1], result$par[2], col="red", pch=19)
37 | # draw a square around the optima to highlight it
38 | rect(result$par[1]-0.2, result$par[2]-0.2, result$par[1]+0.2,
39 | 	 result$par[2]+0.2, lwd=2)
40 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/6-Optimization/golden_section_search.R:
--------------------------------------------------------------------------------
 1 | # Golden Section Line Search
 2 | 
 3 | # define a 1D basin function, optima at f(0)=0
 4 | basin <- function(x) {
 5 | 	x[1]^2
 6 | }
 7 | 
 8 | # # locate the minimum of the function using a Golden Section Line Search
 9 | result <- optimize(
10 | 	basin, # the function to be minimized
11 | 	c(-5, 5), # the bounds on the function paramter
12 | 	maximum=FALSE, # we are concerned with the function minima
13 | 	tol=1e-8) # the size of the final bracketing
14 | 
15 | # display the results
16 | print(result$minimum) #function parameter
17 | print(result$objective) # function response
18 | 
19 | # plot the function
20 | x <- seq(-5, 5, length.out=100)
21 | y <- basin(expand.grid(x))
22 | plot(x, y, xlab="x",ylab="f(x)", type="l")
23 | # plot the solution as a point
24 | points(result$minimum, result$objective, col="red", pch=19)
25 | # draw a square around the optima to highlight it
26 | rect(result$minimum-0.3, result$objective-0.7, result$minimum+0.3,
27 | 	 result$objective+0.7, lwd=2)
28 | 
29 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/6-Optimization/nelder_mead.R:
--------------------------------------------------------------------------------
 1 | # Nelder-Mead method
 2 | 
 3 | # definition of the 2D Rosenbrock function, optima is at (1,1)
 4 | rosenbrock <- function(v) {
 5 | 	(1 - v[1])^2 + 100 * (v[2] - v[1]*v[1])^2
 6 | }
 7 | 
 8 | # locate the minimum of the function using the Nelder-Mead method
 9 | result <- optim(
10 | 	c(runif(1,-3,3), runif(1,-3,3)), # start at a random position
11 | 	rosenbrock, # the function to minimize
12 | 	NULL, # no function gradient
13 | 	method="Nelder-Mead", # use the Nelder-Mead method
14 | 	control=c( # configure Nelder-Mead
15 | 		maxit=100, # maximum iterations of 100
16 | 		reltol=1e-8, # response tolerance over-one step
17 | 		alpha=1.0, # reflection factor
18 | 		beta=0.5, # contraction factor
19 | 		gamma=2.0)) # expansion factor
20 | 
21 | # summarise results
22 | print(result$par) # the coordinate of the minimim
23 | print(result$value) # the function response of the minimum
24 | print(result$counts) # the number of function calls performed
25 | 
26 | # dispaly the function as a contour plot
27 | x <- seq(-3, 3, length.out=100)
28 | y <- seq(-3, 3, length.out=100)
29 | z <- rosenbrock(expand.grid(x, y))
30 | contour(x, y, matrix(log10(z), length(x)), xlab="x",ylab="y")
31 | # draw the optima as a point
32 | points(result$par[1], result$par[2], col="red", pch=19)
33 | # draw a square around the optima to highlight it
34 | rect(result$par[1]-0.2, result$par[2]-0.2, result$par[1]+0.2,
35 | 	 result$par[2]+0.2, lwd=2)
36 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/.DS_Store


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/1-ResamplingMethods/bootstrap.R:
--------------------------------------------------------------------------------
 1 | # Estimate accuracy using the bootstrap.
 2 | 
 3 | # load the library
 4 | library(caret)
 5 | # load the iris dataset
 6 | data(iris)
 7 | # define training control
 8 | train_control <- trainControl(method="boot", number=100)
 9 | # train the model
10 | model <- train(Species~., data=iris, trControl=train_control, method="nb")
11 | # summarize results
12 | print(model)
13 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/1-ResamplingMethods/data_split.R:
--------------------------------------------------------------------------------
 1 | # Estimate accuracy using a train/test split.
 2 | 
 3 | # load the libraries
 4 | library(caret)
 5 | library(klaR)
 6 | # load the iris dataset
 7 | data(iris)
 8 | # define an 80%/20% train/test split of the dataset
 9 | split=0.80
10 | trainIndex <- createDataPartition(iris$Species, p=split, list=FALSE)
11 | data_train <- iris[ trainIndex,]
12 | data_test  <- iris[-trainIndex,]
13 | # train a naive bayes model
14 | model <- NaiveBayes(Species~., data=data_train)
15 | # make predictions
16 | x_test <- data_test[,1:4]
17 | y_test <- data_test[,5]
18 | predictions <- predict(model, x_test)
19 | # summarize results
20 | confusionMatrix(predictions$class, y_test)
21 | 
22 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/1-ResamplingMethods/kfold_cross_validation.R:
--------------------------------------------------------------------------------
 1 | # Estimate accuracy using a k-fold cross validation.
 2 | 
 3 | # load the library
 4 | library(caret)
 5 | # load the iris dataset
 6 | data(iris)
 7 | # define training control
 8 | train_control <- trainControl(method="cv", number=10)
 9 | # fix the parameters of the algorithm
10 | grid <- expand.grid(.fL=c(0), .usekernel=c(FALSE))
11 | # train the model
12 | model <- train(Species~., data=iris, trControl=train_control, method="nb", tuneGrid=grid)
13 | # summarize results
14 | print(model)
15 | 
16 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/1-ResamplingMethods/leave_one_out_cross_validation.R:
--------------------------------------------------------------------------------
 1 | # Estimate accuracy using a leave one out cross validation.
 2 | 
 3 | # load the library
 4 | library(caret)
 5 | # load the iris dataset
 6 | data(iris)
 7 | # define training control
 8 | train_control <- trainControl(method="LOOCV")
 9 | # train the model
10 | model <- train(Species~., data=iris, trControl=train_control, method="nb")
11 | # summarize results
12 | print(model)
13 | 
14 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/1-ResamplingMethods/repeated_kfold_cross_validation.R:
--------------------------------------------------------------------------------
 1 | # Estimate accuracy using repeated k-fold cross validation.
 2 | 
 3 | # load the library
 4 | library(caret)
 5 | # load the iris dataset
 6 | data(iris)
 7 | # define training control
 8 | train_control <- trainControl(method="repeatedcv", number=10, repeats=3)
 9 | # train the model
10 | model <- train(Species~., data=iris, trControl=train_control, method="nb")
11 | # summarize results
12 | print(model)
13 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/2-Metrics/Accuracy.R:
--------------------------------------------------------------------------------
 1 | # Accuracy metric
 2 | 
 3 | # load libraries
 4 | library(caret)
 5 | library(mlbench)
 6 | # load the dataset
 7 | data(PimaIndiansDiabetes)
 8 | # prepare resampling method
 9 | control <- trainControl(method="cv", number=5)
10 | set.seed(7)
11 | fit <- train(diabetes~., data=PimaIndiansDiabetes, method="glm", metric="Accuracy", trControl=control)
12 | # display results
13 | print(fit)


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/2-Metrics/Kappa.R:
--------------------------------------------------------------------------------
 1 | # Kappa metric
 2 | 
 3 | # load libraries
 4 | library(caret)
 5 | library(mlbench)
 6 | # load the dataset
 7 | data(PimaIndiansDiabetes)
 8 | # prepare resampling method
 9 | control <- trainControl(method="cv", number=5)
10 | set.seed(7)
11 | fit <- train(diabetes~., data=PimaIndiansDiabetes, method="glm", metric="Kappa", trControl=control)
12 | # display results
13 | print(fit)


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/2-Metrics/LogLoss.R:
--------------------------------------------------------------------------------
 1 | # MultiNomialLogLoss Metric
 2 | 
 3 | # load libraries
 4 | library(caret)
 5 | # load the dataset
 6 | data(iris)
 7 | # prepare resampling method
 8 | control <- trainControl(method="cv", number=5, classProbs=TRUE, summaryFunction=mnLogLoss)
 9 | set.seed(7)
10 | fit <- train(Species~., data=iris, method="rpart", metric="logLoss", trControl=control)
11 | # display results
12 | print(fit)
13 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/2-Metrics/RMSE.R:
--------------------------------------------------------------------------------
 1 | # RMSE metric
 2 | 
 3 | # load libraries
 4 | library(caret)
 5 | # load data
 6 | data(longley)
 7 | # prepare resampling method
 8 | control <- trainControl(method="cv", number=5)
 9 | set.seed(7)
10 | fit <- train(Employed~., data=longley, method="lm", metric="RMSE", trControl=control)
11 | # display results
12 | print(fit)


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/2-Metrics/ROC.R:
--------------------------------------------------------------------------------
 1 | # ROC: AUC, sensitivity, specificity metrics
 2 | 
 3 | # load libraries
 4 | library(caret)
 5 | library(mlbench)
 6 | # load the dataset
 7 | data(PimaIndiansDiabetes)
 8 | # prepare resampling method
 9 | control <- trainControl(method="cv", number=5, classProbs=TRUE, summaryFunction=twoClassSummary)
10 | set.seed(7)
11 | fit <- train(diabetes~., data=PimaIndiansDiabetes, method="glm", metric="ROC", trControl=control)
12 | # display results
13 | print(fit)


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/2-Metrics/RSquared.R:
--------------------------------------------------------------------------------
 1 | # Rsquared metric
 2 | 
 3 | # load libraries
 4 | library(caret)
 5 | # load data
 6 | data(longley)
 7 | # prepare resampling method
 8 | control <- trainControl(method="cv", number=5)
 9 | set.seed(7)
10 | fit <- train(Employed~., data=longley, method="lm", metric="Rsquared", trControl=control)
11 | # display results
12 | print(fit)


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/3-ModelSelection/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/3-ModelSelection/.DS_Store


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/3-ModelSelection/compare_boxplots.R:
--------------------------------------------------------------------------------
 1 | # Compare models using box and whisker plots
 2 | 
 3 | # load libraries
 4 | library(mlbench)
 5 | library(caret)
 6 | # load the dataset
 7 | data(PimaIndiansDiabetes)
 8 | # prepare training scheme
 9 | control <- trainControl(method="repeatedcv", number=10, repeats=3)
10 | # CART
11 | set.seed(7)
12 | fit.cart <- train(diabetes~., data=PimaIndiansDiabetes, method="rpart", trControl=control)
13 | # LDA
14 | set.seed(7)
15 | fit.lda <- train(diabetes~., data=PimaIndiansDiabetes, method="lda", trControl=control)
16 | # SVM
17 | set.seed(7)
18 | fit.svm <- train(diabetes~., data=PimaIndiansDiabetes, method="svmRadial", trControl=control)
19 | # kNN
20 | set.seed(7)
21 | fit.knn <- train(diabetes~., data=PimaIndiansDiabetes, method="knn", trControl=control)
22 | # Random Forest
23 | set.seed(7)
24 | fit.rf <- train(diabetes~., data=PimaIndiansDiabetes, method="rf", trControl=control)
25 | # collect resamples
26 | results <- resamples(list(CART=fit.cart, LDA=fit.lda, SVM=fit.svm, KNN=fit.knn, RF=fit.rf))
27 | # box and whisker plots to compare models
28 | scales <- list(x=list(relation="free"), y=list(relation="free"))
29 | bwplot(results, scales=scales)
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/3-ModelSelection/compare_densityplot.R:
--------------------------------------------------------------------------------
 1 | # Compare models using density plots plots
 2 | 
 3 | # load libraries
 4 | library(mlbench)
 5 | library(caret)
 6 | # load the dataset
 7 | data(PimaIndiansDiabetes)
 8 | # prepare training scheme
 9 | control <- trainControl(method="repeatedcv", number=10, repeats=3)
10 | # CART
11 | set.seed(7)
12 | fit.cart <- train(diabetes~., data=PimaIndiansDiabetes, method="rpart", trControl=control)
13 | # LDA
14 | set.seed(7)
15 | fit.lda <- train(diabetes~., data=PimaIndiansDiabetes, method="lda", trControl=control)
16 | # SVM
17 | set.seed(7)
18 | fit.svm <- train(diabetes~., data=PimaIndiansDiabetes, method="svmRadial", trControl=control)
19 | # kNN
20 | set.seed(7)
21 | fit.knn <- train(diabetes~., data=PimaIndiansDiabetes, method="knn", trControl=control)
22 | # Random Forest
23 | set.seed(7)
24 | fit.rf <- train(diabetes~., data=PimaIndiansDiabetes, method="rf", trControl=control)
25 | # collect resamples
26 | results <- resamples(list(CART=fit.cart, LDA=fit.lda, SVM=fit.svm, KNN=fit.knn, RF=fit.rf))
27 | # density plots of accuracy
28 | scales <- list(x=list(relation="free"), y=list(relation="free"))
29 | densityplot(results, scales=scales, pch = "|")
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/3-ModelSelection/compare_dotplot.R:
--------------------------------------------------------------------------------
 1 | # Compare models using dotplots
 2 | 
 3 | # load libraries
 4 | library(mlbench)
 5 | library(caret)
 6 | # load the dataset
 7 | data(PimaIndiansDiabetes)
 8 | # prepare training scheme
 9 | control <- trainControl(method="repeatedcv", number=10, repeats=3)
10 | # CART
11 | set.seed(7)
12 | fit.cart <- train(diabetes~., data=PimaIndiansDiabetes, method="rpart", trControl=control)
13 | # LDA
14 | set.seed(7)
15 | fit.lda <- train(diabetes~., data=PimaIndiansDiabetes, method="lda", trControl=control)
16 | # SVM
17 | set.seed(7)
18 | fit.svm <- train(diabetes~., data=PimaIndiansDiabetes, method="svmRadial", trControl=control)
19 | # kNN
20 | set.seed(7)
21 | fit.knn <- train(diabetes~., data=PimaIndiansDiabetes, method="knn", trControl=control)
22 | # Random Forest
23 | set.seed(7)
24 | fit.rf <- train(diabetes~., data=PimaIndiansDiabetes, method="rf", trControl=control)
25 | # collect resamples
26 | results <- resamples(list(CART=fit.cart, LDA=fit.lda, SVM=fit.svm, KNN=fit.knn, RF=fit.rf))
27 | # dot plots of accuracy
28 | scales <- list(x=list(relation="free"), y=list(relation="free"))
29 | dotplot(results, scales=scales)
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/3-ModelSelection/compare_parallelplot.R:
--------------------------------------------------------------------------------
 1 | # Compare models using parallel plots
 2 | 
 3 | # load libraries
 4 | library(mlbench)
 5 | library(caret)
 6 | # load the dataset
 7 | data(PimaIndiansDiabetes)
 8 | # prepare training scheme
 9 | control <- trainControl(method="repeatedcv", number=10, repeats=3)
10 | # CART
11 | set.seed(7)
12 | fit.cart <- train(diabetes~., data=PimaIndiansDiabetes, method="rpart", trControl=control)
13 | # LDA
14 | set.seed(7)
15 | fit.lda <- train(diabetes~., data=PimaIndiansDiabetes, method="lda", trControl=control)
16 | # SVM
17 | set.seed(7)
18 | fit.svm <- train(diabetes~., data=PimaIndiansDiabetes, method="svmRadial", trControl=control)
19 | # kNN
20 | set.seed(7)
21 | fit.knn <- train(diabetes~., data=PimaIndiansDiabetes, method="knn", trControl=control)
22 | # Random Forest
23 | set.seed(7)
24 | fit.rf <- train(diabetes~., data=PimaIndiansDiabetes, method="rf", trControl=control)
25 | # collect resamples
26 | results <- resamples(list(CART=fit.cart, LDA=fit.lda, SVM=fit.svm, KNN=fit.knn, RF=fit.rf))
27 | # parallel plots to compare models
28 | parallelplot(results)
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/3-ModelSelection/compare_scatterplot_matrix.R:
--------------------------------------------------------------------------------
 1 | # Compare models using scatterplot matrix
 2 | 
 3 | # load libraries
 4 | library(mlbench)
 5 | library(caret)
 6 | # load the dataset
 7 | data(PimaIndiansDiabetes)
 8 | # prepare training scheme
 9 | control <- trainControl(method="repeatedcv", number=10, repeats=3)
10 | # CART
11 | set.seed(7)
12 | fit.cart <- train(diabetes~., data=PimaIndiansDiabetes, method="rpart", trControl=control)
13 | # LDA
14 | set.seed(7)
15 | fit.lda <- train(diabetes~., data=PimaIndiansDiabetes, method="lda", trControl=control)
16 | # SVM
17 | set.seed(7)
18 | fit.svm <- train(diabetes~., data=PimaIndiansDiabetes, method="svmRadial", trControl=control)
19 | # kNN
20 | set.seed(7)
21 | fit.knn <- train(diabetes~., data=PimaIndiansDiabetes, method="knn", trControl=control)
22 | # Random Forest
23 | set.seed(7)
24 | fit.rf <- train(diabetes~., data=PimaIndiansDiabetes, method="rf", trControl=control)
25 | # collect resamples
26 | results <- resamples(list(CART=fit.cart, LDA=fit.lda, SVM=fit.svm, KNN=fit.knn, RF=fit.rf))
27 | # pair-wise scatterplots of predictions to compare models
28 | splom(results)
29 | # pair-wise scatterplots of accuracy measures to compare models
30 | splom(results, variables="metrics")
31 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/3-ModelSelection/compare_summary.R:
--------------------------------------------------------------------------------
 1 | # Compare models using a table summary
 2 | 
 3 | # load libraries
 4 | library(mlbench)
 5 | library(caret)
 6 | # load the dataset
 7 | data(PimaIndiansDiabetes)
 8 | # prepare training scheme
 9 | control <- trainControl(method="repeatedcv", number=10, repeats=3)
10 | # CART
11 | set.seed(7)
12 | fit.cart <- train(diabetes~., data=PimaIndiansDiabetes, method="rpart", trControl=control)
13 | # LDA
14 | set.seed(7)
15 | fit.lda <- train(diabetes~., data=PimaIndiansDiabetes, method="lda", trControl=control)
16 | # SVM
17 | set.seed(7)
18 | fit.svm <- train(diabetes~., data=PimaIndiansDiabetes, method="svmRadial", trControl=control)
19 | # kNN
20 | set.seed(7)
21 | fit.knn <- train(diabetes~., data=PimaIndiansDiabetes, method="knn", trControl=control)
22 | # Random Forest
23 | set.seed(7)
24 | fit.rf <- train(diabetes~., data=PimaIndiansDiabetes, method="rf", trControl=control)
25 | # collect resamples
26 | results <- resamples(list(CART=fit.cart, LDA=fit.lda, SVM=fit.svm, KNN=fit.knn, RF=fit.rf))
27 | # summarize differences between modes
28 | summary(results)
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/3-ModelSelection/compare_xyplot.R:
--------------------------------------------------------------------------------
 1 | # Compare models using xyplot
 2 | 
 3 | # load libraries
 4 | library(mlbench)
 5 | library(caret)
 6 | # load the dataset
 7 | data(PimaIndiansDiabetes)
 8 | # prepare training scheme
 9 | control <- trainControl(method="repeatedcv", number=10, repeats=3)
10 | # CART
11 | set.seed(7)
12 | fit.cart <- train(diabetes~., data=PimaIndiansDiabetes, method="rpart", trControl=control)
13 | # LDA
14 | set.seed(7)
15 | fit.lda <- train(diabetes~., data=PimaIndiansDiabetes, method="lda", trControl=control)
16 | # SVM
17 | set.seed(7)
18 | fit.svm <- train(diabetes~., data=PimaIndiansDiabetes, method="svmRadial", trControl=control)
19 | # kNN
20 | set.seed(7)
21 | fit.knn <- train(diabetes~., data=PimaIndiansDiabetes, method="knn", trControl=control)
22 | # Random Forest
23 | set.seed(7)
24 | fit.rf <- train(diabetes~., data=PimaIndiansDiabetes, method="rf", trControl=control)
25 | # collect resamples
26 | results <- resamples(list(CART=fit.cart, LDA=fit.lda, SVM=fit.svm, KNN=fit.knn, RF=fit.rf))
27 | # xyplot plots to compare models
28 | xyplot(results, models=c("LDA", "SVM"))
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/3-ModelSelection/significant_difference.R:
--------------------------------------------------------------------------------
 1 | # Calculate statistical significance of difference between model predictions
 2 | 
 3 | # load libraries
 4 | library(mlbench)
 5 | library(caret)
 6 | # load the dataset
 7 | data(PimaIndiansDiabetes)
 8 | # prepare training scheme
 9 | control <- trainControl(method="repeatedcv", number=10, repeats=3)
10 | # CART
11 | set.seed(7)
12 | fit.cart <- train(diabetes~., data=PimaIndiansDiabetes, method="rpart", trControl=control)
13 | # LDA
14 | set.seed(7)
15 | fit.lda <- train(diabetes~., data=PimaIndiansDiabetes, method="lda", trControl=control)
16 | # SVM
17 | set.seed(7)
18 | fit.svm <- train(diabetes~., data=PimaIndiansDiabetes, method="svmRadial", trControl=control)
19 | # kNN
20 | set.seed(7)
21 | fit.knn <- train(diabetes~., data=PimaIndiansDiabetes, method="knn", trControl=control)
22 | # Random Forest
23 | set.seed(7)
24 | fit.rf <- train(diabetes~., data=PimaIndiansDiabetes, method="rf", trControl=control)
25 | # collect resamples
26 | results <- resamples(list(CART=fit.cart, LDA=fit.lda, SVM=fit.svm, KNN=fit.knn, RF=fit.rf))
27 | # difference in model predictions
28 | diffs <- diff(results)
29 | # summarize p-values for pair-wise comparisons
30 | summary(diffs)
31 | # plot of differences
32 | scales <- list(x=list(relation="free"), y=list(relation="free"))
33 | bwplot(diffs, scales=scales)
34 | # t-test between two models
35 | compare_models(fit.svm, fit.lda)
36 | 
37 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/5-ImproveResults/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/machine_learning_mastery_with_r_code/5-ImproveResults/.DS_Store


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/5-ImproveResults/1-TuneAlgorithms/automatic_grid_search.R:
--------------------------------------------------------------------------------
 1 | # Tune algorithm parameters using an automatic grid search.
 2 | 
 3 | # load the library
 4 | library(caret)
 5 | # load the dataset
 6 | data(iris)
 7 | # prepare training scheme
 8 | control <- trainControl(method="repeatedcv", number=10, repeats=3)
 9 | # train the model
10 | model <- train(Species~., data=iris, method="lvq", trControl=control, tuneLength=5)
11 | # summarize the model
12 | print(model)
13 | # plot the effect of parameters on accuracy
14 | plot(model)
15 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/5-ImproveResults/1-TuneAlgorithms/custom_search.R:
--------------------------------------------------------------------------------
 1 | # Customer Parameter Search
 2 | 
 3 | # load the packages
 4 | library(randomForest)
 5 | library(mlbench)
 6 | library(caret)
 7 | # configure multi-core (not supported on Windoews)
 8 | library(doMC)
 9 | registerDoMC(cores=8)
10 | 
11 | # define the custom caret algorithm (wrapper for Random Forest)
12 | customRF <- list(type="Classification", library="randomForest", loop=NULL)
13 | customRF$parameters <- data.frame(parameter=c("mtry", "ntree"), class=rep("numeric", 2), label=c("mtry", "ntree"))
14 | customRF$grid <- function(x, y, len=NULL, search="grid") {}
15 | customRF$fit <- function(x, y, wts, param, lev, last, weights, classProbs, ...) {
16 |   randomForest(x, y, mtry=param$mtry, ntree=param$ntree, ...)
17 | }
18 | customRF$predict <- function(modelFit, newdata, preProc=NULL, submodels=NULL)
19 |    predict(modelFit, newdata)
20 | customRF$prob <- function(modelFit, newdata, preProc=NULL, submodels=NULL)
21 |    predict(modelFit, newdata, type = "prob")
22 | customRF$sort <- function(x) x[order(x[,1]),]
23 | customRF$levels <- function(x) x$classes
24 | 
25 | # Load Dataset
26 | data(Sonar)
27 | dataset <- Sonar
28 | seed <- 7
29 | metric <- "Accuracy"
30 | 
31 | # train model
32 | trainControl <- trainControl(method="repeatedcv", number=10, repeats=3)
33 | tunegrid <- expand.grid(.mtry=c(1:15), .ntree=c(1000, 1500, 2000, 2500))
34 | set.seed(seed)
35 | custom <- train(Class~., data=dataset, method=customRF, metric=metric, tuneGrid=tunegrid, trControl=trainControl)
36 | print(custom)
37 | plot(custom)
38 | 
39 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/5-ImproveResults/1-TuneAlgorithms/manual_grid_search.R:
--------------------------------------------------------------------------------
 1 | # Tune algorithm parameters using a manual grid search.
 2 | 
 3 | # load the library
 4 | library(caret)
 5 | # load the dataset
 6 | data(iris)
 7 | # prepare training scheme
 8 | control <- trainControl(method="repeatedcv", number=10, repeats=3)
 9 | # design the parameter tuning grid
10 | grid <- expand.grid(size=c(5,10,20,50), k=c(1,2,3,4,5))
11 | # train the model
12 | model <- train(Species~., data=iris, method="lvq", trControl=control, tuneGrid=grid)
13 | # summarize the model
14 | print(model)
15 | # plot the effect of parameters on accuracy
16 | plot(model)
17 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/5-ImproveResults/1-TuneAlgorithms/manual_search.R:
--------------------------------------------------------------------------------
 1 | # Manually search parametres
 2 | 
 3 | # load the packages
 4 | library(randomForest)
 5 | library(mlbench)
 6 | library(caret)
 7 | # Load Dataset
 8 | data(Sonar)
 9 | dataset <- Sonar
10 | x <- dataset[,1:60]
11 | y <- dataset[,61]
12 | seed <- 7
13 | metric <- "Accuracy"
14 | # Manual Search
15 | trainControl <- trainControl(method="repeatedcv", number=10, repeats=3, search="grid")
16 | tunegrid <- expand.grid(.mtry=c(sqrt(ncol(x))))
17 | modellist <- list()
18 | for (ntree in c(1000, 1500, 2000, 2500)) {
19 | 	set.seed(seed)
20 | 	fit <- train(Class~., data=dataset, method="rf", metric=metric, tuneGrid=tunegrid, trControl=trainControl, ntree=ntree)
21 | 	key <- toString(ntree)
22 | 	modellist[[key]] <- fit
23 | }
24 | # compare results
25 | results <- resamples(modellist)
26 | summary(results)
27 | dotplot(results)
28 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/5-ImproveResults/1-TuneAlgorithms/optimal_parameters.R:
--------------------------------------------------------------------------------
 1 | # Select the best tuning configuration
 2 | 
 3 | # load libraries
 4 | library(mlbench)
 5 | library(caret)
 6 | # load the dataset
 7 | data(PimaIndiansDiabetes)
 8 | # prepare training scheme
 9 | control <- trainControl(method="repeatedcv", number=10, repeats=3)
10 | # CART
11 | set.seed(7)
12 | tunegrid <- expand.grid(.cp=seq(0,0.1,by=0.01))
13 | fit.cart <- train(diabetes~., data=PimaIndiansDiabetes, method="rpart", metric="Accuracy", tuneGrid=tunegrid, trControl=control)
14 | # display the best configuration
15 | print(fit.cart$bestTune)
16 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/5-ImproveResults/1-TuneAlgorithms/random_search.R:
--------------------------------------------------------------------------------
 1 | # Randomly search algorithm parameters
 2 | 
 3 | # load the library
 4 | library(caret)
 5 | # load the dataset
 6 | data(iris)
 7 | # prepare training scheme
 8 | control <- trainControl(method="repeatedcv", number=10, repeats=3, search="random")
 9 | # train the model
10 | model <- train(Species~., data=iris, method="lvq", trControl=control, tuneLength=25)
11 | # summarize the model
12 | print(model)
13 | # plot the effect of parameters on accuracy
14 | plot(model)


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/5-ImproveResults/2-Ensembles/bagging.R:
--------------------------------------------------------------------------------
 1 | # Bagging or Bootstrap Aggregation of Decision Trees
 2 | 
 3 | # load the libraries
 4 | library(ipred)
 5 | library(rpart)
 6 | library(mlbench)
 7 | # load the dataset
 8 | data(PimaIndiansDiabetes)
 9 | # bag the decision tree
10 | model <- bagging(diabetes~., data=PimaIndiansDiabetes, nbagg=25, coob=TRUE)
11 | # make predictions on the training dataset
12 | predictions <- predict(model, PimaIndiansDiabetes[,1:8])
13 | # summarize accuracy
14 | table(predictions, PimaIndiansDiabetes$diabetes)
15 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/5-ImproveResults/2-Ensembles/blending.R:
--------------------------------------------------------------------------------
 1 | # Blending (linear combination of models)
 2 | 
 3 | # load libraries
 4 | library(caret)
 5 | library(caretEnsemble)
 6 | # load the dataset
 7 | data(PimaIndiansDiabetes)
 8 | # define training control
 9 | train_control <- trainControl(method="cv", number=10, savePredictions=TRUE, classProbs=TRUE)
10 | # train a list of models
11 | methodList <- c('glm', 'lda', 'knn')
12 | models <- caretList(diabetes~., data=PimaIndiansDiabetes, trControl=train_control, methodList=methodList)
13 | # create ensemble of trained models
14 | ensemble <- caretEnsemble(models)
15 | # summarize ensemble
16 | summary(ensemble)
17 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/5-ImproveResults/2-Ensembles/stacking.R:
--------------------------------------------------------------------------------
 1 | # Stacking (non-linear combination of models)
 2 | 
 3 | # load libraries
 4 | library(caret)
 5 | library(caretEnsemble)
 6 | # load the dataset
 7 | data(PimaIndiansDiabetes)
 8 | # define training control
 9 | train_control <- trainControl(method="cv", number=10, savePredictions=TRUE, classProbs=TRUE)
10 | # train a list of models
11 | methodList <- c('glm', 'lda', 'knn')
12 | models <- caretList(diabetes~., data=PimaIndiansDiabetes, trControl=train_control, methodList=methodList)
13 | # create stacked ensemble of trained models
14 | ensemble <- caretStack(models, method='rpart')
15 | # summarize ensemble
16 | summary(ensemble)
17 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/6-FinalizeModel/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/machine_learning_mastery_with_r_code/6-FinalizeModel/.DS_Store


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/6-FinalizeModel/1-Predict/predict_caret.R:
--------------------------------------------------------------------------------
 1 | # Make predictions using caret model
 2 | 
 3 | # load libraries
 4 | library(caret)
 5 | library(mlbench)
 6 | # load dataset
 7 | data(PimaIndiansDiabetes)
 8 | # create 80%/20% for training and validation datasets
 9 | set.seed(9)
10 | validation_index <- createDataPartition(PimaIndiansDiabetes$diabetes, p=0.80, list=FALSE)
11 | validation <- PimaIndiansDiabetes[-validation_index,]
12 | training <- PimaIndiansDiabetes[validation_index,]
13 | # train a model and summarize model
14 | set.seed(9)
15 | control <- trainControl(method="cv", number=10)
16 | fit.lda <- train(diabetes~., data=training, method="lda", metric="Accuracy", trControl=control)
17 | print(fit.lda)
18 | print(fit.lda$finalModel)
19 | # estimate skill on validation dataset
20 | set.seed(9)
21 | predictions <- predict(fit.lda, newdata=validation)
22 | confusionMatrix(predictions, validation$diabetes)
23 | 
24 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/6-FinalizeModel/1-Predict/train_all_dataset.R:
--------------------------------------------------------------------------------
 1 | # Train a model on the entire training dataset
 2 | 
 3 | # load libraries
 4 | library(caret)
 5 | library(mlbench)
 6 | # load dataset
 7 | data(PimaIndiansDiabetes)
 8 | set.seed(9)
 9 | control <- trainControl(method="none", number=10)
10 | fit.lda <- train(diabetes~., data=training, method="lda", metric="Accuracy", trControl=control)
11 | print(fit.lda)
12 | print(fit.lda$finalModel)
13 | 
14 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/6-FinalizeModel/2-FinalModel/standalone_model.R:
--------------------------------------------------------------------------------
 1 | # Create a Standalone Model
 2 | 
 3 | # load libraries
 4 | library(caret)
 5 | library(mlbench)
 6 | library(randomForest)
 7 | library(doMC)
 8 | registerDoMC(cores=8)
 9 | # load dataset
10 | data(Sonar)
11 | set.seed(7)
12 | # create 80%/20% for training and validation datasets
13 | validation_index <- createDataPartition(Sonar$Class, p=0.80, list=FALSE)
14 | validation <- Sonar[-validation_index,]
15 | training <- Sonar[validation_index,]
16 | # train a model and summarize model
17 | set.seed(7)
18 | control <- trainControl(method="repeatedcv", number=10, repeats=3)
19 | fit.rf <- train(Class~., data=training, method="rf", metric="Accuracy", trControl=control, ntree=2000)
20 | print(fit.rf)
21 | print(fit.rf$finalModel)
22 | # create standalone model using all training data
23 | set.seed(7)
24 | finalModel <- randomForest(Class~., training, mtry=2, ntree=2000)
25 | # make a predictions on "new data" using the final model
26 | final_predictions <- predict(finalModel, validation[,1:60])
27 | confusionMatrix(final_predictions, validation$Class)
28 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/6-FinalizeModel/3-SaveLoadModel/save_load_model.R:
--------------------------------------------------------------------------------
 1 | # Save and Load model
 2 | 
 3 | 
 4 | # load libraries
 5 | library(caret)
 6 | library(mlbench)
 7 | library(randomForest)
 8 | library(doMC)
 9 | registerDoMC(cores=8)
10 | # load dataset
11 | data(Sonar)
12 | set.seed(7)
13 | # create 80%/20% for training and validation datasets
14 | validation_index <- createDataPartition(Sonar$Class, p=0.80, list=FALSE)
15 | validation <- Sonar[-validation_index,]
16 | training <- Sonar[validation_index,]
17 | # create final standalone model using all training data
18 | set.seed(7)
19 | final_model <- randomForest(Class~., training, mtry=2, ntree=2000)
20 | # save the model to disk
21 | saveRDS(final_model, "./final_model.rds")
22 | 
23 | # later...
24 | 
25 | # load the model
26 | super_model <- readRDS("./final_model.rds")
27 | print(super_model)
28 | # make a predictions on "new data" using the final model
29 | final_predictions <- predict(super_model, validation[,1:60])
30 | confusionMatrix(final_predictions, validation$Class)
31 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/7-Other/install_list_of_packages.R:
--------------------------------------------------------------------------------
 1 | # Install a list of packages
 2 | 
 3 | # preferred repo
 4 | repository <- "http://cran.ms.unimelb.edu.au/"
 5 | # list of packages used by project
 6 | packages <- c("ggplot2", "caret", "mlbench", "caretEnsemble", "ipred", "rpart",
 7 | 	"doMC", "AppliedPredictiveModeling", "corrplot", "Hmisc", "DMwR", "lattice",
 8 | 	"RWeka", "e1071", "C50")
 9 | 
10 | for (p in packages) {
11 | 	if(p %in% rownames(installed.packages()) == FALSE) {
12 | 		install.packages(p, repos=repository)
13 | 	}
14 | }
15 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/7-Other/install_package_with_dependencies.R:
--------------------------------------------------------------------------------
1 | # install package with dependencies
2 | 
3 | # Install package with dependencies
4 | install.packages("caret", dependencies = c("Depends", "Suggests"))


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/8-CaseStudies/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/machine_learning_mastery_with_r_code/8-CaseStudies/.DS_Store


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/8-CaseStudies/MultiClassClassification/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/machine_learning_mastery_with_r_code/8-CaseStudies/MultiClassClassification/.DS_Store


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/8-CaseStudies/project_template.R:
--------------------------------------------------------------------------------
 1 | # R Project Template
 2 | 
 3 | # 1. Prepare Problem
 4 | # a) Load libraries
 5 | # b) Load dataset
 6 | # c) Split-out validation dataset
 7 | 
 8 | # 2. Summarize Data
 9 | # a) Descriptive statistics
10 | # b) Data visualizations
11 | 
12 | # 3. Prepare Data
13 | # a) Data Cleaning
14 | # b) Feature Selection
15 | # c) Data Transforms
16 | 
17 | # 4. Evaluate Algorithms
18 | # a) Test options and evaluation metric
19 | # b) Spot Check Algorithms
20 | # c) Compare Algorithms
21 | 
22 | # 5. Improve Accuracy
23 | # a) Algorithm Tuning
24 | # b) Ensembles
25 | 
26 | # 6. Finalize Model
27 | # a) Predictions on validation dataset
28 | # b) Create standalone model on entire training dataset
29 | # c) Save model for later use
30 | 


--------------------------------------------------------------------------------
/machine_learning_mastery_with_r_code/README.txt:
--------------------------------------------------------------------------------
 1 | Machine Learning Mastery With R: Recipes
 2 | ========================================
 3 | 
 4 | Recipes that you can use to bootstrap your machine learning project in R.
 5 | 
 6 | 
 7 | About
 8 | -----
 9 | 
10 | - Recipes are code snippets not tutorials.
11 | - Recipes provide just enough code to work.
12 | - Recipes are demonstrative not exhaustive.
13 | - Recipes run as-is and produce a result.
14 | - Recipes assume that required packages are installed.
15 | - Recipes use built-in datasets or datasets provided in specific packages.
16 | - Recipes are limited to regression and classification predictive modeling problems.
17 | 
18 | Usage
19 | -----
20 | 
21 | 1. Find a useful recipe.
22 | 2. Copy and paste it into your project.
23 | 3. Adapt it to your needs.
24 | 


--------------------------------------------------------------------------------
/ml_with_python_code/02_scipy_versions.py:
--------------------------------------------------------------------------------
 1 | # scipy
 2 | import scipy
 3 | print('scipy: {}'.format(scipy.__version__))
 4 | # numpy
 5 | import numpy
 6 | print('numpy: {}'.format(numpy.__version__))
 7 | # matplotlib
 8 | import matplotlib
 9 | print('matplotlib: {}'.format(matplotlib.__version__))
10 | # pandas
11 | import pandas
12 | print('pandas: {}'.format(pandas.__version__))


--------------------------------------------------------------------------------
/ml_with_python_code/02_sklearn_version.py:
--------------------------------------------------------------------------------
1 | # scikit-learn
2 | import sklearn
3 | print('sklearn: {}'.format(sklearn.__version__))


--------------------------------------------------------------------------------
/ml_with_python_code/03_matplotlib_crash_course.py:
--------------------------------------------------------------------------------
 1 | # matplotlib crash course
 2 | 
 3 | 
 4 | # basic line plot
 5 | import matplotlib.pyplot as plt
 6 | import numpy
 7 | myarray = numpy.array([1, 2, 3])
 8 | plt.plot(myarray)
 9 | plt.xlabel('some x axis')
10 | plt.ylabel('some y axis')
11 | plt.show()
12 | 
13 | 
14 | # basic scatter plot
15 | import matplotlib.pyplot as plt
16 | import numpy
17 | x = numpy.array([1, 2, 3])
18 | y = numpy.array([2, 4, 6])
19 | plt.scatter(x,y)
20 | plt.xlabel('some x axis')
21 | plt.ylabel('some y axis')
22 | plt.show()
23 | 


--------------------------------------------------------------------------------
/ml_with_python_code/03_numpy_crash_course.py:
--------------------------------------------------------------------------------
 1 | # numpy crash course
 2 | 
 3 | # define an array
 4 | import numpy
 5 | mylist = [1, 2, 3]
 6 | myarray = numpy.array(mylist)
 7 | print(myarray)
 8 | print(myarray.shape)
 9 | 
10 | # access values
11 | import numpy
12 | mylist = [[1, 2, 3], [3, 4, 5]]
13 | myarray = numpy.array(mylist)
14 | print(myarray)
15 | print(myarray.shape)
16 | print("First row: %s") % myarray[0]
17 | print("Last row: %s") % myarray[-1]
18 | print("Specific row and col: %s") % myarray[0, 2]
19 | print("Whole col: %s") % myarray[:, 2]
20 | 
21 | # arithmetic
22 | import numpy
23 | myarray1 = numpy.array([2, 2, 2])
24 | myarray2 = numpy.array([3, 3, 3])
25 | print("Addition: %s") % (myarray1 + myarray2)
26 | print("Multiplication: %s") % (myarray1 * myarray2)
27 | 
28 | 


--------------------------------------------------------------------------------
/ml_with_python_code/03_pandas_crash_course.py:
--------------------------------------------------------------------------------
 1 | # pandas crash course
 2 | 
 3 | 
 4 | # series
 5 | import numpy
 6 | import pandas
 7 | myarray = numpy.array([1, 2, 3])
 8 | rownames = ['a', 'b', 'c']
 9 | myseries = pandas.Series(myarray, index=rownames)
10 | print(myseries)
11 | 
12 | print(myseries[0])
13 | print(myseries['a'])
14 | 
15 | 
16 | # dataframe
17 | import numpy
18 | import pandas
19 | myarray = numpy.array([[1, 2, 3], [4, 5, 6]])
20 | rownames = ['a', 'b']
21 | colnames = ['one', 'two', 'three']
22 | mydataframe = pandas.DataFrame(myarray, index=rownames, columns=colnames)
23 | print(mydataframe)
24 | 
25 | print("one column: %s") % mydataframe['one']
26 | print("one column: %s") % mydataframe.one
27 | 


--------------------------------------------------------------------------------
/ml_with_python_code/03_python_crash_course.py:
--------------------------------------------------------------------------------
 1 | # Python Crash Course
 2 | 
 3 | 
 4 | # Assignment
 5 | # ==========
 6 | 
 7 | # Strings
 8 | data = 'hello world'
 9 | print(data[0])
10 | print(len(data))
11 | print(data)
12 | 
13 | # Numbers
14 | value = 123.1
15 | print(value)
16 | value = 10
17 | print(value)
18 | 
19 | # Boolean
20 | a = True
21 | b = False
22 | print(a, b)
23 | 
24 | # Multiple Assignment
25 | a, b, c = 1, 2, 3
26 | print(a, b, c)
27 | 
28 | # No value
29 | a = None
30 | print(a)
31 | 
32 | 
33 | 
34 | # Flow Control
35 | # ============
36 | 
37 | # If-Then-Else
38 | 
39 | value = 99
40 | if value >= 99:
41 | 	print 'That is fast'
42 | elif value > 200:
43 | 	print 'That is too fast'
44 | else:
45 | 	print 'That that is safe'
46 | 
47 | # For-Loop
48 | for i in range(10):
49 | 	print i
50 | 
51 | # While-Loop
52 | i = 0
53 | while i < 10:
54 | 	print i
55 | 	i += 1
56 | 
57 | 
58 | # Data Structures
59 | # ===============
60 | 
61 | # Tuple (cannot change)
62 | a = (1, 2, 3)
63 | print a
64 | 
65 | 
66 | # Lists
67 | mylist = [1, 2, 3]
68 | print("Zeroth Value: %d") % mylist[0]
69 | mylist.append(4)
70 | print("List Length: %d") % len(mylist)
71 | for value in mylist:
72 | 	print value
73 | 
74 | 
75 | 
76 | # Dictionaries
77 | 
78 | mydict = {'a': 1, 'b': 2, 'c': 3}
79 | print("A value: %d") % mydict['a']
80 | mydict['a'] = 11
81 | print("A value: %d") % mydict['a']
82 | print("Keys: %s") % mydict.keys()
83 | print("Values: %s") % mydict.values()
84 | for key in mydict.keys():
85 | 	print mydict[key]
86 | 
87 | 
88 | # Functions
89 | # ===========
90 | 
91 | # Sum function
92 | def mysum(x, y):
93 | 	return x + y
94 | 
95 | # Test sum function
96 | mysum(1, 3)
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/ml_with_python_code/04_load_csv.py:
--------------------------------------------------------------------------------
 1 | # Load CSV Using Python Standard Library
 2 | import csv
 3 | import numpy
 4 | filename = 'pima-indians-diabetes.data.csv'
 5 | raw_data = open(filename, 'rb')
 6 | reader = csv.reader(raw_data, delimiter=',', quoting=csv.QUOTE_NONE)
 7 | x = list(reader)
 8 | data = numpy.array(x).astype('float')
 9 | print(data.shape)
10 | 


--------------------------------------------------------------------------------
/ml_with_python_code/04_load_csv_np.py:
--------------------------------------------------------------------------------
1 | # Load CSV using NumPy
2 | from numpy import loadtxt
3 | filename = 'pima-indians-diabetes.data.csv'
4 | raw_data = open(filename, 'rb')
5 | data = loadtxt(raw_data, delimiter=",")
6 | print(data.shape)
7 | 


--------------------------------------------------------------------------------
/ml_with_python_code/04_load_csv_np_url.py:
--------------------------------------------------------------------------------
1 | # Load CSV from URL using NumPy
2 | from numpy import loadtxt
3 | from urllib import urlopen
4 | url = 'https://goo.gl/vhm1eU'
5 | raw_data = urlopen(url)
6 | dataset = loadtxt(raw_data, delimiter=",")
7 | print(dataset.shape)
8 | 


--------------------------------------------------------------------------------
/ml_with_python_code/04_load_csv_pandas.py:
--------------------------------------------------------------------------------
1 | # Load CSV using Pandas
2 | from pandas import read_csv
3 | filename = 'pima-indians-diabetes.data.csv'
4 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
5 | data = read_csv(filename, names=names)
6 | print(data.shape)
7 | 


--------------------------------------------------------------------------------
/ml_with_python_code/04_load_csv_pandas_url.py:
--------------------------------------------------------------------------------
1 | # Load CSV using Pandas from URL
2 | from pandas import read_csv
3 | url = 'https://goo.gl/vhm1eU'
4 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
5 | data = read_csv(url, names=names)
6 | print(data.shape)
7 | 


--------------------------------------------------------------------------------
/ml_with_python_code/05_class_distribution.py:
--------------------------------------------------------------------------------
1 | # Class Distribution
2 | from pandas import read_csv
3 | filename = "pima-indians-diabetes.data.csv"
4 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
5 | data = read_csv(filename, names=names)
6 | class_counts = data.groupby('class').size()
7 | print(class_counts)
8 | 


--------------------------------------------------------------------------------
/ml_with_python_code/05_data_types.py:
--------------------------------------------------------------------------------
1 | # Data Types for Each Attribute
2 | from pandas import read_csv
3 | filename = "pima-indians-diabetes.data.csv"
4 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
5 | data = read_csv(filename, names=names)
6 | types = data.dtypes
7 | print(types)
8 | 


--------------------------------------------------------------------------------
/ml_with_python_code/05_describe.py:
--------------------------------------------------------------------------------
 1 | # Statistical Summary
 2 | from pandas import read_csv
 3 | from pandas import set_option
 4 | filename = "pima-indians-diabetes.data.csv"
 5 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 6 | data = read_csv(filename, names=names)
 7 | set_option('display.width', 100)
 8 | set_option('precision', 3)
 9 | description = data.describe()
10 | print(description)
11 | 


--------------------------------------------------------------------------------
/ml_with_python_code/05_dimensions.py:
--------------------------------------------------------------------------------
1 | # Dimensions of your data
2 | from pandas import read_csv
3 | filename = "pima-indians-diabetes.data.csv"
4 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
5 | data = read_csv(filename, names=names)
6 | shape = data.shape
7 | print(shape)
8 | 


--------------------------------------------------------------------------------
/ml_with_python_code/05_head.py:
--------------------------------------------------------------------------------
1 | # View first 20 rows
2 | from pandas import read_csv
3 | filename = "pima-indians-diabetes.data.csv"
4 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
5 | data = read_csv(filename, names=names)
6 | peek = data.head(20)
7 | print(peek)
8 | 


--------------------------------------------------------------------------------
/ml_with_python_code/05_pearson_correlation.py:
--------------------------------------------------------------------------------
 1 | # Pairwise Pearson correlations
 2 | from pandas import read_csv
 3 | from pandas import set_option
 4 | filename = "pima-indians-diabetes.data.csv"
 5 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 6 | data = read_csv(filename, names=names)
 7 | set_option('display.width', 100)
 8 | set_option('precision', 3)
 9 | correlations = data.corr(method='pearson')
10 | print(correlations)
11 | 


--------------------------------------------------------------------------------
/ml_with_python_code/05_skew.py:
--------------------------------------------------------------------------------
1 | # Skew for each attribute
2 | from pandas import read_csv
3 | filename = "pima-indians-diabetes.data.csv"
4 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
5 | data = read_csv(filename, names=names)
6 | skew = data.skew()
7 | print(skew)
8 | 


--------------------------------------------------------------------------------
/ml_with_python_code/06_boxplot.py:
--------------------------------------------------------------------------------
1 | # Box and Whisker Plots
2 | from matplotlib import pyplot
3 | from pandas import read_csv
4 | filename = "pima-indians-diabetes.data.csv"
5 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
6 | data = read_csv(filename, names=names)
7 | data.plot(kind='box', subplots=True, layout=(3,3), sharex=False, sharey=False)
8 | pyplot.show()
9 | 


--------------------------------------------------------------------------------
/ml_with_python_code/06_correlation_matrix.py:
--------------------------------------------------------------------------------
 1 | # Correction Matrix Plot
 2 | from matplotlib import pyplot
 3 | from pandas import read_csv
 4 | import numpy
 5 | filename = 'pima-indians-diabetes.data.csv'
 6 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 7 | data = read_csv(filename, names=names)
 8 | correlations = data.corr()
 9 | # plot correlation matrix
10 | fig = pyplot.figure()
11 | ax = fig.add_subplot(111)
12 | cax = ax.matshow(correlations, vmin=-1, vmax=1)
13 | fig.colorbar(cax)
14 | ticks = numpy.arange(0,9,1)
15 | ax.set_xticks(ticks)
16 | ax.set_yticks(ticks)
17 | ax.set_xticklabels(names)
18 | ax.set_yticklabels(names)
19 | pyplot.show()
20 | 


--------------------------------------------------------------------------------
/ml_with_python_code/06_correlation_matrix_generic.py:
--------------------------------------------------------------------------------
 1 | # Correction Matrix Plot (generic)
 2 | from matplotlib import pyplot
 3 | from pandas import read_csv
 4 | import numpy
 5 | filename = 'pima-indians-diabetes.data.csv'
 6 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 7 | data = read_csv(filename, names=names)
 8 | correlations = data.corr()
 9 | # plot correlation matrix
10 | fig = pyplot.figure()
11 | ax = fig.add_subplot(111)
12 | cax = ax.matshow(correlations, vmin=-1, vmax=1)
13 | fig.colorbar(cax)
14 | pyplot.show()
15 | 


--------------------------------------------------------------------------------
/ml_with_python_code/06_density_plots.py:
--------------------------------------------------------------------------------
1 | # Univariate Density Plots
2 | from matplotlib import pyplot
3 | from pandas import read_csv
4 | filename = 'pima-indians-diabetes.data.csv'
5 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
6 | data = read_csv(filename, names=names)
7 | data.plot(kind='density', subplots=True, layout=(3,3), sharex=False)
8 | pyplot.show()
9 | 


--------------------------------------------------------------------------------
/ml_with_python_code/06_histograms.py:
--------------------------------------------------------------------------------
1 | # Univariate Histograms
2 | from matplotlib import pyplot
3 | from pandas import read_csv
4 | filename = 'pima-indians-diabetes.data.csv'
5 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
6 | data = read_csv(filename, names=names)
7 | data.hist()
8 | pyplot.show()
9 | 


--------------------------------------------------------------------------------
/ml_with_python_code/06_scatterplot_matrix.py:
--------------------------------------------------------------------------------
 1 | # Scatterplot Matrix
 2 | from matplotlib import pyplot
 3 | from pandas import read_csv
 4 | from pandas.tools.plotting import scatter_matrix
 5 | filename = "pima-indians-diabetes.data.csv"
 6 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 7 | data = read_csv(filename, names=names)
 8 | scatter_matrix(data)
 9 | pyplot.show()
10 | 


--------------------------------------------------------------------------------
/ml_with_python_code/07_binarization.py:
--------------------------------------------------------------------------------
 1 | # binarization
 2 | from sklearn.preprocessing import Binarizer
 3 | from pandas import read_csv
 4 | from numpy import set_printoptions
 5 | filename = 'pima-indians-diabetes.data.csv'
 6 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 7 | dataframe = read_csv(filename, names=names)
 8 | array = dataframe.values
 9 | # separate array into input and output components
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | binarizer = Binarizer(threshold=0.0).fit(X)
13 | binaryX = binarizer.transform(X)
14 | # summarize transformed data
15 | set_printoptions(precision=3)
16 | print(binaryX[0:5,:])
17 | 


--------------------------------------------------------------------------------
/ml_with_python_code/07_normalize_data.py:
--------------------------------------------------------------------------------
 1 | # Normalize data (length of 1)
 2 | from sklearn.preprocessing import Normalizer
 3 | from pandas import read_csv
 4 | from numpy import set_printoptions
 5 | filename = 'pima-indians-diabetes.data.csv'
 6 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 7 | dataframe = read_csv(filename, names=names)
 8 | array = dataframe.values
 9 | # separate array into input and output components
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | scaler = Normalizer().fit(X)
13 | normalizedX = scaler.transform(X)
14 | # summarize transformed data
15 | set_printoptions(precision=3)
16 | print(normalizedX[0:5,:])
17 | 


--------------------------------------------------------------------------------
/ml_with_python_code/07_rescale_data.py:
--------------------------------------------------------------------------------
 1 | # Rescale data (between 0 and 1)
 2 | from pandas import read_csv
 3 | from numpy import set_printoptions
 4 | from sklearn.preprocessing import MinMaxScaler
 5 | filename = 'pima-indians-diabetes.data.csv'
 6 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 7 | dataframe = read_csv(filename, names=names)
 8 | array = dataframe.values
 9 | # separate array into input and output components
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | scaler = MinMaxScaler(feature_range=(0, 1))
13 | rescaledX = scaler.fit_transform(X)
14 | # summarize transformed data
15 | set_printoptions(precision=3)
16 | print(rescaledX[0:5,:])
17 | 


--------------------------------------------------------------------------------
/ml_with_python_code/07_standardize_data.py:
--------------------------------------------------------------------------------
 1 | # Standardize data (0 mean, 1 stdev)
 2 | from sklearn.preprocessing import StandardScaler
 3 | from pandas import read_csv
 4 | from numpy import set_printoptions
 5 | filename = 'pima-indians-diabetes.data.csv'
 6 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 7 | dataframe = read_csv(filename, names=names)
 8 | array = dataframe.values
 9 | # separate array into input and output components
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | scaler = StandardScaler().fit(X)
13 | rescaledX = scaler.transform(X)
14 | # summarize transformed data
15 | set_printoptions(precision=3)
16 | print(rescaledX[0:5,:])
17 | 


--------------------------------------------------------------------------------
/ml_with_python_code/08_feature_importance.py:
--------------------------------------------------------------------------------
 1 | # Feature Importance with Extra Trees Classifier
 2 | from pandas import read_csv
 3 | from sklearn.ensemble import ExtraTreesClassifier
 4 | # load data
 5 | filename = 'pima-indians-diabetes.data.csv'
 6 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 7 | dataframe = read_csv(filename, names=names)
 8 | array = dataframe.values
 9 | X = array[:,0:8]
10 | Y = array[:,8]
11 | # feature extraction
12 | model = ExtraTreesClassifier()
13 | model.fit(X, Y)
14 | print(model.feature_importances_)
15 | 


--------------------------------------------------------------------------------
/ml_with_python_code/08_pca.py:
--------------------------------------------------------------------------------
 1 | # Feature Extraction with PCA
 2 | from pandas import read_csv
 3 | from sklearn.decomposition import PCA
 4 | # load data
 5 | filename = 'pima-indians-diabetes.data.csv'
 6 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 7 | dataframe = read_csv(filename, names=names)
 8 | array = dataframe.values
 9 | X = array[:,0:8]
10 | Y = array[:,8]
11 | # feature extraction
12 | pca = PCA(n_components=3)
13 | fit = pca.fit(X)
14 | # summarize components
15 | print("Explained Variance: %s") % fit.explained_variance_ratio_
16 | print(fit.components_)
17 | 


--------------------------------------------------------------------------------
/ml_with_python_code/08_recursive_feature_elimination.py:
--------------------------------------------------------------------------------
 1 | # Feature Extraction with RFE
 2 | from pandas import read_csv
 3 | from sklearn.feature_selection import RFE
 4 | from sklearn.linear_model import LogisticRegression
 5 | # load data
 6 | filename = 'pima-indians-diabetes.data.csv'
 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 8 | dataframe = read_csv(filename, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | # feature extraction
13 | model = LogisticRegression()
14 | rfe = RFE(model, 3)
15 | fit = rfe.fit(X, Y)
16 | print("Num Features: %d") % fit.n_features_
17 | print("Selected Features: %s") % fit.support_
18 | print("Feature Ranking: %s") % fit.ranking_
19 | 


--------------------------------------------------------------------------------
/ml_with_python_code/08_univariate_selection.py:
--------------------------------------------------------------------------------
 1 | # Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
 2 | from pandas import read_csv
 3 | from numpy import set_printoptions
 4 | from sklearn.feature_selection import SelectKBest
 5 | from sklearn.feature_selection import chi2
 6 | # load data
 7 | filename = 'pima-indians-diabetes.data.csv'
 8 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 9 | dataframe = read_csv(filename, names=names)
10 | array = dataframe.values
11 | X = array[:,0:8]
12 | Y = array[:,8]
13 | # feature extraction
14 | test = SelectKBest(score_func=chi2, k=4)
15 | fit = test.fit(X, Y)
16 | # summarize scores
17 | set_printoptions(precision=3)
18 | print(fit.scores_)
19 | features = fit.transform(X)
20 | # summarize selected features
21 | print(features[0:5,:])
22 | 
23 | 


--------------------------------------------------------------------------------
/ml_with_python_code/09_cross_validation.py:
--------------------------------------------------------------------------------
 1 | # Evaluate using Cross Validation
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.linear_model import LogisticRegression
 6 | filename = 'pima-indians-diabetes.data.csv'
 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 8 | dataframe = read_csv(filename, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | num_folds = 10
13 | seed = 7
14 | kfold = KFold(n_splits=num_folds, random_state=seed)
15 | model = LogisticRegression()
16 | results = cross_val_score(model, X, Y, cv=kfold)
17 | print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0)
18 | 


--------------------------------------------------------------------------------
/ml_with_python_code/09_loocv.py:
--------------------------------------------------------------------------------
 1 | # Evaluate using Leave One Out Cross Validation
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import LeaveOneOut
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.linear_model import LogisticRegression
 6 | filename = 'pima-indians-diabetes.data.csv'
 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 8 | dataframe = read_csv(filename, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | num_folds = 10
13 | loocv = LeaveOneOut()
14 | model = LogisticRegression()
15 | results = cross_val_score(model, X, Y, cv=loocv)
16 | print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0)


--------------------------------------------------------------------------------
/ml_with_python_code/09_shuffle_split.py:
--------------------------------------------------------------------------------
 1 | # Evaluate using Shuffle Split Cross Validation
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import ShuffleSplit
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.linear_model import LogisticRegression
 6 | filename = 'pima-indians-diabetes.data.csv'
 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 8 | dataframe = read_csv(filename, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | n_splits = 10
13 | test_size = 0.33
14 | seed = 7
15 | kfold = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)
16 | model = LogisticRegression()
17 | results = cross_val_score(model, X, Y, cv=kfold)
18 | print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0)


--------------------------------------------------------------------------------
/ml_with_python_code/09_train_test.py:
--------------------------------------------------------------------------------
 1 | # Evaluate using a train and a test set
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import train_test_split
 4 | from sklearn.linear_model import LogisticRegression
 5 | filename = 'pima-indians-diabetes.data.csv'
 6 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 7 | dataframe = read_csv(filename, names=names)
 8 | array = dataframe.values
 9 | X = array[:,0:8]
10 | Y = array[:,8]
11 | test_size = 0.33
12 | seed = 7
13 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
14 | model = LogisticRegression()
15 | model.fit(X_train, Y_train)
16 | result = model.score(X_test, Y_test)
17 | print("Accuracy: %.3f%%") % (result*100.0)


--------------------------------------------------------------------------------
/ml_with_python_code/10_classification_accuracy.py:
--------------------------------------------------------------------------------
 1 | # Cross Validation Classification Accuracy
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.linear_model import LogisticRegression
 6 | filename = 'pima-indians-diabetes.data.csv'
 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 8 | dataframe = read_csv(filename, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | kfold = KFold(n_splits=10, random_state=7)
13 | model = LogisticRegression()
14 | scoring = 'accuracy'
15 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
16 | print("Accuracy: %.3f (%.3f)") % (results.mean(), results.std())


--------------------------------------------------------------------------------
/ml_with_python_code/10_classification_auc.py:
--------------------------------------------------------------------------------
 1 | # Cross Validation Classification ROC AUC
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.linear_model import LogisticRegression
 6 | filename = 'pima-indians-diabetes.data.csv'
 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 8 | dataframe = read_csv(filename, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | kfold = KFold(n_splits=10, random_state=7)
13 | model = LogisticRegression()
14 | scoring = 'roc_auc'
15 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
16 | print("AUC: %.3f (%.3f)") % (results.mean(), results.std())


--------------------------------------------------------------------------------
/ml_with_python_code/10_classification_confusion_matrix.py:
--------------------------------------------------------------------------------
 1 | # Cross Validation Classification Confusion Matrix
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import train_test_split
 4 | from sklearn.linear_model import LogisticRegression
 5 | from sklearn.metrics import confusion_matrix
 6 | filename = 'pima-indians-diabetes.data.csv'
 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 8 | dataframe = read_csv(filename, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | test_size = 0.33
13 | seed = 7
14 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
15 | model = LogisticRegression()
16 | model.fit(X_train, Y_train)
17 | predicted = model.predict(X_test)
18 | matrix = confusion_matrix(Y_test, predicted)
19 | print(matrix)
20 | 


--------------------------------------------------------------------------------
/ml_with_python_code/10_classification_logloss.py:
--------------------------------------------------------------------------------
 1 | # Cross Validation Classification LogLoss
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.linear_model import LogisticRegression
 6 | filename = 'pima-indians-diabetes.data.csv'
 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 8 | dataframe = read_csv(filename, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | kfold = KFold(n_splits=10, random_state=7)
13 | model = LogisticRegression()
14 | scoring = 'neg_log_loss'
15 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
16 | print("Logloss: %.3f (%.3f)") % (results.mean(), results.std())


--------------------------------------------------------------------------------
/ml_with_python_code/10_classification_report.py:
--------------------------------------------------------------------------------
 1 | # Cross Validation Classification Report
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import train_test_split
 4 | from sklearn.linear_model import LogisticRegression
 5 | from sklearn.metrics import classification_report
 6 | filename = 'pima-indians-diabetes.data.csv'
 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 8 | dataframe = read_csv(filename, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | test_size = 0.33
13 | seed = 7
14 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
15 | model = LogisticRegression()
16 | model.fit(X_train, Y_train)
17 | predicted = model.predict(X_test)
18 | report = classification_report(Y_test, predicted)
19 | print(report)
20 | 


--------------------------------------------------------------------------------
/ml_with_python_code/10_regression_mae.py:
--------------------------------------------------------------------------------
 1 | # Cross Validation Regression MAE
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.linear_model import LinearRegression
 6 | filename = 'housing.csv'
 7 | names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
 8 | dataframe = read_csv(filename, delim_whitespace=True, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:13]
11 | Y = array[:,13]
12 | kfold = KFold(n_splits=10, random_state=7)
13 | model = LinearRegression()
14 | scoring = 'neg_mean_absolute_error'
15 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
16 | print("MAE: %.3f (%.3f)") % (results.mean(), results.std())


--------------------------------------------------------------------------------
/ml_with_python_code/10_regression_mse.py:
--------------------------------------------------------------------------------
 1 | # Cross Validation Regression MSE
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.linear_model import LinearRegression
 6 | filename = 'housing.csv'
 7 | names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
 8 | dataframe = read_csv(filename, delim_whitespace=True, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:13]
11 | Y = array[:,13]
12 | num_folds = 10
13 | kfold = KFold(n_splits=10, random_state=7)
14 | model = LinearRegression()
15 | scoring = 'neg_mean_squared_error'
16 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
17 | print("MSE: %.3f (%.3f)") % (results.mean(), results.std())


--------------------------------------------------------------------------------
/ml_with_python_code/10_regression_rsquared.py:
--------------------------------------------------------------------------------
 1 | # Cross Validation Regression R^2
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.linear_model import LinearRegression
 6 | filename = 'housing.csv'
 7 | names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
 8 | dataframe = read_csv(filename, delim_whitespace=True, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:13]
11 | Y = array[:,13]
12 | kfold = KFold(n_splits=10, random_state=7)
13 | model = LinearRegression()
14 | scoring = 'r2'
15 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
16 | print("R^2: %.3f (%.3f)") % (results.mean(), results.std())


--------------------------------------------------------------------------------
/ml_with_python_code/11_classification_and_regression_trees_classification.py:
--------------------------------------------------------------------------------
 1 | # CART Classification
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.tree import DecisionTreeClassifier
 6 | filename = 'pima-indians-diabetes.data.csv'
 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 8 | dataframe = read_csv(filename, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | kfold = KFold(n_splits=10, random_state=7)
13 | model = DecisionTreeClassifier()
14 | results = cross_val_score(model, X, Y, cv=kfold)
15 | print(results.mean())
16 | 


--------------------------------------------------------------------------------
/ml_with_python_code/11_gaussian_naive_bayes.py:
--------------------------------------------------------------------------------
 1 | # Gaussian Naive Bayes Classification
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.naive_bayes import GaussianNB
 6 | filename = 'pima-indians-diabetes.data.csv'
 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 8 | dataframe = read_csv(filename, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | kfold = KFold(n_splits=10, random_state=7)
13 | model = GaussianNB()
14 | results = cross_val_score(model, X, Y, cv=kfold)
15 | print(results.mean())
16 | 


--------------------------------------------------------------------------------
/ml_with_python_code/11_k_nearest_neighbors_classification.py:
--------------------------------------------------------------------------------
 1 | # KNN Classification
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.neighbors import KNeighborsClassifier
 6 | filename = 'pima-indians-diabetes.data.csv'
 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 8 | dataframe = read_csv(filename, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | num_folds = 10
13 | kfold = KFold(n_splits=10, random_state=7)
14 | model = KNeighborsClassifier()
15 | results = cross_val_score(model, X, Y, cv=kfold)
16 | print(results.mean())
17 | 


--------------------------------------------------------------------------------
/ml_with_python_code/11_linear_discriminant_analysis.py:
--------------------------------------------------------------------------------
 1 | # LDA Classification
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 6 | filename = 'pima-indians-diabetes.data.csv'
 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 8 | dataframe = read_csv(filename, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | num_folds = 10
13 | kfold = KFold(n_splits=10, random_state=7)
14 | model = LinearDiscriminantAnalysis()
15 | results = cross_val_score(model, X, Y, cv=kfold)
16 | print(results.mean())
17 | 


--------------------------------------------------------------------------------
/ml_with_python_code/11_logistic_regression.py:
--------------------------------------------------------------------------------
 1 | # Logistic Regression Classification
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.linear_model import LogisticRegression
 6 | filename = 'pima-indians-diabetes.data.csv'
 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 8 | dataframe = read_csv(filename, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | num_folds = 10
13 | kfold = KFold(n_splits=10, random_state=7)
14 | model = LogisticRegression()
15 | results = cross_val_score(model, X, Y, cv=kfold)
16 | print(results.mean())
17 | 


--------------------------------------------------------------------------------
/ml_with_python_code/11_support_vector_machines_classification.py:
--------------------------------------------------------------------------------
 1 | # SVM Classification
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.svm import SVC
 6 | filename = 'pima-indians-diabetes.data.csv'
 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 8 | dataframe = read_csv(filename, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | kfold = KFold(n_splits=10, random_state=7)
13 | model = SVC()
14 | results = cross_val_score(model, X, Y, cv=kfold)
15 | print(results.mean())
16 | 


--------------------------------------------------------------------------------
/ml_with_python_code/12_classification_and_regression_trees_regression.py:
--------------------------------------------------------------------------------
 1 | # Decision Tree Regression
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.tree import DecisionTreeRegressor
 6 | filename = 'housing.csv'
 7 | names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
 8 | dataframe = read_csv(filename, delim_whitespace=True, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:13]
11 | Y = array[:,13]
12 | kfold = KFold(n_splits=10, random_state=7)
13 | model = DecisionTreeRegressor()
14 | scoring = 'neg_mean_squared_error'
15 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
16 | print(results.mean())
17 | 


--------------------------------------------------------------------------------
/ml_with_python_code/12_elastic_net.py:
--------------------------------------------------------------------------------
 1 | # ElasticNet Regression
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.linear_model import ElasticNet
 6 | filename = 'housing.csv'
 7 | names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
 8 | dataframe = read_csv(filename, delim_whitespace=True, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:13]
11 | Y = array[:,13]
12 | kfold = KFold(n_splits=10, random_state=7)
13 | model = ElasticNet()
14 | scoring = 'neg_mean_squared_error'
15 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
16 | print(results.mean())
17 | 


--------------------------------------------------------------------------------
/ml_with_python_code/12_k_nearest_neighbors_regression.py:
--------------------------------------------------------------------------------
 1 | # KNN Regression
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.neighbors import KNeighborsRegressor
 6 | filename = 'housing.csv'
 7 | names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
 8 | dataframe = read_csv(filename, delim_whitespace=True, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:13]
11 | Y = array[:,13]
12 | kfold = KFold(n_splits=10, random_state=7)
13 | model = KNeighborsRegressor()
14 | scoring = 'neg_mean_squared_error'
15 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
16 | print(results.mean())
17 | 


--------------------------------------------------------------------------------
/ml_with_python_code/12_lasso_regression.py:
--------------------------------------------------------------------------------
 1 | # Lasso Regression
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.linear_model import Lasso
 6 | filename = 'housing.csv'
 7 | names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
 8 | dataframe = read_csv(filename, delim_whitespace=True, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:13]
11 | Y = array[:,13]
12 | kfold = KFold(n_splits=10, random_state=7)
13 | model = Lasso()
14 | scoring = 'neg_mean_squared_error'
15 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
16 | print(results.mean())
17 | 


--------------------------------------------------------------------------------
/ml_with_python_code/12_linear_regression.py:
--------------------------------------------------------------------------------
 1 | # Linear Regression
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.linear_model import LinearRegression
 6 | filename = 'housing.csv'
 7 | names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
 8 | dataframe = read_csv(filename, delim_whitespace=True, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:13]
11 | Y = array[:,13]
12 | kfold = KFold(n_splits=10, random_state=7)
13 | model = LinearRegression()
14 | scoring = 'neg_mean_squared_error'
15 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
16 | print(results.mean())
17 | 


--------------------------------------------------------------------------------
/ml_with_python_code/12_ridge_regression.py:
--------------------------------------------------------------------------------
 1 | # Ridge Regression
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.linear_model import Ridge
 6 | filename = 'housing.csv'
 7 | names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
 8 | dataframe = read_csv(filename, delim_whitespace=True, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:13]
11 | Y = array[:,13]
12 | num_folds = 10
13 | kfold = KFold(n_splits=10, random_state=7)
14 | model = Ridge()
15 | scoring = 'neg_mean_squared_error'
16 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
17 | print(results.mean())
18 | 


--------------------------------------------------------------------------------
/ml_with_python_code/12_support_vector_machines_regression.py:
--------------------------------------------------------------------------------
 1 | # SVM Regression
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.svm import SVR
 6 | filename = 'housing.csv'
 7 | names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
 8 | dataframe = read_csv(filename, delim_whitespace=True, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:13]
11 | Y = array[:,13]
12 | num_folds = 10
13 | kfold = KFold(n_splits=10, random_state=7)
14 | model = SVR()
15 | scoring = 'neg_mean_squared_error'
16 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
17 | print(results.mean())
18 | 


--------------------------------------------------------------------------------
/ml_with_python_code/14_feature_union_model_pipeline.py:
--------------------------------------------------------------------------------
 1 | # Create a pipeline that extracts features from the data then creates a model
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.pipeline import Pipeline
 6 | from sklearn.pipeline import FeatureUnion
 7 | from sklearn.linear_model import LogisticRegression
 8 | from sklearn.decomposition import PCA
 9 | from sklearn.feature_selection import SelectKBest
10 | # load data
11 | filename = 'pima-indians-diabetes.data.csv'
12 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
13 | dataframe = read_csv(filename, names=names)
14 | array = dataframe.values
15 | X = array[:,0:8]
16 | Y = array[:,8]
17 | # create feature union
18 | features = []
19 | features.append(('pca', PCA(n_components=3)))
20 | features.append(('select_best', SelectKBest(k=6)))
21 | feature_union = FeatureUnion(features)
22 | # create pipeline
23 | estimators = []
24 | estimators.append(('feature_union', feature_union))
25 | estimators.append(('logistic', LogisticRegression()))
26 | model = Pipeline(estimators)
27 | # evaluate pipeline
28 | kfold = KFold(n_splits=10, random_state=7)
29 | results = cross_val_score(model, X, Y, cv=kfold)
30 | print(results.mean())
31 | 


--------------------------------------------------------------------------------
/ml_with_python_code/14_standardize_model_pipeline.py:
--------------------------------------------------------------------------------
 1 | # Create a pipeline that standardizes the data then creates a model
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.preprocessing import StandardScaler
 6 | from sklearn.pipeline import Pipeline
 7 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 8 | # load data
 9 | filename = 'pima-indians-diabetes.data.csv'
10 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
11 | dataframe = read_csv(filename, names=names)
12 | array = dataframe.values
13 | X = array[:,0:8]
14 | Y = array[:,8]
15 | # create pipeline
16 | estimators = []
17 | estimators.append(('standardize', StandardScaler()))
18 | estimators.append(('lda', LinearDiscriminantAnalysis()))
19 | model = Pipeline(estimators)
20 | # evaluate pipeline
21 | kfold = KFold(n_splits=10, random_state=7)
22 | results = cross_val_score(model, X, Y, cv=kfold)
23 | print(results.mean())
24 | 


--------------------------------------------------------------------------------
/ml_with_python_code/15_adaboost_classification.py:
--------------------------------------------------------------------------------
 1 | # AdaBoost Classification
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.ensemble import AdaBoostClassifier
 6 | filename = 'pima-indians-diabetes.data.csv'
 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 8 | dataframe = read_csv(filename, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | num_trees = 30
13 | seed=7
14 | kfold = KFold(n_splits=10, random_state=seed)
15 | model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
16 | results = cross_val_score(model, X, Y, cv=kfold)
17 | print(results.mean())


--------------------------------------------------------------------------------
/ml_with_python_code/15_bagged_cart_classification.py:
--------------------------------------------------------------------------------
 1 | # Bagged Decision Trees for Classification
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.ensemble import BaggingClassifier
 6 | from sklearn.tree import DecisionTreeClassifier
 7 | filename = 'pima-indians-diabetes.data.csv'
 8 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 9 | dataframe = read_csv(filename, names=names)
10 | array = dataframe.values
11 | X = array[:,0:8]
12 | Y = array[:,8]
13 | seed = 7
14 | kfold = KFold(n_splits=10, random_state=seed)
15 | cart = DecisionTreeClassifier()
16 | num_trees = 100
17 | model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
18 | results = cross_val_score(model, X, Y, cv=kfold)
19 | print(results.mean())
20 | 


--------------------------------------------------------------------------------
/ml_with_python_code/15_extra_trees_classification.py:
--------------------------------------------------------------------------------
 1 | # Extra Trees Classification
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.ensemble import ExtraTreesClassifier
 6 | filename = 'pima-indians-diabetes.data.csv'
 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 8 | dataframe = read_csv(filename, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | num_trees = 100
13 | max_features = 7
14 | kfold = KFold(n_splits=10, random_state=7)
15 | model = ExtraTreesClassifier(n_estimators=num_trees, max_features=max_features)
16 | results = cross_val_score(model, X, Y, cv=kfold)
17 | print(results.mean())
18 | 


--------------------------------------------------------------------------------
/ml_with_python_code/15_gradient_boosting_classification.py:
--------------------------------------------------------------------------------
 1 | # Stochastic Gradient Boosting Classification
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.ensemble import GradientBoostingClassifier
 6 | filename = 'pima-indians-diabetes.data.csv'
 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 8 | dataframe = read_csv(filename, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | seed = 7
13 | num_trees = 100
14 | kfold = KFold(n_splits=10, random_state=seed)
15 | model = GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)
16 | results = cross_val_score(model, X, Y, cv=kfold)
17 | print(results.mean())
18 | 


--------------------------------------------------------------------------------
/ml_with_python_code/15_random_forest_classification.py:
--------------------------------------------------------------------------------
 1 | # Random Forest Classification
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.ensemble import RandomForestClassifier
 6 | filename = 'pima-indians-diabetes.data.csv'
 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 8 | dataframe = read_csv(filename, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | num_trees = 100
13 | max_features = 3
14 | kfold = KFold(n_splits=10, random_state=7)
15 | model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
16 | results = cross_val_score(model, X, Y, cv=kfold)
17 | print(results.mean())
18 | 


--------------------------------------------------------------------------------
/ml_with_python_code/15_voting_ensemble_classification.py:
--------------------------------------------------------------------------------
 1 | # Voting Ensemble for Classification
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.model_selection import cross_val_score
 5 | from sklearn.linear_model import LogisticRegression
 6 | from sklearn.tree import DecisionTreeClassifier
 7 | from sklearn.svm import SVC
 8 | from sklearn.ensemble import VotingClassifier
 9 | filename = 'pima-indians-diabetes.data.csv'
10 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
11 | dataframe = read_csv(filename, names=names)
12 | array = dataframe.values
13 | X = array[:,0:8]
14 | Y = array[:,8]
15 | kfold = KFold(n_splits=10, random_state=7)
16 | # create the sub models
17 | estimators = []
18 | model1 = LogisticRegression()
19 | estimators.append(('logistic', model1))
20 | model2 = DecisionTreeClassifier()
21 | estimators.append(('cart', model2))
22 | model3 = SVC()
23 | estimators.append(('svm', model3))
24 | # create the ensemble model
25 | ensemble = VotingClassifier(estimators)
26 | results = cross_val_score(ensemble, X, Y, cv=kfold)
27 | print(results.mean())
28 | 


--------------------------------------------------------------------------------
/ml_with_python_code/16_grid_search.py:
--------------------------------------------------------------------------------
 1 | # Grid Search for Algorithm Tuning
 2 | import numpy
 3 | from pandas import read_csv
 4 | from sklearn.linear_model import Ridge
 5 | from sklearn.model_selection import GridSearchCV
 6 | filename = 'pima-indians-diabetes.data.csv'
 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 8 | dataframe = read_csv(filename, names=names)
 9 | array = dataframe.values
10 | X = array[:,0:8]
11 | Y = array[:,8]
12 | alphas = numpy.array([1,0.1,0.01,0.001,0.0001,0])
13 | param_grid = dict(alpha=alphas)
14 | model = Ridge()
15 | grid = GridSearchCV(estimator=model, param_grid=param_grid)
16 | grid.fit(X, Y)
17 | print(grid.best_score_)
18 | print(grid.best_estimator_.alpha)
19 | 


--------------------------------------------------------------------------------
/ml_with_python_code/16_random_search.py:
--------------------------------------------------------------------------------
 1 | # Randomized for Algorithm Tuning
 2 | import numpy
 3 | from pandas import read_csv
 4 | from scipy.stats import uniform
 5 | from sklearn.linear_model import Ridge
 6 | from sklearn.model_selection import RandomizedSearchCV
 7 | filename = 'pima-indians-diabetes.data.csv'
 8 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 9 | dataframe = read_csv(filename, names=names)
10 | array = dataframe.values
11 | X = array[:,0:8]
12 | Y = array[:,8]
13 | param_grid = {'alpha': uniform()}
14 | model = Ridge()
15 | rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100, random_state=7)
16 | rsearch.fit(X, Y)
17 | print(rsearch.best_score_)
18 | print(rsearch.best_estimator_.alpha)
19 | 


--------------------------------------------------------------------------------
/ml_with_python_code/17_save_model_joblib.py:
--------------------------------------------------------------------------------
 1 | # Save Model Using joblib
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import train_test_split
 4 | from sklearn.linear_model import LogisticRegression
 5 | from sklearn.externals.joblib import dump
 6 | from sklearn.externals.joblib import load
 7 | filename = 'pima-indians-diabetes.data.csv'
 8 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 9 | dataframe = read_csv(filename, names=names)
10 | array = dataframe.values
11 | X = array[:,0:8]
12 | Y = array[:,8]
13 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=7)
14 | # Fit the model on 33%
15 | model = LogisticRegression()
16 | model.fit(X_train, Y_train)
17 | # save the model to disk
18 | filename = 'finalized_model.sav'
19 | dump(model, filename)
20 | 
21 | # some time later...
22 | 
23 | # load the model from disk
24 | loaded_model = load(filename)
25 | result = loaded_model.score(X_test, Y_test)
26 | print(result)
27 | 


--------------------------------------------------------------------------------
/ml_with_python_code/17_save_model_pickel.py:
--------------------------------------------------------------------------------
 1 | # Save Model Using Pickle
 2 | from pandas import read_csv
 3 | from sklearn.model_selection import train_test_split
 4 | from sklearn.linear_model import LogisticRegression
 5 | from pickle import dump
 6 | from pickle import load
 7 | filename = 'pima-indians-diabetes.data.csv'
 8 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 9 | dataframe = read_csv(filename, names=names)
10 | array = dataframe.values
11 | X = array[:,0:8]
12 | Y = array[:,8]
13 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=7)
14 | # Fit the model on 33%
15 | model = LogisticRegression()
16 | model.fit(X_train, Y_train)
17 | # save the model to disk
18 | filename = 'finalized_model.sav'
19 | dump(model, open(filename, 'wb'))
20 | 
21 | # some time later...
22 | 
23 | # load the model from disk
24 | loaded_model = load(open(filename, 'rb'))
25 | result = loaded_model.score(X_test, Y_test)
26 | print(result)
27 | 


--------------------------------------------------------------------------------
/ml_with_python_code/18_project_template.py:
--------------------------------------------------------------------------------
 1 | # Python Project Template
 2 | 
 3 | # 1. Prepare Problem
 4 | # a) Load libraries
 5 | # b) Load dataset
 6 | 
 7 | # 2. Summarize Data
 8 | # a) Descriptive statistics
 9 | # b) Data visualizations
10 | 
11 | # 3. Prepare Data
12 | # a) Data Cleaning
13 | # b) Feature Selection
14 | # c) Data Transforms
15 | 
16 | # 4. Evaluate Algorithms
17 | # a) Split-out validation dataset
18 | # b) Test options and evaluation metric
19 | # c) Spot Check Algorithms
20 | # d) Compare Algorithms
21 | 
22 | # 5. Improve Accuracy
23 | # a) Algorithm Tuning
24 | # b) Ensembles
25 | 
26 | # 6. Finalize Model
27 | # a) Predictions on validation dataset
28 | # b) Create standalone model on entire training dataset
29 | # c) Save model for later use
30 | 


--------------------------------------------------------------------------------
/xgboost_with_python_code/04_first_model.py:
--------------------------------------------------------------------------------
 1 | # First XGBoost model for Pima Indians dataset
 2 | from numpy import loadtxt
 3 | from xgboost import XGBClassifier
 4 | from sklearn.cross_validation import train_test_split
 5 | from sklearn.metrics import accuracy_score
 6 | # load data
 7 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
 8 | # split data into X and y
 9 | X = dataset[:,0:8]
10 | Y = dataset[:,8]
11 | # split data into train and test sets
12 | seed = 7
13 | test_size = 0.33
14 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
15 | # fit model no training data
16 | model = XGBClassifier()
17 | model.fit(X_train, y_train)
18 | # make predictions for test data
19 | y_pred = model.predict(X_test)
20 | predictions = [round(value) for value in y_pred]
21 | # evaluate predictions
22 | accuracy = accuracy_score(y_test, predictions)
23 | print("Accuracy: %.2f%%" % (accuracy * 100.0))


--------------------------------------------------------------------------------
/xgboost_with_python_code/05_horse_colic_missing.py:
--------------------------------------------------------------------------------
 1 | # binary classification, missing data
 2 | from pandas import read_csv
 3 | from xgboost import XGBClassifier
 4 | from sklearn.cross_validation import train_test_split
 5 | from sklearn.metrics import accuracy_score
 6 | from sklearn.preprocessing import LabelEncoder
 7 | # load data
 8 | dataframe = read_csv("horse-colic.csv", delim_whitespace=True, header=None)
 9 | dataset = dataframe.values
10 | # split data into X and y
11 | X = dataset[:,0:27]
12 | Y = dataset[:,27]
13 | # set missing values to 0
14 | X[X == '?'] = 0
15 | # convert to numeric
16 | X = X.astype('float32')
17 | # encode Y class values as integers
18 | label_encoder = LabelEncoder()
19 | label_encoder = label_encoder.fit(Y)
20 | label_encoded_y = label_encoder.transform(Y)
21 | # split data into train and test sets
22 | seed = 7
23 | test_size = 0.33
24 | X_train, X_test, y_train, y_test = train_test_split(X, label_encoded_y, test_size=test_size, random_state=seed)
25 | # fit model no training data
26 | model = XGBClassifier()
27 | model.fit(X_train, y_train)
28 | print(model)
29 | # make predictions for test data
30 | y_pred = model.predict(X_test)
31 | predictions = [round(value) for value in y_pred]
32 | # evaluate predictions
33 | accuracy = accuracy_score(y_test, predictions)
34 | print("Accuracy: %.2f%%" % (accuracy * 100.0))


--------------------------------------------------------------------------------
/xgboost_with_python_code/05_horse_colic_missing_imputer.py:
--------------------------------------------------------------------------------
 1 | # binary classification, missing data, impute with mean
 2 | import numpy
 3 | from pandas import read_csv
 4 | from xgboost import XGBClassifier
 5 | from sklearn.cross_validation import train_test_split
 6 | from sklearn.metrics import accuracy_score
 7 | from sklearn.preprocessing import LabelEncoder
 8 | from sklearn.preprocessing import Imputer
 9 | # load data
10 | dataframe = read_csv("horse-colic.csv", delim_whitespace=True, header=None)
11 | dataset = dataframe.values
12 | # split data into X and y
13 | X = dataset[:,0:27]
14 | Y = dataset[:,27]
15 | # set missing values to NaN
16 | X[X == '?'] = numpy.nan
17 | # convert to numeric
18 | X = X.astype('float32')
19 | # impute missing values as the mean
20 | imputer = Imputer()
21 | imputed_x = imputer.fit_transform(X)
22 | # encode Y class values as integers
23 | label_encoder = LabelEncoder()
24 | label_encoder = label_encoder.fit(Y)
25 | label_encoded_y = label_encoder.transform(Y)
26 | # split data into train and test sets
27 | seed = 7
28 | test_size = 0.33
29 | X_train, X_test, y_train, y_test = train_test_split(imputed_x, label_encoded_y, test_size=test_size, random_state=seed)
30 | # fit model no training data
31 | model = XGBClassifier()
32 | model.fit(X_train, y_train)
33 | print(model)
34 | # make predictions for test data
35 | y_pred = model.predict(X_test)
36 | predictions = [round(value) for value in y_pred]
37 | # evaluate predictions
38 | accuracy = accuracy_score(y_test, predictions)
39 | print("Accuracy: %.2f%%" % (accuracy * 100.0))


--------------------------------------------------------------------------------
/xgboost_with_python_code/05_iris_label_encode.py:
--------------------------------------------------------------------------------
 1 | # multiclass classification
 2 | from pandas import read_csv
 3 | from xgboost import XGBClassifier
 4 | from sklearn import cross_validation
 5 | from sklearn.metrics import accuracy_score
 6 | from sklearn.preprocessing import LabelEncoder
 7 | # load data
 8 | data = read_csv('iris.csv', header=None)
 9 | dataset = data.values
10 | # split data into X and y
11 | X = dataset[:,0:4]
12 | Y = dataset[:,4]
13 | # encode string class values as integers
14 | label_encoder = LabelEncoder()
15 | label_encoder = label_encoder.fit(Y)
16 | label_encoded_y = label_encoder.transform(Y)
17 | seed = 7
18 | test_size = 0.33
19 | X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, label_encoded_y, test_size=test_size, random_state=seed)
20 | # fit model no training data
21 | model = XGBClassifier()
22 | model.fit(X_train, y_train)
23 | print(model)
24 | # make predictions for test data
25 | y_pred = model.predict(X_test)
26 | predictions = [round(value) for value in y_pred]
27 | # evaluate predictions
28 | accuracy = accuracy_score(y_test, predictions)
29 | print("Accuracy: %.2f%%" % (accuracy * 100.0))


--------------------------------------------------------------------------------
/xgboost_with_python_code/06_cross_validation.py:
--------------------------------------------------------------------------------
 1 | # k-fold cross validation evaluation of xgboost model
 2 | from numpy import loadtxt
 3 | from xgboost import XGBClassifier
 4 | from sklearn.cross_validation import KFold
 5 | from sklearn.cross_validation import cross_val_score
 6 | # load data
 7 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
 8 | # split data into X and y
 9 | X = dataset[:,0:8]
10 | Y = dataset[:,8]
11 | # CV model
12 | model = XGBClassifier()
13 | kfold = KFold(n=len(X), n_folds=10, random_state=7)
14 | results = cross_val_score(model, X, Y, cv=kfold)
15 | print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))


--------------------------------------------------------------------------------
/xgboost_with_python_code/06_stratified_cross_validation.py:
--------------------------------------------------------------------------------
 1 | # stratified k-fold cross validation evaluation of xgboost model
 2 | from numpy import loadtxt
 3 | from xgboost import XGBClassifier
 4 | from sklearn.cross_validation import StratifiedKFold
 5 | from sklearn.cross_validation import cross_val_score
 6 | # load data
 7 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
 8 | # split data into X and y
 9 | X = dataset[:,0:8]
10 | Y = dataset[:,8]
11 | # CV model
12 | model = XGBClassifier()
13 | kfold = StratifiedKFold(Y, n_folds=10, random_state=7)
14 | results = cross_val_score(model, X, Y, cv=kfold)
15 | print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))


--------------------------------------------------------------------------------
/xgboost_with_python_code/06_train_test_split.py:
--------------------------------------------------------------------------------
 1 | # train-test split evaluation of xgboost model
 2 | from numpy import loadtxt
 3 | from xgboost import XGBClassifier
 4 | from sklearn.cross_validation import train_test_split
 5 | from sklearn.metrics import accuracy_score
 6 | # load data
 7 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
 8 | # split data into X and y
 9 | X = dataset[:,0:8]
10 | Y = dataset[:,8]
11 | # split data into train and test sets
12 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=7)
13 | # fit model no training data
14 | model = XGBClassifier()
15 | model.fit(X_train, y_train)
16 | # make predictions for test data
17 | y_pred = model.predict(X_test)
18 | predictions = [round(value) for value in y_pred]
19 | # evaluate predictions
20 | accuracy = accuracy_score(y_test, predictions)
21 | print("Accuracy: %.2f%%" % (accuracy * 100.0))


--------------------------------------------------------------------------------
/xgboost_with_python_code/07_plot_tree-left-to-right.py:
--------------------------------------------------------------------------------
 1 | # plot decision tree
 2 | from numpy import loadtxt
 3 | from xgboost import XGBClassifier
 4 | from xgboost import plot_tree
 5 | from matplotlib import pyplot
 6 | # load data
 7 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
 8 | # split data into X and y
 9 | X = dataset[:,0:8]
10 | y = dataset[:,8]
11 | # fit model no training data
12 | model = XGBClassifier()
13 | model.fit(X, y)
14 | # plot single tree
15 | plot_tree(model, num_trees=0, rankdir='LR')
16 | pyplot.show()


--------------------------------------------------------------------------------
/xgboost_with_python_code/07_plot_tree.py:
--------------------------------------------------------------------------------
 1 | # plot decision tree
 2 | from numpy import loadtxt
 3 | from xgboost import XGBClassifier
 4 | from xgboost import plot_tree
 5 | from matplotlib import pyplot
 6 | # load data
 7 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
 8 | # split data into X and y
 9 | X = dataset[:,0:8]
10 | y = dataset[:,8]
11 | # fit model no training data
12 | model = XGBClassifier()
13 | model.fit(X, y)
14 | # plot single tree
15 | plot_tree(model)
16 | pyplot.show()


--------------------------------------------------------------------------------
/xgboost_with_python_code/08_serialize_with_joblib.py:
--------------------------------------------------------------------------------
 1 | # Train XGBoost model, save to file using joblib, load and make predictions
 2 | from numpy import loadtxt
 3 | from xgboost import XGBClassifier
 4 | from sklearn.externals import joblib
 5 | from sklearn.cross_validation import train_test_split
 6 | from sklearn.metrics import accuracy_score
 7 | # load data
 8 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
 9 | # split data into X and y
10 | X = dataset[:,0:8]
11 | Y = dataset[:,8]
12 | # split data into train and test sets
13 | seed = 7
14 | test_size = 0.33
15 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
16 | # fit model no training data
17 | model = XGBClassifier()
18 | model.fit(X_train, y_train)
19 | # save model to file
20 | joblib.dump(model, "pima.joblib.dat")
21 | print("Saved model to: pima.joblib.dat")
22 | 
23 | # some time later...
24 | 
25 | # load model from file
26 | loaded_model = joblib.load("pima.joblib.dat")
27 | print("Loaded model from: pima.joblib.dat")
28 | # make predictions for test data
29 | y_pred = loaded_model.predict(X_test)
30 | predictions = [round(value) for value in y_pred]
31 | # evaluate predictions
32 | accuracy = accuracy_score(y_test, predictions)
33 | print("Accuracy: %.2f%%" % (accuracy * 100.0))


--------------------------------------------------------------------------------
/xgboost_with_python_code/08_serialize_with_pickle.py:
--------------------------------------------------------------------------------
 1 | # Train XGBoost model, save to file using pickle, load and make predictions
 2 | from numpy import loadtxt
 3 | from xgboost import XGBClassifier
 4 | import pickle
 5 | from sklearn.cross_validation import train_test_split
 6 | from sklearn.metrics import accuracy_score
 7 | # load data
 8 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
 9 | # split data into X and y
10 | X = dataset[:,0:8]
11 | Y = dataset[:,8]
12 | # split data into train and test sets
13 | seed = 7
14 | test_size = 0.33
15 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
16 | # fit model no training data
17 | model = XGBClassifier()
18 | model.fit(X_train, y_train)
19 | # save model to file
20 | pickle.dump(model, open("pima.pickle.dat", "wb"))
21 | print("Saved model to: pima.pickle.dat")
22 | 
23 | # some time later...
24 | 
25 | # load model from file
26 | loaded_model = pickle.load(open("pima.pickle.dat", "rb"))
27 | print("Loaded model from: pima.pickle.dat")
28 | # make predictions for test data
29 | y_pred = loaded_model.predict(X_test)
30 | predictions = [round(value) for value in y_pred]
31 | # evaluate predictions
32 | accuracy = accuracy_score(y_test, predictions)
33 | print("Accuracy: %.2f%%" % (accuracy * 100.0))


--------------------------------------------------------------------------------
/xgboost_with_python_code/09_automatic_feature_importance.py:
--------------------------------------------------------------------------------
 1 | # plot feature importance using built-in function
 2 | from numpy import loadtxt
 3 | from xgboost import XGBClassifier
 4 | from xgboost import plot_importance
 5 | from matplotlib import pyplot
 6 | # load data
 7 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
 8 | # split data into X and y
 9 | X = dataset[:,0:8]
10 | y = dataset[:,8]
11 | # fit model no training data
12 | model = XGBClassifier()
13 | model.fit(X, y)
14 | # plot feature importance
15 | plot_importance(model)
16 | pyplot.show()


--------------------------------------------------------------------------------
/xgboost_with_python_code/09_feature_selection.py:
--------------------------------------------------------------------------------
 1 | # use feature importance for feature selection
 2 | from numpy import loadtxt
 3 | from numpy import sort
 4 | from xgboost import XGBClassifier
 5 | from sklearn.cross_validation import train_test_split
 6 | from sklearn.metrics import accuracy_score
 7 | from sklearn.feature_selection import SelectFromModel
 8 | # load data
 9 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
10 | # split data into X and y
11 | X = dataset[:,0:8]
12 | Y = dataset[:,8]
13 | # split data into train and test sets
14 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=7)
15 | # fit model on all training data
16 | model = XGBClassifier()
17 | model.fit(X_train, y_train)
18 | # make predictions for test data and evaluate
19 | y_pred = model.predict(X_test)
20 | predictions = [round(value) for value in y_pred]
21 | accuracy = accuracy_score(y_test, predictions)
22 | print("Accuracy: %.2f%%" % (accuracy * 100.0))
23 | # Fit model using each importance as a threshold
24 | thresholds = sort(model.feature_importances_)
25 | for thresh in thresholds:
26 | 	# select features using threshold
27 | 	selection = SelectFromModel(model, threshold=thresh, prefit=True)
28 | 	select_X_train = selection.transform(X_train)
29 | 	# train model
30 | 	selection_model = XGBClassifier()
31 | 	selection_model.fit(select_X_train, y_train)
32 | 	# eval model
33 | 	select_X_test = selection.transform(X_test)
34 | 	y_pred = selection_model.predict(select_X_test)
35 | 	predictions = [round(value) for value in y_pred]
36 | 	accuracy = accuracy_score(y_test, predictions)
37 | 	print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1], accuracy*100.0))


--------------------------------------------------------------------------------
/xgboost_with_python_code/09_manual_feature_importance.py:
--------------------------------------------------------------------------------
 1 | # plot feature importance manually
 2 | from numpy import loadtxt
 3 | from xgboost import XGBClassifier
 4 | from matplotlib import pyplot
 5 | # load data
 6 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
 7 | # split data into X and y
 8 | X = dataset[:,0:8]
 9 | y = dataset[:,8]
10 | # fit model no training data
11 | model = XGBClassifier()
12 | model.fit(X, y)
13 | # feature importance
14 | print(model.feature_importances_)
15 | # plot
16 | pyplot.bar(range(len(model.feature_importances_)), model.feature_importances_)
17 | pyplot.show()


--------------------------------------------------------------------------------
/xgboost_with_python_code/10_early_stopping.py:
--------------------------------------------------------------------------------
 1 | # early stopping
 2 | from numpy import loadtxt
 3 | from xgboost import XGBClassifier
 4 | from sklearn.cross_validation import train_test_split
 5 | from sklearn.metrics import accuracy_score
 6 | # load data
 7 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
 8 | # split data into X and y
 9 | X = dataset[:,0:8]
10 | Y = dataset[:,8]
11 | # split data into train and test sets
12 | seed = 7
13 | test_size = 0.33
14 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
15 | # fit model no training data
16 | model = XGBClassifier()
17 | eval_set = [(X_test, y_test)]
18 | model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="logloss", eval_set=eval_set, verbose=True)
19 | # make predictions for test data
20 | y_pred = model.predict(X_test)
21 | predictions = [round(value) for value in y_pred]
22 | # evaluate predictions
23 | accuracy = accuracy_score(y_test, predictions)
24 | print("Accuracy: %.2f%%" % (accuracy * 100.0))


--------------------------------------------------------------------------------
/xgboost_with_python_code/10_evaluate_validation_set.py:
--------------------------------------------------------------------------------
 1 | # monitor training performance
 2 | from numpy import loadtxt
 3 | from xgboost import XGBClassifier
 4 | from sklearn.cross_validation import train_test_split
 5 | from sklearn.metrics import accuracy_score
 6 | # load data
 7 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
 8 | # split data into X and y
 9 | X = dataset[:,0:8]
10 | Y = dataset[:,8]
11 | # split data into train and test sets
12 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=7)
13 | # fit model no training data
14 | model = XGBClassifier()
15 | eval_set = [(X_test, y_test)]
16 | model.fit(X_train, y_train, eval_metric="error", eval_set=eval_set, verbose=True)
17 | # make predictions for test data
18 | y_pred = model.predict(X_test)
19 | predictions = [round(value) for value in y_pred]
20 | # evaluate predictions
21 | accuracy = accuracy_score(y_test, predictions)
22 | print("Accuracy: %.2f%%" % (accuracy * 100.0))


--------------------------------------------------------------------------------
/xgboost_with_python_code/11_eval_num_threads.py:
--------------------------------------------------------------------------------
 1 | # Otto, tune number of threads
 2 | from pandas import read_csv
 3 | from xgboost import XGBClassifier
 4 | from sklearn.preprocessing import LabelEncoder
 5 | import time
 6 | from matplotlib import pyplot
 7 | # load data
 8 | data = read_csv('train.csv')
 9 | dataset = data.values
10 | # split data into X and y
11 | X = dataset[:,0:94]
12 | y = dataset[:,94]
13 | # encode string class values as integers
14 | label_encoded_y = LabelEncoder().fit_transform(y)
15 | # evaluate the effect of the number of threads
16 | results = []
17 | num_threads = [1, 2, 3, 4]
18 | for n in num_threads:
19 | 	start = time.time()
20 | 	model = XGBClassifier(nthread=n)
21 | 	model.fit(X, label_encoded_y)
22 | 	elapsed = time.time() - start
23 | 	print(n, elapsed)
24 | 	results.append(elapsed)
25 | # plot results
26 | pyplot.plot(num_threads, results)
27 | pyplot.ylabel('Speed (seconds)')
28 | pyplot.xlabel('Number of Threads')
29 | pyplot.title('XGBoost Training Speed vs Number of Threads')
30 | pyplot.show()


--------------------------------------------------------------------------------
/xgboost_with_python_code/11_eval_parallel_cv_and_xgboost.py:
--------------------------------------------------------------------------------
 1 | # Otto, parallel cross validation
 2 | from pandas import read_csv
 3 | from xgboost import XGBClassifier
 4 | from sklearn.cross_validation import StratifiedKFold
 5 | from sklearn.cross_validation import cross_val_score
 6 | from sklearn.preprocessing import LabelEncoder
 7 | import time
 8 | # load data
 9 | data = read_csv('train.csv')
10 | dataset = data.values
11 | # split data into X and y
12 | X = dataset[:,0:94]
13 | y = dataset[:,94]
14 | # encode string class values as integers
15 | label_encoded_y = LabelEncoder().fit_transform(y)
16 | # prepare cross validation
17 | kfold = StratifiedKFold(label_encoded_y, n_folds=10, shuffle=True, random_state=7)
18 | # Single Thread XGBoost, Parallel Thread CV
19 | start = time.time()
20 | model = XGBClassifier(nthread=1)
21 | results = cross_val_score(model, X, label_encoded_y, cv=kfold, scoring='log_loss', n_jobs=-1)
22 | elapsed = time.time() - start
23 | print("Single Thread XGBoost, Parallel Thread CV: %f" % (elapsed))
24 | # Parallel Thread XGBoost, Single Thread CV
25 | start = time.time()
26 | model = XGBClassifier(nthread=-1)
27 | results = cross_val_score(model, X, label_encoded_y, cv=kfold, scoring='log_loss', n_jobs=1)
28 | elapsed = time.time() - start
29 | print("Parallel Thread XGBoost, Single Thread CV: %f" % (elapsed))
30 | # Parallel Thread XGBoost and CV
31 | start = time.time()
32 | model = XGBClassifier(nthread=-1)
33 | results = cross_val_score(model, X, label_encoded_y, cv=kfold, scoring='log_loss', n_jobs=-1)
34 | elapsed = time.time() - start
35 | print("Parallel Thread XGBoost and CV: %f" % (elapsed))


--------------------------------------------------------------------------------
/xgboost_with_python_code/12_check_num_threads.py:
--------------------------------------------------------------------------------
 1 | # Otto multi-core test
 2 | from pandas import read_csv
 3 | from xgboost import XGBClassifier
 4 | from sklearn.preprocessing import LabelEncoder
 5 | import time
 6 | # load data
 7 | data = read_csv('train.csv')
 8 | dataset = data.values
 9 | # split data into X and y
10 | X = dataset[:,0:94]
11 | y = dataset[:,94]
12 | # encode string class values as integers
13 | label_encoded_y = LabelEncoder().fit_transform(y)
14 | # evaluate the effect of the number of threads
15 | results = []
16 | num_threads = [1, 16, 32]
17 | for n in num_threads:
18 | 	start = time.time()
19 | 	model = XGBClassifier(nthread=n)
20 | 	model.fit(X, label_encoded_y)
21 | 	elapsed = time.time() - start
22 | 	print(n, elapsed)
23 | 	results.append(elapsed)


--------------------------------------------------------------------------------
/xgboost_with_python_code/14_tune_depth.py:
--------------------------------------------------------------------------------
 1 | # XGBoost on Otto dataset, Tune max_depth
 2 | from pandas import read_csv
 3 | from xgboost import XGBClassifier
 4 | from sklearn.grid_search import GridSearchCV
 5 | from sklearn.cross_validation import StratifiedKFold
 6 | from sklearn.preprocessing import LabelEncoder
 7 | import matplotlib
 8 | matplotlib.use('Agg')
 9 | from matplotlib import pyplot
10 | # load data
11 | data = read_csv('train.csv')
12 | dataset = data.values
13 | # split data into X and y
14 | X = dataset[:,0:94]
15 | y = dataset[:,94]
16 | # encode string class values as integers
17 | label_encoded_y = LabelEncoder().fit_transform(y)
18 | # grid search
19 | model = XGBClassifier()
20 | max_depth = range(1, 11, 2)
21 | print(max_depth)
22 | param_grid = dict(max_depth=max_depth)
23 | kfold = StratifiedKFold(label_encoded_y, n_folds=10, shuffle=True, random_state=7)
24 | grid_search = GridSearchCV(model, param_grid, scoring="log_loss", n_jobs=-1, cv=kfold, verbose=1)
25 | result = grid_search.fit(X, label_encoded_y)
26 | # summarize results
27 | print("Best: %f using %s" % (result.best_score_, result.best_params_))
28 | means, stdevs = [], []
29 | for params, mean_score, scores in result.grid_scores_:
30 | 	stdev = scores.std()
31 | 	means.append(mean_score)
32 | 	stdevs.append(stdev)
33 | 	print("%f (%f) with: %r" % (mean_score, stdev, params))
34 | # plot
35 | pyplot.errorbar(max_depth, means, yerr=stdevs)
36 | pyplot.title("XGBoost max_depth vs Log Loss")
37 | pyplot.xlabel('max_depth')
38 | pyplot.ylabel('Log Loss')
39 | pyplot.savefig('max_depth.png')


--------------------------------------------------------------------------------
/xgboost_with_python_code/14_tune_trees.py:
--------------------------------------------------------------------------------
 1 | # XGBoost on Otto dataset, Tune n_estimators
 2 | from pandas import read_csv
 3 | from xgboost import XGBClassifier
 4 | from sklearn.grid_search import GridSearchCV
 5 | from sklearn.cross_validation import StratifiedKFold
 6 | from sklearn.preprocessing import LabelEncoder
 7 | import matplotlib
 8 | matplotlib.use('Agg')
 9 | from matplotlib import pyplot
10 | # load data
11 | data = read_csv('train.csv')
12 | dataset = data.values
13 | # split data into X and y
14 | X = dataset[:,0:94]
15 | y = dataset[:,94]
16 | # encode string class values as integers
17 | label_encoded_y = LabelEncoder().fit_transform(y)
18 | # grid search
19 | model = XGBClassifier()
20 | n_estimators = range(50, 400, 50)
21 | param_grid = dict(n_estimators=n_estimators)
22 | kfold = StratifiedKFold(label_encoded_y, n_folds=10, shuffle=True, random_state=7)
23 | grid_search = GridSearchCV(model, param_grid, scoring="log_loss", n_jobs=-1, cv=kfold)
24 | result = grid_search.fit(X, label_encoded_y)
25 | # summarize results
26 | print("Best: %f using %s" % (result.best_score_, result.best_params_))
27 | means, stdevs = [], []
28 | for params, mean_score, scores in result.grid_scores_:
29 | 	stdev = scores.std()
30 | 	means.append(mean_score)
31 | 	stdevs.append(stdev)
32 | 	print("%f (%f) with: %r" % (mean_score, stdev, params))
33 | # plot
34 | pyplot.errorbar(n_estimators, means, yerr=stdevs)
35 | pyplot.title("XGBoost n_estimators vs Log Loss")
36 | pyplot.xlabel('n_estimators')
37 | pyplot.ylabel('Log Loss')
38 | pyplot.savefig('n_estimators.png')


--------------------------------------------------------------------------------
/xgboost_with_python_code/15_plot_performance.py:
--------------------------------------------------------------------------------
1 | # Plot performance for learning_rate=0.1
2 | from matplotlib import pyplot
3 | n_estimators = [100, 200, 300, 400, 500]
4 | loss = [-0.001239, -0.001153, -0.001152, -0.001153, -0.001153]
5 | pyplot.plot(n_estimators, loss)
6 | pyplot.xlabel('n_estimators')
7 | pyplot.ylabel('Log Loss')
8 | pyplot.title('XGBoost learning_rate=0.1 n_estimators vs Log Loss')
9 | pyplot.show()


--------------------------------------------------------------------------------
/xgboost_with_python_code/15_tune_learning_rate.py:
--------------------------------------------------------------------------------
 1 | # XGBoost on Otto dataset, Tune learning_rate
 2 | from pandas import read_csv
 3 | from xgboost import XGBClassifier
 4 | from sklearn.grid_search import GridSearchCV
 5 | from sklearn.cross_validation import StratifiedKFold
 6 | from sklearn.preprocessing import LabelEncoder
 7 | import matplotlib
 8 | matplotlib.use('Agg')
 9 | from matplotlib import pyplot
10 | # load data
11 | data = read_csv('train.csv')
12 | dataset = data.values
13 | # split data into X and y
14 | X = dataset[:,0:94]
15 | y = dataset[:,94]
16 | # encode string class values as integers
17 | label_encoded_y = LabelEncoder().fit_transform(y)
18 | # grid search
19 | model = XGBClassifier()
20 | learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
21 | param_grid = dict(learning_rate=learning_rate)
22 | kfold = StratifiedKFold(label_encoded_y, n_folds=10, shuffle=True, random_state=7)
23 | grid_search = GridSearchCV(model, param_grid, scoring="log_loss", n_jobs=-1, cv=kfold)
24 | result = grid_search.fit(X, label_encoded_y)
25 | # summarize results
26 | print("Best: %f using %s" % (result.best_score_, result.best_params_))
27 | means, stdevs = [], []
28 | for params, mean_score, scores in result.grid_scores_:
29 | 	stdev = scores.std()
30 | 	means.append(mean_score)
31 | 	stdevs.append(stdev)
32 | 	print("%f (%f) with: %r" % (mean_score, stdev, params))
33 | # plot
34 | pyplot.errorbar(learning_rate, means, yerr=stdevs)
35 | pyplot.title("XGBoost learning_rate vs Log Loss")
36 | pyplot.xlabel('learning_rate')
37 | pyplot.ylabel('Log Loss')
38 | pyplot.savefig('learning_rate.png')


--------------------------------------------------------------------------------
/xgboost_with_python_code/16_tune_column_sample_rate_bytree.py:
--------------------------------------------------------------------------------
 1 | # XGBoost on Otto dataset, tune colsample_bytree
 2 | from pandas import read_csv
 3 | from xgboost import XGBClassifier
 4 | from sklearn.grid_search import GridSearchCV
 5 | from sklearn.cross_validation import StratifiedKFold
 6 | from sklearn.preprocessing import LabelEncoder
 7 | import matplotlib
 8 | matplotlib.use('Agg')
 9 | from matplotlib import pyplot
10 | # load data
11 | data = read_csv('train.csv')
12 | dataset = data.values
13 | # split data into X and y
14 | X = dataset[:,0:94]
15 | y = dataset[:,94]
16 | # encode string class values as integers
17 | label_encoded_y = LabelEncoder().fit_transform(y)
18 | # grid search
19 | model = XGBClassifier()
20 | colsample_bytree = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0]
21 | param_grid = dict(colsample_bytree=colsample_bytree)
22 | kfold = StratifiedKFold(label_encoded_y, n_folds=10, shuffle=True, random_state=7)
23 | grid_search = GridSearchCV(model, param_grid, scoring="log_loss", n_jobs=-1, cv=kfold)
24 | result = grid_search.fit(X, label_encoded_y)
25 | # summarize results
26 | print("Best: %f using %s" % (result.best_score_, result.best_params_))
27 | means, stdevs = [], []
28 | for params, mean_score, scores in result.grid_scores_:
29 | 	stdev = scores.std()
30 | 	means.append(mean_score)
31 | 	stdevs.append(stdev)
32 | 	print("%f (%f) with: %r" % (mean_score, stdev, params))
33 | # plot
34 | pyplot.errorbar(colsample_bytree, means, yerr=stdevs)
35 | pyplot.title("XGBoost colsample_bytree vs Log Loss")
36 | pyplot.xlabel('colsample_bytree')
37 | pyplot.ylabel('Log Loss')
38 | pyplot.savefig('colsample_bytree.png')


--------------------------------------------------------------------------------
/xgboost_with_python_code/16_tune_column_sample_rate_split.py:
--------------------------------------------------------------------------------
 1 | # XGBoost on Otto dataset, tune colsample_bylevel
 2 | from pandas import read_csv
 3 | from xgboost import XGBClassifier
 4 | from sklearn.grid_search import GridSearchCV
 5 | from sklearn.cross_validation import StratifiedKFold
 6 | from sklearn.preprocessing import LabelEncoder
 7 | import matplotlib
 8 | matplotlib.use('Agg')
 9 | from matplotlib import pyplot
10 | # load data
11 | data = read_csv('train.csv')
12 | dataset = data.values
13 | # split data into X and y
14 | X = dataset[:,0:94]
15 | y = dataset[:,94]
16 | # encode string class values as integers
17 | label_encoded_y = LabelEncoder().fit_transform(y)
18 | # grid search
19 | model = XGBClassifier()
20 | colsample_bylevel = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0]
21 | param_grid = dict(colsample_bylevel=colsample_bylevel)
22 | kfold = StratifiedKFold(label_encoded_y, n_folds=10, shuffle=True, random_state=7)
23 | grid_search = GridSearchCV(model, param_grid, scoring="log_loss", n_jobs=-1, cv=kfold)
24 | result = grid_search.fit(X, label_encoded_y)
25 | # summarize results
26 | print("Best: %f using %s" % (result.best_score_, result.best_params_))
27 | means, stdevs = [], []
28 | for params, mean_score, scores in result.grid_scores_:
29 | 	stdev = scores.std()
30 | 	means.append(mean_score)
31 | 	stdevs.append(stdev)
32 | 	print("%f (%f) with: %r" % (mean_score, stdev, params))
33 | # plot
34 | pyplot.errorbar(colsample_bylevel, means, yerr=stdevs)
35 | pyplot.title("XGBoost colsample_bylevel vs Log Loss")
36 | pyplot.xlabel('colsample_bylevel')
37 | pyplot.ylabel('Log Loss')
38 | pyplot.savefig('colsample_bylevel.png')


--------------------------------------------------------------------------------
/xgboost_with_python_code/16_tune_row_sample_rate.py:
--------------------------------------------------------------------------------
 1 | # XGBoost on Otto dataset, tune subsample
 2 | from pandas import read_csv
 3 | from xgboost import XGBClassifier
 4 | from sklearn.grid_search import GridSearchCV
 5 | from sklearn.cross_validation import StratifiedKFold
 6 | from sklearn.preprocessing import LabelEncoder
 7 | import matplotlib
 8 | matplotlib.use('Agg')
 9 | from matplotlib import pyplot
10 | # load data
11 | data = read_csv('train.csv')
12 | dataset = data.values
13 | # split data into X and y
14 | X = dataset[:,0:94]
15 | y = dataset[:,94]
16 | # encode string class values as integers
17 | label_encoded_y = LabelEncoder().fit_transform(y)
18 | # grid search
19 | model = XGBClassifier()
20 | subsample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0]
21 | param_grid = dict(subsample=subsample)
22 | kfold = StratifiedKFold(label_encoded_y, n_folds=10, shuffle=True, random_state=7)
23 | grid_search = GridSearchCV(model, param_grid, scoring="log_loss", n_jobs=-1, cv=kfold)
24 | result = grid_search.fit(X, label_encoded_y)
25 | # summarize results
26 | print("Best: %f using %s" % (result.best_score_, result.best_params_))
27 | means, stdevs = [], []
28 | for params, mean_score, scores in result.grid_scores_:
29 | 	stdev = scores.std()
30 | 	means.append(mean_score)
31 | 	stdevs.append(stdev)
32 | 	print("%f (%f) with: %r" % (mean_score, stdev, params))
33 | # plot
34 | pyplot.errorbar(subsample, means, yerr=stdevs)
35 | pyplot.title("XGBoost subsample vs Log Loss")
36 | pyplot.xlabel('subsample')
37 | pyplot.ylabel('Log Loss')
38 | pyplot.savefig('subsample.png')


--------------------------------------------------------------------------------