├── README.md ├── deep_learning_with_python_code ├── 02_theano_example.py ├── 03_tensorflow_example.py ├── 07_first_mlp.py ├── 08_automatic_split.py ├── 08_manual_cross_validation.py ├── 08_manual_split.py ├── 09_sklearn_cross_validation.py ├── 09_sklearn_grid_search_params.py ├── 10_iris_example.py ├── 11_sonar_baseline.py ├── 11_sonar_standardized.py ├── 11_sonar_standardized_larger.py ├── 11_sonar_standardized_smaller.py ├── 12_boston_baseline.py ├── 12_boston_standardized.py ├── 12_boston_standardized_larger.py ├── 12_boston_standardized_wider.py ├── 13_serialize_json.py ├── 13_serialize_yaml.py ├── 14_checkpoint_best_model.py ├── 14_checkpoint_load.py ├── 14_checkpoint_model_improvements.py ├── 15_plot_history.py ├── 16_baseline.py ├── 16_dropout_hidden.py ├── 16_dropout_visible.py ├── 17_decay_drop_based.py ├── 17_decay_time_based.py ├── 19_mnist_cnn.py ├── 19_mnist_cnn_large.py ├── 19_mnist_mlp_baseline.py ├── 19_mnist_plot.py ├── 20_augment_baseline.py ├── 20_augment_feature_standardize.py ├── 20_augment_flips.py ├── 20_augment_rotations.py ├── 20_augment_save_to_file.py ├── 20_augment_shifts.py ├── 20_augment_zca.py ├── 21_cifar10_cnn.py ├── 21_cifar10_cnn_large.py ├── 21_cifar10_plot.py ├── 22_imdb_cnn.py ├── 22_imdb_mlp.py ├── 22_imdb_plot.py ├── 24_mlp_simple.py ├── 24_mlp_window.py ├── 25_lstm_simple.py ├── 25_lstm_stacked.py ├── 25_lstm_stateful.py ├── 25_lstm_time_steps.py ├── 25_lstm_window.py ├── 26_lstm_cnn.py ├── 26_lstm_dropout_gates.py ├── 26_lstm_dropout_layers.py ├── 26_lstm_simple.py ├── 27_lstm_char_seq_batch.py ├── 27_lstm_char_seq_features.py ├── 27_lstm_char_seq_timesteps.py ├── 27_lstm_one_char.py ├── 27_lstm_one_char_stateful.py ├── 27_lstm_var_length.py ├── 28_lstm_gen_text.py ├── 28_lstm_larger.py ├── 28_lstm_larger_gen_text.py ├── 28_lstm_small.py ├── housing.csv ├── international-airline-passengers.csv ├── ionosphere.csv ├── iris.csv ├── pima-indians-diabetes.csv ├── sonar.csv ├── weights-improvement-19-1.9435.hdf5 ├── weights-improvement-47-1.2219-bigger.hdf5 └── wonderland.txt ├── machine_learning_mastery_with_r_code ├── .DS_Store ├── 1-AnalyzeData │ ├── .DS_Store │ ├── 1-LoadData │ │ ├── datasets-mlbench.R │ │ ├── datasets_appliedpredictivemodeling.R │ │ ├── datasets_datasets.R │ │ ├── iris.csv │ │ ├── load_csv_file.R │ │ └── load_csv_url.R │ ├── 2-DataSummarization │ │ ├── class_distribution.R │ │ ├── correction_spearman.R │ │ ├── correlation_pearson.R │ │ ├── data_types.R │ │ ├── dimensions.R │ │ ├── peek.R │ │ ├── skewness.R │ │ ├── standard_deviation.R │ │ └── summary.R │ └── 3-DataVisualization │ │ ├── 1-Univariate │ │ ├── barplot.R │ │ ├── boxplot.R │ │ ├── density_plot.R │ │ ├── histogram.R │ │ └── missing_plot.R │ │ ├── 2-Multivaraite │ │ ├── boxplot_by_class.R │ │ ├── correlation_plot.R │ │ ├── density_plot_by_class.R │ │ ├── scatterplot_matrix.R │ │ └── scatterplot_matrix_by_class.R │ │ └── 3-Projection │ │ ├── andrews_curves.R │ │ ├── parallel_coordinates.R │ │ ├── pca.R │ │ ├── sammons.R │ │ └── som.R ├── 2-PrepareData │ ├── .DS_Store │ ├── 1-DataCleaning │ │ ├── impute_missing_values.R │ │ ├── mark_missing_values.R │ │ ├── rebalance_SMOTE.R │ │ ├── remove_duplicates.R │ │ ├── remove_na.R │ │ └── remove_outliers.R │ ├── 2-FeatureSelection │ │ ├── rank_features_by_importance_caret.R │ │ ├── recursive_feature_elimination_caret.R │ │ └── remove_highly_correlated_features_caret.R │ └── 3-TransformData │ │ ├── boxcox_transform.R │ │ ├── center.R │ │ ├── ica_transform.R │ │ ├── normalize.R │ │ ├── pca_transform.R │ │ ├── scale.R │ │ ├── standardize.R │ │ └── yeojohnson_transform.R ├── 3-Algorithms │ ├── .DS_Store │ ├── 1-Algorithms │ │ ├── .DS_Store │ │ ├── 1-LinearRegression │ │ │ ├── ordinary_least_squares_regression.R │ │ │ ├── partial_least_squares_regression.R │ │ │ ├── principal_component_regression.R │ │ │ └── stepwise_linear_regression.R │ │ ├── 2-PenalizedLinearRegression │ │ │ ├── LASSO.R │ │ │ ├── elastic_net.R │ │ │ └── ridge_regression.R │ │ ├── 3-NonLinearRegression │ │ │ ├── M5P.R │ │ │ ├── M5Rules.R │ │ │ ├── bagging_CART.R │ │ │ ├── classification_and_regression_trees.R │ │ │ ├── conditional_decision_trees.R │ │ │ ├── cubist.R │ │ │ ├── feed_forward_neural_network.R │ │ │ ├── gradient_boosted_machine.R │ │ │ ├── k-nearest_neighbor.R │ │ │ ├── multivariate_adaptive_regression_splines.R │ │ │ ├── random_forest.R │ │ │ └── support_vector_machine.R │ │ ├── 4-LinearClassification │ │ │ ├── linear_discriminant_analysis.R │ │ │ ├── logistic_regression.R │ │ │ ├── logistic_regression_multiclass.R │ │ │ └── partial_least_squares_discriminant_analysis.R │ │ ├── 5-NonLinearClassiication │ │ │ ├── C4.5.R │ │ │ ├── C5.0.R │ │ │ ├── PART.R │ │ │ ├── bagging_CART.R │ │ │ ├── classification_and_regression_trees.R │ │ │ ├── feed_forward_neural_network.R │ │ │ ├── flexible_discriminant_analysis.R │ │ │ ├── gradient_boosted_machine.R │ │ │ ├── k-nearest_neighbors.R │ │ │ ├── mixture_discriminant_analysis.R │ │ │ ├── naive_bayes.R │ │ │ ├── quadratic_discriminant_analysis.R │ │ │ ├── random_forest.R │ │ │ ├── regularized_discriminant_analysis.R │ │ │ └── support_vector_machine.R │ │ └── 6-Optimization │ │ │ ├── bfgs.R │ │ │ ├── conjugate_gradient.R │ │ │ ├── golden_section_search.R │ │ │ ├── gradient_descent.R │ │ │ └── nelder_mead.R │ ├── 2-CaretAlgorithms │ │ ├── binary_classification_algorithms.R │ │ └── regression_algorithms.R │ └── algorithm_spot_check.R ├── 4-EvaluateAlgorithms │ ├── .DS_Store │ ├── 1-ResamplingMethods │ │ ├── bootstrap.R │ │ ├── data_split.R │ │ ├── kfold_cross_validation.R │ │ ├── leave_one_out_cross_validation.R │ │ └── repeated_kfold_cross_validation.R │ ├── 2-Metrics │ │ ├── Accuracy.R │ │ ├── Kappa.R │ │ ├── LogLoss.R │ │ ├── RMSE.R │ │ ├── ROC.R │ │ └── RSquared.R │ └── 3-ModelSelection │ │ ├── .DS_Store │ │ ├── compare_boxplots.R │ │ ├── compare_densityplot.R │ │ ├── compare_dotplot.R │ │ ├── compare_parallelplot.R │ │ ├── compare_scatterplot_matrix.R │ │ ├── compare_summary.R │ │ ├── compare_xyplot.R │ │ └── significant_difference.R ├── 5-ImproveResults │ ├── .DS_Store │ ├── 1-TuneAlgorithms │ │ ├── automatic_grid_search.R │ │ ├── custom_search.R │ │ ├── manual_grid_search.R │ │ ├── manual_search.R │ │ ├── optimal_parameters.R │ │ └── random_search.R │ └── 2-Ensembles │ │ ├── bagging.R │ │ ├── blending.R │ │ └── stacking.R ├── 6-FinalizeModel │ ├── .DS_Store │ ├── 1-Predict │ │ ├── predict_caret.R │ │ └── train_all_dataset.R │ ├── 2-FinalModel │ │ └── standalone_model.R │ └── 3-SaveLoadModel │ │ └── save_load_model.R ├── 7-Other │ ├── install_list_of_packages.R │ ├── install_package_with_dependencies.R │ └── r_crash_course.R ├── 8-CaseStudies │ ├── .DS_Store │ ├── BinaryClassification │ │ ├── breast_cancer.R │ │ ├── diabetes.R │ │ ├── diabetes_spot_check.R │ │ ├── ionosphere.R │ │ ├── ionosphere_ensemble.R │ │ ├── sonar.R │ │ └── sonar_tuning.R │ ├── MultiClassClassification │ │ ├── .DS_Store │ │ ├── glass.R │ │ ├── iris.R │ │ └── soybean.R │ ├── Regression │ │ ├── abalone.R │ │ ├── boston.R │ │ └── longley.R │ └── project_template.R └── README.txt ├── ml_with_python_code ├── 02_scipy_versions.py ├── 02_sklearn_version.py ├── 03_matplotlib_crash_course.py ├── 03_numpy_crash_course.py ├── 03_pandas_crash_course.py ├── 03_python_crash_course.py ├── 04_load_csv.py ├── 04_load_csv_np.py ├── 04_load_csv_np_url.py ├── 04_load_csv_pandas.py ├── 04_load_csv_pandas_url.py ├── 05_class_distribution.py ├── 05_data_types.py ├── 05_describe.py ├── 05_dimensions.py ├── 05_head.py ├── 05_pearson_correlation.py ├── 05_skew.py ├── 06_boxplot.py ├── 06_correlation_matrix.py ├── 06_correlation_matrix_generic.py ├── 06_density_plots.py ├── 06_histograms.py ├── 06_scatterplot_matrix.py ├── 07_binarization.py ├── 07_normalize_data.py ├── 07_rescale_data.py ├── 07_standardize_data.py ├── 08_feature_importance.py ├── 08_pca.py ├── 08_recursive_feature_elimination.py ├── 08_univariate_selection.py ├── 09_cross_validation.py ├── 09_loocv.py ├── 09_shuffle_split.py ├── 09_train_test.py ├── 10_classification_accuracy.py ├── 10_classification_auc.py ├── 10_classification_confusion_matrix.py ├── 10_classification_logloss.py ├── 10_classification_report.py ├── 10_regression_mae.py ├── 10_regression_mse.py ├── 10_regression_rsquared.py ├── 11_classification_and_regression_trees_classification.py ├── 11_gaussian_naive_bayes.py ├── 11_k_nearest_neighbors_classification.py ├── 11_linear_discriminant_analysis.py ├── 11_logistic_regression.py ├── 11_support_vector_machines_classification.py ├── 12_classification_and_regression_trees_regression.py ├── 12_elastic_net.py ├── 12_k_nearest_neighbors_regression.py ├── 12_lasso_regression.py ├── 12_linear_regression.py ├── 12_ridge_regression.py ├── 12_support_vector_machines_regression.py ├── 13_race_algorithms.py ├── 14_feature_union_model_pipeline.py ├── 14_standardize_model_pipeline.py ├── 15_adaboost_classification.py ├── 15_bagged_cart_classification.py ├── 15_extra_trees_classification.py ├── 15_gradient_boosting_classification.py ├── 15_random_forest_classification.py ├── 15_voting_ensemble_classification.py ├── 16_grid_search.py ├── 16_random_search.py ├── 17_save_model_joblib.py ├── 17_save_model_pickel.py ├── 18_project_template.py ├── 19_project_classification_iris.py ├── 20_project_regression_boston.py ├── 21_project_classification_sonar.py ├── housing.csv ├── iris.data.csv ├── pima-indians-diabetes.data.csv └── sonar.all-data.csv └── xgboost_with_python_code ├── 04_first_model.py ├── 05_breast_one_hot.py ├── 05_horse_colic_missing.py ├── 05_horse_colic_missing_imputer.py ├── 05_iris_label_encode.py ├── 06_cross_validation.py ├── 06_stratified_cross_validation.py ├── 06_train_test_split.py ├── 07_plot_tree-left-to-right.py ├── 07_plot_tree.py ├── 08_serialize_with_joblib.py ├── 08_serialize_with_pickle.py ├── 09_automatic_feature_importance.py ├── 09_feature_selection.py ├── 09_manual_feature_importance.py ├── 10_early_stopping.py ├── 10_evaluate_validation_set.py ├── 10_learning_curves.py ├── 11_eval_num_threads.py ├── 11_eval_parallel_cv_and_xgboost.py ├── 12_check_num_threads.py ├── 14_tune_depth.py ├── 14_tune_num_trees_and_depth.py ├── 14_tune_trees.py ├── 15_plot_performance.py ├── 15_tune_learning_rate.py ├── 15_tune_learning_rate_and_num_trees.py ├── 16_tune_column_sample_rate_bytree.py ├── 16_tune_column_sample_rate_split.py ├── 16_tune_row_sample_rate.py ├── datasets-uci-breast-cancer.csv ├── horse-colic.csv ├── iris.csv └── pima-indians-diabetes.csv /README.md: -------------------------------------------------------------------------------- 1 | # ML-mastery 2 | Code from Jason Brownlee's course on mastering machine learning 3 | -------------------------------------------------------------------------------- /deep_learning_with_python_code/02_theano_example.py: -------------------------------------------------------------------------------- 1 | # Example of Theano library 2 | import theano 3 | from theano import tensor 4 | # declare two symbolic floating-point scalars 5 | a = tensor.dscalar() 6 | b = tensor.dscalar() 7 | # create a simple symbolic expression 8 | c = a + b 9 | # convert the expression into a callable object that takes (a,b) and computes c 10 | f = theano.function([a,b], c) 11 | # bind 1.5 to 'a', 2.5 to 'b', and evaluate 'c' 12 | result = f(1.5, 2.5) 13 | print(result) -------------------------------------------------------------------------------- /deep_learning_with_python_code/03_tensorflow_example.py: -------------------------------------------------------------------------------- 1 | # Example of TensorFlow library 2 | import tensorflow as tf 3 | # declare two symbolic floating-point scalars 4 | a = tf.placeholder(tf.float32) 5 | b = tf.placeholder(tf.float32) 6 | # create a simple symbolic expression using the add function 7 | add = tf.add(a, b) 8 | # bind 1.5 to 'a', 2.5 to 'b', and evaluate 'c' 9 | sess = tf.Session() 10 | binding = {a: 1.5, b: 2.5} 11 | c = sess.run(add, feed_dict=binding) 12 | print(c) -------------------------------------------------------------------------------- /deep_learning_with_python_code/07_first_mlp.py: -------------------------------------------------------------------------------- 1 | # Create your first MLP in Keras 2 | from keras.models import Sequential 3 | from keras.layers import Dense 4 | import numpy 5 | # fix random seed for reproducibility 6 | seed = 7 7 | numpy.random.seed(seed) 8 | # load pima indians dataset 9 | dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",") 10 | # split into input (X) and output (Y) variables 11 | X = dataset[:,0:8] 12 | Y = dataset[:,8] 13 | # create model 14 | model = Sequential() 15 | model.add(Dense(12, input_dim=8, init='uniform', activation='relu')) 16 | model.add(Dense(8, init='uniform', activation='relu')) 17 | model.add(Dense(1, init='uniform', activation='sigmoid')) 18 | # Compile model 19 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 20 | # Fit the model 21 | model.fit(X, Y, nb_epoch=150, batch_size=10) 22 | # evaluate the model 23 | scores = model.evaluate(X, Y) 24 | print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) -------------------------------------------------------------------------------- /deep_learning_with_python_code/08_automatic_split.py: -------------------------------------------------------------------------------- 1 | # MLP with automatic validation set 2 | from keras.models import Sequential 3 | from keras.layers import Dense 4 | import numpy 5 | # fix random seed for reproducibility 6 | seed = 7 7 | numpy.random.seed(seed) 8 | # load pima indians dataset 9 | dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",") 10 | # split into input (X) and output (Y) variables 11 | X = dataset[:,0:8] 12 | Y = dataset[:,8] 13 | # create model 14 | model = Sequential() 15 | model.add(Dense(12, input_dim=8, init='uniform', activation='relu')) 16 | model.add(Dense(8, init='uniform', activation='relu')) 17 | model.add(Dense(1, init='uniform', activation='sigmoid')) 18 | # Compile model 19 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 20 | # Fit the model 21 | model.fit(X, Y, validation_split=0.33, nb_epoch=150, batch_size=10) -------------------------------------------------------------------------------- /deep_learning_with_python_code/08_manual_cross_validation.py: -------------------------------------------------------------------------------- 1 | # MLP for Pima Indians Dataset with 10-fold cross validation 2 | from keras.models import Sequential 3 | from keras.layers import Dense 4 | from sklearn.model_selection import StratifiedKFold 5 | import numpy 6 | # fix random seed for reproducibility 7 | seed = 7 8 | numpy.random.seed(seed) 9 | # load pima indians dataset 10 | dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",") 11 | # split into input (X) and output (Y) variables 12 | X = dataset[:,0:8] 13 | Y = dataset[:,8] 14 | # define 10-fold cross validation test harness 15 | kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed) 16 | cvscores = [] 17 | for train, test in kfold.split(X, Y): 18 | # create model 19 | model = Sequential() 20 | model.add(Dense(12, input_dim=8, init='uniform', activation='relu')) 21 | model.add(Dense(8, init='uniform', activation='relu')) 22 | model.add(Dense(1, init='uniform', activation='sigmoid')) 23 | # Compile model 24 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 25 | # Fit the model 26 | model.fit(X[train], Y[train], nb_epoch=150, batch_size=10, verbose=0) 27 | # evaluate the model 28 | scores = model.evaluate(X[test], Y[test], verbose=0) 29 | print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) 30 | cvscores.append(scores[1] * 100) 31 | 32 | print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores))) 33 | -------------------------------------------------------------------------------- /deep_learning_with_python_code/08_manual_split.py: -------------------------------------------------------------------------------- 1 | # MLP with manual validation set 2 | from keras.models import Sequential 3 | from keras.layers import Dense 4 | from sklearn.model_selection import train_test_split 5 | import numpy 6 | # fix random seed for reproducibility 7 | seed = 7 8 | numpy.random.seed(seed) 9 | # load pima indians dataset 10 | dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",") 11 | # split into input (X) and output (Y) variables 12 | X = dataset[:,0:8] 13 | Y = dataset[:,8] 14 | # split into 67% for train and 33% for test 15 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=seed) 16 | # create model 17 | model = Sequential() 18 | model.add(Dense(12, input_dim=8, init='uniform', activation='relu')) 19 | model.add(Dense(8, init='uniform', activation='relu')) 20 | model.add(Dense(1, init='uniform', activation='sigmoid')) 21 | # Compile model 22 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 23 | # Fit the model 24 | model.fit(X_train, y_train, validation_data=(X_test,y_test), nb_epoch=150, batch_size=10) -------------------------------------------------------------------------------- /deep_learning_with_python_code/09_sklearn_cross_validation.py: -------------------------------------------------------------------------------- 1 | # MLP for Pima Indians Dataset with 10-fold cross validation via sklearn 2 | from keras.models import Sequential 3 | from keras.layers import Dense 4 | from keras.wrappers.scikit_learn import KerasClassifier 5 | from sklearn.model_selection import StratifiedKFold 6 | from sklearn.model_selection import cross_val_score 7 | import numpy 8 | 9 | # Function to create model, required for KerasClassifier 10 | def create_model(): 11 | # create model 12 | model = Sequential() 13 | model.add(Dense(12, input_dim=8, init='uniform', activation='relu')) 14 | model.add(Dense(8, init='uniform', activation='relu')) 15 | model.add(Dense(1, init='uniform', activation='sigmoid')) 16 | # Compile model 17 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 18 | return model 19 | 20 | # fix random seed for reproducibility 21 | seed = 7 22 | numpy.random.seed(seed) 23 | # load pima indians dataset 24 | dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",") 25 | # split into input (X) and output (Y) variables 26 | X = dataset[:,0:8] 27 | Y = dataset[:,8] 28 | # create model 29 | model = KerasClassifier(build_fn=create_model, nb_epoch=150, batch_size=10, verbose=0) 30 | # evaluate using 10-fold cross validation 31 | kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed) 32 | results = cross_val_score(model, X, Y, cv=kfold) 33 | print(results.mean()) 34 | -------------------------------------------------------------------------------- /deep_learning_with_python_code/10_iris_example.py: -------------------------------------------------------------------------------- 1 | # Multiclass Classification with the Iris Flowers Dataset 2 | import numpy 3 | import pandas 4 | from keras.models import Sequential 5 | from keras.layers import Dense 6 | from keras.wrappers.scikit_learn import KerasClassifier 7 | from keras.utils import np_utils 8 | from sklearn.model_selection import cross_val_score 9 | from sklearn.model_selection import KFold 10 | from sklearn.preprocessing import LabelEncoder 11 | from sklearn.pipeline import Pipeline 12 | # fix random seed for reproducibility 13 | seed = 7 14 | numpy.random.seed(seed) 15 | # load dataset 16 | dataframe = pandas.read_csv("iris.csv", header=None) 17 | dataset = dataframe.values 18 | X = dataset[:,0:4].astype(float) 19 | Y = dataset[:,4] 20 | # encode class values as integers 21 | encoder = LabelEncoder() 22 | encoder.fit(Y) 23 | encoded_Y = encoder.transform(Y) 24 | # convert integers to dummy variables (i.e. one hot encoded) 25 | dummy_y = np_utils.to_categorical(encoded_Y) 26 | # define baseline model 27 | def baseline_model(): 28 | # create model 29 | model = Sequential() 30 | model.add(Dense(4, input_dim=4, init='normal', activation='relu')) 31 | model.add(Dense(3, init='normal', activation='sigmoid')) 32 | # Compile model 33 | model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 34 | return model 35 | estimator = KerasClassifier(build_fn=baseline_model, nb_epoch=200, batch_size=5, verbose=0) 36 | kfold = KFold(n_splits=10, shuffle=True, random_state=seed) 37 | results = cross_val_score(estimator, X, dummy_y, cv=kfold) 38 | print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100)) 39 | -------------------------------------------------------------------------------- /deep_learning_with_python_code/11_sonar_baseline.py: -------------------------------------------------------------------------------- 1 | # Binary Classification with Sonar Dataset: Baseline 2 | import numpy 3 | import pandas 4 | from keras.models import Sequential 5 | from keras.layers import Dense 6 | from keras.wrappers.scikit_learn import KerasClassifier 7 | from sklearn.model_selection import cross_val_score 8 | from sklearn.preprocessing import LabelEncoder 9 | from sklearn.model_selection import StratifiedKFold 10 | from sklearn.preprocessing import StandardScaler 11 | from sklearn.pipeline import Pipeline 12 | # fix random seed for reproducibility 13 | seed = 7 14 | numpy.random.seed(seed) 15 | # load dataset 16 | dataframe = pandas.read_csv("sonar.csv", header=None) 17 | dataset = dataframe.values 18 | # split into input (X) and output (Y) variables 19 | X = dataset[:,0:60].astype(float) 20 | Y = dataset[:,60] 21 | # encode class values as integers 22 | encoder = LabelEncoder() 23 | encoder.fit(Y) 24 | encoded_Y = encoder.transform(Y) 25 | # baseline model 26 | def create_baseline(): 27 | # create model 28 | model = Sequential() 29 | model.add(Dense(60, input_dim=60, init='normal', activation='relu')) 30 | model.add(Dense(1, init='normal', activation='sigmoid')) 31 | # Compile model 32 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 33 | return model 34 | # evaluate model with standardized dataset 35 | estimator = KerasClassifier(build_fn=create_baseline, nb_epoch=100, batch_size=5, verbose=0) 36 | kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed) 37 | results = cross_val_score(estimator, X, encoded_Y, cv=kfold) 38 | print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100)) 39 | -------------------------------------------------------------------------------- /deep_learning_with_python_code/12_boston_baseline.py: -------------------------------------------------------------------------------- 1 | # Regression Example With Boston Dataset: Baseline 2 | import numpy 3 | import pandas 4 | from keras.models import Sequential 5 | from keras.layers import Dense 6 | from keras.wrappers.scikit_learn import KerasRegressor 7 | from sklearn.model_selection import cross_val_score 8 | from sklearn.model_selection import KFold 9 | from sklearn.preprocessing import StandardScaler 10 | from sklearn.pipeline import Pipeline 11 | # load dataset 12 | dataframe = pandas.read_csv("housing.csv", delim_whitespace=True, header=None) 13 | dataset = dataframe.values 14 | # split into input (X) and output (Y) variables 15 | X = dataset[:,0:13] 16 | Y = dataset[:,13] 17 | # define base model 18 | def baseline_model(): 19 | # create model 20 | model = Sequential() 21 | model.add(Dense(13, input_dim=13, init='normal', activation='relu')) 22 | model.add(Dense(1, init='normal')) 23 | # Compile model 24 | model.compile(loss='mean_squared_error', optimizer='adam') 25 | return model 26 | # fix random seed for reproducibility 27 | seed = 7 28 | numpy.random.seed(seed) 29 | # evaluate model with standardized dataset 30 | estimator = KerasRegressor(build_fn=baseline_model, nb_epoch=100, batch_size=5, verbose=0) 31 | kfold = KFold(n_splits=10, random_state=seed) 32 | results = cross_val_score(estimator, X, Y, cv=kfold) 33 | print("Baseline: %.2f (%.2f) MSE" % (results.mean(), results.std())) 34 | -------------------------------------------------------------------------------- /deep_learning_with_python_code/12_boston_standardized.py: -------------------------------------------------------------------------------- 1 | # Regression Example With Boston Dataset: Standardized 2 | import numpy 3 | import pandas 4 | from keras.models import Sequential 5 | from keras.layers import Dense 6 | from keras.wrappers.scikit_learn import KerasRegressor 7 | from sklearn.model_selection import cross_val_score 8 | from sklearn.model_selection import KFold 9 | from sklearn.preprocessing import StandardScaler 10 | from sklearn.pipeline import Pipeline 11 | # load dataset 12 | dataframe = pandas.read_csv("housing.csv", delim_whitespace=True, header=None) 13 | dataset = dataframe.values 14 | # split into input (X) and output (Y) variables 15 | X = dataset[:,0:13] 16 | Y = dataset[:,13] 17 | # define base model 18 | def baseline_model(): 19 | # create model 20 | model = Sequential() 21 | model.add(Dense(13, input_dim=13, init='normal', activation='relu')) 22 | model.add(Dense(1, init='normal')) 23 | # Compile model 24 | model.compile(loss='mean_squared_error', optimizer='adam') 25 | return model 26 | # fix random seed for reproducibility 27 | seed = 7 28 | numpy.random.seed(seed) 29 | # evaluate model with standardized dataset 30 | estimators = [] 31 | estimators.append(('standardize', StandardScaler())) 32 | estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, nb_epoch=50, batch_size=5, verbose=0))) 33 | pipeline = Pipeline(estimators) 34 | kfold = KFold(n_splits=10, random_state=seed) 35 | results = cross_val_score(pipeline, X, Y, cv=kfold) 36 | print("Standardized: %.2f (%.2f) MSE" % (results.mean(), results.std())) 37 | -------------------------------------------------------------------------------- /deep_learning_with_python_code/12_boston_standardized_larger.py: -------------------------------------------------------------------------------- 1 | # Regression Example With Boston Dataset: Standardized and Larger 2 | import numpy 3 | import pandas 4 | from keras.models import Sequential 5 | from keras.layers import Dense 6 | from keras.wrappers.scikit_learn import KerasRegressor 7 | from sklearn.model_selection import cross_val_score 8 | from sklearn.model_selection import KFold 9 | from sklearn.preprocessing import StandardScaler 10 | from sklearn.pipeline import Pipeline 11 | # load dataset 12 | dataframe = pandas.read_csv("housing.csv", delim_whitespace=True, header=None) 13 | dataset = dataframe.values 14 | # split into input (X) and output (Y) variables 15 | X = dataset[:,0:13] 16 | Y = dataset[:,13] 17 | # define the model 18 | def larger_model(): 19 | # create model 20 | model = Sequential() 21 | model.add(Dense(13, input_dim=13, init='normal', activation='relu')) 22 | model.add(Dense(6, init='normal', activation='relu')) 23 | model.add(Dense(1, init='normal')) 24 | # Compile model 25 | model.compile(loss='mean_squared_error', optimizer='adam') 26 | return model 27 | # fix random seed for reproducibility 28 | seed = 7 29 | numpy.random.seed(seed) 30 | # evaluate model with standardized dataset 31 | estimators = [] 32 | estimators.append(('standardize', StandardScaler())) 33 | estimators.append(('mlp', KerasRegressor(build_fn=larger_model, nb_epoch=50, batch_size=5, verbose=0))) 34 | pipeline = Pipeline(estimators) 35 | kfold = KFold(n_splits=10, random_state=seed) 36 | results = cross_val_score(pipeline, X, Y, cv=kfold) 37 | print("Larger: %.2f (%.2f) MSE" % (results.mean(), results.std())) 38 | -------------------------------------------------------------------------------- /deep_learning_with_python_code/12_boston_standardized_wider.py: -------------------------------------------------------------------------------- 1 | # Regression Example With Boston Dataset: Standardized and Wider 2 | import numpy 3 | import pandas 4 | from keras.models import Sequential 5 | from keras.layers import Dense 6 | from keras.wrappers.scikit_learn import KerasRegressor 7 | from sklearn.model_selection import cross_val_score 8 | from sklearn.model_selection import KFold 9 | from sklearn.preprocessing import StandardScaler 10 | from sklearn.pipeline import Pipeline 11 | # load dataset 12 | dataframe = pandas.read_csv("housing.csv", delim_whitespace=True, header=None) 13 | dataset = dataframe.values 14 | # split into input (X) and output (Y) variables 15 | X = dataset[:,0:13] 16 | Y = dataset[:,13] 17 | # define wider model 18 | def wider_model(): 19 | # create model 20 | model = Sequential() 21 | model.add(Dense(20, input_dim=13, init='normal', activation='relu')) 22 | model.add(Dense(1, init='normal')) 23 | # Compile model 24 | model.compile(loss='mean_squared_error', optimizer='adam') 25 | return model 26 | # fix random seed for reproducibility 27 | seed = 7 28 | numpy.random.seed(seed) 29 | # evaluate model with standardized dataset 30 | estimators = [] 31 | estimators.append(('standardize', StandardScaler())) 32 | estimators.append(('mlp', KerasRegressor(build_fn=wider_model, nb_epoch=100, batch_size=5, verbose=0))) 33 | pipeline = Pipeline(estimators) 34 | kfold = KFold(n_splits=10, random_state=seed) 35 | results = cross_val_score(pipeline, X, Y, cv=kfold) 36 | print("Wider: %.2f (%.2f) MSE" % (results.mean(), results.std())) 37 | -------------------------------------------------------------------------------- /deep_learning_with_python_code/14_checkpoint_best_model.py: -------------------------------------------------------------------------------- 1 | # Checkpoint the weights for best model on validation accuracy 2 | from keras.models import Sequential 3 | from keras.layers import Dense 4 | from keras.callbacks import ModelCheckpoint 5 | import matplotlib.pyplot as plt 6 | import numpy 7 | # fix random seed for reproducibility 8 | seed = 7 9 | numpy.random.seed(seed) 10 | # load pima indians dataset 11 | dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",") 12 | # split into input (X) and output (Y) variables 13 | X = dataset[:,0:8] 14 | Y = dataset[:,8] 15 | # create model 16 | model = Sequential() 17 | model.add(Dense(12, input_dim=8, init='uniform', activation='relu')) 18 | model.add(Dense(8, init='uniform', activation='relu')) 19 | model.add(Dense(1, init='uniform', activation='sigmoid')) 20 | # Compile model 21 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 22 | # checkpoint 23 | filepath="weights.best.hdf5" 24 | checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') 25 | callbacks_list = [checkpoint] 26 | # Fit the model 27 | model.fit(X, Y, validation_split=0.33, nb_epoch=150, batch_size=10, callbacks=callbacks_list, verbose=0) 28 | -------------------------------------------------------------------------------- /deep_learning_with_python_code/14_checkpoint_load.py: -------------------------------------------------------------------------------- 1 | # How to load and use weights from a checkpoint 2 | from keras.models import Sequential 3 | from keras.layers import Dense 4 | from keras.callbacks import ModelCheckpoint 5 | import matplotlib.pyplot as plt 6 | import numpy 7 | # fix random seed for reproducibility 8 | seed = 7 9 | numpy.random.seed(seed) 10 | # create model 11 | model = Sequential() 12 | model.add(Dense(12, input_dim=8, init='uniform', activation='relu')) 13 | model.add(Dense(8, init='uniform', activation='relu')) 14 | model.add(Dense(1, init='uniform', activation='sigmoid')) 15 | # load weights 16 | model.load_weights("weights.best.hdf5") 17 | # Compile model (required to make predictions) 18 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 19 | print("Created model and loaded weights from file") 20 | # load pima indians dataset 21 | dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",") 22 | # split into input (X) and output (Y) variables 23 | X = dataset[:,0:8] 24 | Y = dataset[:,8] 25 | # estimate accuracy on whole dataset using loaded weights 26 | scores = model.evaluate(X, Y, verbose=0) 27 | print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) 28 | -------------------------------------------------------------------------------- /deep_learning_with_python_code/14_checkpoint_model_improvements.py: -------------------------------------------------------------------------------- 1 | # Checkpoint the weights when validation accuracy improves 2 | from keras.models import Sequential 3 | from keras.layers import Dense 4 | from keras.callbacks import ModelCheckpoint 5 | import matplotlib.pyplot as plt 6 | import numpy 7 | # fix random seed for reproducibility 8 | seed = 7 9 | numpy.random.seed(seed) 10 | # load pima indians dataset 11 | dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",") 12 | # split into input (X) and output (Y) variables 13 | X = dataset[:,0:8] 14 | Y = dataset[:,8] 15 | # create model 16 | model = Sequential() 17 | model.add(Dense(12, input_dim=8, init='uniform', activation='relu')) 18 | model.add(Dense(8, init='uniform', activation='relu')) 19 | model.add(Dense(1, init='uniform', activation='sigmoid')) 20 | # Compile model 21 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 22 | # checkpoint 23 | filepath="weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5" 24 | checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') 25 | callbacks_list = [checkpoint] 26 | # Fit the model 27 | model.fit(X, Y, validation_split=0.33, nb_epoch=150, batch_size=10, callbacks=callbacks_list, verbose=0) 28 | -------------------------------------------------------------------------------- /deep_learning_with_python_code/15_plot_history.py: -------------------------------------------------------------------------------- 1 | # Visualize training history 2 | from keras.models import Sequential 3 | from keras.layers import Dense 4 | import matplotlib.pyplot as plt 5 | import numpy 6 | # fix random seed for reproducibility 7 | seed = 7 8 | numpy.random.seed(seed) 9 | # load pima indians dataset 10 | dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",") 11 | # split into input (X) and output (Y) variables 12 | X = dataset[:,0:8] 13 | Y = dataset[:,8] 14 | # create model 15 | model = Sequential() 16 | model.add(Dense(12, input_dim=8, init='uniform', activation='relu')) 17 | model.add(Dense(8, init='uniform', activation='relu')) 18 | model.add(Dense(1, init='uniform', activation='sigmoid')) 19 | # Compile model 20 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 21 | # Fit the model 22 | history = model.fit(X, Y, validation_split=0.33, nb_epoch=150, batch_size=10, verbose=0) 23 | # list all data in history 24 | print(history.history.keys()) 25 | # summarize history for accuracy 26 | plt.plot(history.history['acc']) 27 | plt.plot(history.history['val_acc']) 28 | plt.title('model accuracy') 29 | plt.ylabel('accuracy') 30 | plt.xlabel('epoch') 31 | plt.legend(['train', 'test'], loc='upper left') 32 | plt.show() 33 | # summarize history for loss 34 | plt.plot(history.history['loss']) 35 | plt.plot(history.history['val_loss']) 36 | plt.title('model loss') 37 | plt.ylabel('loss') 38 | plt.xlabel('epoch') 39 | plt.legend(['train', 'test'], loc='upper left') 40 | plt.show() 41 | -------------------------------------------------------------------------------- /deep_learning_with_python_code/17_decay_drop_based.py: -------------------------------------------------------------------------------- 1 | # Drop-Based Learning Rate Decay 2 | import pandas 3 | import pandas 4 | import numpy 5 | import math 6 | from keras.models import Sequential 7 | from keras.layers import Dense 8 | from keras.optimizers import SGD 9 | from sklearn.preprocessing import LabelEncoder 10 | from keras.callbacks import LearningRateScheduler 11 | 12 | # learning rate schedule 13 | def step_decay(epoch): 14 | initial_lrate = 0.1 15 | drop = 0.5 16 | epochs_drop = 10.0 17 | lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop)) 18 | return lrate 19 | 20 | # fix random seed for reproducibility 21 | seed = 7 22 | numpy.random.seed(seed) 23 | # load dataset 24 | dataframe = pandas.read_csv("ionosphere.csv", header=None) 25 | dataset = dataframe.values 26 | # split into input (X) and output (Y) variables 27 | X = dataset[:,0:34].astype(float) 28 | Y = dataset[:,34] 29 | # encode class values as integers 30 | encoder = LabelEncoder() 31 | encoder.fit(Y) 32 | Y = encoder.transform(Y) 33 | # create model 34 | model = Sequential() 35 | model.add(Dense(34, input_dim=34, init='normal', activation='relu')) 36 | model.add(Dense(1, init='normal', activation='sigmoid')) 37 | # Compile model 38 | sgd = SGD(lr=0.0, momentum=0.9, decay=0.0, nesterov=False) 39 | model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy']) 40 | # learning schedule callback 41 | lrate = LearningRateScheduler(step_decay) 42 | callbacks_list = [lrate] 43 | # Fit the model 44 | model.fit(X, Y, validation_split=0.33, nb_epoch=50, batch_size=28, callbacks=callbacks_list, verbose=2) 45 | -------------------------------------------------------------------------------- /deep_learning_with_python_code/17_decay_time_based.py: -------------------------------------------------------------------------------- 1 | # Time Based Learning Rate Decay 2 | import pandas 3 | import numpy 4 | from keras.models import Sequential 5 | from keras.layers import Dense 6 | from keras.optimizers import SGD 7 | from sklearn.preprocessing import LabelEncoder 8 | # fix random seed for reproducibility 9 | seed = 7 10 | numpy.random.seed(seed) 11 | # load dataset 12 | dataframe = pandas.read_csv("ionosphere.csv", header=None) 13 | dataset = dataframe.values 14 | # split into input (X) and output (Y) variables 15 | X = dataset[:,0:34].astype(float) 16 | Y = dataset[:,34] 17 | # encode class values as integers 18 | encoder = LabelEncoder() 19 | encoder.fit(Y) 20 | Y = encoder.transform(Y) 21 | # create model 22 | model = Sequential() 23 | model.add(Dense(34, input_dim=34, init='normal', activation='relu')) 24 | model.add(Dense(1, init='normal', activation='sigmoid')) 25 | # Compile model 26 | epochs = 50 27 | learning_rate = 0.1 28 | decay_rate = learning_rate / epochs 29 | momentum = 0.8 30 | sgd = SGD(lr=learning_rate, momentum=momentum, decay=decay_rate, nesterov=False) 31 | model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy']) 32 | # Fit the model 33 | model.fit(X, Y, validation_split=0.33, nb_epoch=epochs, batch_size=28, verbose=2) 34 | -------------------------------------------------------------------------------- /deep_learning_with_python_code/19_mnist_mlp_baseline.py: -------------------------------------------------------------------------------- 1 | # Baseline MLP for MNIST dataset 2 | import numpy 3 | from keras.datasets import mnist 4 | from keras.models import Sequential 5 | from keras.layers import Dense 6 | from keras.layers import Dropout 7 | from keras.utils import np_utils 8 | # fix random seed for reproducibility 9 | seed = 7 10 | numpy.random.seed(seed) 11 | # load data 12 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 13 | # flatten 28*28 images to a 784 vector for each image 14 | num_pixels = X_train.shape[1] * X_train.shape[2] 15 | X_train = X_train.reshape(X_train.shape[0], num_pixels).astype('float32') 16 | X_test = X_test.reshape(X_test.shape[0], num_pixels).astype('float32') 17 | # normalize inputs from 0-255 to 0-1 18 | X_train = X_train / 255 19 | X_test = X_test / 255 20 | # one hot encode outputs 21 | y_train = np_utils.to_categorical(y_train) 22 | y_test = np_utils.to_categorical(y_test) 23 | num_classes = y_test.shape[1] 24 | # define baseline model 25 | def baseline_model(): 26 | # create model 27 | model = Sequential() 28 | model.add(Dense(num_pixels, input_dim=num_pixels, init='normal', activation='relu')) 29 | model.add(Dense(num_classes, init='normal', activation='softmax')) 30 | # Compile model 31 | model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 32 | return model 33 | # build the model 34 | model = baseline_model() 35 | # Fit the model 36 | model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=10, batch_size=200, verbose=2) 37 | # Final evaluation of the model 38 | scores = model.evaluate(X_test, y_test, verbose=0) 39 | print("Baseline Error: %.2f%%" % (100-scores[1]*100)) 40 | -------------------------------------------------------------------------------- /deep_learning_with_python_code/19_mnist_plot.py: -------------------------------------------------------------------------------- 1 | # Plot ad hoc mnist instances 2 | from keras.datasets import mnist 3 | import matplotlib.pyplot as plt 4 | # load (downloaded if needed) the MNIST dataset 5 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 6 | # plot 4 images as gray scale 7 | plt.subplot(221) 8 | plt.imshow(X_train[0], cmap=plt.get_cmap('gray')) 9 | plt.subplot(222) 10 | plt.imshow(X_train[1], cmap=plt.get_cmap('gray')) 11 | plt.subplot(223) 12 | plt.imshow(X_train[2], cmap=plt.get_cmap('gray')) 13 | plt.subplot(224) 14 | plt.imshow(X_train[3], cmap=plt.get_cmap('gray')) 15 | # show the plot 16 | plt.show() 17 | -------------------------------------------------------------------------------- /deep_learning_with_python_code/20_augment_baseline.py: -------------------------------------------------------------------------------- 1 | # Plot of images as baseline for comparison 2 | from keras.datasets import mnist 3 | from matplotlib import pyplot 4 | # load data 5 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 6 | # create a grid of 3x3 images 7 | for i in range(0, 9): 8 | pyplot.subplot(330 + 1 + i) 9 | pyplot.imshow(X_train[i], cmap=pyplot.get_cmap('gray')) 10 | 11 | # show the plot 12 | pyplot.show() -------------------------------------------------------------------------------- /deep_learning_with_python_code/20_augment_feature_standardize.py: -------------------------------------------------------------------------------- 1 | # Standardize images across the dataset, mean=0, stdev=1 2 | from keras.datasets import mnist 3 | from keras.preprocessing.image import ImageDataGenerator 4 | from matplotlib import pyplot 5 | # load data 6 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 7 | # reshape to be [samples][pixels][width][height] 8 | X_train = X_train.reshape(X_train.shape[0], 1, 28, 28) 9 | X_test = X_test.reshape(X_test.shape[0], 1, 28, 28) 10 | # convert from int to float 11 | X_train = X_train.astype('float32') 12 | X_test = X_test.astype('float32') 13 | # define data preparation 14 | datagen = ImageDataGenerator(featurewise_center=True, featurewise_std_normalization=True) 15 | # fit parameters from data 16 | datagen.fit(X_train) 17 | # configure batch size and retrieve one batch of images 18 | for X_batch, y_batch in datagen.flow(X_train, y_train, batch_size=9): 19 | # create a grid of 3x3 images 20 | for i in range(0, 9): 21 | pyplot.subplot(330 + 1 + i) 22 | pyplot.imshow(X_batch[i].reshape(28, 28), cmap=pyplot.get_cmap('gray')) 23 | # show the plot 24 | pyplot.show() 25 | break 26 | -------------------------------------------------------------------------------- /deep_learning_with_python_code/20_augment_flips.py: -------------------------------------------------------------------------------- 1 | # Random Flips 2 | from keras.datasets import mnist 3 | from keras.preprocessing.image import ImageDataGenerator 4 | from matplotlib import pyplot 5 | # load data 6 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 7 | # reshape to be [samples][pixels][width][height] 8 | X_train = X_train.reshape(X_train.shape[0], 1, 28, 28) 9 | X_test = X_test.reshape(X_test.shape[0], 1, 28, 28) 10 | # convert from int to float 11 | X_train = X_train.astype('float32') 12 | X_test = X_test.astype('float32') 13 | # define data preparation 14 | datagen = ImageDataGenerator(horizontal_flip=True, vertical_flip=True) 15 | # fit parameters from data 16 | datagen.fit(X_train) 17 | # configure batch size and retrieve one batch of images 18 | for X_batch, y_batch in datagen.flow(X_train, y_train, batch_size=9): 19 | # create a grid of 3x3 images 20 | for i in range(0, 9): 21 | pyplot.subplot(330 + 1 + i) 22 | pyplot.imshow(X_batch[i].reshape(28, 28), cmap=pyplot.get_cmap('gray')) 23 | # show the plot 24 | pyplot.show() 25 | break -------------------------------------------------------------------------------- /deep_learning_with_python_code/20_augment_rotations.py: -------------------------------------------------------------------------------- 1 | # Random Rotations 2 | from keras.datasets import mnist 3 | from keras.preprocessing.image import ImageDataGenerator 4 | from matplotlib import pyplot 5 | # load data 6 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 7 | # reshape to be [samples][pixels][width][height] 8 | X_train = X_train.reshape(X_train.shape[0], 1, 28, 28) 9 | X_test = X_test.reshape(X_test.shape[0], 1, 28, 28) 10 | # convert from int to float 11 | X_train = X_train.astype('float32') 12 | X_test = X_test.astype('float32') 13 | # define data preparation 14 | datagen = ImageDataGenerator(rotation_range=90) 15 | # fit parameters from data 16 | datagen.fit(X_train) 17 | # configure batch size and retrieve one batch of images 18 | for X_batch, y_batch in datagen.flow(X_train, y_train, batch_size=9): 19 | # create a grid of 3x3 images 20 | for i in range(0, 9): 21 | pyplot.subplot(330 + 1 + i) 22 | pyplot.imshow(X_batch[i].reshape(28, 28), cmap=pyplot.get_cmap('gray')) 23 | # show the plot 24 | pyplot.show() 25 | break -------------------------------------------------------------------------------- /deep_learning_with_python_code/20_augment_save_to_file.py: -------------------------------------------------------------------------------- 1 | # Save augmented images to file 2 | from keras.datasets import mnist 3 | from keras.preprocessing.image import ImageDataGenerator 4 | from matplotlib import pyplot 5 | import os 6 | from keras import backend as K 7 | K.set_image_dim_ordering('th') 8 | # load data 9 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 10 | # reshape to be [samples][pixels][width][height] 11 | X_train = X_train.reshape(X_train.shape[0], 1, 28, 28) 12 | X_test = X_test.reshape(X_test.shape[0], 1, 28, 28) 13 | # convert from int to float 14 | X_train = X_train.astype('float32') 15 | X_test = X_test.astype('float32') 16 | # define data preparation 17 | datagen = ImageDataGenerator() 18 | # fit parameters from data 19 | datagen.fit(X_train) 20 | # configure batch size and retrieve one batch of images 21 | os.makedirs('images') 22 | for X_batch, y_batch in datagen.flow(X_train, y_train, batch_size=9, save_to_dir='images', save_prefix='aug', save_format='png'): 23 | # create a grid of 3x3 images 24 | for i in range(0, 9): 25 | pyplot.subplot(330 + 1 + i) 26 | pyplot.imshow(X_batch[i].reshape(28, 28), cmap=pyplot.get_cmap('gray')) 27 | # show the plot 28 | pyplot.show() 29 | break -------------------------------------------------------------------------------- /deep_learning_with_python_code/20_augment_shifts.py: -------------------------------------------------------------------------------- 1 | # Random Shifts 2 | from keras.datasets import mnist 3 | from keras.preprocessing.image import ImageDataGenerator 4 | from matplotlib import pyplot 5 | # load data 6 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 7 | # reshape to be [samples][pixels][width][height] 8 | X_train = X_train.reshape(X_train.shape[0], 1, 28, 28) 9 | X_test = X_test.reshape(X_test.shape[0], 1, 28, 28) 10 | # convert from int to float 11 | X_train = X_train.astype('float32') 12 | X_test = X_test.astype('float32') 13 | # define data preparation 14 | shift = 0.2 15 | datagen = ImageDataGenerator(width_shift_range=shift, height_shift_range=shift) 16 | # fit parameters from data 17 | datagen.fit(X_train) 18 | # configure batch size and retrieve one batch of images 19 | for X_batch, y_batch in datagen.flow(X_train, y_train, batch_size=9): 20 | # create a grid of 3x3 images 21 | for i in range(0, 9): 22 | pyplot.subplot(330 + 1 + i) 23 | pyplot.imshow(X_batch[i].reshape(28, 28), cmap=pyplot.get_cmap('gray')) 24 | # show the plot 25 | pyplot.show() 26 | break -------------------------------------------------------------------------------- /deep_learning_with_python_code/20_augment_zca.py: -------------------------------------------------------------------------------- 1 | # ZCA whitening 2 | from keras.datasets import mnist 3 | from keras.preprocessing.image import ImageDataGenerator 4 | from matplotlib import pyplot 5 | # load data 6 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 7 | # reshape to be [samples][pixels][width][height] 8 | X_train = X_train.reshape(X_train.shape[0], 1, 28, 28) 9 | X_test = X_test.reshape(X_test.shape[0], 1, 28, 28) 10 | # convert from int to float 11 | X_train = X_train.astype('float32') 12 | X_test = X_test.astype('float32') 13 | # define data preparation 14 | datagen = ImageDataGenerator(zca_whitening=True) 15 | # fit parameters from data 16 | datagen.fit(X_train) 17 | # configure batch size and retrieve one batch of images 18 | for X_batch, y_batch in datagen.flow(X_train, y_train, batch_size=9): 19 | # create a grid of 3x3 images 20 | for i in range(0, 9): 21 | pyplot.subplot(330 + 1 + i) 22 | pyplot.imshow(X_batch[i].reshape(28, 28), cmap=pyplot.get_cmap('gray')) 23 | # show the plot 24 | pyplot.show() 25 | break 26 | -------------------------------------------------------------------------------- /deep_learning_with_python_code/21_cifar10_plot.py: -------------------------------------------------------------------------------- 1 | # Plot ad hoc CIFAR10 instances 2 | from keras.datasets import cifar10 3 | from matplotlib import pyplot 4 | from scipy.misc import toimage 5 | # load data 6 | (X_train, y_train), (X_test, y_test) = cifar10.load_data() 7 | # create a grid of 3x3 images 8 | for i in range(0, 9): 9 | pyplot.subplot(330 + 1 + i) 10 | pyplot.imshow(toimage(X_train[i])) 11 | # show the plot 12 | pyplot.show() -------------------------------------------------------------------------------- /deep_learning_with_python_code/22_imdb_cnn.py: -------------------------------------------------------------------------------- 1 | # CNN for the IMDB problem 2 | import numpy 3 | from keras.datasets import imdb 4 | from keras.models import Sequential 5 | from keras.layers import Dense 6 | from keras.layers import Flatten 7 | from keras.layers.convolutional import Convolution1D 8 | from keras.layers.convolutional import MaxPooling1D 9 | from keras.layers.embeddings import Embedding 10 | from keras.preprocessing import sequence 11 | # fix random seed for reproducibility 12 | seed = 7 13 | numpy.random.seed(seed) 14 | # load the dataset but only keep the top n words, zero the rest 15 | top_words = 5000 16 | (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words) 17 | # pad dataset to a maximum review length in words 18 | max_words = 500 19 | X_train = sequence.pad_sequences(X_train, maxlen=max_words) 20 | X_test = sequence.pad_sequences(X_test, maxlen=max_words) 21 | # create the model 22 | model = Sequential() 23 | model.add(Embedding(top_words, 32, input_length=max_words)) 24 | model.add(Convolution1D(nb_filter=32, filter_length=3, border_mode='same', activation='relu')) 25 | model.add(MaxPooling1D(pool_length=2)) 26 | model.add(Flatten()) 27 | model.add(Dense(250, activation='relu')) 28 | model.add(Dense(1, activation='sigmoid')) 29 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 30 | print(model.summary()) 31 | # Fit the model 32 | model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=2, batch_size=128, verbose=1) 33 | # Final evaluation of the model 34 | scores = model.evaluate(X_test, y_test, verbose=0) 35 | print("Accuracy: %.2f%%" % (scores[1]*100)) 36 | -------------------------------------------------------------------------------- /deep_learning_with_python_code/22_imdb_mlp.py: -------------------------------------------------------------------------------- 1 | # MLP for the IMDB problem 2 | import numpy 3 | from keras.datasets import imdb 4 | from keras.models import Sequential 5 | from keras.layers import Dense 6 | from keras.layers import Flatten 7 | from keras.layers.embeddings import Embedding 8 | from keras.preprocessing import sequence 9 | # fix random seed for reproducibility 10 | seed = 7 11 | numpy.random.seed(seed) 12 | # load the dataset but only keep the top n words, zero the rest 13 | top_words = 5000 14 | (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words) 15 | max_words = 500 16 | X_train = sequence.pad_sequences(X_train, maxlen=max_words) 17 | X_test = sequence.pad_sequences(X_test, maxlen=max_words) 18 | # create the model 19 | model = Sequential() 20 | model.add(Embedding(top_words, 32, input_length=max_words)) 21 | model.add(Flatten()) 22 | model.add(Dense(250, activation='relu')) 23 | model.add(Dense(1, activation='sigmoid')) 24 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 25 | print(model.summary()) 26 | # Fit the model 27 | model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=2, batch_size=128, verbose=1) 28 | # Final evaluation of the model 29 | scores = model.evaluate(X_test, y_test, verbose=0) 30 | print("Accuracy: %.2f%%" % (scores[1]*100)) 31 | -------------------------------------------------------------------------------- /deep_learning_with_python_code/22_imdb_plot.py: -------------------------------------------------------------------------------- 1 | # Load and Plot the IMDB dataset 2 | import numpy 3 | from keras.datasets import imdb 4 | from matplotlib import pyplot 5 | # load the dataset 6 | (X_train, y_train), (X_test, y_test) = imdb.load_data() 7 | X = numpy.concatenate((X_train, X_test), axis=0) 8 | y = numpy.concatenate((y_train, y_test), axis=0) 9 | # summarize size 10 | print("Training data: ") 11 | print(X.shape) 12 | print(y.shape) 13 | # Summarize number of classes 14 | print("Classes: ") 15 | print(numpy.unique(y)) 16 | # Summarize number of words 17 | print("Number of words: ") 18 | print(len(numpy.unique(numpy.hstack(X)))) 19 | # Summarize review length 20 | print("Review length: ") 21 | result = map(len, X) 22 | print("Mean %.2f words (%f)" % (numpy.mean(result), numpy.std(result))) 23 | # plot review length as a boxplot and histogram 24 | pyplot.subplot(121) 25 | pyplot.boxplot(result) 26 | pyplot.subplot(122) 27 | pyplot.hist(result) 28 | pyplot.show() 29 | -------------------------------------------------------------------------------- /deep_learning_with_python_code/26_lstm_cnn.py: -------------------------------------------------------------------------------- 1 | # LSTM and CNN for sequence classification in the IMDB dataset 2 | import numpy 3 | from keras.datasets import imdb 4 | from keras.models import Sequential 5 | from keras.layers import Dense 6 | from keras.layers import LSTM 7 | from keras.layers.convolutional import Convolution1D 8 | from keras.layers.convolutional import MaxPooling1D 9 | from keras.layers.embeddings import Embedding 10 | from keras.preprocessing import sequence 11 | # fix random seed for reproducibility 12 | numpy.random.seed(7) 13 | # load the dataset but only keep the top n words, zero the rest 14 | top_words = 5000 15 | (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words) 16 | # truncate and pad input sequences 17 | max_review_length = 500 18 | X_train = sequence.pad_sequences(X_train, maxlen=max_review_length) 19 | X_test = sequence.pad_sequences(X_test, maxlen=max_review_length) 20 | # create the model 21 | embedding_vecor_length = 32 22 | model = Sequential() 23 | model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length)) 24 | model.add(Convolution1D(nb_filter=32, filter_length=3, border_mode='same', activation='relu')) 25 | model.add(MaxPooling1D(pool_length=2)) 26 | model.add(LSTM(100)) 27 | model.add(Dense(1, activation='sigmoid')) 28 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 29 | print(model.summary()) 30 | model.fit(X_train, y_train, nb_epoch=3, batch_size=64) 31 | # Final evaluation of the model 32 | scores = model.evaluate(X_test, y_test, verbose=0) 33 | print("Accuracy: %.2f%%" % (scores[1]*100)) -------------------------------------------------------------------------------- /deep_learning_with_python_code/26_lstm_dropout_gates.py: -------------------------------------------------------------------------------- 1 | # LSTM with dropout for sequence classification in the IMDB dataset 2 | import numpy 3 | from keras.datasets import imdb 4 | from keras.models import Sequential 5 | from keras.layers import Dense 6 | from keras.layers import LSTM 7 | from keras.layers.embeddings import Embedding 8 | from keras.preprocessing import sequence 9 | # fix random seed for reproducibility 10 | numpy.random.seed(7) 11 | # load the dataset but only keep the top n words, zero the rest 12 | top_words = 5000 13 | (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words) 14 | # truncate and pad input sequences 15 | max_review_length = 500 16 | X_train = sequence.pad_sequences(X_train, maxlen=max_review_length) 17 | X_test = sequence.pad_sequences(X_test, maxlen=max_review_length) 18 | # create the model 19 | embedding_vecor_length = 32 20 | model = Sequential() 21 | model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length, dropout=0.2)) 22 | model.add(LSTM(100, dropout_W=0.2, dropout_U=0.2)) 23 | model.add(Dense(1, activation='sigmoid')) 24 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 25 | print(model.summary()) 26 | model.fit(X_train, y_train, nb_epoch=3, batch_size=64) 27 | # Final evaluation of the model 28 | scores = model.evaluate(X_test, y_test, verbose=0) 29 | print("Accuracy: %.2f%%" % (scores[1]*100)) -------------------------------------------------------------------------------- /deep_learning_with_python_code/26_lstm_dropout_layers.py: -------------------------------------------------------------------------------- 1 | # LSTM with Dropout for sequence classification in the IMDB dataset 2 | import numpy 3 | from keras.datasets import imdb 4 | from keras.models import Sequential 5 | from keras.layers import Dense 6 | from keras.layers import LSTM 7 | from keras.layers import Dropout 8 | from keras.layers.embeddings import Embedding 9 | from keras.preprocessing import sequence 10 | # fix random seed for reproducibility 11 | numpy.random.seed(7) 12 | # load the dataset but only keep the top n words, zero the rest 13 | top_words = 5000 14 | (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words) 15 | # truncate and pad input sequences 16 | max_review_length = 500 17 | X_train = sequence.pad_sequences(X_train, maxlen=max_review_length) 18 | X_test = sequence.pad_sequences(X_test, maxlen=max_review_length) 19 | # create the model 20 | embedding_vecor_length = 32 21 | model = Sequential() 22 | model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length, dropout=0.2)) 23 | model.add(Dropout(0.2)) 24 | model.add(LSTM(100)) 25 | model.add(Dropout(0.2)) 26 | model.add(Dense(1, activation='sigmoid')) 27 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 28 | print(model.summary()) 29 | model.fit(X_train, y_train, nb_epoch=3, batch_size=64) 30 | # Final evaluation of the model 31 | scores = model.evaluate(X_test, y_test, verbose=0) 32 | print("Accuracy: %.2f%%" % (scores[1]*100)) -------------------------------------------------------------------------------- /deep_learning_with_python_code/26_lstm_simple.py: -------------------------------------------------------------------------------- 1 | # LSTM for sequence classification in the IMDB dataset 2 | import numpy 3 | from keras.datasets import imdb 4 | from keras.models import Sequential 5 | from keras.layers import Dense 6 | from keras.layers import LSTM 7 | from keras.layers.embeddings import Embedding 8 | from keras.preprocessing import sequence 9 | # fix random seed for reproducibility 10 | numpy.random.seed(7) 11 | # load the dataset but only keep the top n words, zero the rest 12 | top_words = 5000 13 | (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words) 14 | # truncate and pad input sequences 15 | max_review_length = 500 16 | X_train = sequence.pad_sequences(X_train, maxlen=max_review_length) 17 | X_test = sequence.pad_sequences(X_test, maxlen=max_review_length) 18 | # create the model 19 | embedding_vecor_length = 32 20 | model = Sequential() 21 | model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length)) 22 | model.add(LSTM(100)) 23 | model.add(Dense(1, activation='sigmoid')) 24 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 25 | print(model.summary()) 26 | model.fit(X_train, y_train, nb_epoch=3, batch_size=64) 27 | # Final evaluation of the model 28 | scores = model.evaluate(X_test, y_test, verbose=0) 29 | print("Accuracy: %.2f%%" % (scores[1]*100)) -------------------------------------------------------------------------------- /deep_learning_with_python_code/weights-improvement-19-1.9435.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/deep_learning_with_python_code/weights-improvement-19-1.9435.hdf5 -------------------------------------------------------------------------------- /deep_learning_with_python_code/weights-improvement-47-1.2219-bigger.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/deep_learning_with_python_code/weights-improvement-47-1.2219-bigger.hdf5 -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/machine_learning_mastery_with_r_code/.DS_Store -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/machine_learning_mastery_with_r_code/1-AnalyzeData/.DS_Store -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/1-LoadData/datasets-mlbench.R: -------------------------------------------------------------------------------- 1 | # Datasets from the mlbench library 2 | 3 | # load the library 4 | library(mlbench) 5 | 6 | # list the contents of the library 7 | library(help = "mlbench") 8 | 9 | # Boston Housing dataset 10 | data(BostonHousing) 11 | dim(BostonHousing) 12 | head(BostonHousing) 13 | 14 | # Wisconsin Breast Cancer dataset 15 | data(BreastCancer) 16 | dim(BreastCancer) 17 | levels(BreastCancer$Class) 18 | head(BreastCancer) 19 | 20 | # Glass Identification dataset 21 | data(Glass) 22 | dim(Glass) 23 | levels(Glass$Type) 24 | head(Glass) 25 | 26 | # Johns Hopkins University Ionosphere dataset 27 | data(Ionosphere) 28 | dim(Ionosphere) 29 | levels(Ionosphere$Class) 30 | head(Ionosphere) 31 | 32 | # Pima Indians Diabetes dataset 33 | data(PimaIndiansDiabetes) 34 | dim(PimaIndiansDiabetes) 35 | levels(PimaIndiansDiabetes$diabetes) 36 | head(PimaIndiansDiabetes) 37 | 38 | # Sonar, Mines vs. Rocks dataset 39 | data(Sonar) 40 | dim(Sonar) 41 | levels(Sonar$Class) 42 | head(Sonar) 43 | 44 | # Soybean dataset 45 | data(Soybean) 46 | dim(Soybean) 47 | levels(Soybean$Class) 48 | head(Soybean) -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/1-LoadData/datasets_appliedpredictivemodeling.R: -------------------------------------------------------------------------------- 1 | # Datasets from the AppliedPredictiveModeling library 2 | 3 | # load the library 4 | library(AppliedPredictiveModeling) 5 | 6 | # list the contents of the library 7 | library(help = "AppliedPredictiveModeling") 8 | 9 | # Abalone Data 10 | data(abalone) 11 | dim(abalone) 12 | head(abalone) 13 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/1-LoadData/datasets_datasets.R: -------------------------------------------------------------------------------- 1 | # Datasets from the dataset library 2 | 3 | # list the contents of the library 4 | library(help = "datasets") 5 | 6 | # list all available datasets in all loaded libraries 7 | data() 8 | 9 | # Iris flowers datasets 10 | data(iris) 11 | dim(iris) 12 | levels(iris$Species) 13 | head(iris) 14 | 15 | # Longley's Economic Regression Data 16 | data(longley) 17 | dim(longley) 18 | head(longley) -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/1-LoadData/load_csv_file.R: -------------------------------------------------------------------------------- 1 | # Load data from a CSV file in the local directory 2 | 3 | # define the filename 4 | filename <- "iris.csv" 5 | # load the CSV file from the local directory 6 | dataset <- read.csv(filename, header=FALSE) 7 | # preview the first 5 rows 8 | head(dataset) -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/1-LoadData/load_csv_url.R: -------------------------------------------------------------------------------- 1 | # Load CSV From a URL 2 | 3 | # load the library 4 | library(RCurl) 5 | # specify the URL for the Iris data CSV 6 | urlfile <-'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' 7 | # download the file 8 | downloaded <- getURL(urlfile, ssl.verifypeer=FALSE) 9 | # treat the text data as a steam so we can read from it 10 | connection <- textConnection(downloaded) 11 | # parse the downloaded data as CSV 12 | dataset <- read.csv(connection, header=FALSE) 13 | # preview the first 5 rows 14 | head(dataset) -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/2-DataSummarization/class_distribution.R: -------------------------------------------------------------------------------- 1 | # Class Distribution 2 | 3 | # load the libraries 4 | library(mlbench) 5 | # load the dataset 6 | data(PimaIndiansDiabetes) 7 | # distribution of class variable 8 | y <- PimaIndiansDiabetes$diabetes 9 | cbind(freq=table(y), percentage=prop.table(table(y))*100) -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/2-DataSummarization/correction_spearman.R: -------------------------------------------------------------------------------- 1 | # Pair-wise correlations using pearson spearman coefficients 2 | 3 | # load the libraries 4 | library(mlbench) 5 | # load the dataset 6 | data(PimaIndiansDiabetes) 7 | # calculate a correlation matrix for numeric variables 8 | correlations <- cor(PimaIndiansDiabetes[,1:8], method="spearman") 9 | # display the correlation matrix 10 | print(correlations) 11 | 12 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/2-DataSummarization/correlation_pearson.R: -------------------------------------------------------------------------------- 1 | # Pair-wise correlations using pearson correlation coefficients 2 | 3 | # load the libraries 4 | library(mlbench) 5 | # load the dataset 6 | data(PimaIndiansDiabetes) 7 | # calculate a correlation matrix for numeric variables 8 | correlations <- cor(PimaIndiansDiabetes[,1:8]) 9 | # display the correlation matrix 10 | print(correlations) 11 | 12 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/2-DataSummarization/data_types.R: -------------------------------------------------------------------------------- 1 | # Data Types 2 | 3 | # load library 4 | library(mlbench) 5 | # load dataset 6 | data(BostonHousing) 7 | # list types for each attribute 8 | sapply(BostonHousing, class) -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/2-DataSummarization/dimensions.R: -------------------------------------------------------------------------------- 1 | # Dimensions of your dataset 2 | 3 | # load the libraries 4 | library(mlbench) 5 | # load the dataset 6 | data(PimaIndiansDiabetes) 7 | # display the dimensions of the dataset 8 | dim(PimaIndiansDiabetes) 9 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/2-DataSummarization/peek.R: -------------------------------------------------------------------------------- 1 | # Peek at raw data 2 | 3 | # load the library 4 | library(mlbench) 5 | # load the dataset 6 | data(PimaIndiansDiabetes) 7 | # display first 20 rows of data 8 | head(PimaIndiansDiabetes, n=20) 9 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/2-DataSummarization/skewness.R: -------------------------------------------------------------------------------- 1 | # Calculate Skewness 2 | 3 | # load libraries 4 | library(mlbench) 5 | library(e1071) 6 | # load the dataset 7 | data(PimaIndiansDiabetes) 8 | # calculate skewness for each variable 9 | skew <- apply(PimaIndiansDiabetes[,1:8], 2, skewness) 10 | # display skewness, larger/smaller deviations from 0 show more skew 11 | print(skew) 12 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/2-DataSummarization/standard_deviation.R: -------------------------------------------------------------------------------- 1 | # Standard Deviation 2 | 3 | # load the libraries 4 | library(mlbench) 5 | # load the dataset 6 | data(PimaIndiansDiabetes) 7 | # calculate standard deviation for all attributes 8 | sapply(PimaIndiansDiabetes[,1:8], sd) 9 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/2-DataSummarization/summary.R: -------------------------------------------------------------------------------- 1 | # Summarize each attribute of a dataset using min, max, mean, 25%, 50% and 75%. 2 | 3 | 4 | # load the iris dataset 5 | data(iris) 6 | # summarize the dataset 7 | summary(iris) 8 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/1-Univariate/barplot.R: -------------------------------------------------------------------------------- 1 | # Plot Factor 2 | 3 | # load the library 4 | library(mlbench) 5 | # load the dataset 6 | data(BreastCancer) 7 | # create a bar plot of each categorical attribute 8 | par(mfrow=c(2,4)) 9 | for(i in 2:9) { 10 | counts <- table(BreastCancer[,i]) 11 | name <- names(BreastCancer)[i] 12 | barplot(counts, main=name) 13 | } 14 | 15 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/1-Univariate/boxplot.R: -------------------------------------------------------------------------------- 1 | # Univarate Box And Whisker Plots 2 | 3 | # load dataset 4 | data(iris) 5 | # Create separate boxplots for each attribute 6 | par(mfrow=c(1,4)) 7 | for(i in 1:4) { 8 | boxplot(iris[,i], main=names(iris)[i]) 9 | } 10 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/1-Univariate/density_plot.R: -------------------------------------------------------------------------------- 1 | # Univariate Density Plots 2 | 3 | # load libraries 4 | library(lattice) 5 | # load dataset 6 | data(iris) 7 | # create a panel of simpler density plots by attribute 8 | par(mfrow=c(1,4)) 9 | for(i in 1:4) { 10 | plot(density(iris[,i]), main=names(iris)[i]) 11 | } 12 | 13 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/1-Univariate/histogram.R: -------------------------------------------------------------------------------- 1 | # Univariate Histograms 2 | 3 | # load the data 4 | data(iris) 5 | # create histograms for each attribute 6 | par(mfrow=c(1,4)) 7 | for(i in 1:4) { 8 | hist(iris[,i], main=names(iris)[i]) 9 | } -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/1-Univariate/missing_plot.R: -------------------------------------------------------------------------------- 1 | # Plot missing data 2 | 3 | # load libraries 4 | library(Amelia) 5 | library(mlbench) 6 | # load dataset 7 | data(Soybean) 8 | # create a missing map 9 | missmap(Soybean, col=c("black", "grey"), legend=FALSE) 10 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/2-Multivaraite/boxplot_by_class.R: -------------------------------------------------------------------------------- 1 | # Create a box and whisker plots for each variable organized by class. 2 | 3 | # load the caret library 4 | library(caret) 5 | # load the iris dataset 6 | data(iris) 7 | # box and whisker plots for each attribute by class value 8 | x <- iris[,1:4] 9 | y <- iris[,5] 10 | featurePlot(x=x, y=y, plot="box") 11 | 12 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/2-Multivaraite/correlation_plot.R: -------------------------------------------------------------------------------- 1 | # Correlation Plot 2 | 3 | # load library 4 | library(corrplot) 5 | # load the data 6 | data(iris) 7 | # calculate correlations 8 | correlations <- cor(iris[,1:4]) 9 | # create correlation plot 10 | corrplot(correlations, method="circle") -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/2-Multivaraite/density_plot_by_class.R: -------------------------------------------------------------------------------- 1 | # Create a density plot for each variable-class combination. 2 | 3 | # load the library 4 | library(caret) 5 | # load the data 6 | data(iris) 7 | # density plots for each attribute by class value 8 | x <- iris[,1:4] 9 | y <- iris[,5] 10 | scales <- list(x=list(relation="free"), y=list(relation="free")) 11 | featurePlot(x=x, y=y, plot="density", scales=scales) 12 | 13 | 14 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/2-Multivaraite/scatterplot_matrix.R: -------------------------------------------------------------------------------- 1 | # Multivariate Scatterplot Matrix 2 | 3 | # load the data 4 | data(iris) 5 | # pair-wise scatterplots of all 4 attributes 6 | pairs(iris) 7 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/2-Multivaraite/scatterplot_matrix_by_class.R: -------------------------------------------------------------------------------- 1 | # Multivariate Scatterplot Matrix By Class 2 | 3 | # load the data 4 | data(iris) 5 | # pair-wise scatterplots colored by class 6 | pairs(Species~., data=iris, col=iris$Species) 7 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/3-Projection/andrews_curves.R: -------------------------------------------------------------------------------- 1 | # Andrews Curves 2 | 3 | # load library 4 | library(andrews) 5 | # load dataset 6 | data(iris) 7 | # generate andres curves 8 | andrews(iris, clr=5, ymax=3) -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/3-Projection/parallel_coordinates.R: -------------------------------------------------------------------------------- 1 | # Parallel Coordinates 2 | 3 | # load library 4 | library(MASS) 5 | # load dataset 6 | data(iris) 7 | # convert data frame to matrix 8 | iris_matrix <- data.matrix(iris) 9 | parcoord(iris_matrix) 10 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/3-Projection/pca.R: -------------------------------------------------------------------------------- 1 | # Principal Component Analysis 2 | 3 | # load the dataset 4 | data(iris) 5 | # separate numerical inputs 6 | x <- data.matrix(iris[,1:4]) 7 | y <- iris[,5] 8 | # calculate components 9 | components <- prcomp(x, center=TRUE, scale=TRUE) 10 | # display components 11 | print(components) 12 | # summarize components 13 | summary(ir.pca) 14 | # plot the components 15 | biplot(components) 16 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/3-Projection/sammons.R: -------------------------------------------------------------------------------- 1 | # Sammons Mapping 2 | 3 | # load library 4 | library(MASS) 5 | # load dataset 6 | data(iris) 7 | # remove duplicates 8 | clean <- unique(iris) 9 | # split out numerical inputs 10 | x <- data.matrix(clean[, 1:4]) 11 | # create a sammon mapping 12 | mapping <- sammon(dist(x)) 13 | # plot mapping by class 14 | plot(mapping$points, type="n") 15 | text(mapping$points, labels=clean[,5]) 16 | 17 | # TODO colour dots by class -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/1-AnalyzeData/3-DataVisualization/3-Projection/som.R: -------------------------------------------------------------------------------- 1 | # Self Organizing Map (Kohonen) 2 | 3 | # load the library 4 | library("kohonen") 5 | # load the dataset 6 | data(iris) 7 | # split input and output 8 | x <- data.matrix(iris[,1:4]) 9 | y <- iris[,5] 10 | # set the random seed for repetable results 11 | set.seed(7) 12 | # create a map of the x values 13 | iris_map <- som(data=x, grid=somgrid(5, 5, "hexagonal")) 14 | # plot the map 15 | plot(iris_map) 16 | 17 | # TODO label the map by class 18 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/2-PrepareData/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/machine_learning_mastery_with_r_code/2-PrepareData/.DS_Store -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/2-PrepareData/1-DataCleaning/impute_missing_values.R: -------------------------------------------------------------------------------- 1 | # Impute missing values 2 | 3 | # load the libraries 4 | library(mlbench) 5 | library(Hmisc) 6 | # load the dataset 7 | data(PimaIndiansDiabetes) 8 | # mark a pressure of 0 as N/A, it is impossible 9 | invalid <- 0 10 | PimaIndiansDiabetes$pressure[PimaIndiansDiabetes$pressure==invalid] <- NA 11 | # impute missing pressure values using the mean 12 | PimaIndiansDiabetes$pressure <- with(PimaIndiansDiabetes, impute(pressure, mean)) 13 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/2-PrepareData/1-DataCleaning/mark_missing_values.R: -------------------------------------------------------------------------------- 1 | # Mark Missing Values as N/A 2 | 3 | # load the libraries 4 | library(mlbench) 5 | # load the dataset 6 | data(PimaIndiansDiabetes) 7 | # mark a pressure of 0 as N/A, it is impossible 8 | invalid <- 0 9 | PimaIndiansDiabetes$pressure[PimaIndiansDiabetes$pressure==invalid] <- NA 10 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/2-PrepareData/1-DataCleaning/rebalance_SMOTE.R: -------------------------------------------------------------------------------- 1 | # Rebalance a dataset using Synthetic Minority Over-sampling Technique (SMOTE) 2 | 3 | # load the libraries 4 | library(mlbench) 5 | library(DMwR) 6 | # load the dataset 7 | data(PimaIndiansDiabetes) 8 | # display count of instances of each class (unbalanced) 9 | table(PimaIndiansDiabetes$diabetes) 10 | # use SMOTE to created a "more balance" version of the dataset 11 | balanced <- SMOTE(diabetes~., PimaIndiansDiabetes, perc.over=300, perc.under=100) 12 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/2-PrepareData/1-DataCleaning/remove_duplicates.R: -------------------------------------------------------------------------------- 1 | # Remove Duplicate Instances 2 | 3 | # load the libraries 4 | library(mlbench) 5 | # load the dataset 6 | data(iris) 7 | dim(iris) 8 | # remove duplicates 9 | clean <- unique(iris) 10 | dim(clean) 11 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/2-PrepareData/1-DataCleaning/remove_na.R: -------------------------------------------------------------------------------- 1 | # Remove rows with NA 2 | 3 | # load library 4 | library(mlbench) 5 | # load dataset 6 | data(BreastCancer) 7 | # summarize dimensions of dataset 8 | dim(BreastCancer) 9 | # Remove all incomplete rows 10 | dataset <- BreastCancer[complete.cases(BreastCancer),] 11 | # summarize dimensions of resulting dataset 12 | dim(dataset) -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/2-PrepareData/1-DataCleaning/remove_outliers.R: -------------------------------------------------------------------------------- 1 | # Update Data Frame to Remove Outliers 2 | 3 | # load the libraries 4 | library(mlbench) 5 | # load the dataset 6 | data(PimaIndiansDiabetes) 7 | # calculate stats for pregnant (number of times pregnant) 8 | pregnant.mean <- mean(PimaIndiansDiabetes$pregnant) 9 | pregnant.sd <- sd(PimaIndiansDiabetes$pregnant) 10 | # max reasonable value is within 99.7% of the data (if Gaussian) 11 | pregnant.max <- pregnant.mean + (3*pregnant.sd) 12 | # mark outlier pregnant values as N/A 13 | PimaIndiansDiabetes$pregnant[PimaIndiansDiabetes$pregnant>pregnant.max] <- NA 14 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/2-PrepareData/2-FeatureSelection/rank_features_by_importance_caret.R: -------------------------------------------------------------------------------- 1 | # Rank features by their importance. 2 | 3 | # load the libraries 4 | library(mlbench) 5 | library(caret) 6 | # load the dataset 7 | data(PimaIndiansDiabetes) 8 | # prepare training scheme 9 | control <- trainControl(method="cv", number=10) 10 | # train the model 11 | model <- train(diabetes~., data=PimaIndiansDiabetes, method="lvq", preProcess="scale", trControl=control) 12 | # estimate variable importance 13 | importance <- varImp(model, scale=FALSE) 14 | # summarize importance 15 | print(importance) 16 | # plot importance 17 | plot(importance) 18 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/2-PrepareData/2-FeatureSelection/recursive_feature_elimination_caret.R: -------------------------------------------------------------------------------- 1 | # Use RFE and to select features 2 | 3 | # load the library 4 | library(mlbench) 5 | library(caret) 6 | # load the data 7 | data(Sonar) 8 | # define the control using a random forest selection function 9 | control <- rfeControl(functions=rfFuncs, method="cv", number=10) 10 | # run the RFE algorithm 11 | x <- Sonar[,1:60] 12 | y <- Sonar[,61] 13 | sizes <- c(10,20,30,40,50,60) 14 | results <- rfe(x, y, sizes=sizes, rfeControl=control) 15 | # summarize the results 16 | print(results) 17 | # list the chosen features 18 | predictors(results) 19 | # plot accuracy versus the number of features 20 | plot(results, type=c("g", "o")) 21 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/2-PrepareData/2-FeatureSelection/remove_highly_correlated_features_caret.R: -------------------------------------------------------------------------------- 1 | # Identify and remove highly correlated features 2 | 3 | # load the libraries 4 | library(mlbench) 5 | library(caret) 6 | # load the data 7 | data(PimaIndiansDiabetes) 8 | # calculate correlation matrix 9 | correlationMatrix <- cor(PimaIndiansDiabetes[,1:8]) 10 | # find attributes that are highly corrected (ideally >0.75) 11 | cutoff <- 0.50 12 | highlyCorrelated <- findCorrelation(correlationMatrix, cutoff=cutoff) 13 | # create a new dataset without highly corrected features 14 | dataset <- PimaIndiansDiabetes[,-highlyCorrelated] 15 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/2-PrepareData/3-TransformData/boxcox_transform.R: -------------------------------------------------------------------------------- 1 | # Box-Cox Transform (attributes must be numeric and >0) 2 | 3 | # load libraries 4 | library(mlbench) 5 | library(caret) 6 | # load the dataset 7 | data(PimaIndiansDiabetes) 8 | # summarize pedigree and age 9 | summary(PimaIndiansDiabetes[,7:8]) 10 | # calculate the pre-process parameters from the dataset 11 | preprocessParams <- preProcess(PimaIndiansDiabetes[,7:8], method=c("BoxCox")) 12 | # summarize transform parameters 13 | print(preprocessParams) 14 | # transform the dataset using the parameters 15 | transformed <- predict(preprocessParams, PimaIndiansDiabetes[,7:8]) 16 | # summarize the transformed dataset (note pedigree and age) 17 | summary(transformed) 18 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/2-PrepareData/3-TransformData/center.R: -------------------------------------------------------------------------------- 1 | # Center attributes by subtracting the mean 2 | 3 | # load libraries 4 | library(caret) 5 | # load the dataset 6 | data(iris) 7 | # summarize data 8 | summary(iris[,1:4]) 9 | # calculate the pre-process parameters from the dataset 10 | preprocessParams <- preProcess(iris[,1:4], method=c("center")) 11 | # summarize transform parameters 12 | print(preprocessParams) 13 | # transform the dataset using the parameters 14 | transformed <- predict(preprocessParams, iris[,1:4]) 15 | # summarize the transformed dataset 16 | summary(transformed) 17 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/2-PrepareData/3-TransformData/ica_transform.R: -------------------------------------------------------------------------------- 1 | # Independent Component Analysis Pre-processing 2 | 3 | # load libraries 4 | library(mlbench) 5 | library(caret) 6 | # load the dataset 7 | data(PimaIndiansDiabetes) 8 | # summarize dataset 9 | summary(PimaIndiansDiabetes[,1:8]) 10 | # calculate the pre-process parameters from the dataset 11 | preprocessParams <- preProcess(PimaIndiansDiabetes[,1:8], method=c("center", "scale", "ica"), n.comp=5) 12 | # summarize transform parameters 13 | print(preprocessParams) 14 | # transform the dataset using the parameters 15 | transformed <- predict(preprocessParams, PimaIndiansDiabetes[,1:8]) 16 | # summarize the transformed dataset 17 | summary(transformed) 18 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/2-PrepareData/3-TransformData/normalize.R: -------------------------------------------------------------------------------- 1 | # Normalize numeric attributes to the range [0,1] 2 | 3 | # load libraries 4 | library(caret) 5 | # load the dataset 6 | data(iris) 7 | # summarize data 8 | summary(iris[,1:4]) 9 | # calculate the pre-process parameters from the dataset 10 | preprocessParams <- preProcess(iris[,1:4], method=c("range")) 11 | # summarize transform parameters 12 | print(preprocessParams) 13 | # transform the dataset using the parameters 14 | transformed <- predict(preprocessParams, iris[,1:4]) 15 | # summarize the transformed dataset 16 | summary(transformed) 17 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/2-PrepareData/3-TransformData/pca_transform.R: -------------------------------------------------------------------------------- 1 | # Principal Component Analysis Pre-processing 2 | 3 | # load the libraries 4 | library(mlbench) 5 | # load the dataset 6 | data(iris) 7 | # summarize dataset 8 | summary(iris) 9 | # calculate the pre-process parameters from the dataset 10 | preprocessParams <- preProcess(iris, method=c("center", "scale", "pca")) 11 | # summarize transform parameters 12 | print(preprocessParams) 13 | # transform the dataset using the parameters 14 | transformed <- predict(preprocessParams, iris) 15 | # summarize the transformed dataset 16 | summary(transformed) 17 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/2-PrepareData/3-TransformData/scale.R: -------------------------------------------------------------------------------- 1 | # Scale attributes by dividing by standard deviation 2 | 3 | # load libraries 4 | library(caret) 5 | # load the dataset 6 | data(iris) 7 | # summarize data 8 | summary(iris[,1:4]) 9 | # calculate the pre-process parameters from the dataset 10 | preprocessParams <- preProcess(iris[,1:4], method=c("scale")) 11 | # summarize transform parameters 12 | print(preprocessParams) 13 | # transform the dataset using the parameters 14 | transformed <- predict(preprocessParams, iris[,1:4]) 15 | # summarize the transformed dataset 16 | summary(transformed) 17 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/2-PrepareData/3-TransformData/standardize.R: -------------------------------------------------------------------------------- 1 | # Standardize numeric attributes so they have zero mean and unit variance. 2 | 3 | # load libraries 4 | library(caret) 5 | # load the dataset 6 | data(iris) 7 | # summarize data 8 | summary(iris[,1:4]) 9 | # calculate the pre-process parameters from the dataset 10 | preprocessParams <- preProcess(iris[,1:4], method=c("center", "scale")) 11 | # summarize transform parameters 12 | print(preprocessParams) 13 | # transform the dataset using the parameters 14 | transformed <- predict(preprocessParams, iris[,1:4]) 15 | # summarize the transformed dataset 16 | summary(transformed) -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/2-PrepareData/3-TransformData/yeojohnson_transform.R: -------------------------------------------------------------------------------- 1 | # Yeo-Johnson Transform 2 | 3 | # load libraries 4 | library(mlbench) 5 | library(caret) 6 | # load the dataset 7 | data(PimaIndiansDiabetes) 8 | # summarize pedigree and age 9 | summary(PimaIndiansDiabetes[,7:8]) 10 | # calculate the pre-process parameters from the dataset 11 | preprocessParams <- preProcess(PimaIndiansDiabetes[,7:8], method=c("YeoJohnson")) 12 | # summarize transform parameters 13 | print(preprocessParams) 14 | # transform the dataset using the parameters 15 | transformed <- predict(preprocessParams, PimaIndiansDiabetes[,7:8]) 16 | # summarize the transformed dataset (note pedigree and age) 17 | summary(transformed) 18 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/machine_learning_mastery_with_r_code/3-Algorithms/.DS_Store -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/.DS_Store -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/1-LinearRegression/ordinary_least_squares_regression.R: -------------------------------------------------------------------------------- 1 | # Ordinary Least Squares Regression 2 | 3 | # load data 4 | data(longley) 5 | # fit model 6 | fit <- lm(Employed~., longley) 7 | # summarize the fit 8 | print(fit) 9 | # make predictions 10 | predictions <- predict(fit, longley) 11 | # summarize accuracy 12 | mse <- mean((longley$Employed - predictions)^2) 13 | print(mse) 14 | 15 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/1-LinearRegression/partial_least_squares_regression.R: -------------------------------------------------------------------------------- 1 | # Partial Least Squares Regression 2 | 3 | # load the package 4 | library(pls) 5 | # load data 6 | data(longley) 7 | # fit model 8 | fit <- plsr(Employed~., data=longley, validation="CV") 9 | # summarize the fit 10 | print(fit) 11 | # make predictions 12 | predictions <- predict(fit, longley, ncomp=6) 13 | # summarize accuracy 14 | mse <- mean((longley$Employed - predictions)^2) 15 | print(mse) 16 | 17 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/1-LinearRegression/principal_component_regression.R: -------------------------------------------------------------------------------- 1 | # Principal Component Regression 2 | 3 | # load the package 4 | library(pls) 5 | # load data 6 | data(longley) 7 | # fit model 8 | fit <- pcr(Employed~., data=longley, validation="CV") 9 | # summarize the fit 10 | print(fit) 11 | # make predictions 12 | predictions <- predict(fit, longley, ncomp=6) 13 | # summarize accuracy 14 | mse <- mean((longley$Employed - predictions)^2) 15 | print(mse) 16 | 17 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/1-LinearRegression/stepwise_linear_regression.R: -------------------------------------------------------------------------------- 1 | # Stepwise Linear Regression 2 | 3 | # load data 4 | data(longley) 5 | # fit model 6 | base <- lm(Employed~., longley) 7 | # summarize the fit 8 | summary(base) 9 | # perform step-wise feature selection 10 | fit <- step(base) 11 | # summarize the selected model 12 | print(fit) 13 | # make predictions 14 | predictions <- predict(fit, longley) 15 | # summarize accuracy 16 | mse <- mean((longley$Employed - predictions)^2) 17 | print(mse) 18 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/2-PenalizedLinearRegression/LASSO.R: -------------------------------------------------------------------------------- 1 | # Least Absolute Shrinkage and Selection Operator 2 | 3 | # load the package 4 | library(lars) 5 | # load data 6 | data(longley) 7 | x <- as.matrix(longley[,1:6]) 8 | y <- as.matrix(longley[,7]) 9 | # fit model 10 | fit <- lars(x, y, type="lasso") 11 | # summarize the fit 12 | print(fit) 13 | # select a step with a minimum error 14 | best_step <- fit$df[which.min(fit$RSS)] 15 | # make predictions 16 | predictions <- predict(fit, x, s=best_step, type="fit")$fit 17 | # summarize accuracy 18 | mse <- mean((y - predictions)^2) 19 | print(mse) 20 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/2-PenalizedLinearRegression/elastic_net.R: -------------------------------------------------------------------------------- 1 | # Elastic Net 2 | 3 | # load the package 4 | library(glmnet) 5 | # load data 6 | data(longley) 7 | x <- as.matrix(longley[,1:6]) 8 | y <- as.matrix(longley[,7]) 9 | # fit model 10 | fit <- glmnet(x, y, family="gaussian", alpha=0.5, lambda=0.001) 11 | # summarize the fit 12 | print(fit) 13 | # make predictions 14 | predictions <- predict(fit, x, type="link") 15 | # summarize accuracy 16 | mse <- mean((y - predictions)^2) 17 | print(mse) 18 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/2-PenalizedLinearRegression/ridge_regression.R: -------------------------------------------------------------------------------- 1 | # Ridge Regression 2 | 3 | # load the package 4 | library(glmnet) 5 | # load data 6 | data(longley) 7 | x <- as.matrix(longley[,1:6]) 8 | y <- as.matrix(longley[,7]) 9 | # fit model 10 | fit <- glmnet(x, y, family="gaussian", alpha=0, lambda=0.001) 11 | # summarize the fit 12 | print(fit) 13 | # make predictions 14 | predictions <- predict(fit, x, type="link") 15 | # summarize accuracy 16 | mse <- mean((y - predictions)^2) 17 | print(mse) 18 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/M5P.R: -------------------------------------------------------------------------------- 1 | # Model Trees 2 | 3 | # load the package 4 | library(RWeka) 5 | # load data 6 | data(longley) 7 | # fit model 8 | fit <- M5P(Employed~., data=longley) 9 | # summarize the fit 10 | print(fit) 11 | # make predictions 12 | predictions <- predict(fit, longley[,1:6]) 13 | # summarize accuracy 14 | mse <- mean((longley$Employed - predictions)^2) 15 | print(mse) 16 | 17 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/M5Rules.R: -------------------------------------------------------------------------------- 1 | # Rule System 2 | 3 | # load the package 4 | library(RWeka) 5 | # load data 6 | data(longley) 7 | # fit model 8 | fit <- M5Rules(Employed~., data=longley) 9 | # summarize the fit 10 | print(fit) 11 | # make predictions 12 | predictions <- predict(fit, longley[,1:6]) 13 | # summarize accuracy 14 | mse <- mean((longley$Employed - predictions)^2) 15 | print(mse) 16 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/bagging_CART.R: -------------------------------------------------------------------------------- 1 | # Bagging CART 2 | 3 | # load the package 4 | library(ipred) 5 | # load data 6 | data(longley) 7 | # fit model 8 | fit <- bagging(Employed~., data=longley, control=rpart.control(minsplit=5)) 9 | # summarize the fit 10 | print(fit) 11 | # make predictions 12 | predictions <- predict(fit, longley[,1:6]) 13 | # summarize accuracy 14 | mse <- mean((longley$Employed - predictions)^2) 15 | print(mse) 16 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/classification_and_regression_trees.R: -------------------------------------------------------------------------------- 1 | # Classification and Regression Trees 2 | 3 | # load the package 4 | library(rpart) 5 | # load data 6 | data(longley) 7 | # fit model 8 | fit <- rpart(Employed~., data=longley, control=rpart.control(minsplit=5)) 9 | # summarize the fit 10 | print(fit) 11 | # make predictions 12 | predictions <- predict(fit, longley[,1:6]) 13 | # summarize accuracy 14 | mse <- mean((longley$Employed - predictions)^2) 15 | print(mse) 16 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/conditional_decision_trees.R: -------------------------------------------------------------------------------- 1 | # Conditional Decision Trees 2 | 3 | # load the package 4 | library(party) 5 | # load data 6 | data(longley) 7 | # fit model 8 | fit <- ctree(Employed~., data=longley, controls=ctree_control(minsplit=2,minbucket=2,testtype="Univariate")) 9 | # summarize the fit 10 | print(fit) 11 | # make predictions 12 | predictions <- predict(fit, longley[,1:6]) 13 | # summarize accuracy 14 | mse <- mean((longley$Employed - predictions)^2) 15 | print(mse) 16 | 17 | 18 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/cubist.R: -------------------------------------------------------------------------------- 1 | # Cubist 2 | 3 | # load the package 4 | library(Cubist) 5 | # load data 6 | data(longley) 7 | # fit model 8 | fit <- cubist(longley[,1:6], longley[,7]) 9 | # summarize the fit 10 | print(fit) 11 | # make predictions 12 | predictions <- predict(fit, longley[,1:6]) 13 | # summarize accuracy 14 | mse <- mean((longley$Employed - predictions)^2) 15 | print(mse) -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/feed_forward_neural_network.R: -------------------------------------------------------------------------------- 1 | # Feed Forward Neural Network 2 | 3 | # load the package 4 | library(nnet) 5 | # load data 6 | data(longley) 7 | x <- longley[,1:6] 8 | y <- longley[,7] 9 | # fit model 10 | fit <- nnet(Employed~., longley, size=12, maxit=500, linout=T, decay=0.01) 11 | # summarize the fit 12 | print(fit) 13 | # make predictions 14 | predictions <- predict(fit, x, type="raw") 15 | # summarize accuracy 16 | mse <- mean((y - predictions)^2) 17 | print(mse) -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/gradient_boosted_machine.R: -------------------------------------------------------------------------------- 1 | # Gradient Boosted Machine 2 | 3 | # load the package 4 | library(gbm) 5 | # load data 6 | data(longley) 7 | # fit model 8 | fit <- gbm(Employed~., data=longley, distribution="gaussian", n.minobsinnode=1) 9 | # summarize the fit 10 | print(fit) 11 | # make predictions 12 | predictions <- predict(fit, longley[,1:6], n.trees=1) 13 | # summarize accuracy 14 | mse <- mean((longley$Employed - predictions)^2) 15 | print(mse) 16 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/k-nearest_neighbor.R: -------------------------------------------------------------------------------- 1 | # k-Nearest Neighbor 2 | 3 | # load the package 4 | library(caret) 5 | # load data 6 | data(longley) 7 | # fit model 8 | fit <- knnreg(longley[,1:6], longley[,7], k=3) 9 | # summarize the fit 10 | print(fit) 11 | # make predictions 12 | predictions <- predict(fit, longley[,1:6]) 13 | # summarize accuracy 14 | mse <- mean((longley$Employed - predictions)^2) 15 | print(mse) 16 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/multivariate_adaptive_regression_splines.R: -------------------------------------------------------------------------------- 1 | # Multivariate Adaptive Regression Splines 2 | 3 | # load the package 4 | library(earth) 5 | # load data 6 | data(longley) 7 | # fit model 8 | fit <- earth(Employed~., longley) 9 | # summarize the fit 10 | print(fit) 11 | # summarize the importance of input variables 12 | evimp(fit) 13 | # make predictions 14 | predictions <- predict(fit, longley) 15 | # summarize accuracy 16 | mse <- mean((longley$Employed - predictions)^2) 17 | print(mse) 18 | 19 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/random_forest.R: -------------------------------------------------------------------------------- 1 | # Random Forest 2 | 3 | # load the package 4 | library(randomForest) 5 | # load data 6 | data(longley) 7 | # fit model 8 | fit <- randomForest(Employed~., data=longley) 9 | # summarize the fit 10 | print(fit) 11 | # make predictions 12 | predictions <- predict(fit, longley[,1:6]) 13 | # summarize accuracy 14 | mse <- mean((longley$Employed - predictions)^2) 15 | print(mse) 16 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/3-NonLinearRegression/support_vector_machine.R: -------------------------------------------------------------------------------- 1 | # Support Vector Machine 2 | 3 | # load the package 4 | library(kernlab) 5 | # load data 6 | data(longley) 7 | # fit model 8 | fit <- ksvm(Employed~., longley) 9 | # summarize the fit 10 | print(fit) 11 | # make predictions 12 | predictions <- predict(fit, longley) 13 | # summarize accuracy 14 | mse <- mean((longley$Employed - predictions)^2) 15 | print(mse) 16 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/4-LinearClassification/linear_discriminant_analysis.R: -------------------------------------------------------------------------------- 1 | # Linear Discriminant Analysis 2 | 3 | # load the package 4 | library(MASS) 5 | data(iris) 6 | # fit model 7 | fit <- lda(Species~., data=iris) 8 | # summarize the fit 9 | print(fit) 10 | # make predictions 11 | predictions <- predict(fit, iris[,1:4])$class 12 | # summarize accuracy 13 | table(predictions, iris$Species) 14 | 15 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/4-LinearClassification/logistic_regression.R: -------------------------------------------------------------------------------- 1 | Logistic Regression 2 | 3 | # Load the dataset 4 | data(PimaIndiansDiabetes) 5 | # fit model 6 | fit <- glm(diabetes~., data=PimaIndiansDiabetes, family=binomial(link='logit')) 7 | # summarize the fit 8 | print(fit) 9 | # make predictions 10 | probabilities <- predict(fit, PimaIndiansDiabetes[,1:8], type='response') 11 | predictions <- ifelse(probabilities > 0.5,'pos','neg') 12 | # summarize accuracy 13 | table(predictions, PimaIndiansDiabetes$diabetes) 14 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/4-LinearClassification/logistic_regression_multiclass.R: -------------------------------------------------------------------------------- 1 | # Logistic Regression 2 | 3 | # load the package 4 | library(VGAM) 5 | # load data 6 | data(iris) 7 | # fit model 8 | fit <- vglm(Species~., family=multinomial, data=iris) 9 | # summarize the fit 10 | print(fit) 11 | # make predictions 12 | probabilities <- predict(fit, iris[,1:4], type="response") 13 | predictions <- apply(probabilities, 1, which.max) 14 | predictions[which(predictions=="1")] <- levels(iris$Species)[1] 15 | predictions[which(predictions=="2")] <- levels(iris$Species)[2] 16 | predictions[which(predictions=="3")] <- levels(iris$Species)[3] 17 | # summarize accuracy 18 | table(predictions, iris$Species) 19 | 20 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/4-LinearClassification/partial_least_squares_discriminant_analysis.R: -------------------------------------------------------------------------------- 1 | # Partial Least Squares Discriminant Analysis 2 | 3 | # load the package 4 | library(caret) 5 | data(iris) 6 | x <- iris[,1:4] 7 | y <- iris[,5] 8 | # fit model 9 | fit <- plsda(x, y, probMethod="Bayes") 10 | # summarize the fit 11 | print(fit) 12 | # make predictions 13 | predictions <- predict(fit, iris[,1:4]) 14 | # summarize accuracy 15 | table(predictions, iris$Species) 16 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/C4.5.R: -------------------------------------------------------------------------------- 1 | # C4.5 2 | 3 | # load the package 4 | library(RWeka) 5 | # load data 6 | data(iris) 7 | # fit model 8 | fit <- J48(Species~., data=iris) 9 | # summarize the fit 10 | print(fit) 11 | # make predictions 12 | predictions <- predict(fit, iris[,1:4]) 13 | # summarize accuracy 14 | table(predictions, iris$Species) -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/C5.0.R: -------------------------------------------------------------------------------- 1 | # C5.0 2 | 3 | # load the package 4 | library(C50) 5 | # load data 6 | data(iris) 7 | # fit model 8 | fit <- C5.0(Species~., data=iris, trials=10) 9 | # summarize the fit 10 | print(fit) 11 | # make predictions 12 | predictions <- predict(fit, iris[,1:4]) 13 | # summarize accuracy 14 | table(predictions, iris$Species) 15 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/PART.R: -------------------------------------------------------------------------------- 1 | # PART 2 | 3 | # load the package 4 | library(RWeka) 5 | # load data 6 | data(iris) 7 | # fit model 8 | fit <- PART(Species~., data=iris) 9 | # summarize the fit 10 | print(fit) 11 | # make predictions 12 | predictions <- predict(fit, iris[,1:4]) 13 | # summarize accuracy 14 | table(predictions, iris$Species) 15 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/bagging_CART.R: -------------------------------------------------------------------------------- 1 | # Bagging CART 2 | 3 | # load the package 4 | library(ipred) 5 | # load data 6 | data(iris) 7 | # fit model 8 | fit <- bagging(Species~., data=iris) 9 | # summarize the fit 10 | print(fit) 11 | # make predictions 12 | predictions <- predict(fit, iris[,1:4], type="class") 13 | # summarize accuracy 14 | table(predictions, iris$Species) 15 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/classification_and_regression_trees.R: -------------------------------------------------------------------------------- 1 | # Classification and Regression Trees 2 | 3 | # load the package 4 | library(rpart) 5 | # load data 6 | data(iris) 7 | # fit model 8 | fit <- rpart(Species~., data=iris) 9 | # summarize the fit 10 | print(fit) 11 | # make predictions 12 | predictions <- predict(fit, iris[,1:4], type="class") 13 | # summarize accuracy 14 | table(predictions, iris$Species) 15 | 16 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/feed_forward_neural_network.R: -------------------------------------------------------------------------------- 1 | # Feed Forward Neural Network 2 | 3 | # load the package 4 | library(nnet) 5 | data(iris) 6 | # fit model 7 | fit <- nnet(Species~., data=iris, size=4, decay=0.0001, maxit=500) 8 | # summarize the fit 9 | print(fit) 10 | # make predictions 11 | predictions <- predict(fit, iris[,1:4], type="class") 12 | # summarize accuracy 13 | table(predictions, iris$Species) 14 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/flexible_discriminant_analysis.R: -------------------------------------------------------------------------------- 1 | # Flexible Discriminant Analysis 2 | 3 | # load the package 4 | library(mda) 5 | data(iris) 6 | # fit model 7 | fit <- fda(Species~., data=iris) 8 | # summarize the fit 9 | print(fit) 10 | # make predictions 11 | predictions <- predict(fit, iris[,1:4]) 12 | # summarize accuracy 13 | table(predictions, iris$Species) 14 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/gradient_boosted_machine.R: -------------------------------------------------------------------------------- 1 | # Gradient Boosted Machine 2 | 3 | # load the package 4 | library(gbm) 5 | # load data 6 | data(iris) 7 | # fit model 8 | fit <- gbm(Species~., data=iris, distribution="multinomial") 9 | # summarize the fit 10 | print(fit) 11 | # make predictions 12 | probabilities <- predict(fit, iris[,1:4], n.trees=1) 13 | predictions <- colnames(probabilities)[apply(probabilities, 1, which.max)] 14 | # summarize accuracy 15 | table(predictions, iris$Species) 16 | 17 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/k-nearest_neighbors.R: -------------------------------------------------------------------------------- 1 | # k-Nearest Neighbors 2 | 3 | # load the package 4 | library(caret) 5 | data(iris) 6 | # fit model 7 | fit <- knn3(Species~., data=iris, k=5) 8 | # summarize the fit 9 | print(fit) 10 | # make predictions 11 | predictions <- predict(fit, iris[,1:4], type="class") 12 | # summarize accuracy 13 | table(predictions, iris$Species) 14 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/mixture_discriminant_analysis.R: -------------------------------------------------------------------------------- 1 | # Mixture Discriminant Analysis 2 | 3 | # load the package 4 | library(mda) 5 | data(iris) 6 | # fit model 7 | fit <- mda(Species~., data=iris) 8 | # summarize the fit 9 | print(fit) 10 | # make predictions 11 | predictions <- predict(fit, iris[,1:4]) 12 | # summarize accuracy 13 | table(predictions, iris$Species) 14 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/naive_bayes.R: -------------------------------------------------------------------------------- 1 | # Naive Bayes 2 | 3 | # load the package 4 | library(e1071) 5 | data(iris) 6 | # fit model 7 | fit <- naiveBayes(Species~., data=iris) 8 | # summarize the fit 9 | print(fit) 10 | # make predictions 11 | predictions <- predict(fit, iris[,1:4]) 12 | # summarize accuracy 13 | table(predictions, iris$Species) 14 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/quadratic_discriminant_analysis.R: -------------------------------------------------------------------------------- 1 | # Quadratic Discriminant Analysis 2 | 3 | # load the package 4 | library(MASS) 5 | data(iris) 6 | # fit model 7 | fit <- qda(Species~., data=iris) 8 | # summarize the fit 9 | print(fit) 10 | # make predictions 11 | predictions <- predict(fit, iris[,1:4])$class 12 | # summarize accuracy 13 | table(predictions, iris$Species) 14 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/random_forest.R: -------------------------------------------------------------------------------- 1 | # Random Forest 2 | 3 | # load the package 4 | library(randomForest) 5 | # load data 6 | data(iris) 7 | # fit model 8 | fit <- randomForest(Species~., data=iris) 9 | # summarize the fit 10 | print(fit) 11 | # make predictions 12 | predictions <- predict(fit, iris[,1:4]) 13 | # summarize accuracy 14 | table(predictions, iris$Species) 15 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/regularized_discriminant_analysis.R: -------------------------------------------------------------------------------- 1 | # Regularized Discriminant Analysis 2 | 3 | # load the package 4 | library(klaR) 5 | data(iris) 6 | # fit model 7 | fit <- rda(Species~., data=iris, gamma=0.05, lambda=0.01) 8 | # summarize the fit 9 | print(fit) 10 | # make predictions 11 | predictions <- predict(fit, iris[,1:4])$class 12 | # summarize accuracy 13 | table(predictions, iris$Species) 14 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/5-NonLinearClassiication/support_vector_machine.R: -------------------------------------------------------------------------------- 1 | # Support Vector Machine 2 | 3 | # load the package 4 | library(kernlab) 5 | data(iris) 6 | # fit model 7 | fit <- ksvm(Species~., data=iris) 8 | # summarize the fit 9 | print(fit) 10 | # make predictions 11 | predictions <- predict(fit, iris[,1:4], type="response") 12 | # summarize accuracy 13 | table(predictions, iris$Species) 14 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/6-Optimization/bfgs.R: -------------------------------------------------------------------------------- 1 | # BFGS 2 | 3 | # definition of the 2D Rosenbrock function, optima is at (1,1) 4 | rosenbrock <- function(v) { 5 | (1 - v[1])^2 + 100 * (v[2] - v[1]*v[1])^2 6 | } 7 | 8 | # definition of the gradient of the 2D Rosenbrock function 9 | derivative <- function(v) { 10 | c(-400 * v[1] * (v[2] - v[1]*v[1]) - 2 * (1 - v[1]), 11 | 200 * (v[2] - v[1]*v[1])) 12 | } 13 | 14 | # locate the minimum of the function using the BFGS method 15 | result <- optim( 16 | c(runif(1,-3,3), runif(1,-3,3)), # start at a random position 17 | rosenbrock, # the function to minimize 18 | derivative, # no function gradient 19 | method="BFGS", # use the BFGS method 20 | control=c( # configure BFGS 21 | maxit=100, # maximum iterations of 100 22 | reltol=1e-8)) # response tolerance over-one step 23 | 24 | # summarise results 25 | print(result$par) # the coordinate of the minimim 26 | print(result$value) # the function response of the minimum 27 | print(result$counts) # the number of function calls performed 28 | 29 | # dispaly the function as a contour plot 30 | x <- seq(-3, 3, length.out=100) 31 | y <- seq(-3, 3, length.out=100) 32 | z <- rosenbrock(expand.grid(x, y)) 33 | contour(x, y, matrix(log10(z), length(x)), xlab="x", ylab="y") 34 | # draw the optima as a point 35 | points(result$par[1], result$par[2], col="red", pch=19) 36 | # draw a square around the optima to highlight it 37 | rect(result$par[1]-0.2, result$par[2]-0.2, result$par[1]+0.2, 38 | result$par[2]+0.2, lwd=2) 39 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/6-Optimization/conjugate_gradient.R: -------------------------------------------------------------------------------- 1 | # Conjugate Gradient method 2 | 3 | # definition of the 2D Rosenbrock function, optima is at (1,1) 4 | rosenbrock <- function(v) { 5 | (1 - v[1])^2 + 100 * (v[2] - v[1]*v[1])^2 6 | } 7 | 8 | # definition of the gradient of the 2D Rosenbrock function 9 | derivative <- function(v) { 10 | c(-400 * v[1] * (v[2] - v[1]*v[1]) - 2 * (1 - v[1]), 11 | 200 * (v[2] - v[1]*v[1])) 12 | } 13 | 14 | # locate the minimum of the function using the Conjugate Gradient method 15 | result <- optim( 16 | c(runif(1,-3,3), runif(1,-3,3)), # start at a random position 17 | rosenbrock, # the function to minimize 18 | derivative, # no function gradient 19 | method="CG", # use the Conjugate Gradient method 20 | control=c( # configure Conjugate Gradient 21 | maxit=100, # maximum iterations of 100 22 | reltol=1e-8, # response tolerance over-one step 23 | type=2)) # use the Polak-Ribiere update method 24 | 25 | # summarise results 26 | print(result$par) # the coordinate of the minimim 27 | print(result$value) # the function response of the minimum 28 | print(result$counts) # the number of function calls performed 29 | 30 | # dispaly the function as a contour plot 31 | x <- seq(-3, 3, length.out=100) 32 | y <- seq(-3, 3, length.out=100) 33 | z <- rosenbrock(expand.grid(x, y)) 34 | contour(x, y, matrix(log10(z), length(x)), xlab="x", ylab="y") 35 | # draw the optima as a point 36 | points(result$par[1], result$par[2], col="red", pch=19) 37 | # draw a square around the optima to highlight it 38 | rect(result$par[1]-0.2, result$par[2]-0.2, result$par[1]+0.2, 39 | result$par[2]+0.2, lwd=2) 40 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/6-Optimization/golden_section_search.R: -------------------------------------------------------------------------------- 1 | # Golden Section Line Search 2 | 3 | # define a 1D basin function, optima at f(0)=0 4 | basin <- function(x) { 5 | x[1]^2 6 | } 7 | 8 | # # locate the minimum of the function using a Golden Section Line Search 9 | result <- optimize( 10 | basin, # the function to be minimized 11 | c(-5, 5), # the bounds on the function paramter 12 | maximum=FALSE, # we are concerned with the function minima 13 | tol=1e-8) # the size of the final bracketing 14 | 15 | # display the results 16 | print(result$minimum) #function parameter 17 | print(result$objective) # function response 18 | 19 | # plot the function 20 | x <- seq(-5, 5, length.out=100) 21 | y <- basin(expand.grid(x)) 22 | plot(x, y, xlab="x",ylab="f(x)", type="l") 23 | # plot the solution as a point 24 | points(result$minimum, result$objective, col="red", pch=19) 25 | # draw a square around the optima to highlight it 26 | rect(result$minimum-0.3, result$objective-0.7, result$minimum+0.3, 27 | result$objective+0.7, lwd=2) 28 | 29 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/3-Algorithms/1-Algorithms/6-Optimization/nelder_mead.R: -------------------------------------------------------------------------------- 1 | # Nelder-Mead method 2 | 3 | # definition of the 2D Rosenbrock function, optima is at (1,1) 4 | rosenbrock <- function(v) { 5 | (1 - v[1])^2 + 100 * (v[2] - v[1]*v[1])^2 6 | } 7 | 8 | # locate the minimum of the function using the Nelder-Mead method 9 | result <- optim( 10 | c(runif(1,-3,3), runif(1,-3,3)), # start at a random position 11 | rosenbrock, # the function to minimize 12 | NULL, # no function gradient 13 | method="Nelder-Mead", # use the Nelder-Mead method 14 | control=c( # configure Nelder-Mead 15 | maxit=100, # maximum iterations of 100 16 | reltol=1e-8, # response tolerance over-one step 17 | alpha=1.0, # reflection factor 18 | beta=0.5, # contraction factor 19 | gamma=2.0)) # expansion factor 20 | 21 | # summarise results 22 | print(result$par) # the coordinate of the minimim 23 | print(result$value) # the function response of the minimum 24 | print(result$counts) # the number of function calls performed 25 | 26 | # dispaly the function as a contour plot 27 | x <- seq(-3, 3, length.out=100) 28 | y <- seq(-3, 3, length.out=100) 29 | z <- rosenbrock(expand.grid(x, y)) 30 | contour(x, y, matrix(log10(z), length(x)), xlab="x",ylab="y") 31 | # draw the optima as a point 32 | points(result$par[1], result$par[2], col="red", pch=19) 33 | # draw a square around the optima to highlight it 34 | rect(result$par[1]-0.2, result$par[2]-0.2, result$par[1]+0.2, 35 | result$par[2]+0.2, lwd=2) 36 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/.DS_Store -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/1-ResamplingMethods/bootstrap.R: -------------------------------------------------------------------------------- 1 | # Estimate accuracy using the bootstrap. 2 | 3 | # load the library 4 | library(caret) 5 | # load the iris dataset 6 | data(iris) 7 | # define training control 8 | train_control <- trainControl(method="boot", number=100) 9 | # train the model 10 | model <- train(Species~., data=iris, trControl=train_control, method="nb") 11 | # summarize results 12 | print(model) 13 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/1-ResamplingMethods/data_split.R: -------------------------------------------------------------------------------- 1 | # Estimate accuracy using a train/test split. 2 | 3 | # load the libraries 4 | library(caret) 5 | library(klaR) 6 | # load the iris dataset 7 | data(iris) 8 | # define an 80%/20% train/test split of the dataset 9 | split=0.80 10 | trainIndex <- createDataPartition(iris$Species, p=split, list=FALSE) 11 | data_train <- iris[ trainIndex,] 12 | data_test <- iris[-trainIndex,] 13 | # train a naive bayes model 14 | model <- NaiveBayes(Species~., data=data_train) 15 | # make predictions 16 | x_test <- data_test[,1:4] 17 | y_test <- data_test[,5] 18 | predictions <- predict(model, x_test) 19 | # summarize results 20 | confusionMatrix(predictions$class, y_test) 21 | 22 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/1-ResamplingMethods/kfold_cross_validation.R: -------------------------------------------------------------------------------- 1 | # Estimate accuracy using a k-fold cross validation. 2 | 3 | # load the library 4 | library(caret) 5 | # load the iris dataset 6 | data(iris) 7 | # define training control 8 | train_control <- trainControl(method="cv", number=10) 9 | # fix the parameters of the algorithm 10 | grid <- expand.grid(.fL=c(0), .usekernel=c(FALSE)) 11 | # train the model 12 | model <- train(Species~., data=iris, trControl=train_control, method="nb", tuneGrid=grid) 13 | # summarize results 14 | print(model) 15 | 16 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/1-ResamplingMethods/leave_one_out_cross_validation.R: -------------------------------------------------------------------------------- 1 | # Estimate accuracy using a leave one out cross validation. 2 | 3 | # load the library 4 | library(caret) 5 | # load the iris dataset 6 | data(iris) 7 | # define training control 8 | train_control <- trainControl(method="LOOCV") 9 | # train the model 10 | model <- train(Species~., data=iris, trControl=train_control, method="nb") 11 | # summarize results 12 | print(model) 13 | 14 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/1-ResamplingMethods/repeated_kfold_cross_validation.R: -------------------------------------------------------------------------------- 1 | # Estimate accuracy using repeated k-fold cross validation. 2 | 3 | # load the library 4 | library(caret) 5 | # load the iris dataset 6 | data(iris) 7 | # define training control 8 | train_control <- trainControl(method="repeatedcv", number=10, repeats=3) 9 | # train the model 10 | model <- train(Species~., data=iris, trControl=train_control, method="nb") 11 | # summarize results 12 | print(model) 13 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/2-Metrics/Accuracy.R: -------------------------------------------------------------------------------- 1 | # Accuracy metric 2 | 3 | # load libraries 4 | library(caret) 5 | library(mlbench) 6 | # load the dataset 7 | data(PimaIndiansDiabetes) 8 | # prepare resampling method 9 | control <- trainControl(method="cv", number=5) 10 | set.seed(7) 11 | fit <- train(diabetes~., data=PimaIndiansDiabetes, method="glm", metric="Accuracy", trControl=control) 12 | # display results 13 | print(fit) -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/2-Metrics/Kappa.R: -------------------------------------------------------------------------------- 1 | # Kappa metric 2 | 3 | # load libraries 4 | library(caret) 5 | library(mlbench) 6 | # load the dataset 7 | data(PimaIndiansDiabetes) 8 | # prepare resampling method 9 | control <- trainControl(method="cv", number=5) 10 | set.seed(7) 11 | fit <- train(diabetes~., data=PimaIndiansDiabetes, method="glm", metric="Kappa", trControl=control) 12 | # display results 13 | print(fit) -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/2-Metrics/LogLoss.R: -------------------------------------------------------------------------------- 1 | # MultiNomialLogLoss Metric 2 | 3 | # load libraries 4 | library(caret) 5 | # load the dataset 6 | data(iris) 7 | # prepare resampling method 8 | control <- trainControl(method="cv", number=5, classProbs=TRUE, summaryFunction=mnLogLoss) 9 | set.seed(7) 10 | fit <- train(Species~., data=iris, method="rpart", metric="logLoss", trControl=control) 11 | # display results 12 | print(fit) 13 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/2-Metrics/RMSE.R: -------------------------------------------------------------------------------- 1 | # RMSE metric 2 | 3 | # load libraries 4 | library(caret) 5 | # load data 6 | data(longley) 7 | # prepare resampling method 8 | control <- trainControl(method="cv", number=5) 9 | set.seed(7) 10 | fit <- train(Employed~., data=longley, method="lm", metric="RMSE", trControl=control) 11 | # display results 12 | print(fit) -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/2-Metrics/ROC.R: -------------------------------------------------------------------------------- 1 | # ROC: AUC, sensitivity, specificity metrics 2 | 3 | # load libraries 4 | library(caret) 5 | library(mlbench) 6 | # load the dataset 7 | data(PimaIndiansDiabetes) 8 | # prepare resampling method 9 | control <- trainControl(method="cv", number=5, classProbs=TRUE, summaryFunction=twoClassSummary) 10 | set.seed(7) 11 | fit <- train(diabetes~., data=PimaIndiansDiabetes, method="glm", metric="ROC", trControl=control) 12 | # display results 13 | print(fit) -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/2-Metrics/RSquared.R: -------------------------------------------------------------------------------- 1 | # Rsquared metric 2 | 3 | # load libraries 4 | library(caret) 5 | # load data 6 | data(longley) 7 | # prepare resampling method 8 | control <- trainControl(method="cv", number=5) 9 | set.seed(7) 10 | fit <- train(Employed~., data=longley, method="lm", metric="Rsquared", trControl=control) 11 | # display results 12 | print(fit) -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/3-ModelSelection/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/3-ModelSelection/.DS_Store -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/3-ModelSelection/compare_boxplots.R: -------------------------------------------------------------------------------- 1 | # Compare models using box and whisker plots 2 | 3 | # load libraries 4 | library(mlbench) 5 | library(caret) 6 | # load the dataset 7 | data(PimaIndiansDiabetes) 8 | # prepare training scheme 9 | control <- trainControl(method="repeatedcv", number=10, repeats=3) 10 | # CART 11 | set.seed(7) 12 | fit.cart <- train(diabetes~., data=PimaIndiansDiabetes, method="rpart", trControl=control) 13 | # LDA 14 | set.seed(7) 15 | fit.lda <- train(diabetes~., data=PimaIndiansDiabetes, method="lda", trControl=control) 16 | # SVM 17 | set.seed(7) 18 | fit.svm <- train(diabetes~., data=PimaIndiansDiabetes, method="svmRadial", trControl=control) 19 | # kNN 20 | set.seed(7) 21 | fit.knn <- train(diabetes~., data=PimaIndiansDiabetes, method="knn", trControl=control) 22 | # Random Forest 23 | set.seed(7) 24 | fit.rf <- train(diabetes~., data=PimaIndiansDiabetes, method="rf", trControl=control) 25 | # collect resamples 26 | results <- resamples(list(CART=fit.cart, LDA=fit.lda, SVM=fit.svm, KNN=fit.knn, RF=fit.rf)) 27 | # box and whisker plots to compare models 28 | scales <- list(x=list(relation="free"), y=list(relation="free")) 29 | bwplot(results, scales=scales) 30 | 31 | 32 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/3-ModelSelection/compare_densityplot.R: -------------------------------------------------------------------------------- 1 | # Compare models using density plots plots 2 | 3 | # load libraries 4 | library(mlbench) 5 | library(caret) 6 | # load the dataset 7 | data(PimaIndiansDiabetes) 8 | # prepare training scheme 9 | control <- trainControl(method="repeatedcv", number=10, repeats=3) 10 | # CART 11 | set.seed(7) 12 | fit.cart <- train(diabetes~., data=PimaIndiansDiabetes, method="rpart", trControl=control) 13 | # LDA 14 | set.seed(7) 15 | fit.lda <- train(diabetes~., data=PimaIndiansDiabetes, method="lda", trControl=control) 16 | # SVM 17 | set.seed(7) 18 | fit.svm <- train(diabetes~., data=PimaIndiansDiabetes, method="svmRadial", trControl=control) 19 | # kNN 20 | set.seed(7) 21 | fit.knn <- train(diabetes~., data=PimaIndiansDiabetes, method="knn", trControl=control) 22 | # Random Forest 23 | set.seed(7) 24 | fit.rf <- train(diabetes~., data=PimaIndiansDiabetes, method="rf", trControl=control) 25 | # collect resamples 26 | results <- resamples(list(CART=fit.cart, LDA=fit.lda, SVM=fit.svm, KNN=fit.knn, RF=fit.rf)) 27 | # density plots of accuracy 28 | scales <- list(x=list(relation="free"), y=list(relation="free")) 29 | densityplot(results, scales=scales, pch = "|") 30 | 31 | 32 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/3-ModelSelection/compare_dotplot.R: -------------------------------------------------------------------------------- 1 | # Compare models using dotplots 2 | 3 | # load libraries 4 | library(mlbench) 5 | library(caret) 6 | # load the dataset 7 | data(PimaIndiansDiabetes) 8 | # prepare training scheme 9 | control <- trainControl(method="repeatedcv", number=10, repeats=3) 10 | # CART 11 | set.seed(7) 12 | fit.cart <- train(diabetes~., data=PimaIndiansDiabetes, method="rpart", trControl=control) 13 | # LDA 14 | set.seed(7) 15 | fit.lda <- train(diabetes~., data=PimaIndiansDiabetes, method="lda", trControl=control) 16 | # SVM 17 | set.seed(7) 18 | fit.svm <- train(diabetes~., data=PimaIndiansDiabetes, method="svmRadial", trControl=control) 19 | # kNN 20 | set.seed(7) 21 | fit.knn <- train(diabetes~., data=PimaIndiansDiabetes, method="knn", trControl=control) 22 | # Random Forest 23 | set.seed(7) 24 | fit.rf <- train(diabetes~., data=PimaIndiansDiabetes, method="rf", trControl=control) 25 | # collect resamples 26 | results <- resamples(list(CART=fit.cart, LDA=fit.lda, SVM=fit.svm, KNN=fit.knn, RF=fit.rf)) 27 | # dot plots of accuracy 28 | scales <- list(x=list(relation="free"), y=list(relation="free")) 29 | dotplot(results, scales=scales) 30 | 31 | 32 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/3-ModelSelection/compare_parallelplot.R: -------------------------------------------------------------------------------- 1 | # Compare models using parallel plots 2 | 3 | # load libraries 4 | library(mlbench) 5 | library(caret) 6 | # load the dataset 7 | data(PimaIndiansDiabetes) 8 | # prepare training scheme 9 | control <- trainControl(method="repeatedcv", number=10, repeats=3) 10 | # CART 11 | set.seed(7) 12 | fit.cart <- train(diabetes~., data=PimaIndiansDiabetes, method="rpart", trControl=control) 13 | # LDA 14 | set.seed(7) 15 | fit.lda <- train(diabetes~., data=PimaIndiansDiabetes, method="lda", trControl=control) 16 | # SVM 17 | set.seed(7) 18 | fit.svm <- train(diabetes~., data=PimaIndiansDiabetes, method="svmRadial", trControl=control) 19 | # kNN 20 | set.seed(7) 21 | fit.knn <- train(diabetes~., data=PimaIndiansDiabetes, method="knn", trControl=control) 22 | # Random Forest 23 | set.seed(7) 24 | fit.rf <- train(diabetes~., data=PimaIndiansDiabetes, method="rf", trControl=control) 25 | # collect resamples 26 | results <- resamples(list(CART=fit.cart, LDA=fit.lda, SVM=fit.svm, KNN=fit.knn, RF=fit.rf)) 27 | # parallel plots to compare models 28 | parallelplot(results) 29 | 30 | 31 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/3-ModelSelection/compare_scatterplot_matrix.R: -------------------------------------------------------------------------------- 1 | # Compare models using scatterplot matrix 2 | 3 | # load libraries 4 | library(mlbench) 5 | library(caret) 6 | # load the dataset 7 | data(PimaIndiansDiabetes) 8 | # prepare training scheme 9 | control <- trainControl(method="repeatedcv", number=10, repeats=3) 10 | # CART 11 | set.seed(7) 12 | fit.cart <- train(diabetes~., data=PimaIndiansDiabetes, method="rpart", trControl=control) 13 | # LDA 14 | set.seed(7) 15 | fit.lda <- train(diabetes~., data=PimaIndiansDiabetes, method="lda", trControl=control) 16 | # SVM 17 | set.seed(7) 18 | fit.svm <- train(diabetes~., data=PimaIndiansDiabetes, method="svmRadial", trControl=control) 19 | # kNN 20 | set.seed(7) 21 | fit.knn <- train(diabetes~., data=PimaIndiansDiabetes, method="knn", trControl=control) 22 | # Random Forest 23 | set.seed(7) 24 | fit.rf <- train(diabetes~., data=PimaIndiansDiabetes, method="rf", trControl=control) 25 | # collect resamples 26 | results <- resamples(list(CART=fit.cart, LDA=fit.lda, SVM=fit.svm, KNN=fit.knn, RF=fit.rf)) 27 | # pair-wise scatterplots of predictions to compare models 28 | splom(results) 29 | # pair-wise scatterplots of accuracy measures to compare models 30 | splom(results, variables="metrics") 31 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/3-ModelSelection/compare_summary.R: -------------------------------------------------------------------------------- 1 | # Compare models using a table summary 2 | 3 | # load libraries 4 | library(mlbench) 5 | library(caret) 6 | # load the dataset 7 | data(PimaIndiansDiabetes) 8 | # prepare training scheme 9 | control <- trainControl(method="repeatedcv", number=10, repeats=3) 10 | # CART 11 | set.seed(7) 12 | fit.cart <- train(diabetes~., data=PimaIndiansDiabetes, method="rpart", trControl=control) 13 | # LDA 14 | set.seed(7) 15 | fit.lda <- train(diabetes~., data=PimaIndiansDiabetes, method="lda", trControl=control) 16 | # SVM 17 | set.seed(7) 18 | fit.svm <- train(diabetes~., data=PimaIndiansDiabetes, method="svmRadial", trControl=control) 19 | # kNN 20 | set.seed(7) 21 | fit.knn <- train(diabetes~., data=PimaIndiansDiabetes, method="knn", trControl=control) 22 | # Random Forest 23 | set.seed(7) 24 | fit.rf <- train(diabetes~., data=PimaIndiansDiabetes, method="rf", trControl=control) 25 | # collect resamples 26 | results <- resamples(list(CART=fit.cart, LDA=fit.lda, SVM=fit.svm, KNN=fit.knn, RF=fit.rf)) 27 | # summarize differences between modes 28 | summary(results) 29 | 30 | 31 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/3-ModelSelection/compare_xyplot.R: -------------------------------------------------------------------------------- 1 | # Compare models using xyplot 2 | 3 | # load libraries 4 | library(mlbench) 5 | library(caret) 6 | # load the dataset 7 | data(PimaIndiansDiabetes) 8 | # prepare training scheme 9 | control <- trainControl(method="repeatedcv", number=10, repeats=3) 10 | # CART 11 | set.seed(7) 12 | fit.cart <- train(diabetes~., data=PimaIndiansDiabetes, method="rpart", trControl=control) 13 | # LDA 14 | set.seed(7) 15 | fit.lda <- train(diabetes~., data=PimaIndiansDiabetes, method="lda", trControl=control) 16 | # SVM 17 | set.seed(7) 18 | fit.svm <- train(diabetes~., data=PimaIndiansDiabetes, method="svmRadial", trControl=control) 19 | # kNN 20 | set.seed(7) 21 | fit.knn <- train(diabetes~., data=PimaIndiansDiabetes, method="knn", trControl=control) 22 | # Random Forest 23 | set.seed(7) 24 | fit.rf <- train(diabetes~., data=PimaIndiansDiabetes, method="rf", trControl=control) 25 | # collect resamples 26 | results <- resamples(list(CART=fit.cart, LDA=fit.lda, SVM=fit.svm, KNN=fit.knn, RF=fit.rf)) 27 | # xyplot plots to compare models 28 | xyplot(results, models=c("LDA", "SVM")) 29 | 30 | 31 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/4-EvaluateAlgorithms/3-ModelSelection/significant_difference.R: -------------------------------------------------------------------------------- 1 | # Calculate statistical significance of difference between model predictions 2 | 3 | # load libraries 4 | library(mlbench) 5 | library(caret) 6 | # load the dataset 7 | data(PimaIndiansDiabetes) 8 | # prepare training scheme 9 | control <- trainControl(method="repeatedcv", number=10, repeats=3) 10 | # CART 11 | set.seed(7) 12 | fit.cart <- train(diabetes~., data=PimaIndiansDiabetes, method="rpart", trControl=control) 13 | # LDA 14 | set.seed(7) 15 | fit.lda <- train(diabetes~., data=PimaIndiansDiabetes, method="lda", trControl=control) 16 | # SVM 17 | set.seed(7) 18 | fit.svm <- train(diabetes~., data=PimaIndiansDiabetes, method="svmRadial", trControl=control) 19 | # kNN 20 | set.seed(7) 21 | fit.knn <- train(diabetes~., data=PimaIndiansDiabetes, method="knn", trControl=control) 22 | # Random Forest 23 | set.seed(7) 24 | fit.rf <- train(diabetes~., data=PimaIndiansDiabetes, method="rf", trControl=control) 25 | # collect resamples 26 | results <- resamples(list(CART=fit.cart, LDA=fit.lda, SVM=fit.svm, KNN=fit.knn, RF=fit.rf)) 27 | # difference in model predictions 28 | diffs <- diff(results) 29 | # summarize p-values for pair-wise comparisons 30 | summary(diffs) 31 | # plot of differences 32 | scales <- list(x=list(relation="free"), y=list(relation="free")) 33 | bwplot(diffs, scales=scales) 34 | # t-test between two models 35 | compare_models(fit.svm, fit.lda) 36 | 37 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/5-ImproveResults/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/machine_learning_mastery_with_r_code/5-ImproveResults/.DS_Store -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/5-ImproveResults/1-TuneAlgorithms/automatic_grid_search.R: -------------------------------------------------------------------------------- 1 | # Tune algorithm parameters using an automatic grid search. 2 | 3 | # load the library 4 | library(caret) 5 | # load the dataset 6 | data(iris) 7 | # prepare training scheme 8 | control <- trainControl(method="repeatedcv", number=10, repeats=3) 9 | # train the model 10 | model <- train(Species~., data=iris, method="lvq", trControl=control, tuneLength=5) 11 | # summarize the model 12 | print(model) 13 | # plot the effect of parameters on accuracy 14 | plot(model) 15 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/5-ImproveResults/1-TuneAlgorithms/custom_search.R: -------------------------------------------------------------------------------- 1 | # Customer Parameter Search 2 | 3 | # load the packages 4 | library(randomForest) 5 | library(mlbench) 6 | library(caret) 7 | # configure multi-core (not supported on Windoews) 8 | library(doMC) 9 | registerDoMC(cores=8) 10 | 11 | # define the custom caret algorithm (wrapper for Random Forest) 12 | customRF <- list(type="Classification", library="randomForest", loop=NULL) 13 | customRF$parameters <- data.frame(parameter=c("mtry", "ntree"), class=rep("numeric", 2), label=c("mtry", "ntree")) 14 | customRF$grid <- function(x, y, len=NULL, search="grid") {} 15 | customRF$fit <- function(x, y, wts, param, lev, last, weights, classProbs, ...) { 16 | randomForest(x, y, mtry=param$mtry, ntree=param$ntree, ...) 17 | } 18 | customRF$predict <- function(modelFit, newdata, preProc=NULL, submodels=NULL) 19 | predict(modelFit, newdata) 20 | customRF$prob <- function(modelFit, newdata, preProc=NULL, submodels=NULL) 21 | predict(modelFit, newdata, type = "prob") 22 | customRF$sort <- function(x) x[order(x[,1]),] 23 | customRF$levels <- function(x) x$classes 24 | 25 | # Load Dataset 26 | data(Sonar) 27 | dataset <- Sonar 28 | seed <- 7 29 | metric <- "Accuracy" 30 | 31 | # train model 32 | trainControl <- trainControl(method="repeatedcv", number=10, repeats=3) 33 | tunegrid <- expand.grid(.mtry=c(1:15), .ntree=c(1000, 1500, 2000, 2500)) 34 | set.seed(seed) 35 | custom <- train(Class~., data=dataset, method=customRF, metric=metric, tuneGrid=tunegrid, trControl=trainControl) 36 | print(custom) 37 | plot(custom) 38 | 39 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/5-ImproveResults/1-TuneAlgorithms/manual_grid_search.R: -------------------------------------------------------------------------------- 1 | # Tune algorithm parameters using a manual grid search. 2 | 3 | # load the library 4 | library(caret) 5 | # load the dataset 6 | data(iris) 7 | # prepare training scheme 8 | control <- trainControl(method="repeatedcv", number=10, repeats=3) 9 | # design the parameter tuning grid 10 | grid <- expand.grid(size=c(5,10,20,50), k=c(1,2,3,4,5)) 11 | # train the model 12 | model <- train(Species~., data=iris, method="lvq", trControl=control, tuneGrid=grid) 13 | # summarize the model 14 | print(model) 15 | # plot the effect of parameters on accuracy 16 | plot(model) 17 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/5-ImproveResults/1-TuneAlgorithms/manual_search.R: -------------------------------------------------------------------------------- 1 | # Manually search parametres 2 | 3 | # load the packages 4 | library(randomForest) 5 | library(mlbench) 6 | library(caret) 7 | # Load Dataset 8 | data(Sonar) 9 | dataset <- Sonar 10 | x <- dataset[,1:60] 11 | y <- dataset[,61] 12 | seed <- 7 13 | metric <- "Accuracy" 14 | # Manual Search 15 | trainControl <- trainControl(method="repeatedcv", number=10, repeats=3, search="grid") 16 | tunegrid <- expand.grid(.mtry=c(sqrt(ncol(x)))) 17 | modellist <- list() 18 | for (ntree in c(1000, 1500, 2000, 2500)) { 19 | set.seed(seed) 20 | fit <- train(Class~., data=dataset, method="rf", metric=metric, tuneGrid=tunegrid, trControl=trainControl, ntree=ntree) 21 | key <- toString(ntree) 22 | modellist[[key]] <- fit 23 | } 24 | # compare results 25 | results <- resamples(modellist) 26 | summary(results) 27 | dotplot(results) 28 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/5-ImproveResults/1-TuneAlgorithms/optimal_parameters.R: -------------------------------------------------------------------------------- 1 | # Select the best tuning configuration 2 | 3 | # load libraries 4 | library(mlbench) 5 | library(caret) 6 | # load the dataset 7 | data(PimaIndiansDiabetes) 8 | # prepare training scheme 9 | control <- trainControl(method="repeatedcv", number=10, repeats=3) 10 | # CART 11 | set.seed(7) 12 | tunegrid <- expand.grid(.cp=seq(0,0.1,by=0.01)) 13 | fit.cart <- train(diabetes~., data=PimaIndiansDiabetes, method="rpart", metric="Accuracy", tuneGrid=tunegrid, trControl=control) 14 | # display the best configuration 15 | print(fit.cart$bestTune) 16 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/5-ImproveResults/1-TuneAlgorithms/random_search.R: -------------------------------------------------------------------------------- 1 | # Randomly search algorithm parameters 2 | 3 | # load the library 4 | library(caret) 5 | # load the dataset 6 | data(iris) 7 | # prepare training scheme 8 | control <- trainControl(method="repeatedcv", number=10, repeats=3, search="random") 9 | # train the model 10 | model <- train(Species~., data=iris, method="lvq", trControl=control, tuneLength=25) 11 | # summarize the model 12 | print(model) 13 | # plot the effect of parameters on accuracy 14 | plot(model) -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/5-ImproveResults/2-Ensembles/bagging.R: -------------------------------------------------------------------------------- 1 | # Bagging or Bootstrap Aggregation of Decision Trees 2 | 3 | # load the libraries 4 | library(ipred) 5 | library(rpart) 6 | library(mlbench) 7 | # load the dataset 8 | data(PimaIndiansDiabetes) 9 | # bag the decision tree 10 | model <- bagging(diabetes~., data=PimaIndiansDiabetes, nbagg=25, coob=TRUE) 11 | # make predictions on the training dataset 12 | predictions <- predict(model, PimaIndiansDiabetes[,1:8]) 13 | # summarize accuracy 14 | table(predictions, PimaIndiansDiabetes$diabetes) 15 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/5-ImproveResults/2-Ensembles/blending.R: -------------------------------------------------------------------------------- 1 | # Blending (linear combination of models) 2 | 3 | # load libraries 4 | library(caret) 5 | library(caretEnsemble) 6 | # load the dataset 7 | data(PimaIndiansDiabetes) 8 | # define training control 9 | train_control <- trainControl(method="cv", number=10, savePredictions=TRUE, classProbs=TRUE) 10 | # train a list of models 11 | methodList <- c('glm', 'lda', 'knn') 12 | models <- caretList(diabetes~., data=PimaIndiansDiabetes, trControl=train_control, methodList=methodList) 13 | # create ensemble of trained models 14 | ensemble <- caretEnsemble(models) 15 | # summarize ensemble 16 | summary(ensemble) 17 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/5-ImproveResults/2-Ensembles/stacking.R: -------------------------------------------------------------------------------- 1 | # Stacking (non-linear combination of models) 2 | 3 | # load libraries 4 | library(caret) 5 | library(caretEnsemble) 6 | # load the dataset 7 | data(PimaIndiansDiabetes) 8 | # define training control 9 | train_control <- trainControl(method="cv", number=10, savePredictions=TRUE, classProbs=TRUE) 10 | # train a list of models 11 | methodList <- c('glm', 'lda', 'knn') 12 | models <- caretList(diabetes~., data=PimaIndiansDiabetes, trControl=train_control, methodList=methodList) 13 | # create stacked ensemble of trained models 14 | ensemble <- caretStack(models, method='rpart') 15 | # summarize ensemble 16 | summary(ensemble) 17 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/6-FinalizeModel/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/machine_learning_mastery_with_r_code/6-FinalizeModel/.DS_Store -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/6-FinalizeModel/1-Predict/predict_caret.R: -------------------------------------------------------------------------------- 1 | # Make predictions using caret model 2 | 3 | # load libraries 4 | library(caret) 5 | library(mlbench) 6 | # load dataset 7 | data(PimaIndiansDiabetes) 8 | # create 80%/20% for training and validation datasets 9 | set.seed(9) 10 | validation_index <- createDataPartition(PimaIndiansDiabetes$diabetes, p=0.80, list=FALSE) 11 | validation <- PimaIndiansDiabetes[-validation_index,] 12 | training <- PimaIndiansDiabetes[validation_index,] 13 | # train a model and summarize model 14 | set.seed(9) 15 | control <- trainControl(method="cv", number=10) 16 | fit.lda <- train(diabetes~., data=training, method="lda", metric="Accuracy", trControl=control) 17 | print(fit.lda) 18 | print(fit.lda$finalModel) 19 | # estimate skill on validation dataset 20 | set.seed(9) 21 | predictions <- predict(fit.lda, newdata=validation) 22 | confusionMatrix(predictions, validation$diabetes) 23 | 24 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/6-FinalizeModel/1-Predict/train_all_dataset.R: -------------------------------------------------------------------------------- 1 | # Train a model on the entire training dataset 2 | 3 | # load libraries 4 | library(caret) 5 | library(mlbench) 6 | # load dataset 7 | data(PimaIndiansDiabetes) 8 | set.seed(9) 9 | control <- trainControl(method="none", number=10) 10 | fit.lda <- train(diabetes~., data=training, method="lda", metric="Accuracy", trControl=control) 11 | print(fit.lda) 12 | print(fit.lda$finalModel) 13 | 14 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/6-FinalizeModel/2-FinalModel/standalone_model.R: -------------------------------------------------------------------------------- 1 | # Create a Standalone Model 2 | 3 | # load libraries 4 | library(caret) 5 | library(mlbench) 6 | library(randomForest) 7 | library(doMC) 8 | registerDoMC(cores=8) 9 | # load dataset 10 | data(Sonar) 11 | set.seed(7) 12 | # create 80%/20% for training and validation datasets 13 | validation_index <- createDataPartition(Sonar$Class, p=0.80, list=FALSE) 14 | validation <- Sonar[-validation_index,] 15 | training <- Sonar[validation_index,] 16 | # train a model and summarize model 17 | set.seed(7) 18 | control <- trainControl(method="repeatedcv", number=10, repeats=3) 19 | fit.rf <- train(Class~., data=training, method="rf", metric="Accuracy", trControl=control, ntree=2000) 20 | print(fit.rf) 21 | print(fit.rf$finalModel) 22 | # create standalone model using all training data 23 | set.seed(7) 24 | finalModel <- randomForest(Class~., training, mtry=2, ntree=2000) 25 | # make a predictions on "new data" using the final model 26 | final_predictions <- predict(finalModel, validation[,1:60]) 27 | confusionMatrix(final_predictions, validation$Class) 28 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/6-FinalizeModel/3-SaveLoadModel/save_load_model.R: -------------------------------------------------------------------------------- 1 | # Save and Load model 2 | 3 | 4 | # load libraries 5 | library(caret) 6 | library(mlbench) 7 | library(randomForest) 8 | library(doMC) 9 | registerDoMC(cores=8) 10 | # load dataset 11 | data(Sonar) 12 | set.seed(7) 13 | # create 80%/20% for training and validation datasets 14 | validation_index <- createDataPartition(Sonar$Class, p=0.80, list=FALSE) 15 | validation <- Sonar[-validation_index,] 16 | training <- Sonar[validation_index,] 17 | # create final standalone model using all training data 18 | set.seed(7) 19 | final_model <- randomForest(Class~., training, mtry=2, ntree=2000) 20 | # save the model to disk 21 | saveRDS(final_model, "./final_model.rds") 22 | 23 | # later... 24 | 25 | # load the model 26 | super_model <- readRDS("./final_model.rds") 27 | print(super_model) 28 | # make a predictions on "new data" using the final model 29 | final_predictions <- predict(super_model, validation[,1:60]) 30 | confusionMatrix(final_predictions, validation$Class) 31 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/7-Other/install_list_of_packages.R: -------------------------------------------------------------------------------- 1 | # Install a list of packages 2 | 3 | # preferred repo 4 | repository <- "http://cran.ms.unimelb.edu.au/" 5 | # list of packages used by project 6 | packages <- c("ggplot2", "caret", "mlbench", "caretEnsemble", "ipred", "rpart", 7 | "doMC", "AppliedPredictiveModeling", "corrplot", "Hmisc", "DMwR", "lattice", 8 | "RWeka", "e1071", "C50") 9 | 10 | for (p in packages) { 11 | if(p %in% rownames(installed.packages()) == FALSE) { 12 | install.packages(p, repos=repository) 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/7-Other/install_package_with_dependencies.R: -------------------------------------------------------------------------------- 1 | # install package with dependencies 2 | 3 | # Install package with dependencies 4 | install.packages("caret", dependencies = c("Depends", "Suggests")) -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/8-CaseStudies/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/machine_learning_mastery_with_r_code/8-CaseStudies/.DS_Store -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/8-CaseStudies/MultiClassClassification/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rupskygill/ML-mastery/f950dc4bc28c93caa6589fde57ccf04dd6d413c3/machine_learning_mastery_with_r_code/8-CaseStudies/MultiClassClassification/.DS_Store -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/8-CaseStudies/project_template.R: -------------------------------------------------------------------------------- 1 | # R Project Template 2 | 3 | # 1. Prepare Problem 4 | # a) Load libraries 5 | # b) Load dataset 6 | # c) Split-out validation dataset 7 | 8 | # 2. Summarize Data 9 | # a) Descriptive statistics 10 | # b) Data visualizations 11 | 12 | # 3. Prepare Data 13 | # a) Data Cleaning 14 | # b) Feature Selection 15 | # c) Data Transforms 16 | 17 | # 4. Evaluate Algorithms 18 | # a) Test options and evaluation metric 19 | # b) Spot Check Algorithms 20 | # c) Compare Algorithms 21 | 22 | # 5. Improve Accuracy 23 | # a) Algorithm Tuning 24 | # b) Ensembles 25 | 26 | # 6. Finalize Model 27 | # a) Predictions on validation dataset 28 | # b) Create standalone model on entire training dataset 29 | # c) Save model for later use 30 | -------------------------------------------------------------------------------- /machine_learning_mastery_with_r_code/README.txt: -------------------------------------------------------------------------------- 1 | Machine Learning Mastery With R: Recipes 2 | ======================================== 3 | 4 | Recipes that you can use to bootstrap your machine learning project in R. 5 | 6 | 7 | About 8 | ----- 9 | 10 | - Recipes are code snippets not tutorials. 11 | - Recipes provide just enough code to work. 12 | - Recipes are demonstrative not exhaustive. 13 | - Recipes run as-is and produce a result. 14 | - Recipes assume that required packages are installed. 15 | - Recipes use built-in datasets or datasets provided in specific packages. 16 | - Recipes are limited to regression and classification predictive modeling problems. 17 | 18 | Usage 19 | ----- 20 | 21 | 1. Find a useful recipe. 22 | 2. Copy and paste it into your project. 23 | 3. Adapt it to your needs. 24 | -------------------------------------------------------------------------------- /ml_with_python_code/02_scipy_versions.py: -------------------------------------------------------------------------------- 1 | # scipy 2 | import scipy 3 | print('scipy: {}'.format(scipy.__version__)) 4 | # numpy 5 | import numpy 6 | print('numpy: {}'.format(numpy.__version__)) 7 | # matplotlib 8 | import matplotlib 9 | print('matplotlib: {}'.format(matplotlib.__version__)) 10 | # pandas 11 | import pandas 12 | print('pandas: {}'.format(pandas.__version__)) -------------------------------------------------------------------------------- /ml_with_python_code/02_sklearn_version.py: -------------------------------------------------------------------------------- 1 | # scikit-learn 2 | import sklearn 3 | print('sklearn: {}'.format(sklearn.__version__)) -------------------------------------------------------------------------------- /ml_with_python_code/03_matplotlib_crash_course.py: -------------------------------------------------------------------------------- 1 | # matplotlib crash course 2 | 3 | 4 | # basic line plot 5 | import matplotlib.pyplot as plt 6 | import numpy 7 | myarray = numpy.array([1, 2, 3]) 8 | plt.plot(myarray) 9 | plt.xlabel('some x axis') 10 | plt.ylabel('some y axis') 11 | plt.show() 12 | 13 | 14 | # basic scatter plot 15 | import matplotlib.pyplot as plt 16 | import numpy 17 | x = numpy.array([1, 2, 3]) 18 | y = numpy.array([2, 4, 6]) 19 | plt.scatter(x,y) 20 | plt.xlabel('some x axis') 21 | plt.ylabel('some y axis') 22 | plt.show() 23 | -------------------------------------------------------------------------------- /ml_with_python_code/03_numpy_crash_course.py: -------------------------------------------------------------------------------- 1 | # numpy crash course 2 | 3 | # define an array 4 | import numpy 5 | mylist = [1, 2, 3] 6 | myarray = numpy.array(mylist) 7 | print(myarray) 8 | print(myarray.shape) 9 | 10 | # access values 11 | import numpy 12 | mylist = [[1, 2, 3], [3, 4, 5]] 13 | myarray = numpy.array(mylist) 14 | print(myarray) 15 | print(myarray.shape) 16 | print("First row: %s") % myarray[0] 17 | print("Last row: %s") % myarray[-1] 18 | print("Specific row and col: %s") % myarray[0, 2] 19 | print("Whole col: %s") % myarray[:, 2] 20 | 21 | # arithmetic 22 | import numpy 23 | myarray1 = numpy.array([2, 2, 2]) 24 | myarray2 = numpy.array([3, 3, 3]) 25 | print("Addition: %s") % (myarray1 + myarray2) 26 | print("Multiplication: %s") % (myarray1 * myarray2) 27 | 28 | -------------------------------------------------------------------------------- /ml_with_python_code/03_pandas_crash_course.py: -------------------------------------------------------------------------------- 1 | # pandas crash course 2 | 3 | 4 | # series 5 | import numpy 6 | import pandas 7 | myarray = numpy.array([1, 2, 3]) 8 | rownames = ['a', 'b', 'c'] 9 | myseries = pandas.Series(myarray, index=rownames) 10 | print(myseries) 11 | 12 | print(myseries[0]) 13 | print(myseries['a']) 14 | 15 | 16 | # dataframe 17 | import numpy 18 | import pandas 19 | myarray = numpy.array([[1, 2, 3], [4, 5, 6]]) 20 | rownames = ['a', 'b'] 21 | colnames = ['one', 'two', 'three'] 22 | mydataframe = pandas.DataFrame(myarray, index=rownames, columns=colnames) 23 | print(mydataframe) 24 | 25 | print("one column: %s") % mydataframe['one'] 26 | print("one column: %s") % mydataframe.one 27 | -------------------------------------------------------------------------------- /ml_with_python_code/03_python_crash_course.py: -------------------------------------------------------------------------------- 1 | # Python Crash Course 2 | 3 | 4 | # Assignment 5 | # ========== 6 | 7 | # Strings 8 | data = 'hello world' 9 | print(data[0]) 10 | print(len(data)) 11 | print(data) 12 | 13 | # Numbers 14 | value = 123.1 15 | print(value) 16 | value = 10 17 | print(value) 18 | 19 | # Boolean 20 | a = True 21 | b = False 22 | print(a, b) 23 | 24 | # Multiple Assignment 25 | a, b, c = 1, 2, 3 26 | print(a, b, c) 27 | 28 | # No value 29 | a = None 30 | print(a) 31 | 32 | 33 | 34 | # Flow Control 35 | # ============ 36 | 37 | # If-Then-Else 38 | 39 | value = 99 40 | if value >= 99: 41 | print 'That is fast' 42 | elif value > 200: 43 | print 'That is too fast' 44 | else: 45 | print 'That that is safe' 46 | 47 | # For-Loop 48 | for i in range(10): 49 | print i 50 | 51 | # While-Loop 52 | i = 0 53 | while i < 10: 54 | print i 55 | i += 1 56 | 57 | 58 | # Data Structures 59 | # =============== 60 | 61 | # Tuple (cannot change) 62 | a = (1, 2, 3) 63 | print a 64 | 65 | 66 | # Lists 67 | mylist = [1, 2, 3] 68 | print("Zeroth Value: %d") % mylist[0] 69 | mylist.append(4) 70 | print("List Length: %d") % len(mylist) 71 | for value in mylist: 72 | print value 73 | 74 | 75 | 76 | # Dictionaries 77 | 78 | mydict = {'a': 1, 'b': 2, 'c': 3} 79 | print("A value: %d") % mydict['a'] 80 | mydict['a'] = 11 81 | print("A value: %d") % mydict['a'] 82 | print("Keys: %s") % mydict.keys() 83 | print("Values: %s") % mydict.values() 84 | for key in mydict.keys(): 85 | print mydict[key] 86 | 87 | 88 | # Functions 89 | # =========== 90 | 91 | # Sum function 92 | def mysum(x, y): 93 | return x + y 94 | 95 | # Test sum function 96 | mysum(1, 3) 97 | 98 | 99 | -------------------------------------------------------------------------------- /ml_with_python_code/04_load_csv.py: -------------------------------------------------------------------------------- 1 | # Load CSV Using Python Standard Library 2 | import csv 3 | import numpy 4 | filename = 'pima-indians-diabetes.data.csv' 5 | raw_data = open(filename, 'rb') 6 | reader = csv.reader(raw_data, delimiter=',', quoting=csv.QUOTE_NONE) 7 | x = list(reader) 8 | data = numpy.array(x).astype('float') 9 | print(data.shape) 10 | -------------------------------------------------------------------------------- /ml_with_python_code/04_load_csv_np.py: -------------------------------------------------------------------------------- 1 | # Load CSV using NumPy 2 | from numpy import loadtxt 3 | filename = 'pima-indians-diabetes.data.csv' 4 | raw_data = open(filename, 'rb') 5 | data = loadtxt(raw_data, delimiter=",") 6 | print(data.shape) 7 | -------------------------------------------------------------------------------- /ml_with_python_code/04_load_csv_np_url.py: -------------------------------------------------------------------------------- 1 | # Load CSV from URL using NumPy 2 | from numpy import loadtxt 3 | from urllib import urlopen 4 | url = 'https://goo.gl/vhm1eU' 5 | raw_data = urlopen(url) 6 | dataset = loadtxt(raw_data, delimiter=",") 7 | print(dataset.shape) 8 | -------------------------------------------------------------------------------- /ml_with_python_code/04_load_csv_pandas.py: -------------------------------------------------------------------------------- 1 | # Load CSV using Pandas 2 | from pandas import read_csv 3 | filename = 'pima-indians-diabetes.data.csv' 4 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 5 | data = read_csv(filename, names=names) 6 | print(data.shape) 7 | -------------------------------------------------------------------------------- /ml_with_python_code/04_load_csv_pandas_url.py: -------------------------------------------------------------------------------- 1 | # Load CSV using Pandas from URL 2 | from pandas import read_csv 3 | url = 'https://goo.gl/vhm1eU' 4 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 5 | data = read_csv(url, names=names) 6 | print(data.shape) 7 | -------------------------------------------------------------------------------- /ml_with_python_code/05_class_distribution.py: -------------------------------------------------------------------------------- 1 | # Class Distribution 2 | from pandas import read_csv 3 | filename = "pima-indians-diabetes.data.csv" 4 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 5 | data = read_csv(filename, names=names) 6 | class_counts = data.groupby('class').size() 7 | print(class_counts) 8 | -------------------------------------------------------------------------------- /ml_with_python_code/05_data_types.py: -------------------------------------------------------------------------------- 1 | # Data Types for Each Attribute 2 | from pandas import read_csv 3 | filename = "pima-indians-diabetes.data.csv" 4 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 5 | data = read_csv(filename, names=names) 6 | types = data.dtypes 7 | print(types) 8 | -------------------------------------------------------------------------------- /ml_with_python_code/05_describe.py: -------------------------------------------------------------------------------- 1 | # Statistical Summary 2 | from pandas import read_csv 3 | from pandas import set_option 4 | filename = "pima-indians-diabetes.data.csv" 5 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 6 | data = read_csv(filename, names=names) 7 | set_option('display.width', 100) 8 | set_option('precision', 3) 9 | description = data.describe() 10 | print(description) 11 | -------------------------------------------------------------------------------- /ml_with_python_code/05_dimensions.py: -------------------------------------------------------------------------------- 1 | # Dimensions of your data 2 | from pandas import read_csv 3 | filename = "pima-indians-diabetes.data.csv" 4 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 5 | data = read_csv(filename, names=names) 6 | shape = data.shape 7 | print(shape) 8 | -------------------------------------------------------------------------------- /ml_with_python_code/05_head.py: -------------------------------------------------------------------------------- 1 | # View first 20 rows 2 | from pandas import read_csv 3 | filename = "pima-indians-diabetes.data.csv" 4 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 5 | data = read_csv(filename, names=names) 6 | peek = data.head(20) 7 | print(peek) 8 | -------------------------------------------------------------------------------- /ml_with_python_code/05_pearson_correlation.py: -------------------------------------------------------------------------------- 1 | # Pairwise Pearson correlations 2 | from pandas import read_csv 3 | from pandas import set_option 4 | filename = "pima-indians-diabetes.data.csv" 5 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 6 | data = read_csv(filename, names=names) 7 | set_option('display.width', 100) 8 | set_option('precision', 3) 9 | correlations = data.corr(method='pearson') 10 | print(correlations) 11 | -------------------------------------------------------------------------------- /ml_with_python_code/05_skew.py: -------------------------------------------------------------------------------- 1 | # Skew for each attribute 2 | from pandas import read_csv 3 | filename = "pima-indians-diabetes.data.csv" 4 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 5 | data = read_csv(filename, names=names) 6 | skew = data.skew() 7 | print(skew) 8 | -------------------------------------------------------------------------------- /ml_with_python_code/06_boxplot.py: -------------------------------------------------------------------------------- 1 | # Box and Whisker Plots 2 | from matplotlib import pyplot 3 | from pandas import read_csv 4 | filename = "pima-indians-diabetes.data.csv" 5 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 6 | data = read_csv(filename, names=names) 7 | data.plot(kind='box', subplots=True, layout=(3,3), sharex=False, sharey=False) 8 | pyplot.show() 9 | -------------------------------------------------------------------------------- /ml_with_python_code/06_correlation_matrix.py: -------------------------------------------------------------------------------- 1 | # Correction Matrix Plot 2 | from matplotlib import pyplot 3 | from pandas import read_csv 4 | import numpy 5 | filename = 'pima-indians-diabetes.data.csv' 6 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 7 | data = read_csv(filename, names=names) 8 | correlations = data.corr() 9 | # plot correlation matrix 10 | fig = pyplot.figure() 11 | ax = fig.add_subplot(111) 12 | cax = ax.matshow(correlations, vmin=-1, vmax=1) 13 | fig.colorbar(cax) 14 | ticks = numpy.arange(0,9,1) 15 | ax.set_xticks(ticks) 16 | ax.set_yticks(ticks) 17 | ax.set_xticklabels(names) 18 | ax.set_yticklabels(names) 19 | pyplot.show() 20 | -------------------------------------------------------------------------------- /ml_with_python_code/06_correlation_matrix_generic.py: -------------------------------------------------------------------------------- 1 | # Correction Matrix Plot (generic) 2 | from matplotlib import pyplot 3 | from pandas import read_csv 4 | import numpy 5 | filename = 'pima-indians-diabetes.data.csv' 6 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 7 | data = read_csv(filename, names=names) 8 | correlations = data.corr() 9 | # plot correlation matrix 10 | fig = pyplot.figure() 11 | ax = fig.add_subplot(111) 12 | cax = ax.matshow(correlations, vmin=-1, vmax=1) 13 | fig.colorbar(cax) 14 | pyplot.show() 15 | -------------------------------------------------------------------------------- /ml_with_python_code/06_density_plots.py: -------------------------------------------------------------------------------- 1 | # Univariate Density Plots 2 | from matplotlib import pyplot 3 | from pandas import read_csv 4 | filename = 'pima-indians-diabetes.data.csv' 5 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 6 | data = read_csv(filename, names=names) 7 | data.plot(kind='density', subplots=True, layout=(3,3), sharex=False) 8 | pyplot.show() 9 | -------------------------------------------------------------------------------- /ml_with_python_code/06_histograms.py: -------------------------------------------------------------------------------- 1 | # Univariate Histograms 2 | from matplotlib import pyplot 3 | from pandas import read_csv 4 | filename = 'pima-indians-diabetes.data.csv' 5 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 6 | data = read_csv(filename, names=names) 7 | data.hist() 8 | pyplot.show() 9 | -------------------------------------------------------------------------------- /ml_with_python_code/06_scatterplot_matrix.py: -------------------------------------------------------------------------------- 1 | # Scatterplot Matrix 2 | from matplotlib import pyplot 3 | from pandas import read_csv 4 | from pandas.tools.plotting import scatter_matrix 5 | filename = "pima-indians-diabetes.data.csv" 6 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 7 | data = read_csv(filename, names=names) 8 | scatter_matrix(data) 9 | pyplot.show() 10 | -------------------------------------------------------------------------------- /ml_with_python_code/07_binarization.py: -------------------------------------------------------------------------------- 1 | # binarization 2 | from sklearn.preprocessing import Binarizer 3 | from pandas import read_csv 4 | from numpy import set_printoptions 5 | filename = 'pima-indians-diabetes.data.csv' 6 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 7 | dataframe = read_csv(filename, names=names) 8 | array = dataframe.values 9 | # separate array into input and output components 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | binarizer = Binarizer(threshold=0.0).fit(X) 13 | binaryX = binarizer.transform(X) 14 | # summarize transformed data 15 | set_printoptions(precision=3) 16 | print(binaryX[0:5,:]) 17 | -------------------------------------------------------------------------------- /ml_with_python_code/07_normalize_data.py: -------------------------------------------------------------------------------- 1 | # Normalize data (length of 1) 2 | from sklearn.preprocessing import Normalizer 3 | from pandas import read_csv 4 | from numpy import set_printoptions 5 | filename = 'pima-indians-diabetes.data.csv' 6 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 7 | dataframe = read_csv(filename, names=names) 8 | array = dataframe.values 9 | # separate array into input and output components 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | scaler = Normalizer().fit(X) 13 | normalizedX = scaler.transform(X) 14 | # summarize transformed data 15 | set_printoptions(precision=3) 16 | print(normalizedX[0:5,:]) 17 | -------------------------------------------------------------------------------- /ml_with_python_code/07_rescale_data.py: -------------------------------------------------------------------------------- 1 | # Rescale data (between 0 and 1) 2 | from pandas import read_csv 3 | from numpy import set_printoptions 4 | from sklearn.preprocessing import MinMaxScaler 5 | filename = 'pima-indians-diabetes.data.csv' 6 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 7 | dataframe = read_csv(filename, names=names) 8 | array = dataframe.values 9 | # separate array into input and output components 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | scaler = MinMaxScaler(feature_range=(0, 1)) 13 | rescaledX = scaler.fit_transform(X) 14 | # summarize transformed data 15 | set_printoptions(precision=3) 16 | print(rescaledX[0:5,:]) 17 | -------------------------------------------------------------------------------- /ml_with_python_code/07_standardize_data.py: -------------------------------------------------------------------------------- 1 | # Standardize data (0 mean, 1 stdev) 2 | from sklearn.preprocessing import StandardScaler 3 | from pandas import read_csv 4 | from numpy import set_printoptions 5 | filename = 'pima-indians-diabetes.data.csv' 6 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 7 | dataframe = read_csv(filename, names=names) 8 | array = dataframe.values 9 | # separate array into input and output components 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | scaler = StandardScaler().fit(X) 13 | rescaledX = scaler.transform(X) 14 | # summarize transformed data 15 | set_printoptions(precision=3) 16 | print(rescaledX[0:5,:]) 17 | -------------------------------------------------------------------------------- /ml_with_python_code/08_feature_importance.py: -------------------------------------------------------------------------------- 1 | # Feature Importance with Extra Trees Classifier 2 | from pandas import read_csv 3 | from sklearn.ensemble import ExtraTreesClassifier 4 | # load data 5 | filename = 'pima-indians-diabetes.data.csv' 6 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 7 | dataframe = read_csv(filename, names=names) 8 | array = dataframe.values 9 | X = array[:,0:8] 10 | Y = array[:,8] 11 | # feature extraction 12 | model = ExtraTreesClassifier() 13 | model.fit(X, Y) 14 | print(model.feature_importances_) 15 | -------------------------------------------------------------------------------- /ml_with_python_code/08_pca.py: -------------------------------------------------------------------------------- 1 | # Feature Extraction with PCA 2 | from pandas import read_csv 3 | from sklearn.decomposition import PCA 4 | # load data 5 | filename = 'pima-indians-diabetes.data.csv' 6 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 7 | dataframe = read_csv(filename, names=names) 8 | array = dataframe.values 9 | X = array[:,0:8] 10 | Y = array[:,8] 11 | # feature extraction 12 | pca = PCA(n_components=3) 13 | fit = pca.fit(X) 14 | # summarize components 15 | print("Explained Variance: %s") % fit.explained_variance_ratio_ 16 | print(fit.components_) 17 | -------------------------------------------------------------------------------- /ml_with_python_code/08_recursive_feature_elimination.py: -------------------------------------------------------------------------------- 1 | # Feature Extraction with RFE 2 | from pandas import read_csv 3 | from sklearn.feature_selection import RFE 4 | from sklearn.linear_model import LogisticRegression 5 | # load data 6 | filename = 'pima-indians-diabetes.data.csv' 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 8 | dataframe = read_csv(filename, names=names) 9 | array = dataframe.values 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | # feature extraction 13 | model = LogisticRegression() 14 | rfe = RFE(model, 3) 15 | fit = rfe.fit(X, Y) 16 | print("Num Features: %d") % fit.n_features_ 17 | print("Selected Features: %s") % fit.support_ 18 | print("Feature Ranking: %s") % fit.ranking_ 19 | -------------------------------------------------------------------------------- /ml_with_python_code/08_univariate_selection.py: -------------------------------------------------------------------------------- 1 | # Feature Extraction with Univariate Statistical Tests (Chi-squared for classification) 2 | from pandas import read_csv 3 | from numpy import set_printoptions 4 | from sklearn.feature_selection import SelectKBest 5 | from sklearn.feature_selection import chi2 6 | # load data 7 | filename = 'pima-indians-diabetes.data.csv' 8 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 9 | dataframe = read_csv(filename, names=names) 10 | array = dataframe.values 11 | X = array[:,0:8] 12 | Y = array[:,8] 13 | # feature extraction 14 | test = SelectKBest(score_func=chi2, k=4) 15 | fit = test.fit(X, Y) 16 | # summarize scores 17 | set_printoptions(precision=3) 18 | print(fit.scores_) 19 | features = fit.transform(X) 20 | # summarize selected features 21 | print(features[0:5,:]) 22 | 23 | -------------------------------------------------------------------------------- /ml_with_python_code/09_cross_validation.py: -------------------------------------------------------------------------------- 1 | # Evaluate using Cross Validation 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.linear_model import LogisticRegression 6 | filename = 'pima-indians-diabetes.data.csv' 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 8 | dataframe = read_csv(filename, names=names) 9 | array = dataframe.values 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | num_folds = 10 13 | seed = 7 14 | kfold = KFold(n_splits=num_folds, random_state=seed) 15 | model = LogisticRegression() 16 | results = cross_val_score(model, X, Y, cv=kfold) 17 | print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0) 18 | -------------------------------------------------------------------------------- /ml_with_python_code/09_loocv.py: -------------------------------------------------------------------------------- 1 | # Evaluate using Leave One Out Cross Validation 2 | from pandas import read_csv 3 | from sklearn.model_selection import LeaveOneOut 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.linear_model import LogisticRegression 6 | filename = 'pima-indians-diabetes.data.csv' 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 8 | dataframe = read_csv(filename, names=names) 9 | array = dataframe.values 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | num_folds = 10 13 | loocv = LeaveOneOut() 14 | model = LogisticRegression() 15 | results = cross_val_score(model, X, Y, cv=loocv) 16 | print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0) -------------------------------------------------------------------------------- /ml_with_python_code/09_shuffle_split.py: -------------------------------------------------------------------------------- 1 | # Evaluate using Shuffle Split Cross Validation 2 | from pandas import read_csv 3 | from sklearn.model_selection import ShuffleSplit 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.linear_model import LogisticRegression 6 | filename = 'pima-indians-diabetes.data.csv' 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 8 | dataframe = read_csv(filename, names=names) 9 | array = dataframe.values 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | n_splits = 10 13 | test_size = 0.33 14 | seed = 7 15 | kfold = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed) 16 | model = LogisticRegression() 17 | results = cross_val_score(model, X, Y, cv=kfold) 18 | print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0) -------------------------------------------------------------------------------- /ml_with_python_code/09_train_test.py: -------------------------------------------------------------------------------- 1 | # Evaluate using a train and a test set 2 | from pandas import read_csv 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.linear_model import LogisticRegression 5 | filename = 'pima-indians-diabetes.data.csv' 6 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 7 | dataframe = read_csv(filename, names=names) 8 | array = dataframe.values 9 | X = array[:,0:8] 10 | Y = array[:,8] 11 | test_size = 0.33 12 | seed = 7 13 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) 14 | model = LogisticRegression() 15 | model.fit(X_train, Y_train) 16 | result = model.score(X_test, Y_test) 17 | print("Accuracy: %.3f%%") % (result*100.0) -------------------------------------------------------------------------------- /ml_with_python_code/10_classification_accuracy.py: -------------------------------------------------------------------------------- 1 | # Cross Validation Classification Accuracy 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.linear_model import LogisticRegression 6 | filename = 'pima-indians-diabetes.data.csv' 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 8 | dataframe = read_csv(filename, names=names) 9 | array = dataframe.values 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | kfold = KFold(n_splits=10, random_state=7) 13 | model = LogisticRegression() 14 | scoring = 'accuracy' 15 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) 16 | print("Accuracy: %.3f (%.3f)") % (results.mean(), results.std()) -------------------------------------------------------------------------------- /ml_with_python_code/10_classification_auc.py: -------------------------------------------------------------------------------- 1 | # Cross Validation Classification ROC AUC 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.linear_model import LogisticRegression 6 | filename = 'pima-indians-diabetes.data.csv' 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 8 | dataframe = read_csv(filename, names=names) 9 | array = dataframe.values 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | kfold = KFold(n_splits=10, random_state=7) 13 | model = LogisticRegression() 14 | scoring = 'roc_auc' 15 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) 16 | print("AUC: %.3f (%.3f)") % (results.mean(), results.std()) -------------------------------------------------------------------------------- /ml_with_python_code/10_classification_confusion_matrix.py: -------------------------------------------------------------------------------- 1 | # Cross Validation Classification Confusion Matrix 2 | from pandas import read_csv 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn.metrics import confusion_matrix 6 | filename = 'pima-indians-diabetes.data.csv' 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 8 | dataframe = read_csv(filename, names=names) 9 | array = dataframe.values 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | test_size = 0.33 13 | seed = 7 14 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) 15 | model = LogisticRegression() 16 | model.fit(X_train, Y_train) 17 | predicted = model.predict(X_test) 18 | matrix = confusion_matrix(Y_test, predicted) 19 | print(matrix) 20 | -------------------------------------------------------------------------------- /ml_with_python_code/10_classification_logloss.py: -------------------------------------------------------------------------------- 1 | # Cross Validation Classification LogLoss 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.linear_model import LogisticRegression 6 | filename = 'pima-indians-diabetes.data.csv' 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 8 | dataframe = read_csv(filename, names=names) 9 | array = dataframe.values 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | kfold = KFold(n_splits=10, random_state=7) 13 | model = LogisticRegression() 14 | scoring = 'neg_log_loss' 15 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) 16 | print("Logloss: %.3f (%.3f)") % (results.mean(), results.std()) -------------------------------------------------------------------------------- /ml_with_python_code/10_classification_report.py: -------------------------------------------------------------------------------- 1 | # Cross Validation Classification Report 2 | from pandas import read_csv 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn.metrics import classification_report 6 | filename = 'pima-indians-diabetes.data.csv' 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 8 | dataframe = read_csv(filename, names=names) 9 | array = dataframe.values 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | test_size = 0.33 13 | seed = 7 14 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) 15 | model = LogisticRegression() 16 | model.fit(X_train, Y_train) 17 | predicted = model.predict(X_test) 18 | report = classification_report(Y_test, predicted) 19 | print(report) 20 | -------------------------------------------------------------------------------- /ml_with_python_code/10_regression_mae.py: -------------------------------------------------------------------------------- 1 | # Cross Validation Regression MAE 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.linear_model import LinearRegression 6 | filename = 'housing.csv' 7 | names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'] 8 | dataframe = read_csv(filename, delim_whitespace=True, names=names) 9 | array = dataframe.values 10 | X = array[:,0:13] 11 | Y = array[:,13] 12 | kfold = KFold(n_splits=10, random_state=7) 13 | model = LinearRegression() 14 | scoring = 'neg_mean_absolute_error' 15 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) 16 | print("MAE: %.3f (%.3f)") % (results.mean(), results.std()) -------------------------------------------------------------------------------- /ml_with_python_code/10_regression_mse.py: -------------------------------------------------------------------------------- 1 | # Cross Validation Regression MSE 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.linear_model import LinearRegression 6 | filename = 'housing.csv' 7 | names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'] 8 | dataframe = read_csv(filename, delim_whitespace=True, names=names) 9 | array = dataframe.values 10 | X = array[:,0:13] 11 | Y = array[:,13] 12 | num_folds = 10 13 | kfold = KFold(n_splits=10, random_state=7) 14 | model = LinearRegression() 15 | scoring = 'neg_mean_squared_error' 16 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) 17 | print("MSE: %.3f (%.3f)") % (results.mean(), results.std()) -------------------------------------------------------------------------------- /ml_with_python_code/10_regression_rsquared.py: -------------------------------------------------------------------------------- 1 | # Cross Validation Regression R^2 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.linear_model import LinearRegression 6 | filename = 'housing.csv' 7 | names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'] 8 | dataframe = read_csv(filename, delim_whitespace=True, names=names) 9 | array = dataframe.values 10 | X = array[:,0:13] 11 | Y = array[:,13] 12 | kfold = KFold(n_splits=10, random_state=7) 13 | model = LinearRegression() 14 | scoring = 'r2' 15 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) 16 | print("R^2: %.3f (%.3f)") % (results.mean(), results.std()) -------------------------------------------------------------------------------- /ml_with_python_code/11_classification_and_regression_trees_classification.py: -------------------------------------------------------------------------------- 1 | # CART Classification 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.tree import DecisionTreeClassifier 6 | filename = 'pima-indians-diabetes.data.csv' 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 8 | dataframe = read_csv(filename, names=names) 9 | array = dataframe.values 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | kfold = KFold(n_splits=10, random_state=7) 13 | model = DecisionTreeClassifier() 14 | results = cross_val_score(model, X, Y, cv=kfold) 15 | print(results.mean()) 16 | -------------------------------------------------------------------------------- /ml_with_python_code/11_gaussian_naive_bayes.py: -------------------------------------------------------------------------------- 1 | # Gaussian Naive Bayes Classification 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.naive_bayes import GaussianNB 6 | filename = 'pima-indians-diabetes.data.csv' 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 8 | dataframe = read_csv(filename, names=names) 9 | array = dataframe.values 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | kfold = KFold(n_splits=10, random_state=7) 13 | model = GaussianNB() 14 | results = cross_val_score(model, X, Y, cv=kfold) 15 | print(results.mean()) 16 | -------------------------------------------------------------------------------- /ml_with_python_code/11_k_nearest_neighbors_classification.py: -------------------------------------------------------------------------------- 1 | # KNN Classification 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.neighbors import KNeighborsClassifier 6 | filename = 'pima-indians-diabetes.data.csv' 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 8 | dataframe = read_csv(filename, names=names) 9 | array = dataframe.values 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | num_folds = 10 13 | kfold = KFold(n_splits=10, random_state=7) 14 | model = KNeighborsClassifier() 15 | results = cross_val_score(model, X, Y, cv=kfold) 16 | print(results.mean()) 17 | -------------------------------------------------------------------------------- /ml_with_python_code/11_linear_discriminant_analysis.py: -------------------------------------------------------------------------------- 1 | # LDA Classification 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 6 | filename = 'pima-indians-diabetes.data.csv' 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 8 | dataframe = read_csv(filename, names=names) 9 | array = dataframe.values 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | num_folds = 10 13 | kfold = KFold(n_splits=10, random_state=7) 14 | model = LinearDiscriminantAnalysis() 15 | results = cross_val_score(model, X, Y, cv=kfold) 16 | print(results.mean()) 17 | -------------------------------------------------------------------------------- /ml_with_python_code/11_logistic_regression.py: -------------------------------------------------------------------------------- 1 | # Logistic Regression Classification 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.linear_model import LogisticRegression 6 | filename = 'pima-indians-diabetes.data.csv' 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 8 | dataframe = read_csv(filename, names=names) 9 | array = dataframe.values 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | num_folds = 10 13 | kfold = KFold(n_splits=10, random_state=7) 14 | model = LogisticRegression() 15 | results = cross_val_score(model, X, Y, cv=kfold) 16 | print(results.mean()) 17 | -------------------------------------------------------------------------------- /ml_with_python_code/11_support_vector_machines_classification.py: -------------------------------------------------------------------------------- 1 | # SVM Classification 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.svm import SVC 6 | filename = 'pima-indians-diabetes.data.csv' 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 8 | dataframe = read_csv(filename, names=names) 9 | array = dataframe.values 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | kfold = KFold(n_splits=10, random_state=7) 13 | model = SVC() 14 | results = cross_val_score(model, X, Y, cv=kfold) 15 | print(results.mean()) 16 | -------------------------------------------------------------------------------- /ml_with_python_code/12_classification_and_regression_trees_regression.py: -------------------------------------------------------------------------------- 1 | # Decision Tree Regression 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.tree import DecisionTreeRegressor 6 | filename = 'housing.csv' 7 | names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'] 8 | dataframe = read_csv(filename, delim_whitespace=True, names=names) 9 | array = dataframe.values 10 | X = array[:,0:13] 11 | Y = array[:,13] 12 | kfold = KFold(n_splits=10, random_state=7) 13 | model = DecisionTreeRegressor() 14 | scoring = 'neg_mean_squared_error' 15 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) 16 | print(results.mean()) 17 | -------------------------------------------------------------------------------- /ml_with_python_code/12_elastic_net.py: -------------------------------------------------------------------------------- 1 | # ElasticNet Regression 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.linear_model import ElasticNet 6 | filename = 'housing.csv' 7 | names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'] 8 | dataframe = read_csv(filename, delim_whitespace=True, names=names) 9 | array = dataframe.values 10 | X = array[:,0:13] 11 | Y = array[:,13] 12 | kfold = KFold(n_splits=10, random_state=7) 13 | model = ElasticNet() 14 | scoring = 'neg_mean_squared_error' 15 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) 16 | print(results.mean()) 17 | -------------------------------------------------------------------------------- /ml_with_python_code/12_k_nearest_neighbors_regression.py: -------------------------------------------------------------------------------- 1 | # KNN Regression 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.neighbors import KNeighborsRegressor 6 | filename = 'housing.csv' 7 | names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'] 8 | dataframe = read_csv(filename, delim_whitespace=True, names=names) 9 | array = dataframe.values 10 | X = array[:,0:13] 11 | Y = array[:,13] 12 | kfold = KFold(n_splits=10, random_state=7) 13 | model = KNeighborsRegressor() 14 | scoring = 'neg_mean_squared_error' 15 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) 16 | print(results.mean()) 17 | -------------------------------------------------------------------------------- /ml_with_python_code/12_lasso_regression.py: -------------------------------------------------------------------------------- 1 | # Lasso Regression 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.linear_model import Lasso 6 | filename = 'housing.csv' 7 | names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'] 8 | dataframe = read_csv(filename, delim_whitespace=True, names=names) 9 | array = dataframe.values 10 | X = array[:,0:13] 11 | Y = array[:,13] 12 | kfold = KFold(n_splits=10, random_state=7) 13 | model = Lasso() 14 | scoring = 'neg_mean_squared_error' 15 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) 16 | print(results.mean()) 17 | -------------------------------------------------------------------------------- /ml_with_python_code/12_linear_regression.py: -------------------------------------------------------------------------------- 1 | # Linear Regression 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.linear_model import LinearRegression 6 | filename = 'housing.csv' 7 | names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'] 8 | dataframe = read_csv(filename, delim_whitespace=True, names=names) 9 | array = dataframe.values 10 | X = array[:,0:13] 11 | Y = array[:,13] 12 | kfold = KFold(n_splits=10, random_state=7) 13 | model = LinearRegression() 14 | scoring = 'neg_mean_squared_error' 15 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) 16 | print(results.mean()) 17 | -------------------------------------------------------------------------------- /ml_with_python_code/12_ridge_regression.py: -------------------------------------------------------------------------------- 1 | # Ridge Regression 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.linear_model import Ridge 6 | filename = 'housing.csv' 7 | names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'] 8 | dataframe = read_csv(filename, delim_whitespace=True, names=names) 9 | array = dataframe.values 10 | X = array[:,0:13] 11 | Y = array[:,13] 12 | num_folds = 10 13 | kfold = KFold(n_splits=10, random_state=7) 14 | model = Ridge() 15 | scoring = 'neg_mean_squared_error' 16 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) 17 | print(results.mean()) 18 | -------------------------------------------------------------------------------- /ml_with_python_code/12_support_vector_machines_regression.py: -------------------------------------------------------------------------------- 1 | # SVM Regression 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.svm import SVR 6 | filename = 'housing.csv' 7 | names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'] 8 | dataframe = read_csv(filename, delim_whitespace=True, names=names) 9 | array = dataframe.values 10 | X = array[:,0:13] 11 | Y = array[:,13] 12 | num_folds = 10 13 | kfold = KFold(n_splits=10, random_state=7) 14 | model = SVR() 15 | scoring = 'neg_mean_squared_error' 16 | results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) 17 | print(results.mean()) 18 | -------------------------------------------------------------------------------- /ml_with_python_code/14_feature_union_model_pipeline.py: -------------------------------------------------------------------------------- 1 | # Create a pipeline that extracts features from the data then creates a model 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.pipeline import Pipeline 6 | from sklearn.pipeline import FeatureUnion 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.decomposition import PCA 9 | from sklearn.feature_selection import SelectKBest 10 | # load data 11 | filename = 'pima-indians-diabetes.data.csv' 12 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 13 | dataframe = read_csv(filename, names=names) 14 | array = dataframe.values 15 | X = array[:,0:8] 16 | Y = array[:,8] 17 | # create feature union 18 | features = [] 19 | features.append(('pca', PCA(n_components=3))) 20 | features.append(('select_best', SelectKBest(k=6))) 21 | feature_union = FeatureUnion(features) 22 | # create pipeline 23 | estimators = [] 24 | estimators.append(('feature_union', feature_union)) 25 | estimators.append(('logistic', LogisticRegression())) 26 | model = Pipeline(estimators) 27 | # evaluate pipeline 28 | kfold = KFold(n_splits=10, random_state=7) 29 | results = cross_val_score(model, X, Y, cv=kfold) 30 | print(results.mean()) 31 | -------------------------------------------------------------------------------- /ml_with_python_code/14_standardize_model_pipeline.py: -------------------------------------------------------------------------------- 1 | # Create a pipeline that standardizes the data then creates a model 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.preprocessing import StandardScaler 6 | from sklearn.pipeline import Pipeline 7 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 8 | # load data 9 | filename = 'pima-indians-diabetes.data.csv' 10 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 11 | dataframe = read_csv(filename, names=names) 12 | array = dataframe.values 13 | X = array[:,0:8] 14 | Y = array[:,8] 15 | # create pipeline 16 | estimators = [] 17 | estimators.append(('standardize', StandardScaler())) 18 | estimators.append(('lda', LinearDiscriminantAnalysis())) 19 | model = Pipeline(estimators) 20 | # evaluate pipeline 21 | kfold = KFold(n_splits=10, random_state=7) 22 | results = cross_val_score(model, X, Y, cv=kfold) 23 | print(results.mean()) 24 | -------------------------------------------------------------------------------- /ml_with_python_code/15_adaboost_classification.py: -------------------------------------------------------------------------------- 1 | # AdaBoost Classification 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.ensemble import AdaBoostClassifier 6 | filename = 'pima-indians-diabetes.data.csv' 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 8 | dataframe = read_csv(filename, names=names) 9 | array = dataframe.values 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | num_trees = 30 13 | seed=7 14 | kfold = KFold(n_splits=10, random_state=seed) 15 | model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed) 16 | results = cross_val_score(model, X, Y, cv=kfold) 17 | print(results.mean()) -------------------------------------------------------------------------------- /ml_with_python_code/15_bagged_cart_classification.py: -------------------------------------------------------------------------------- 1 | # Bagged Decision Trees for Classification 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.ensemble import BaggingClassifier 6 | from sklearn.tree import DecisionTreeClassifier 7 | filename = 'pima-indians-diabetes.data.csv' 8 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 9 | dataframe = read_csv(filename, names=names) 10 | array = dataframe.values 11 | X = array[:,0:8] 12 | Y = array[:,8] 13 | seed = 7 14 | kfold = KFold(n_splits=10, random_state=seed) 15 | cart = DecisionTreeClassifier() 16 | num_trees = 100 17 | model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed) 18 | results = cross_val_score(model, X, Y, cv=kfold) 19 | print(results.mean()) 20 | -------------------------------------------------------------------------------- /ml_with_python_code/15_extra_trees_classification.py: -------------------------------------------------------------------------------- 1 | # Extra Trees Classification 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.ensemble import ExtraTreesClassifier 6 | filename = 'pima-indians-diabetes.data.csv' 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 8 | dataframe = read_csv(filename, names=names) 9 | array = dataframe.values 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | num_trees = 100 13 | max_features = 7 14 | kfold = KFold(n_splits=10, random_state=7) 15 | model = ExtraTreesClassifier(n_estimators=num_trees, max_features=max_features) 16 | results = cross_val_score(model, X, Y, cv=kfold) 17 | print(results.mean()) 18 | -------------------------------------------------------------------------------- /ml_with_python_code/15_gradient_boosting_classification.py: -------------------------------------------------------------------------------- 1 | # Stochastic Gradient Boosting Classification 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.ensemble import GradientBoostingClassifier 6 | filename = 'pima-indians-diabetes.data.csv' 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 8 | dataframe = read_csv(filename, names=names) 9 | array = dataframe.values 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | seed = 7 13 | num_trees = 100 14 | kfold = KFold(n_splits=10, random_state=seed) 15 | model = GradientBoostingClassifier(n_estimators=num_trees, random_state=seed) 16 | results = cross_val_score(model, X, Y, cv=kfold) 17 | print(results.mean()) 18 | -------------------------------------------------------------------------------- /ml_with_python_code/15_random_forest_classification.py: -------------------------------------------------------------------------------- 1 | # Random Forest Classification 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.ensemble import RandomForestClassifier 6 | filename = 'pima-indians-diabetes.data.csv' 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 8 | dataframe = read_csv(filename, names=names) 9 | array = dataframe.values 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | num_trees = 100 13 | max_features = 3 14 | kfold = KFold(n_splits=10, random_state=7) 15 | model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features) 16 | results = cross_val_score(model, X, Y, cv=kfold) 17 | print(results.mean()) 18 | -------------------------------------------------------------------------------- /ml_with_python_code/15_voting_ensemble_classification.py: -------------------------------------------------------------------------------- 1 | # Voting Ensemble for Classification 2 | from pandas import read_csv 3 | from sklearn.model_selection import KFold 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn.tree import DecisionTreeClassifier 7 | from sklearn.svm import SVC 8 | from sklearn.ensemble import VotingClassifier 9 | filename = 'pima-indians-diabetes.data.csv' 10 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 11 | dataframe = read_csv(filename, names=names) 12 | array = dataframe.values 13 | X = array[:,0:8] 14 | Y = array[:,8] 15 | kfold = KFold(n_splits=10, random_state=7) 16 | # create the sub models 17 | estimators = [] 18 | model1 = LogisticRegression() 19 | estimators.append(('logistic', model1)) 20 | model2 = DecisionTreeClassifier() 21 | estimators.append(('cart', model2)) 22 | model3 = SVC() 23 | estimators.append(('svm', model3)) 24 | # create the ensemble model 25 | ensemble = VotingClassifier(estimators) 26 | results = cross_val_score(ensemble, X, Y, cv=kfold) 27 | print(results.mean()) 28 | -------------------------------------------------------------------------------- /ml_with_python_code/16_grid_search.py: -------------------------------------------------------------------------------- 1 | # Grid Search for Algorithm Tuning 2 | import numpy 3 | from pandas import read_csv 4 | from sklearn.linear_model import Ridge 5 | from sklearn.model_selection import GridSearchCV 6 | filename = 'pima-indians-diabetes.data.csv' 7 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 8 | dataframe = read_csv(filename, names=names) 9 | array = dataframe.values 10 | X = array[:,0:8] 11 | Y = array[:,8] 12 | alphas = numpy.array([1,0.1,0.01,0.001,0.0001,0]) 13 | param_grid = dict(alpha=alphas) 14 | model = Ridge() 15 | grid = GridSearchCV(estimator=model, param_grid=param_grid) 16 | grid.fit(X, Y) 17 | print(grid.best_score_) 18 | print(grid.best_estimator_.alpha) 19 | -------------------------------------------------------------------------------- /ml_with_python_code/16_random_search.py: -------------------------------------------------------------------------------- 1 | # Randomized for Algorithm Tuning 2 | import numpy 3 | from pandas import read_csv 4 | from scipy.stats import uniform 5 | from sklearn.linear_model import Ridge 6 | from sklearn.model_selection import RandomizedSearchCV 7 | filename = 'pima-indians-diabetes.data.csv' 8 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 9 | dataframe = read_csv(filename, names=names) 10 | array = dataframe.values 11 | X = array[:,0:8] 12 | Y = array[:,8] 13 | param_grid = {'alpha': uniform()} 14 | model = Ridge() 15 | rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100, random_state=7) 16 | rsearch.fit(X, Y) 17 | print(rsearch.best_score_) 18 | print(rsearch.best_estimator_.alpha) 19 | -------------------------------------------------------------------------------- /ml_with_python_code/17_save_model_joblib.py: -------------------------------------------------------------------------------- 1 | # Save Model Using joblib 2 | from pandas import read_csv 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn.externals.joblib import dump 6 | from sklearn.externals.joblib import load 7 | filename = 'pima-indians-diabetes.data.csv' 8 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 9 | dataframe = read_csv(filename, names=names) 10 | array = dataframe.values 11 | X = array[:,0:8] 12 | Y = array[:,8] 13 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=7) 14 | # Fit the model on 33% 15 | model = LogisticRegression() 16 | model.fit(X_train, Y_train) 17 | # save the model to disk 18 | filename = 'finalized_model.sav' 19 | dump(model, filename) 20 | 21 | # some time later... 22 | 23 | # load the model from disk 24 | loaded_model = load(filename) 25 | result = loaded_model.score(X_test, Y_test) 26 | print(result) 27 | -------------------------------------------------------------------------------- /ml_with_python_code/17_save_model_pickel.py: -------------------------------------------------------------------------------- 1 | # Save Model Using Pickle 2 | from pandas import read_csv 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.linear_model import LogisticRegression 5 | from pickle import dump 6 | from pickle import load 7 | filename = 'pima-indians-diabetes.data.csv' 8 | names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 9 | dataframe = read_csv(filename, names=names) 10 | array = dataframe.values 11 | X = array[:,0:8] 12 | Y = array[:,8] 13 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=7) 14 | # Fit the model on 33% 15 | model = LogisticRegression() 16 | model.fit(X_train, Y_train) 17 | # save the model to disk 18 | filename = 'finalized_model.sav' 19 | dump(model, open(filename, 'wb')) 20 | 21 | # some time later... 22 | 23 | # load the model from disk 24 | loaded_model = load(open(filename, 'rb')) 25 | result = loaded_model.score(X_test, Y_test) 26 | print(result) 27 | -------------------------------------------------------------------------------- /ml_with_python_code/18_project_template.py: -------------------------------------------------------------------------------- 1 | # Python Project Template 2 | 3 | # 1. Prepare Problem 4 | # a) Load libraries 5 | # b) Load dataset 6 | 7 | # 2. Summarize Data 8 | # a) Descriptive statistics 9 | # b) Data visualizations 10 | 11 | # 3. Prepare Data 12 | # a) Data Cleaning 13 | # b) Feature Selection 14 | # c) Data Transforms 15 | 16 | # 4. Evaluate Algorithms 17 | # a) Split-out validation dataset 18 | # b) Test options and evaluation metric 19 | # c) Spot Check Algorithms 20 | # d) Compare Algorithms 21 | 22 | # 5. Improve Accuracy 23 | # a) Algorithm Tuning 24 | # b) Ensembles 25 | 26 | # 6. Finalize Model 27 | # a) Predictions on validation dataset 28 | # b) Create standalone model on entire training dataset 29 | # c) Save model for later use 30 | -------------------------------------------------------------------------------- /xgboost_with_python_code/04_first_model.py: -------------------------------------------------------------------------------- 1 | # First XGBoost model for Pima Indians dataset 2 | from numpy import loadtxt 3 | from xgboost import XGBClassifier 4 | from sklearn.cross_validation import train_test_split 5 | from sklearn.metrics import accuracy_score 6 | # load data 7 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") 8 | # split data into X and y 9 | X = dataset[:,0:8] 10 | Y = dataset[:,8] 11 | # split data into train and test sets 12 | seed = 7 13 | test_size = 0.33 14 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) 15 | # fit model no training data 16 | model = XGBClassifier() 17 | model.fit(X_train, y_train) 18 | # make predictions for test data 19 | y_pred = model.predict(X_test) 20 | predictions = [round(value) for value in y_pred] 21 | # evaluate predictions 22 | accuracy = accuracy_score(y_test, predictions) 23 | print("Accuracy: %.2f%%" % (accuracy * 100.0)) -------------------------------------------------------------------------------- /xgboost_with_python_code/05_horse_colic_missing.py: -------------------------------------------------------------------------------- 1 | # binary classification, missing data 2 | from pandas import read_csv 3 | from xgboost import XGBClassifier 4 | from sklearn.cross_validation import train_test_split 5 | from sklearn.metrics import accuracy_score 6 | from sklearn.preprocessing import LabelEncoder 7 | # load data 8 | dataframe = read_csv("horse-colic.csv", delim_whitespace=True, header=None) 9 | dataset = dataframe.values 10 | # split data into X and y 11 | X = dataset[:,0:27] 12 | Y = dataset[:,27] 13 | # set missing values to 0 14 | X[X == '?'] = 0 15 | # convert to numeric 16 | X = X.astype('float32') 17 | # encode Y class values as integers 18 | label_encoder = LabelEncoder() 19 | label_encoder = label_encoder.fit(Y) 20 | label_encoded_y = label_encoder.transform(Y) 21 | # split data into train and test sets 22 | seed = 7 23 | test_size = 0.33 24 | X_train, X_test, y_train, y_test = train_test_split(X, label_encoded_y, test_size=test_size, random_state=seed) 25 | # fit model no training data 26 | model = XGBClassifier() 27 | model.fit(X_train, y_train) 28 | print(model) 29 | # make predictions for test data 30 | y_pred = model.predict(X_test) 31 | predictions = [round(value) for value in y_pred] 32 | # evaluate predictions 33 | accuracy = accuracy_score(y_test, predictions) 34 | print("Accuracy: %.2f%%" % (accuracy * 100.0)) -------------------------------------------------------------------------------- /xgboost_with_python_code/05_horse_colic_missing_imputer.py: -------------------------------------------------------------------------------- 1 | # binary classification, missing data, impute with mean 2 | import numpy 3 | from pandas import read_csv 4 | from xgboost import XGBClassifier 5 | from sklearn.cross_validation import train_test_split 6 | from sklearn.metrics import accuracy_score 7 | from sklearn.preprocessing import LabelEncoder 8 | from sklearn.preprocessing import Imputer 9 | # load data 10 | dataframe = read_csv("horse-colic.csv", delim_whitespace=True, header=None) 11 | dataset = dataframe.values 12 | # split data into X and y 13 | X = dataset[:,0:27] 14 | Y = dataset[:,27] 15 | # set missing values to NaN 16 | X[X == '?'] = numpy.nan 17 | # convert to numeric 18 | X = X.astype('float32') 19 | # impute missing values as the mean 20 | imputer = Imputer() 21 | imputed_x = imputer.fit_transform(X) 22 | # encode Y class values as integers 23 | label_encoder = LabelEncoder() 24 | label_encoder = label_encoder.fit(Y) 25 | label_encoded_y = label_encoder.transform(Y) 26 | # split data into train and test sets 27 | seed = 7 28 | test_size = 0.33 29 | X_train, X_test, y_train, y_test = train_test_split(imputed_x, label_encoded_y, test_size=test_size, random_state=seed) 30 | # fit model no training data 31 | model = XGBClassifier() 32 | model.fit(X_train, y_train) 33 | print(model) 34 | # make predictions for test data 35 | y_pred = model.predict(X_test) 36 | predictions = [round(value) for value in y_pred] 37 | # evaluate predictions 38 | accuracy = accuracy_score(y_test, predictions) 39 | print("Accuracy: %.2f%%" % (accuracy * 100.0)) -------------------------------------------------------------------------------- /xgboost_with_python_code/05_iris_label_encode.py: -------------------------------------------------------------------------------- 1 | # multiclass classification 2 | from pandas import read_csv 3 | from xgboost import XGBClassifier 4 | from sklearn import cross_validation 5 | from sklearn.metrics import accuracy_score 6 | from sklearn.preprocessing import LabelEncoder 7 | # load data 8 | data = read_csv('iris.csv', header=None) 9 | dataset = data.values 10 | # split data into X and y 11 | X = dataset[:,0:4] 12 | Y = dataset[:,4] 13 | # encode string class values as integers 14 | label_encoder = LabelEncoder() 15 | label_encoder = label_encoder.fit(Y) 16 | label_encoded_y = label_encoder.transform(Y) 17 | seed = 7 18 | test_size = 0.33 19 | X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, label_encoded_y, test_size=test_size, random_state=seed) 20 | # fit model no training data 21 | model = XGBClassifier() 22 | model.fit(X_train, y_train) 23 | print(model) 24 | # make predictions for test data 25 | y_pred = model.predict(X_test) 26 | predictions = [round(value) for value in y_pred] 27 | # evaluate predictions 28 | accuracy = accuracy_score(y_test, predictions) 29 | print("Accuracy: %.2f%%" % (accuracy * 100.0)) -------------------------------------------------------------------------------- /xgboost_with_python_code/06_cross_validation.py: -------------------------------------------------------------------------------- 1 | # k-fold cross validation evaluation of xgboost model 2 | from numpy import loadtxt 3 | from xgboost import XGBClassifier 4 | from sklearn.cross_validation import KFold 5 | from sklearn.cross_validation import cross_val_score 6 | # load data 7 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") 8 | # split data into X and y 9 | X = dataset[:,0:8] 10 | Y = dataset[:,8] 11 | # CV model 12 | model = XGBClassifier() 13 | kfold = KFold(n=len(X), n_folds=10, random_state=7) 14 | results = cross_val_score(model, X, Y, cv=kfold) 15 | print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100)) -------------------------------------------------------------------------------- /xgboost_with_python_code/06_stratified_cross_validation.py: -------------------------------------------------------------------------------- 1 | # stratified k-fold cross validation evaluation of xgboost model 2 | from numpy import loadtxt 3 | from xgboost import XGBClassifier 4 | from sklearn.cross_validation import StratifiedKFold 5 | from sklearn.cross_validation import cross_val_score 6 | # load data 7 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") 8 | # split data into X and y 9 | X = dataset[:,0:8] 10 | Y = dataset[:,8] 11 | # CV model 12 | model = XGBClassifier() 13 | kfold = StratifiedKFold(Y, n_folds=10, random_state=7) 14 | results = cross_val_score(model, X, Y, cv=kfold) 15 | print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100)) -------------------------------------------------------------------------------- /xgboost_with_python_code/06_train_test_split.py: -------------------------------------------------------------------------------- 1 | # train-test split evaluation of xgboost model 2 | from numpy import loadtxt 3 | from xgboost import XGBClassifier 4 | from sklearn.cross_validation import train_test_split 5 | from sklearn.metrics import accuracy_score 6 | # load data 7 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") 8 | # split data into X and y 9 | X = dataset[:,0:8] 10 | Y = dataset[:,8] 11 | # split data into train and test sets 12 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=7) 13 | # fit model no training data 14 | model = XGBClassifier() 15 | model.fit(X_train, y_train) 16 | # make predictions for test data 17 | y_pred = model.predict(X_test) 18 | predictions = [round(value) for value in y_pred] 19 | # evaluate predictions 20 | accuracy = accuracy_score(y_test, predictions) 21 | print("Accuracy: %.2f%%" % (accuracy * 100.0)) -------------------------------------------------------------------------------- /xgboost_with_python_code/07_plot_tree-left-to-right.py: -------------------------------------------------------------------------------- 1 | # plot decision tree 2 | from numpy import loadtxt 3 | from xgboost import XGBClassifier 4 | from xgboost import plot_tree 5 | from matplotlib import pyplot 6 | # load data 7 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") 8 | # split data into X and y 9 | X = dataset[:,0:8] 10 | y = dataset[:,8] 11 | # fit model no training data 12 | model = XGBClassifier() 13 | model.fit(X, y) 14 | # plot single tree 15 | plot_tree(model, num_trees=0, rankdir='LR') 16 | pyplot.show() -------------------------------------------------------------------------------- /xgboost_with_python_code/07_plot_tree.py: -------------------------------------------------------------------------------- 1 | # plot decision tree 2 | from numpy import loadtxt 3 | from xgboost import XGBClassifier 4 | from xgboost import plot_tree 5 | from matplotlib import pyplot 6 | # load data 7 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") 8 | # split data into X and y 9 | X = dataset[:,0:8] 10 | y = dataset[:,8] 11 | # fit model no training data 12 | model = XGBClassifier() 13 | model.fit(X, y) 14 | # plot single tree 15 | plot_tree(model) 16 | pyplot.show() -------------------------------------------------------------------------------- /xgboost_with_python_code/08_serialize_with_joblib.py: -------------------------------------------------------------------------------- 1 | # Train XGBoost model, save to file using joblib, load and make predictions 2 | from numpy import loadtxt 3 | from xgboost import XGBClassifier 4 | from sklearn.externals import joblib 5 | from sklearn.cross_validation import train_test_split 6 | from sklearn.metrics import accuracy_score 7 | # load data 8 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") 9 | # split data into X and y 10 | X = dataset[:,0:8] 11 | Y = dataset[:,8] 12 | # split data into train and test sets 13 | seed = 7 14 | test_size = 0.33 15 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) 16 | # fit model no training data 17 | model = XGBClassifier() 18 | model.fit(X_train, y_train) 19 | # save model to file 20 | joblib.dump(model, "pima.joblib.dat") 21 | print("Saved model to: pima.joblib.dat") 22 | 23 | # some time later... 24 | 25 | # load model from file 26 | loaded_model = joblib.load("pima.joblib.dat") 27 | print("Loaded model from: pima.joblib.dat") 28 | # make predictions for test data 29 | y_pred = loaded_model.predict(X_test) 30 | predictions = [round(value) for value in y_pred] 31 | # evaluate predictions 32 | accuracy = accuracy_score(y_test, predictions) 33 | print("Accuracy: %.2f%%" % (accuracy * 100.0)) -------------------------------------------------------------------------------- /xgboost_with_python_code/08_serialize_with_pickle.py: -------------------------------------------------------------------------------- 1 | # Train XGBoost model, save to file using pickle, load and make predictions 2 | from numpy import loadtxt 3 | from xgboost import XGBClassifier 4 | import pickle 5 | from sklearn.cross_validation import train_test_split 6 | from sklearn.metrics import accuracy_score 7 | # load data 8 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") 9 | # split data into X and y 10 | X = dataset[:,0:8] 11 | Y = dataset[:,8] 12 | # split data into train and test sets 13 | seed = 7 14 | test_size = 0.33 15 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) 16 | # fit model no training data 17 | model = XGBClassifier() 18 | model.fit(X_train, y_train) 19 | # save model to file 20 | pickle.dump(model, open("pima.pickle.dat", "wb")) 21 | print("Saved model to: pima.pickle.dat") 22 | 23 | # some time later... 24 | 25 | # load model from file 26 | loaded_model = pickle.load(open("pima.pickle.dat", "rb")) 27 | print("Loaded model from: pima.pickle.dat") 28 | # make predictions for test data 29 | y_pred = loaded_model.predict(X_test) 30 | predictions = [round(value) for value in y_pred] 31 | # evaluate predictions 32 | accuracy = accuracy_score(y_test, predictions) 33 | print("Accuracy: %.2f%%" % (accuracy * 100.0)) -------------------------------------------------------------------------------- /xgboost_with_python_code/09_automatic_feature_importance.py: -------------------------------------------------------------------------------- 1 | # plot feature importance using built-in function 2 | from numpy import loadtxt 3 | from xgboost import XGBClassifier 4 | from xgboost import plot_importance 5 | from matplotlib import pyplot 6 | # load data 7 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") 8 | # split data into X and y 9 | X = dataset[:,0:8] 10 | y = dataset[:,8] 11 | # fit model no training data 12 | model = XGBClassifier() 13 | model.fit(X, y) 14 | # plot feature importance 15 | plot_importance(model) 16 | pyplot.show() -------------------------------------------------------------------------------- /xgboost_with_python_code/09_feature_selection.py: -------------------------------------------------------------------------------- 1 | # use feature importance for feature selection 2 | from numpy import loadtxt 3 | from numpy import sort 4 | from xgboost import XGBClassifier 5 | from sklearn.cross_validation import train_test_split 6 | from sklearn.metrics import accuracy_score 7 | from sklearn.feature_selection import SelectFromModel 8 | # load data 9 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") 10 | # split data into X and y 11 | X = dataset[:,0:8] 12 | Y = dataset[:,8] 13 | # split data into train and test sets 14 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=7) 15 | # fit model on all training data 16 | model = XGBClassifier() 17 | model.fit(X_train, y_train) 18 | # make predictions for test data and evaluate 19 | y_pred = model.predict(X_test) 20 | predictions = [round(value) for value in y_pred] 21 | accuracy = accuracy_score(y_test, predictions) 22 | print("Accuracy: %.2f%%" % (accuracy * 100.0)) 23 | # Fit model using each importance as a threshold 24 | thresholds = sort(model.feature_importances_) 25 | for thresh in thresholds: 26 | # select features using threshold 27 | selection = SelectFromModel(model, threshold=thresh, prefit=True) 28 | select_X_train = selection.transform(X_train) 29 | # train model 30 | selection_model = XGBClassifier() 31 | selection_model.fit(select_X_train, y_train) 32 | # eval model 33 | select_X_test = selection.transform(X_test) 34 | y_pred = selection_model.predict(select_X_test) 35 | predictions = [round(value) for value in y_pred] 36 | accuracy = accuracy_score(y_test, predictions) 37 | print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1], accuracy*100.0)) -------------------------------------------------------------------------------- /xgboost_with_python_code/09_manual_feature_importance.py: -------------------------------------------------------------------------------- 1 | # plot feature importance manually 2 | from numpy import loadtxt 3 | from xgboost import XGBClassifier 4 | from matplotlib import pyplot 5 | # load data 6 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") 7 | # split data into X and y 8 | X = dataset[:,0:8] 9 | y = dataset[:,8] 10 | # fit model no training data 11 | model = XGBClassifier() 12 | model.fit(X, y) 13 | # feature importance 14 | print(model.feature_importances_) 15 | # plot 16 | pyplot.bar(range(len(model.feature_importances_)), model.feature_importances_) 17 | pyplot.show() -------------------------------------------------------------------------------- /xgboost_with_python_code/10_early_stopping.py: -------------------------------------------------------------------------------- 1 | # early stopping 2 | from numpy import loadtxt 3 | from xgboost import XGBClassifier 4 | from sklearn.cross_validation import train_test_split 5 | from sklearn.metrics import accuracy_score 6 | # load data 7 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") 8 | # split data into X and y 9 | X = dataset[:,0:8] 10 | Y = dataset[:,8] 11 | # split data into train and test sets 12 | seed = 7 13 | test_size = 0.33 14 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) 15 | # fit model no training data 16 | model = XGBClassifier() 17 | eval_set = [(X_test, y_test)] 18 | model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="logloss", eval_set=eval_set, verbose=True) 19 | # make predictions for test data 20 | y_pred = model.predict(X_test) 21 | predictions = [round(value) for value in y_pred] 22 | # evaluate predictions 23 | accuracy = accuracy_score(y_test, predictions) 24 | print("Accuracy: %.2f%%" % (accuracy * 100.0)) -------------------------------------------------------------------------------- /xgboost_with_python_code/10_evaluate_validation_set.py: -------------------------------------------------------------------------------- 1 | # monitor training performance 2 | from numpy import loadtxt 3 | from xgboost import XGBClassifier 4 | from sklearn.cross_validation import train_test_split 5 | from sklearn.metrics import accuracy_score 6 | # load data 7 | dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") 8 | # split data into X and y 9 | X = dataset[:,0:8] 10 | Y = dataset[:,8] 11 | # split data into train and test sets 12 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=7) 13 | # fit model no training data 14 | model = XGBClassifier() 15 | eval_set = [(X_test, y_test)] 16 | model.fit(X_train, y_train, eval_metric="error", eval_set=eval_set, verbose=True) 17 | # make predictions for test data 18 | y_pred = model.predict(X_test) 19 | predictions = [round(value) for value in y_pred] 20 | # evaluate predictions 21 | accuracy = accuracy_score(y_test, predictions) 22 | print("Accuracy: %.2f%%" % (accuracy * 100.0)) -------------------------------------------------------------------------------- /xgboost_with_python_code/11_eval_num_threads.py: -------------------------------------------------------------------------------- 1 | # Otto, tune number of threads 2 | from pandas import read_csv 3 | from xgboost import XGBClassifier 4 | from sklearn.preprocessing import LabelEncoder 5 | import time 6 | from matplotlib import pyplot 7 | # load data 8 | data = read_csv('train.csv') 9 | dataset = data.values 10 | # split data into X and y 11 | X = dataset[:,0:94] 12 | y = dataset[:,94] 13 | # encode string class values as integers 14 | label_encoded_y = LabelEncoder().fit_transform(y) 15 | # evaluate the effect of the number of threads 16 | results = [] 17 | num_threads = [1, 2, 3, 4] 18 | for n in num_threads: 19 | start = time.time() 20 | model = XGBClassifier(nthread=n) 21 | model.fit(X, label_encoded_y) 22 | elapsed = time.time() - start 23 | print(n, elapsed) 24 | results.append(elapsed) 25 | # plot results 26 | pyplot.plot(num_threads, results) 27 | pyplot.ylabel('Speed (seconds)') 28 | pyplot.xlabel('Number of Threads') 29 | pyplot.title('XGBoost Training Speed vs Number of Threads') 30 | pyplot.show() -------------------------------------------------------------------------------- /xgboost_with_python_code/11_eval_parallel_cv_and_xgboost.py: -------------------------------------------------------------------------------- 1 | # Otto, parallel cross validation 2 | from pandas import read_csv 3 | from xgboost import XGBClassifier 4 | from sklearn.cross_validation import StratifiedKFold 5 | from sklearn.cross_validation import cross_val_score 6 | from sklearn.preprocessing import LabelEncoder 7 | import time 8 | # load data 9 | data = read_csv('train.csv') 10 | dataset = data.values 11 | # split data into X and y 12 | X = dataset[:,0:94] 13 | y = dataset[:,94] 14 | # encode string class values as integers 15 | label_encoded_y = LabelEncoder().fit_transform(y) 16 | # prepare cross validation 17 | kfold = StratifiedKFold(label_encoded_y, n_folds=10, shuffle=True, random_state=7) 18 | # Single Thread XGBoost, Parallel Thread CV 19 | start = time.time() 20 | model = XGBClassifier(nthread=1) 21 | results = cross_val_score(model, X, label_encoded_y, cv=kfold, scoring='log_loss', n_jobs=-1) 22 | elapsed = time.time() - start 23 | print("Single Thread XGBoost, Parallel Thread CV: %f" % (elapsed)) 24 | # Parallel Thread XGBoost, Single Thread CV 25 | start = time.time() 26 | model = XGBClassifier(nthread=-1) 27 | results = cross_val_score(model, X, label_encoded_y, cv=kfold, scoring='log_loss', n_jobs=1) 28 | elapsed = time.time() - start 29 | print("Parallel Thread XGBoost, Single Thread CV: %f" % (elapsed)) 30 | # Parallel Thread XGBoost and CV 31 | start = time.time() 32 | model = XGBClassifier(nthread=-1) 33 | results = cross_val_score(model, X, label_encoded_y, cv=kfold, scoring='log_loss', n_jobs=-1) 34 | elapsed = time.time() - start 35 | print("Parallel Thread XGBoost and CV: %f" % (elapsed)) -------------------------------------------------------------------------------- /xgboost_with_python_code/12_check_num_threads.py: -------------------------------------------------------------------------------- 1 | # Otto multi-core test 2 | from pandas import read_csv 3 | from xgboost import XGBClassifier 4 | from sklearn.preprocessing import LabelEncoder 5 | import time 6 | # load data 7 | data = read_csv('train.csv') 8 | dataset = data.values 9 | # split data into X and y 10 | X = dataset[:,0:94] 11 | y = dataset[:,94] 12 | # encode string class values as integers 13 | label_encoded_y = LabelEncoder().fit_transform(y) 14 | # evaluate the effect of the number of threads 15 | results = [] 16 | num_threads = [1, 16, 32] 17 | for n in num_threads: 18 | start = time.time() 19 | model = XGBClassifier(nthread=n) 20 | model.fit(X, label_encoded_y) 21 | elapsed = time.time() - start 22 | print(n, elapsed) 23 | results.append(elapsed) -------------------------------------------------------------------------------- /xgboost_with_python_code/14_tune_depth.py: -------------------------------------------------------------------------------- 1 | # XGBoost on Otto dataset, Tune max_depth 2 | from pandas import read_csv 3 | from xgboost import XGBClassifier 4 | from sklearn.grid_search import GridSearchCV 5 | from sklearn.cross_validation import StratifiedKFold 6 | from sklearn.preprocessing import LabelEncoder 7 | import matplotlib 8 | matplotlib.use('Agg') 9 | from matplotlib import pyplot 10 | # load data 11 | data = read_csv('train.csv') 12 | dataset = data.values 13 | # split data into X and y 14 | X = dataset[:,0:94] 15 | y = dataset[:,94] 16 | # encode string class values as integers 17 | label_encoded_y = LabelEncoder().fit_transform(y) 18 | # grid search 19 | model = XGBClassifier() 20 | max_depth = range(1, 11, 2) 21 | print(max_depth) 22 | param_grid = dict(max_depth=max_depth) 23 | kfold = StratifiedKFold(label_encoded_y, n_folds=10, shuffle=True, random_state=7) 24 | grid_search = GridSearchCV(model, param_grid, scoring="log_loss", n_jobs=-1, cv=kfold, verbose=1) 25 | result = grid_search.fit(X, label_encoded_y) 26 | # summarize results 27 | print("Best: %f using %s" % (result.best_score_, result.best_params_)) 28 | means, stdevs = [], [] 29 | for params, mean_score, scores in result.grid_scores_: 30 | stdev = scores.std() 31 | means.append(mean_score) 32 | stdevs.append(stdev) 33 | print("%f (%f) with: %r" % (mean_score, stdev, params)) 34 | # plot 35 | pyplot.errorbar(max_depth, means, yerr=stdevs) 36 | pyplot.title("XGBoost max_depth vs Log Loss") 37 | pyplot.xlabel('max_depth') 38 | pyplot.ylabel('Log Loss') 39 | pyplot.savefig('max_depth.png') -------------------------------------------------------------------------------- /xgboost_with_python_code/14_tune_trees.py: -------------------------------------------------------------------------------- 1 | # XGBoost on Otto dataset, Tune n_estimators 2 | from pandas import read_csv 3 | from xgboost import XGBClassifier 4 | from sklearn.grid_search import GridSearchCV 5 | from sklearn.cross_validation import StratifiedKFold 6 | from sklearn.preprocessing import LabelEncoder 7 | import matplotlib 8 | matplotlib.use('Agg') 9 | from matplotlib import pyplot 10 | # load data 11 | data = read_csv('train.csv') 12 | dataset = data.values 13 | # split data into X and y 14 | X = dataset[:,0:94] 15 | y = dataset[:,94] 16 | # encode string class values as integers 17 | label_encoded_y = LabelEncoder().fit_transform(y) 18 | # grid search 19 | model = XGBClassifier() 20 | n_estimators = range(50, 400, 50) 21 | param_grid = dict(n_estimators=n_estimators) 22 | kfold = StratifiedKFold(label_encoded_y, n_folds=10, shuffle=True, random_state=7) 23 | grid_search = GridSearchCV(model, param_grid, scoring="log_loss", n_jobs=-1, cv=kfold) 24 | result = grid_search.fit(X, label_encoded_y) 25 | # summarize results 26 | print("Best: %f using %s" % (result.best_score_, result.best_params_)) 27 | means, stdevs = [], [] 28 | for params, mean_score, scores in result.grid_scores_: 29 | stdev = scores.std() 30 | means.append(mean_score) 31 | stdevs.append(stdev) 32 | print("%f (%f) with: %r" % (mean_score, stdev, params)) 33 | # plot 34 | pyplot.errorbar(n_estimators, means, yerr=stdevs) 35 | pyplot.title("XGBoost n_estimators vs Log Loss") 36 | pyplot.xlabel('n_estimators') 37 | pyplot.ylabel('Log Loss') 38 | pyplot.savefig('n_estimators.png') -------------------------------------------------------------------------------- /xgboost_with_python_code/15_plot_performance.py: -------------------------------------------------------------------------------- 1 | # Plot performance for learning_rate=0.1 2 | from matplotlib import pyplot 3 | n_estimators = [100, 200, 300, 400, 500] 4 | loss = [-0.001239, -0.001153, -0.001152, -0.001153, -0.001153] 5 | pyplot.plot(n_estimators, loss) 6 | pyplot.xlabel('n_estimators') 7 | pyplot.ylabel('Log Loss') 8 | pyplot.title('XGBoost learning_rate=0.1 n_estimators vs Log Loss') 9 | pyplot.show() -------------------------------------------------------------------------------- /xgboost_with_python_code/15_tune_learning_rate.py: -------------------------------------------------------------------------------- 1 | # XGBoost on Otto dataset, Tune learning_rate 2 | from pandas import read_csv 3 | from xgboost import XGBClassifier 4 | from sklearn.grid_search import GridSearchCV 5 | from sklearn.cross_validation import StratifiedKFold 6 | from sklearn.preprocessing import LabelEncoder 7 | import matplotlib 8 | matplotlib.use('Agg') 9 | from matplotlib import pyplot 10 | # load data 11 | data = read_csv('train.csv') 12 | dataset = data.values 13 | # split data into X and y 14 | X = dataset[:,0:94] 15 | y = dataset[:,94] 16 | # encode string class values as integers 17 | label_encoded_y = LabelEncoder().fit_transform(y) 18 | # grid search 19 | model = XGBClassifier() 20 | learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3] 21 | param_grid = dict(learning_rate=learning_rate) 22 | kfold = StratifiedKFold(label_encoded_y, n_folds=10, shuffle=True, random_state=7) 23 | grid_search = GridSearchCV(model, param_grid, scoring="log_loss", n_jobs=-1, cv=kfold) 24 | result = grid_search.fit(X, label_encoded_y) 25 | # summarize results 26 | print("Best: %f using %s" % (result.best_score_, result.best_params_)) 27 | means, stdevs = [], [] 28 | for params, mean_score, scores in result.grid_scores_: 29 | stdev = scores.std() 30 | means.append(mean_score) 31 | stdevs.append(stdev) 32 | print("%f (%f) with: %r" % (mean_score, stdev, params)) 33 | # plot 34 | pyplot.errorbar(learning_rate, means, yerr=stdevs) 35 | pyplot.title("XGBoost learning_rate vs Log Loss") 36 | pyplot.xlabel('learning_rate') 37 | pyplot.ylabel('Log Loss') 38 | pyplot.savefig('learning_rate.png') -------------------------------------------------------------------------------- /xgboost_with_python_code/16_tune_column_sample_rate_bytree.py: -------------------------------------------------------------------------------- 1 | # XGBoost on Otto dataset, tune colsample_bytree 2 | from pandas import read_csv 3 | from xgboost import XGBClassifier 4 | from sklearn.grid_search import GridSearchCV 5 | from sklearn.cross_validation import StratifiedKFold 6 | from sklearn.preprocessing import LabelEncoder 7 | import matplotlib 8 | matplotlib.use('Agg') 9 | from matplotlib import pyplot 10 | # load data 11 | data = read_csv('train.csv') 12 | dataset = data.values 13 | # split data into X and y 14 | X = dataset[:,0:94] 15 | y = dataset[:,94] 16 | # encode string class values as integers 17 | label_encoded_y = LabelEncoder().fit_transform(y) 18 | # grid search 19 | model = XGBClassifier() 20 | colsample_bytree = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0] 21 | param_grid = dict(colsample_bytree=colsample_bytree) 22 | kfold = StratifiedKFold(label_encoded_y, n_folds=10, shuffle=True, random_state=7) 23 | grid_search = GridSearchCV(model, param_grid, scoring="log_loss", n_jobs=-1, cv=kfold) 24 | result = grid_search.fit(X, label_encoded_y) 25 | # summarize results 26 | print("Best: %f using %s" % (result.best_score_, result.best_params_)) 27 | means, stdevs = [], [] 28 | for params, mean_score, scores in result.grid_scores_: 29 | stdev = scores.std() 30 | means.append(mean_score) 31 | stdevs.append(stdev) 32 | print("%f (%f) with: %r" % (mean_score, stdev, params)) 33 | # plot 34 | pyplot.errorbar(colsample_bytree, means, yerr=stdevs) 35 | pyplot.title("XGBoost colsample_bytree vs Log Loss") 36 | pyplot.xlabel('colsample_bytree') 37 | pyplot.ylabel('Log Loss') 38 | pyplot.savefig('colsample_bytree.png') -------------------------------------------------------------------------------- /xgboost_with_python_code/16_tune_column_sample_rate_split.py: -------------------------------------------------------------------------------- 1 | # XGBoost on Otto dataset, tune colsample_bylevel 2 | from pandas import read_csv 3 | from xgboost import XGBClassifier 4 | from sklearn.grid_search import GridSearchCV 5 | from sklearn.cross_validation import StratifiedKFold 6 | from sklearn.preprocessing import LabelEncoder 7 | import matplotlib 8 | matplotlib.use('Agg') 9 | from matplotlib import pyplot 10 | # load data 11 | data = read_csv('train.csv') 12 | dataset = data.values 13 | # split data into X and y 14 | X = dataset[:,0:94] 15 | y = dataset[:,94] 16 | # encode string class values as integers 17 | label_encoded_y = LabelEncoder().fit_transform(y) 18 | # grid search 19 | model = XGBClassifier() 20 | colsample_bylevel = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0] 21 | param_grid = dict(colsample_bylevel=colsample_bylevel) 22 | kfold = StratifiedKFold(label_encoded_y, n_folds=10, shuffle=True, random_state=7) 23 | grid_search = GridSearchCV(model, param_grid, scoring="log_loss", n_jobs=-1, cv=kfold) 24 | result = grid_search.fit(X, label_encoded_y) 25 | # summarize results 26 | print("Best: %f using %s" % (result.best_score_, result.best_params_)) 27 | means, stdevs = [], [] 28 | for params, mean_score, scores in result.grid_scores_: 29 | stdev = scores.std() 30 | means.append(mean_score) 31 | stdevs.append(stdev) 32 | print("%f (%f) with: %r" % (mean_score, stdev, params)) 33 | # plot 34 | pyplot.errorbar(colsample_bylevel, means, yerr=stdevs) 35 | pyplot.title("XGBoost colsample_bylevel vs Log Loss") 36 | pyplot.xlabel('colsample_bylevel') 37 | pyplot.ylabel('Log Loss') 38 | pyplot.savefig('colsample_bylevel.png') -------------------------------------------------------------------------------- /xgboost_with_python_code/16_tune_row_sample_rate.py: -------------------------------------------------------------------------------- 1 | # XGBoost on Otto dataset, tune subsample 2 | from pandas import read_csv 3 | from xgboost import XGBClassifier 4 | from sklearn.grid_search import GridSearchCV 5 | from sklearn.cross_validation import StratifiedKFold 6 | from sklearn.preprocessing import LabelEncoder 7 | import matplotlib 8 | matplotlib.use('Agg') 9 | from matplotlib import pyplot 10 | # load data 11 | data = read_csv('train.csv') 12 | dataset = data.values 13 | # split data into X and y 14 | X = dataset[:,0:94] 15 | y = dataset[:,94] 16 | # encode string class values as integers 17 | label_encoded_y = LabelEncoder().fit_transform(y) 18 | # grid search 19 | model = XGBClassifier() 20 | subsample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0] 21 | param_grid = dict(subsample=subsample) 22 | kfold = StratifiedKFold(label_encoded_y, n_folds=10, shuffle=True, random_state=7) 23 | grid_search = GridSearchCV(model, param_grid, scoring="log_loss", n_jobs=-1, cv=kfold) 24 | result = grid_search.fit(X, label_encoded_y) 25 | # summarize results 26 | print("Best: %f using %s" % (result.best_score_, result.best_params_)) 27 | means, stdevs = [], [] 28 | for params, mean_score, scores in result.grid_scores_: 29 | stdev = scores.std() 30 | means.append(mean_score) 31 | stdevs.append(stdev) 32 | print("%f (%f) with: %r" % (mean_score, stdev, params)) 33 | # plot 34 | pyplot.errorbar(subsample, means, yerr=stdevs) 35 | pyplot.title("XGBoost subsample vs Log Loss") 36 | pyplot.xlabel('subsample') 37 | pyplot.ylabel('Log Loss') 38 | pyplot.savefig('subsample.png') --------------------------------------------------------------------------------