├── README.md ├── LICENSE ├── Outlier Removal - IQR Approach.ipynb ├── Identify and Remove Duplicate Rows.ipynb ├── SimpleImputer and Model Evaluation.ipynb ├── IterativeImputer and Model Evaluation.ipynb ├── Outlier Removal - Standard Deviation Approach.ipynb ├── IterativeImputer Data Transform.ipynb ├── Automatic Outlier Detection.ipynb ├── Data Cleansing Master Class - Data Preparation With Training and Testing Sets.ipynb ├── Statistical Imputation With KNN.ipynb ├── Statistical Imputation With SimpleImputer.ipynb ├── Remove Missing Values.ipynb ├── Mark Missing Values.ipynb ├── Comparing Different Imputed Statistics.ipynb ├── KNNImputer and Model Evaluation Different K-Values.ipynb ├── IterativeImputer and Different Number of Iterations.ipynb ├── rescaling.tscproj ├── Sparse Column Identification and Removal.ipynb ├── Polynomial Feature Transform.ipynb └── Categorical Feature Selection.ipynb /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Data-Cleansing-Master-Class-in-Python -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Outlier Removal - IQR Approach.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Percentiles: 25th=46.685, 75th=53.359, IQR=6.674\n", 13 | "Identified outliers: 81\n", 14 | "Non-outlier observations: 9919\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "# identify outliers with interquartile range\n", 20 | "from numpy.random import seed\n", 21 | "from numpy.random import randn\n", 22 | "from numpy import percentile\n", 23 | "# seed the random number generator\n", 24 | "seed(1)\n", 25 | "# generate univariate observations\n", 26 | "data = 5 * randn(10000) + 50\n", 27 | "# calculate interquartile range\n", 28 | "q25, q75 = percentile(data, 25), percentile(data, 75)\n", 29 | "iqr = q75 - q25\n", 30 | "print('Percentiles: 25th=%.3f, 75th=%.3f, IQR=%.3f' % (q25, q75, iqr))\n", 31 | "# calculate the outlier cutoff\n", 32 | "cut_off = iqr * 1.5\n", 33 | "lower, upper = q25 - cut_off, q75 + cut_off\n", 34 | "# identify outliers\n", 35 | "outliers = [x for x in data if x < lower or x > upper]\n", 36 | "print('Identified outliers: %d' % len(outliers))\n", 37 | "# remove outliers\n", 38 | "outliers_removed = [x for x in data if x >= lower and x <= upper]\n", 39 | "print('Non-outlier observations: %d' % len(outliers_removed))" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [] 48 | } 49 | ], 50 | "metadata": { 51 | "kernelspec": { 52 | "display_name": "Python 3", 53 | "language": "python", 54 | "name": "python3" 55 | }, 56 | "language_info": { 57 | "codemirror_mode": { 58 | "name": "ipython", 59 | "version": 3 60 | }, 61 | "file_extension": ".py", 62 | "mimetype": "text/x-python", 63 | "name": "python", 64 | "nbconvert_exporter": "python", 65 | "pygments_lexer": "ipython3", 66 | "version": "3.6.4" 67 | } 68 | }, 69 | "nbformat": 4, 70 | "nbformat_minor": 2 71 | } 72 | -------------------------------------------------------------------------------- /Identify and Remove Duplicate Rows.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "True\n", 13 | " 0 1 2 3 4\n", 14 | "34 4.9 3.1 1.5 0.1 Iris-setosa\n", 15 | "37 4.9 3.1 1.5 0.1 Iris-setosa\n", 16 | "142 5.8 2.7 5.1 1.9 Iris-virginica\n" 17 | ] 18 | } 19 | ], 20 | "source": [ 21 | "# locate rows of duplicate data\n", 22 | "from pandas import read_csv\n", 23 | "# load the dataset\n", 24 | "df = read_csv('iris.csv', header=None)\n", 25 | "# calculate duplicates\n", 26 | "dups = df.duplicated()\n", 27 | "# report if there are any duplicates\n", 28 | "print(dups.any())\n", 29 | "# list all duplicate rows\n", 30 | "print(df[dups])\n" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "(150, 5)\n", 43 | "(147, 5)\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "# delete rows of duplicate data from the dataset\n", 49 | "from pandas import read_csv\n", 50 | "# load the dataset\n", 51 | "df = read_csv('iris.csv', header=None)\n", 52 | "print(df.shape)\n", 53 | "# delete duplicate rows\n", 54 | "df.drop_duplicates(inplace=True)\n", 55 | "print(df.shape)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [] 64 | } 65 | ], 66 | "metadata": { 67 | "kernelspec": { 68 | "display_name": "Python 3", 69 | "language": "python", 70 | "name": "python3" 71 | }, 72 | "language_info": { 73 | "codemirror_mode": { 74 | "name": "ipython", 75 | "version": 3 76 | }, 77 | "file_extension": ".py", 78 | "mimetype": "text/x-python", 79 | "name": "python", 80 | "nbconvert_exporter": "python", 81 | "pygments_lexer": "ipython3", 82 | "version": "3.6.4" 83 | } 84 | }, 85 | "nbformat": 4, 86 | "nbformat_minor": 2 87 | } 88 | -------------------------------------------------------------------------------- /SimpleImputer and Model Evaluation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Mean Accuracy: 0.863 (0.057)\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "# evaluate mean imputation and random forest for the horse colic dataset\n", 18 | "from numpy import mean\n", 19 | "from numpy import std\n", 20 | "from pandas import read_csv\n", 21 | "from sklearn.ensemble import RandomForestClassifier\n", 22 | "from sklearn.impute import SimpleImputer\n", 23 | "from sklearn.model_selection import cross_val_score\n", 24 | "from sklearn.model_selection import RepeatedStratifiedKFold\n", 25 | "from sklearn.pipeline import Pipeline\n", 26 | "# load dataset\n", 27 | "dataframe = read_csv('horse-colic.csv', header=None, na_values='?')\n", 28 | "# split into input and output elements\n", 29 | "data = dataframe.values\n", 30 | "ix = [i for i in range(data.shape[1]) if i != 23]\n", 31 | "X, y = data[:, ix], data[:, 23]\n", 32 | "# define modeling pipeline\n", 33 | "model = RandomForestClassifier()\n", 34 | "imputer = SimpleImputer(strategy='mean')\n", 35 | "pipeline = Pipeline(steps=[('i', imputer), ('m', model)])\n", 36 | "# define model evaluation\n", 37 | "cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)\n", 38 | "# evaluate model\n", 39 | "scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)\n", 40 | "print('Mean Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [] 49 | } 50 | ], 51 | "metadata": { 52 | "kernelspec": { 53 | "display_name": "Python 3", 54 | "language": "python", 55 | "name": "python3" 56 | }, 57 | "language_info": { 58 | "codemirror_mode": { 59 | "name": "ipython", 60 | "version": 3 61 | }, 62 | "file_extension": ".py", 63 | "mimetype": "text/x-python", 64 | "name": "python", 65 | "nbconvert_exporter": "python", 66 | "pygments_lexer": "ipython3", 67 | "version": "3.6.4" 68 | } 69 | }, 70 | "nbformat": 4, 71 | "nbformat_minor": 2 72 | } 73 | -------------------------------------------------------------------------------- /IterativeImputer and Model Evaluation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# define modeling pipeline\n", 10 | "model = RandomForestClassifier()\n", 11 | "imputer = IterativeImputer()\n", 12 | "pipeline = Pipeline(steps=[('i', imputer), ('m', model)])" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "name": "stdout", 22 | "output_type": "stream", 23 | "text": [ 24 | "Mean Accuracy: 0.866 (0.053)\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "# evaluate iterative imputation and random forest for the horse colic dataset\n", 30 | "from numpy import mean\n", 31 | "from numpy import std\n", 32 | "from pandas import read_csv\n", 33 | "from sklearn.ensemble import RandomForestClassifier\n", 34 | "from sklearn.experimental import enable_iterative_imputer\n", 35 | "from sklearn.impute import IterativeImputer\n", 36 | "from sklearn.model_selection import cross_val_score\n", 37 | "from sklearn.model_selection import RepeatedStratifiedKFold\n", 38 | "from sklearn.pipeline import Pipeline\n", 39 | "# load dataset\n", 40 | "dataframe = read_csv('horse-colic.csv', header=None, na_values='?')\n", 41 | "# split into input and output elements\n", 42 | "data = dataframe.values\n", 43 | "ix = [i for i in range(data.shape[1]) if i != 23]\n", 44 | "X, y = data[:, ix], data[:, 23]\n", 45 | "# define modeling pipeline\n", 46 | "model = RandomForestClassifier()\n", 47 | "imputer = IterativeImputer()\n", 48 | "pipeline = Pipeline(steps=[('i', imputer), ('m', model)])\n", 49 | "# define model evaluation\n", 50 | "cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)\n", 51 | "# evaluate model\n", 52 | "scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)\n", 53 | "print('Mean Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [] 62 | } 63 | ], 64 | "metadata": { 65 | "kernelspec": { 66 | "display_name": "Python 3", 67 | "language": "python", 68 | "name": "python3" 69 | }, 70 | "language_info": { 71 | "codemirror_mode": { 72 | "name": "ipython", 73 | "version": 3 74 | }, 75 | "file_extension": ".py", 76 | "mimetype": "text/x-python", 77 | "name": "python", 78 | "nbconvert_exporter": "python", 79 | "pygments_lexer": "ipython3", 80 | "version": "3.6.4" 81 | } 82 | }, 83 | "nbformat": 4, 84 | "nbformat_minor": 2 85 | } 86 | -------------------------------------------------------------------------------- /Outlier Removal - Standard Deviation Approach.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "mean=50.049 stdv=4.994\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "# generate gaussian data\n", 18 | "from numpy.random import seed\n", 19 | "from numpy.random import randn\n", 20 | "from numpy import mean\n", 21 | "from numpy import std\n", 22 | "# seed the random number generator\n", 23 | "seed(1)\n", 24 | "# generate univariate observations\n", 25 | "data = 5 * randn(10000) + 50\n", 26 | "# summarize\n", 27 | "print('mean=%.3f stdv=%.3f' % (mean(data), std(data)))" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "Identified outliers: 29\n", 40 | "Non-outlier observations: 9971\n" 41 | ] 42 | } 43 | ], 44 | "source": [ 45 | "# identify outliers with standard deviation\n", 46 | "from numpy.random import seed\n", 47 | "from numpy.random import randn\n", 48 | "from numpy import mean\n", 49 | "from numpy import std\n", 50 | "# seed the random number generator\n", 51 | "seed(1)\n", 52 | "# generate univariate observations\n", 53 | "data = 5 * randn(10000) + 50\n", 54 | "# calculate summary statistics\n", 55 | "data_mean, data_std = mean(data), std(data)\n", 56 | "# define outliers\n", 57 | "cut_off = data_std * 3\n", 58 | "lower, upper = data_mean - cut_off, data_mean + cut_off\n", 59 | "# identify outliers\n", 60 | "outliers = [x for x in data if x < lower or x > upper]\n", 61 | "print('Identified outliers: %d' % len(outliers))\n", 62 | "# remove outliers\n", 63 | "outliers_removed = [x for x in data if x >= lower and x <= upper]\n", 64 | "print('Non-outlier observations: %d' % len(outliers_removed))" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [] 73 | } 74 | ], 75 | "metadata": { 76 | "kernelspec": { 77 | "display_name": "Python 3", 78 | "language": "python", 79 | "name": "python3" 80 | }, 81 | "language_info": { 82 | "codemirror_mode": { 83 | "name": "ipython", 84 | "version": 3 85 | }, 86 | "file_extension": ".py", 87 | "mimetype": "text/x-python", 88 | "name": "python", 89 | "nbconvert_exporter": "python", 90 | "pygments_lexer": "ipython3", 91 | "version": "3.6.4" 92 | } 93 | }, 94 | "nbformat": 4, 95 | "nbformat_minor": 2 96 | } 97 | -------------------------------------------------------------------------------- /IterativeImputer Data Transform.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# define imputer\n", 10 | "imputer = IterativeImputer(estimator=BayesianRidge(), n_nearest_features=None, imputation_order='ascending')" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# fit on the dataset\n", 20 | "imputer.fit(X)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "# transform the dataset\n", 30 | "Xtrans = imputer.transform(X)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 1, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "Missing: 1605\n", 43 | "Missing: 0\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "# iterative imputation transform for the horse colic dataset\n", 49 | "from numpy import isnan\n", 50 | "from pandas import read_csv\n", 51 | "from sklearn.experimental import enable_iterative_imputer\n", 52 | "from sklearn.impute import IterativeImputer\n", 53 | "# load dataset\n", 54 | "url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv'\n", 55 | "dataframe = read_csv(url, header=None, na_values='?')\n", 56 | "# split into input and output elements\n", 57 | "data = dataframe.values\n", 58 | "ix = [i for i in range(data.shape[1]) if i != 23]\n", 59 | "X, y = data[:, ix], data[:, 23]\n", 60 | "# print total missing\n", 61 | "print('Missing: %d' % sum(isnan(X).flatten()))\n", 62 | "# define imputer\n", 63 | "imputer = IterativeImputer()\n", 64 | "# fit on the dataset\n", 65 | "imputer.fit(X)\n", 66 | "# transform the dataset\n", 67 | "Xtrans = imputer.transform(X)\n", 68 | "# print total missing\n", 69 | "print('Missing: %d' % sum(isnan(Xtrans).flatten()))" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [] 78 | } 79 | ], 80 | "metadata": { 81 | "kernelspec": { 82 | "display_name": "Python 3", 83 | "language": "python", 84 | "name": "python3" 85 | }, 86 | "language_info": { 87 | "codemirror_mode": { 88 | "name": "ipython", 89 | "version": 3 90 | }, 91 | "file_extension": ".py", 92 | "mimetype": "text/x-python", 93 | "name": "python", 94 | "nbconvert_exporter": "python", 95 | "pygments_lexer": "ipython3", 96 | "version": "3.6.4" 97 | } 98 | }, 99 | "nbformat": 4, 100 | "nbformat_minor": 2 101 | } 102 | -------------------------------------------------------------------------------- /Automatic Outlier Detection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "(506, 0) (506,)\n", 13 | "(339, 0) (167, 0) (339,) (167,)\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "# load and summarize the dataset\n", 19 | "from pandas import read_csv\n", 20 | "from sklearn.model_selection import train_test_split\n", 21 | "# load the dataset\n", 22 | "df = read_csv('housing.csv', header=None)\n", 23 | "# retrieve the array\n", 24 | "data = df.values\n", 25 | "# split into input and output elements\n", 26 | "X, y = data[:, :-1], data[:, -1]\n", 27 | "# summarize the shape of the dataset\n", 28 | "print(X.shape, y.shape)\n", 29 | "# split into train and test sets\n", 30 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)\n", 31 | "# summarize the shape of the train and test sets\n", 32 | "print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)\n" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 10, 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "MAE: 3.417\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "# evaluate model on the raw dataset\n", 50 | "from pandas import read_csv\n", 51 | "from sklearn.model_selection import train_test_split\n", 52 | "from sklearn.linear_model import LinearRegression\n", 53 | "from sklearn.metrics import mean_absolute_error\n", 54 | "# load the dataset\n", 55 | "url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'\n", 56 | "df = read_csv(url, header=None)\n", 57 | "# retrieve the array\n", 58 | "data = df.values\n", 59 | "# split into inpiut and output elements\n", 60 | "X, y = data[:, :-1], data[:, -1]\n", 61 | "# split into train and test sets\n", 62 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)\n", 63 | "# fit the model\n", 64 | "model = LinearRegression()\n", 65 | "model.fit(X_train, y_train)\n", 66 | "# evaluate the model\n", 67 | "yhat = model.predict(X_test)\n", 68 | "# evaluate predictions\n", 69 | "mae = mean_absolute_error(y_test, yhat)\n", 70 | "print('MAE: %.3f' % mae)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 11, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "name": "stdout", 80 | "output_type": "stream", 81 | "text": [ 82 | "(339, 13) (339,)\n", 83 | "(305, 13) (305,)\n", 84 | "MAE: 3.356\n" 85 | ] 86 | } 87 | ], 88 | "source": [ 89 | "# evaluate model on training dataset with outliers removed\n", 90 | "from pandas import read_csv\n", 91 | "from sklearn.model_selection import train_test_split\n", 92 | "from sklearn.linear_model import LinearRegression\n", 93 | "from sklearn.neighbors import LocalOutlierFactor\n", 94 | "from sklearn.metrics import mean_absolute_error\n", 95 | "# load the dataset\n", 96 | "url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'\n", 97 | "df = read_csv(url, header=None)\n", 98 | "# retrieve the array\n", 99 | "data = df.values\n", 100 | "# split into inpiut and output elements\n", 101 | "X, y = data[:, :-1], data[:, -1]\n", 102 | "# split into train and test sets\n", 103 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)\n", 104 | "# summarize the shape of the training dataset\n", 105 | "print(X_train.shape, y_train.shape)\n", 106 | "# identify outliers in the training dataset\n", 107 | "lof = LocalOutlierFactor()\n", 108 | "yhat = lof.fit_predict(X_train)\n", 109 | "# select all rows that are not outliers\n", 110 | "mask = yhat != -1\n", 111 | "X_train, y_train = X_train[mask, :], y_train[mask]\n", 112 | "# summarize the shape of the updated training dataset\n", 113 | "print(X_train.shape, y_train.shape)\n", 114 | "# fit the model\n", 115 | "model = LinearRegression()\n", 116 | "model.fit(X_train, y_train)\n", 117 | "# evaluate the model\n", 118 | "yhat = model.predict(X_test)\n", 119 | "# evaluate predictions\n", 120 | "mae = mean_absolute_error(y_test, yhat)\n", 121 | "print('MAE: %.3f' % mae)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [] 130 | } 131 | ], 132 | "metadata": { 133 | "kernelspec": { 134 | "display_name": "Python 3", 135 | "language": "python", 136 | "name": "python3" 137 | }, 138 | "language_info": { 139 | "codemirror_mode": { 140 | "name": "ipython", 141 | "version": 3 142 | }, 143 | "file_extension": ".py", 144 | "mimetype": "text/x-python", 145 | "name": "python", 146 | "nbconvert_exporter": "python", 147 | "pygments_lexer": "ipython3", 148 | "version": "3.6.4" 149 | } 150 | }, 151 | "nbformat": 4, 152 | "nbformat_minor": 2 153 | } 154 | -------------------------------------------------------------------------------- /Data Cleansing Master Class - Data Preparation With Training and Testing Sets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "(1000, 20) (1000,)\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "# test classification dataset\n", 18 | "from sklearn.datasets import make_classification\n", 19 | "# define dataset\n", 20 | "X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)\n", 21 | "# summarize the dataset\n", 22 | "print(X.shape, y.shape) " 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "name": "stdout", 32 | "output_type": "stream", 33 | "text": [ 34 | "Accuracy: 84.848\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "# naive approach to normalizing the data before splitting the data and evaluating the model\n", 40 | "from sklearn.datasets import make_classification\n", 41 | "from sklearn.model_selection import train_test_split\n", 42 | "from sklearn.preprocessing import MinMaxScaler\n", 43 | "from sklearn.linear_model import LogisticRegression\n", 44 | "from sklearn.metrics import accuracy_score\n", 45 | "# define dataset\n", 46 | "X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)\n", 47 | "# standardize the dataset\n", 48 | "scaler = MinMaxScaler()\n", 49 | "X = scaler.fit_transform(X)\n", 50 | "# split into train and test sets\n", 51 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)\n", 52 | "# fit the model\n", 53 | "model = LogisticRegression()\n", 54 | "model.fit(X_train, y_train)\n", 55 | "# evaluate the model\n", 56 | "yhat = model.predict(X_test)\n", 57 | "# evaluate predictions\n", 58 | "accuracy = accuracy_score(y_test, yhat)\n", 59 | "print('Accuracy: %.3f' % (accuracy*100))" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "Accuracy: 85.455\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "# correct approach for normalizing the data after the data is split before the model is evaluated\n", 77 | "from sklearn.datasets import make_classification\n", 78 | "from sklearn.model_selection import train_test_split\n", 79 | "from sklearn.preprocessing import MinMaxScaler\n", 80 | "from sklearn.linear_model import LogisticRegression\n", 81 | "from sklearn.metrics import accuracy_score\n", 82 | "# define dataset\n", 83 | "X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)\n", 84 | "# split into train and test sets\n", 85 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)\n", 86 | "# define the scaler\n", 87 | "scaler = MinMaxScaler()\n", 88 | "# fit on the training dataset\n", 89 | "scaler.fit(X_train)\n", 90 | "# scale the training dataset\n", 91 | "X_train = scaler.transform(X_train)\n", 92 | "# scale the test dataset\n", 93 | "X_test = scaler.transform(X_test)\n", 94 | "# fit the model\n", 95 | "model = LogisticRegression()\n", 96 | "model.fit(X_train, y_train)\n", 97 | "# evaluate the model\n", 98 | "yhat = model.predict(X_test)\n", 99 | "# evaluate predictions\n", 100 | "accuracy = accuracy_score(y_test, yhat)\n", 101 | "print('Accuracy: %.3f' % (accuracy*100))" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 4, 107 | "metadata": {}, 108 | "outputs": [ 109 | { 110 | "name": "stdout", 111 | "output_type": "stream", 112 | "text": [ 113 | "Accuracy: 85.300 (3.607)\n" 114 | ] 115 | } 116 | ], 117 | "source": [ 118 | "# naive data preparation for model evaluation with k-fold cross-validation\n", 119 | "from numpy import mean\n", 120 | "from numpy import std\n", 121 | "from sklearn.datasets import make_classification\n", 122 | "from sklearn.model_selection import cross_val_score\n", 123 | "from sklearn.model_selection import RepeatedStratifiedKFold\n", 124 | "from sklearn.preprocessing import MinMaxScaler\n", 125 | "from sklearn.linear_model import LogisticRegression\n", 126 | "# define dataset\n", 127 | "X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)\n", 128 | "# standardize the dataset\n", 129 | "scaler = MinMaxScaler()\n", 130 | "X = scaler.fit_transform(X)\n", 131 | "# define the model\n", 132 | "model = LogisticRegression()\n", 133 | "# define the evaluation procedure\n", 134 | "cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)\n", 135 | "# evaluate the model using cross-validation\n", 136 | "scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)\n", 137 | "# report performance\n", 138 | "print('Accuracy: %.3f (%.3f)' % (mean(scores)*100, std(scores)*100))" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 5, 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "name": "stdout", 148 | "output_type": "stream", 149 | "text": [ 150 | "Accuracy: 85.433 (3.471)\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "# correct data preparation for model evaluation with k-fold cross-validation\n", 156 | "from numpy import mean\n", 157 | "from numpy import std\n", 158 | "from sklearn.datasets import make_classification\n", 159 | "from sklearn.model_selection import cross_val_score\n", 160 | "from sklearn.model_selection import RepeatedStratifiedKFold\n", 161 | "from sklearn.preprocessing import MinMaxScaler\n", 162 | "from sklearn.linear_model import LogisticRegression\n", 163 | "from sklearn.pipeline import Pipeline\n", 164 | "# define dataset\n", 165 | "X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)\n", 166 | "# define the pipeline\n", 167 | "steps = list()\n", 168 | "steps.append(('scaler', MinMaxScaler()))\n", 169 | "steps.append(('model', LogisticRegression()))\n", 170 | "pipeline = Pipeline(steps=steps)\n", 171 | "# define the evaluation procedure\n", 172 | "cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)\n", 173 | "# evaluate the model using cross-validation\n", 174 | "scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)\n", 175 | "# report performance\n", 176 | "print('Accuracy: %.3f (%.3f)' % (mean(scores)*100, std(scores)*100))" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [] 185 | } 186 | ], 187 | "metadata": { 188 | "kernelspec": { 189 | "display_name": "Python 3", 190 | "language": "python", 191 | "name": "python3" 192 | }, 193 | "language_info": { 194 | "codemirror_mode": { 195 | "name": "ipython", 196 | "version": 3 197 | }, 198 | "file_extension": ".py", 199 | "mimetype": "text/x-python", 200 | "name": "python", 201 | "nbconvert_exporter": "python", 202 | "pygments_lexer": "ipython3", 203 | "version": "3.6.4" 204 | } 205 | }, 206 | "nbformat": 4, 207 | "nbformat_minor": 2 208 | } 209 | -------------------------------------------------------------------------------- /Statistical Imputation With KNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 10, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | " 0 1 2 3 4 5 6 7 8 9 ... 18 19 20 21 22 23 \\\n", 13 | "0 2 1 530101 38.50 66 28 3 3 ? 2 ... 45.00 8.40 ? ? 2 2 \n", 14 | "1 1 1 534817 39.2 88 20 ? ? 4 1 ... 50 85 2 2 3 2 \n", 15 | "2 2 1 530334 38.30 40 24 1 1 3 1 ... 33.00 6.70 ? ? 1 2 \n", 16 | "3 1 9 5290409 39.10 164 84 4 1 6 2 ... 48.00 7.20 3 5.30 2 1 \n", 17 | "4 2 1 530255 37.30 104 35 ? ? 6 2 ... 74.00 7.40 ? ? 2 2 \n", 18 | "\n", 19 | " 24 25 26 27 \n", 20 | "0 11300 0 0 2 \n", 21 | "1 2208 0 0 2 \n", 22 | "2 0 0 0 1 \n", 23 | "3 2208 0 0 1 \n", 24 | "4 4300 0 0 2 \n", 25 | "\n", 26 | "[5 rows x 28 columns]\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "# summarize the horse colic dataset\n", 32 | "from pandas import read_csv\n", 33 | "# load dataset\n", 34 | "dataframe = read_csv('horse-colic.csv', header=None,)\n", 35 | "# summarize the first few rows\n", 36 | "print(dataframe.head())" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 13, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "name": "stdout", 46 | "output_type": "stream", 47 | "text": [ 48 | " 0 1 2 3 4 5 6 7 8 9 ... 18 19 \\\n", 49 | "0 2.0 1 530101 38.5 66.0 28.0 3.0 3.0 NaN 2.0 ... 45.0 8.4 \n", 50 | "1 1.0 1 534817 39.2 88.0 20.0 NaN NaN 4.0 1.0 ... 50.0 85.0 \n", 51 | "2 2.0 1 530334 38.3 40.0 24.0 1.0 1.0 3.0 1.0 ... 33.0 6.7 \n", 52 | "3 1.0 9 5290409 39.1 164.0 84.0 4.0 1.0 6.0 2.0 ... 48.0 7.2 \n", 53 | "4 2.0 1 530255 37.3 104.0 35.0 NaN NaN 6.0 2.0 ... 74.0 7.4 \n", 54 | "\n", 55 | " 20 21 22 23 24 25 26 27 \n", 56 | "0 NaN NaN 2.0 2 11300 0 0 2 \n", 57 | "1 2.0 2.0 3.0 2 2208 0 0 2 \n", 58 | "2 NaN NaN 1.0 2 0 0 0 1 \n", 59 | "3 3.0 5.3 2.0 1 2208 0 0 1 \n", 60 | "4 NaN NaN 2.0 2 4300 0 0 2 \n", 61 | "\n", 62 | "[5 rows x 28 columns]\n", 63 | "> 27, Missing: 0 (0.0%)\n" 64 | ] 65 | } 66 | ], 67 | "source": [ 68 | "# summarize the horse colic dataset\n", 69 | "from pandas import read_csv\n", 70 | "# load dataset\n", 71 | "dataframe = read_csv('horse-colic.csv', header=None, na_values='?')\n", 72 | "# summarize the first few rows\n", 73 | "print(dataframe.head())\n", 74 | "# summarize the number of rows with missing values for each column\n", 75 | "for i in range(dataframe.shape[1]):\n", 76 | "# count number of rows with missing values\n", 77 | " n_miss = dataframe[[i]].isnull().sum()\n", 78 | " perc = n_miss / dataframe.shape[0] * 100\n", 79 | "print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 17, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | " 0 1 2 3 4 5 6 7 8 9 ... 18 19 \\\n", 92 | "0 2.0 1 530101 38.5 66.0 28.0 3.0 3.0 NaN 2.0 ... 45.0 8.4 \n", 93 | "1 1.0 1 534817 39.2 88.0 20.0 NaN NaN 4.0 1.0 ... 50.0 85.0 \n", 94 | "2 2.0 1 530334 38.3 40.0 24.0 1.0 1.0 3.0 1.0 ... 33.0 6.7 \n", 95 | "3 1.0 9 5290409 39.1 164.0 84.0 4.0 1.0 6.0 2.0 ... 48.0 7.2 \n", 96 | "4 2.0 1 530255 37.3 104.0 35.0 NaN NaN 6.0 2.0 ... 74.0 7.4 \n", 97 | "\n", 98 | " 20 21 22 23 24 25 26 27 \n", 99 | "0 NaN NaN 2.0 2 11300 0 0 2 \n", 100 | "1 2.0 2.0 3.0 2 2208 0 0 2 \n", 101 | "2 NaN NaN 1.0 2 0 0 0 1 \n", 102 | "3 3.0 5.3 2.0 1 2208 0 0 1 \n", 103 | "4 NaN NaN 2.0 2 4300 0 0 2 \n", 104 | "\n", 105 | "[5 rows x 28 columns]\n", 106 | "> 0, Missing: 1 (0.3%)\n", 107 | "> 1, Missing: 0 (0.0%)\n", 108 | "> 2, Missing: 0 (0.0%)\n", 109 | "> 3, Missing: 60 (20.0%)\n", 110 | "> 4, Missing: 24 (8.0%)\n", 111 | "> 5, Missing: 58 (19.3%)\n", 112 | "> 6, Missing: 56 (18.7%)\n", 113 | "> 7, Missing: 69 (23.0%)\n", 114 | "> 8, Missing: 47 (15.7%)\n", 115 | "> 9, Missing: 32 (10.7%)\n", 116 | "> 10, Missing: 55 (18.3%)\n", 117 | "> 11, Missing: 44 (14.7%)\n", 118 | "> 12, Missing: 56 (18.7%)\n", 119 | "> 13, Missing: 104 (34.7%)\n", 120 | "> 14, Missing: 106 (35.3%)\n", 121 | "> 15, Missing: 247 (82.3%)\n", 122 | "> 16, Missing: 102 (34.0%)\n", 123 | "> 17, Missing: 118 (39.3%)\n", 124 | "> 18, Missing: 29 (9.7%)\n", 125 | "> 19, Missing: 33 (11.0%)\n", 126 | "> 20, Missing: 165 (55.0%)\n", 127 | "> 21, Missing: 198 (66.0%)\n", 128 | "> 22, Missing: 1 (0.3%)\n", 129 | "> 23, Missing: 0 (0.0%)\n", 130 | "> 24, Missing: 0 (0.0%)\n", 131 | "> 25, Missing: 0 (0.0%)\n", 132 | "> 26, Missing: 0 (0.0%)\n", 133 | "> 27, Missing: 0 (0.0%)\n" 134 | ] 135 | } 136 | ], 137 | "source": [ 138 | "# summarize the horse colic dataset\n", 139 | "from pandas import read_csv\n", 140 | "# load dataset\n", 141 | "dataframe = read_csv('horse-colic.csv', header=None, na_values='?')\n", 142 | "# summarize the first few rows\n", 143 | "print(dataframe.head())\n", 144 | "# summarize the number of rows with missing values for each column\n", 145 | "for i in range(dataframe.shape[1]):\n", 146 | "# count number of rows with missing values\n", 147 | " n_miss = dataframe[[i]].isnull().sum()\n", 148 | " perc = n_miss / dataframe.shape[0] * 100\n", 149 | " print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 1, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "name": "stdout", 159 | "output_type": "stream", 160 | "text": [ 161 | "Missing: 1605\n", 162 | "Missing: 0\n" 163 | ] 164 | } 165 | ], 166 | "source": [ 167 | "# knn imputation transform for the horse colic dataset\n", 168 | "from numpy import isnan\n", 169 | "from pandas import read_csv\n", 170 | "from sklearn.impute import KNNImputer\n", 171 | "# load dataset\n", 172 | "dataframe = read_csv('horse-colic.csv', header=None, na_values='?')\n", 173 | "# split into input and output elements\n", 174 | "data = dataframe.values\n", 175 | "ix = [i for i in range(data.shape[1]) if i != 23]\n", 176 | "X, y = data[:, ix], data[:, 23]\n", 177 | "# summarize total missing\n", 178 | "print('Missing: %d' % sum(isnan(X).flatten()))\n", 179 | "# define imputer\n", 180 | "imputer = KNNImputer()\n", 181 | "# fit on the dataset\n", 182 | "imputer.fit(X)\n", 183 | "# transform the dataset\n", 184 | "Xtrans = imputer.transform(X)\n", 185 | "# summarize total missing\n", 186 | "print('Missing: %d' % sum(isnan(Xtrans).flatten()))" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [] 195 | } 196 | ], 197 | "metadata": { 198 | "kernelspec": { 199 | "display_name": "Python 3", 200 | "language": "python", 201 | "name": "python3" 202 | }, 203 | "language_info": { 204 | "codemirror_mode": { 205 | "name": "ipython", 206 | "version": 3 207 | }, 208 | "file_extension": ".py", 209 | "mimetype": "text/x-python", 210 | "name": "python", 211 | "nbconvert_exporter": "python", 212 | "pygments_lexer": "ipython3", 213 | "version": "3.6.4" 214 | } 215 | }, 216 | "nbformat": 4, 217 | "nbformat_minor": 2 218 | } 219 | -------------------------------------------------------------------------------- /Statistical Imputation With SimpleImputer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 10, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | " 0 1 2 3 4 5 6 7 8 9 ... 18 19 20 21 22 23 \\\n", 13 | "0 2 1 530101 38.50 66 28 3 3 ? 2 ... 45.00 8.40 ? ? 2 2 \n", 14 | "1 1 1 534817 39.2 88 20 ? ? 4 1 ... 50 85 2 2 3 2 \n", 15 | "2 2 1 530334 38.30 40 24 1 1 3 1 ... 33.00 6.70 ? ? 1 2 \n", 16 | "3 1 9 5290409 39.10 164 84 4 1 6 2 ... 48.00 7.20 3 5.30 2 1 \n", 17 | "4 2 1 530255 37.30 104 35 ? ? 6 2 ... 74.00 7.40 ? ? 2 2 \n", 18 | "\n", 19 | " 24 25 26 27 \n", 20 | "0 11300 0 0 2 \n", 21 | "1 2208 0 0 2 \n", 22 | "2 0 0 0 1 \n", 23 | "3 2208 0 0 1 \n", 24 | "4 4300 0 0 2 \n", 25 | "\n", 26 | "[5 rows x 28 columns]\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "# summarize the horse colic dataset\n", 32 | "from pandas import read_csv\n", 33 | "# load dataset\n", 34 | "dataframe = read_csv('horse-colic.csv', header=None,)\n", 35 | "# summarize the first few rows\n", 36 | "print(dataframe.head())" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 13, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "name": "stdout", 46 | "output_type": "stream", 47 | "text": [ 48 | " 0 1 2 3 4 5 6 7 8 9 ... 18 19 \\\n", 49 | "0 2.0 1 530101 38.5 66.0 28.0 3.0 3.0 NaN 2.0 ... 45.0 8.4 \n", 50 | "1 1.0 1 534817 39.2 88.0 20.0 NaN NaN 4.0 1.0 ... 50.0 85.0 \n", 51 | "2 2.0 1 530334 38.3 40.0 24.0 1.0 1.0 3.0 1.0 ... 33.0 6.7 \n", 52 | "3 1.0 9 5290409 39.1 164.0 84.0 4.0 1.0 6.0 2.0 ... 48.0 7.2 \n", 53 | "4 2.0 1 530255 37.3 104.0 35.0 NaN NaN 6.0 2.0 ... 74.0 7.4 \n", 54 | "\n", 55 | " 20 21 22 23 24 25 26 27 \n", 56 | "0 NaN NaN 2.0 2 11300 0 0 2 \n", 57 | "1 2.0 2.0 3.0 2 2208 0 0 2 \n", 58 | "2 NaN NaN 1.0 2 0 0 0 1 \n", 59 | "3 3.0 5.3 2.0 1 2208 0 0 1 \n", 60 | "4 NaN NaN 2.0 2 4300 0 0 2 \n", 61 | "\n", 62 | "[5 rows x 28 columns]\n", 63 | "> 27, Missing: 0 (0.0%)\n" 64 | ] 65 | } 66 | ], 67 | "source": [ 68 | "# summarize the horse colic dataset\n", 69 | "from pandas import read_csv\n", 70 | "# load dataset\n", 71 | "dataframe = read_csv('horse-colic.csv', header=None, na_values='?')\n", 72 | "# summarize the first few rows\n", 73 | "print(dataframe.head())\n", 74 | "# summarize the number of rows with missing values for each column\n", 75 | "for i in range(dataframe.shape[1]):\n", 76 | "# count number of rows with missing values\n", 77 | " n_miss = dataframe[[i]].isnull().sum()\n", 78 | " perc = n_miss / dataframe.shape[0] * 100\n", 79 | "print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 17, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | " 0 1 2 3 4 5 6 7 8 9 ... 18 19 \\\n", 92 | "0 2.0 1 530101 38.5 66.0 28.0 3.0 3.0 NaN 2.0 ... 45.0 8.4 \n", 93 | "1 1.0 1 534817 39.2 88.0 20.0 NaN NaN 4.0 1.0 ... 50.0 85.0 \n", 94 | "2 2.0 1 530334 38.3 40.0 24.0 1.0 1.0 3.0 1.0 ... 33.0 6.7 \n", 95 | "3 1.0 9 5290409 39.1 164.0 84.0 4.0 1.0 6.0 2.0 ... 48.0 7.2 \n", 96 | "4 2.0 1 530255 37.3 104.0 35.0 NaN NaN 6.0 2.0 ... 74.0 7.4 \n", 97 | "\n", 98 | " 20 21 22 23 24 25 26 27 \n", 99 | "0 NaN NaN 2.0 2 11300 0 0 2 \n", 100 | "1 2.0 2.0 3.0 2 2208 0 0 2 \n", 101 | "2 NaN NaN 1.0 2 0 0 0 1 \n", 102 | "3 3.0 5.3 2.0 1 2208 0 0 1 \n", 103 | "4 NaN NaN 2.0 2 4300 0 0 2 \n", 104 | "\n", 105 | "[5 rows x 28 columns]\n", 106 | "> 0, Missing: 1 (0.3%)\n", 107 | "> 1, Missing: 0 (0.0%)\n", 108 | "> 2, Missing: 0 (0.0%)\n", 109 | "> 3, Missing: 60 (20.0%)\n", 110 | "> 4, Missing: 24 (8.0%)\n", 111 | "> 5, Missing: 58 (19.3%)\n", 112 | "> 6, Missing: 56 (18.7%)\n", 113 | "> 7, Missing: 69 (23.0%)\n", 114 | "> 8, Missing: 47 (15.7%)\n", 115 | "> 9, Missing: 32 (10.7%)\n", 116 | "> 10, Missing: 55 (18.3%)\n", 117 | "> 11, Missing: 44 (14.7%)\n", 118 | "> 12, Missing: 56 (18.7%)\n", 119 | "> 13, Missing: 104 (34.7%)\n", 120 | "> 14, Missing: 106 (35.3%)\n", 121 | "> 15, Missing: 247 (82.3%)\n", 122 | "> 16, Missing: 102 (34.0%)\n", 123 | "> 17, Missing: 118 (39.3%)\n", 124 | "> 18, Missing: 29 (9.7%)\n", 125 | "> 19, Missing: 33 (11.0%)\n", 126 | "> 20, Missing: 165 (55.0%)\n", 127 | "> 21, Missing: 198 (66.0%)\n", 128 | "> 22, Missing: 1 (0.3%)\n", 129 | "> 23, Missing: 0 (0.0%)\n", 130 | "> 24, Missing: 0 (0.0%)\n", 131 | "> 25, Missing: 0 (0.0%)\n", 132 | "> 26, Missing: 0 (0.0%)\n", 133 | "> 27, Missing: 0 (0.0%)\n" 134 | ] 135 | } 136 | ], 137 | "source": [ 138 | "# summarize the horse colic dataset\n", 139 | "from pandas import read_csv\n", 140 | "# load dataset\n", 141 | "dataframe = read_csv('horse-colic.csv', header=None, na_values='?')\n", 142 | "# summarize the first few rows\n", 143 | "print(dataframe.head())\n", 144 | "# summarize the number of rows with missing values for each column\n", 145 | "for i in range(dataframe.shape[1]):\n", 146 | "# count number of rows with missing values\n", 147 | " n_miss = dataframe[[i]].isnull().sum()\n", 148 | " perc = n_miss / dataframe.shape[0] * 100\n", 149 | " print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 19, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "name": "stdout", 159 | "output_type": "stream", 160 | "text": [ 161 | "Missing: 1605\n", 162 | "Missing: 0\n" 163 | ] 164 | } 165 | ], 166 | "source": [ 167 | "# statistical imputation transform for the horse colic dataset\n", 168 | "from numpy import isnan\n", 169 | "from pandas import read_csv\n", 170 | "from sklearn.impute import SimpleImputer\n", 171 | "# load dataset\n", 172 | "dataframe = read_csv('horse-colic.csv', header=None, na_values='?')\n", 173 | "# split into input and output elements\n", 174 | "data = dataframe.values\n", 175 | "ix = [i for i in range(data.shape[1]) if i != 23]\n", 176 | "X, y = data[:, ix], data[:, 23]\n", 177 | "# summarize total missing\n", 178 | "print('Missing: %d' % sum(isnan(X).flatten()))\n", 179 | "# define imputer\n", 180 | "imputer = SimpleImputer(strategy='mean')\n", 181 | "# fit on the dataset\n", 182 | "imputer.fit(X)\n", 183 | "# transform the dataset\n", 184 | "Xtrans = imputer.transform(X)\n", 185 | "# summarize total missing\n", 186 | "print('Missing: %d' % sum(isnan(Xtrans).flatten()))" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [] 195 | } 196 | ], 197 | "metadata": { 198 | "kernelspec": { 199 | "display_name": "Python 3", 200 | "language": "python", 201 | "name": "python3" 202 | }, 203 | "language_info": { 204 | "codemirror_mode": { 205 | "name": "ipython", 206 | "version": 3 207 | }, 208 | "file_extension": ".py", 209 | "mimetype": "text/x-python", 210 | "name": "python", 211 | "nbconvert_exporter": "python", 212 | "pygments_lexer": "ipython3", 213 | "version": "3.6.4" 214 | } 215 | }, 216 | "nbformat": 4, 217 | "nbformat_minor": 2 218 | } 219 | -------------------------------------------------------------------------------- /Remove Missing Values.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Accuracy: nan\n" 13 | ] 14 | }, 15 | { 16 | "name": "stderr", 17 | "output_type": "stream", 18 | "text": [ 19 | "C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py:552: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n", 20 | "Traceback (most recent call last):\n", 21 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 531, in _fit_and_score\n", 22 | " estimator.fit(X_train, y_train, **fit_params)\n", 23 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\discriminant_analysis.py\", line 425, in fit\n", 24 | " dtype=[np.float64, np.float32])\n", 25 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\base.py\", line 432, in _validate_data\n", 26 | " X, y = check_X_y(X, y, **check_params)\n", 27 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py\", line 72, in inner_f\n", 28 | " return f(**kwargs)\n", 29 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py\", line 802, in check_X_y\n", 30 | " estimator=estimator)\n", 31 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py\", line 72, in inner_f\n", 32 | " return f(**kwargs)\n", 33 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py\", line 645, in check_array\n", 34 | " allow_nan=force_all_finite == 'allow-nan')\n", 35 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py\", line 99, in _assert_all_finite\n", 36 | " msg_dtype if msg_dtype is not None else X.dtype)\n", 37 | "ValueError: Input contains NaN, infinity or a value too large for dtype('float64').\n", 38 | "\n", 39 | " FitFailedWarning)\n", 40 | "C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py:552: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n", 41 | "Traceback (most recent call last):\n", 42 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 531, in _fit_and_score\n", 43 | " estimator.fit(X_train, y_train, **fit_params)\n", 44 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\discriminant_analysis.py\", line 425, in fit\n", 45 | " dtype=[np.float64, np.float32])\n", 46 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\base.py\", line 432, in _validate_data\n", 47 | " X, y = check_X_y(X, y, **check_params)\n", 48 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py\", line 72, in inner_f\n", 49 | " return f(**kwargs)\n", 50 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py\", line 802, in check_X_y\n", 51 | " estimator=estimator)\n", 52 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py\", line 72, in inner_f\n", 53 | " return f(**kwargs)\n", 54 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py\", line 645, in check_array\n", 55 | " allow_nan=force_all_finite == 'allow-nan')\n", 56 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py\", line 99, in _assert_all_finite\n", 57 | " msg_dtype if msg_dtype is not None else X.dtype)\n", 58 | "ValueError: Input contains NaN, infinity or a value too large for dtype('float64').\n", 59 | "\n", 60 | " FitFailedWarning)\n", 61 | "C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py:552: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n", 62 | "Traceback (most recent call last):\n", 63 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 531, in _fit_and_score\n", 64 | " estimator.fit(X_train, y_train, **fit_params)\n", 65 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\discriminant_analysis.py\", line 425, in fit\n", 66 | " dtype=[np.float64, np.float32])\n", 67 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\base.py\", line 432, in _validate_data\n", 68 | " X, y = check_X_y(X, y, **check_params)\n", 69 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py\", line 72, in inner_f\n", 70 | " return f(**kwargs)\n", 71 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py\", line 802, in check_X_y\n", 72 | " estimator=estimator)\n", 73 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py\", line 72, in inner_f\n", 74 | " return f(**kwargs)\n", 75 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py\", line 645, in check_array\n", 76 | " allow_nan=force_all_finite == 'allow-nan')\n", 77 | " File \"C:\\Users\\PSMike\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py\", line 99, in _assert_all_finite\n", 78 | " msg_dtype if msg_dtype is not None else X.dtype)\n", 79 | "ValueError: Input contains NaN, infinity or a value too large for dtype('float64').\n", 80 | "\n", 81 | " FitFailedWarning)\n" 82 | ] 83 | } 84 | ], 85 | "source": [ 86 | "# example where missing values cause errors\n", 87 | "from numpy import nan\n", 88 | "from pandas import read_csv\n", 89 | "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n", 90 | "from sklearn.model_selection import KFold\n", 91 | "from sklearn.model_selection import cross_val_score\n", 92 | "# load the dataset\n", 93 | "dataset = read_csv('pima-indians-diabetes.csv', header=None)\n", 94 | "# replace '0' values with 'nan'\n", 95 | "dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0, nan)\n", 96 | "# split dataset into inputs and outputs\n", 97 | "values = dataset.values\n", 98 | "X = values[:,0:8]\n", 99 | "y = values[:,8]\n", 100 | "# define the model\n", 101 | "model = LinearDiscriminantAnalysis()\n", 102 | "# define the model evaluation procedure\n", 103 | "cv = KFold(n_splits=3, shuffle=True, random_state=1)\n", 104 | "# evaluate the model\n", 105 | "result = cross_val_score(model, X, y, cv=cv, scoring='accuracy')\n", 106 | "# report the mean performance\n", 107 | "print('Accuracy: %.3f' % result.mean())" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 5, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "name": "stdout", 117 | "output_type": "stream", 118 | "text": [ 119 | "(768, 9)\n", 120 | "(392, 9)\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "# example of removing rows that contain missing values\n", 126 | "from numpy import nan\n", 127 | "from pandas import read_csv\n", 128 | "# load the dataset\n", 129 | "dataset = read_csv('pima-indians-diabetes.csv', header=None)\n", 130 | "# summarize the shape of the raw data\n", 131 | "print(dataset.shape)\n", 132 | "# replace '0' values with 'nan'\n", 133 | "dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0, nan)\n", 134 | "# drop rows with missing values\n", 135 | "dataset.dropna(inplace=True)\n", 136 | "# summarize the shape of the data with missing rows removed\n", 137 | "print(dataset.shape)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 6, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "Accuracy: 0.781\n" 150 | ] 151 | } 152 | ], 153 | "source": [ 154 | "# evaluate model on data after rows with missing data are removed\n", 155 | "from numpy import nan\n", 156 | "from pandas import read_csv\n", 157 | "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n", 158 | "from sklearn.model_selection import KFold\n", 159 | "from sklearn.model_selection import cross_val_score\n", 160 | "# load the dataset\n", 161 | "dataset = read_csv('pima-indians-diabetes.csv', header=None)\n", 162 | "# replace '0' values with 'nan'\n", 163 | "dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0, nan)\n", 164 | "# drop rows with missing values\n", 165 | "dataset.dropna(inplace=True)\n", 166 | "# split dataset into inputs and outputs\n", 167 | "values = dataset.values\n", 168 | "X = values[:,0:8]\n", 169 | "y = values[:,8]\n", 170 | "# define the model\n", 171 | "model = LinearDiscriminantAnalysis()\n", 172 | "# define the model evaluation procedure\n", 173 | "cv = KFold(n_splits=3, shuffle=True, random_state=1)\n", 174 | "# evaluate the model\n", 175 | "result = cross_val_score(model, X, y, cv=cv, scoring='accuracy')\n", 176 | "# report the mean performance\n", 177 | "print('Accuracy: %.3f' % result.mean())" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [] 186 | } 187 | ], 188 | "metadata": { 189 | "kernelspec": { 190 | "display_name": "Python 3", 191 | "language": "python", 192 | "name": "python3" 193 | }, 194 | "language_info": { 195 | "codemirror_mode": { 196 | "name": "ipython", 197 | "version": 3 198 | }, 199 | "file_extension": ".py", 200 | "mimetype": "text/x-python", 201 | "name": "python", 202 | "nbconvert_exporter": "python", 203 | "pygments_lexer": "ipython3", 204 | "version": "3.6.4" 205 | } 206 | }, 207 | "nbformat": 4, 208 | "nbformat_minor": 2 209 | } 210 | -------------------------------------------------------------------------------- /Mark Missing Values.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | " 0 1 2 3 4 5 6 7 8\n", 13 | "0 6 148 72 35 0 33.6 0.627 50 1\n", 14 | "1 1 85 66 29 0 26.6 0.351 31 0\n", 15 | "2 8 183 64 0 0 23.3 0.672 32 1\n", 16 | "3 1 89 66 23 94 28.1 0.167 21 0\n", 17 | "4 0 137 40 35 168 43.1 2.288 33 1\n", 18 | "5 5 116 74 0 0 25.6 0.201 30 0\n", 19 | "6 3 78 50 32 88 31.0 0.248 26 1\n", 20 | "7 10 115 0 0 0 35.3 0.134 29 0\n", 21 | "8 2 197 70 45 543 30.5 0.158 53 1\n", 22 | "9 8 125 96 0 0 0.0 0.232 54 1\n", 23 | "10 4 110 92 0 0 37.6 0.191 30 0\n", 24 | "11 10 168 74 0 0 38.0 0.537 34 1\n", 25 | "12 10 139 80 0 0 27.1 1.441 57 0\n", 26 | "13 1 189 60 23 846 30.1 0.398 59 1\n", 27 | "14 5 166 72 19 175 25.8 0.587 51 1\n", 28 | "15 7 100 0 0 0 30.0 0.484 32 1\n", 29 | "16 0 118 84 47 230 45.8 0.551 31 1\n", 30 | "17 7 107 74 0 0 29.6 0.254 31 1\n", 31 | "18 1 103 30 38 83 43.3 0.183 33 0\n", 32 | "19 1 115 70 30 96 34.6 0.529 32 1\n", 33 | "20 3 126 88 41 235 39.3 0.704 27 0\n", 34 | "21 8 99 84 0 0 35.4 0.388 50 0\n", 35 | "22 7 196 90 0 0 39.8 0.451 41 1\n", 36 | "23 9 119 80 35 0 29.0 0.263 29 1\n", 37 | "24 11 143 94 33 146 36.6 0.254 51 1\n", 38 | "25 10 125 70 26 115 31.1 0.205 41 1\n", 39 | "26 7 147 76 0 0 39.4 0.257 43 1\n", 40 | "27 1 97 66 15 140 23.2 0.487 22 0\n", 41 | "28 13 145 82 19 110 22.2 0.245 57 0\n", 42 | "29 5 117 92 0 0 34.1 0.337 38 0\n", 43 | ".. .. ... .. .. ... ... ... .. ..\n", 44 | "738 2 99 60 17 160 36.6 0.453 21 0\n", 45 | "739 1 102 74 0 0 39.5 0.293 42 1\n", 46 | "740 11 120 80 37 150 42.3 0.785 48 1\n", 47 | "741 3 102 44 20 94 30.8 0.400 26 0\n", 48 | "742 1 109 58 18 116 28.5 0.219 22 0\n", 49 | "743 9 140 94 0 0 32.7 0.734 45 1\n", 50 | "744 13 153 88 37 140 40.6 1.174 39 0\n", 51 | "745 12 100 84 33 105 30.0 0.488 46 0\n", 52 | "746 1 147 94 41 0 49.3 0.358 27 1\n", 53 | "747 1 81 74 41 57 46.3 1.096 32 0\n", 54 | "748 3 187 70 22 200 36.4 0.408 36 1\n", 55 | "749 6 162 62 0 0 24.3 0.178 50 1\n", 56 | "750 4 136 70 0 0 31.2 1.182 22 1\n", 57 | "751 1 121 78 39 74 39.0 0.261 28 0\n", 58 | "752 3 108 62 24 0 26.0 0.223 25 0\n", 59 | "753 0 181 88 44 510 43.3 0.222 26 1\n", 60 | "754 8 154 78 32 0 32.4 0.443 45 1\n", 61 | "755 1 128 88 39 110 36.5 1.057 37 1\n", 62 | "756 7 137 90 41 0 32.0 0.391 39 0\n", 63 | "757 0 123 72 0 0 36.3 0.258 52 1\n", 64 | "758 1 106 76 0 0 37.5 0.197 26 0\n", 65 | "759 6 190 92 0 0 35.5 0.278 66 1\n", 66 | "760 2 88 58 26 16 28.4 0.766 22 0\n", 67 | "761 9 170 74 31 0 44.0 0.403 43 1\n", 68 | "762 9 89 62 0 0 22.5 0.142 33 0\n", 69 | "763 10 101 76 48 180 32.9 0.171 63 0\n", 70 | "764 2 122 70 27 0 36.8 0.340 27 0\n", 71 | "765 5 121 72 23 112 26.2 0.245 30 0\n", 72 | "766 1 126 60 0 0 30.1 0.349 47 1\n", 73 | "767 1 93 70 31 0 30.4 0.315 23 0\n", 74 | "\n", 75 | "[768 rows x 9 columns]\n" 76 | ] 77 | } 78 | ], 79 | "source": [ 80 | "# load and summarize the dataset\n", 81 | "from pandas import read_csv\n", 82 | "# load the dataset\n", 83 | "dataset = read_csv('pima-indians-diabetes.csv', header=None)\n", 84 | "# summarize the dataset\n", 85 | "print(dataset)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 6, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "name": "stdout", 95 | "output_type": "stream", 96 | "text": [ 97 | " 0 1 2 3 4 5 \\\n", 98 | "count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 \n", 99 | "mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578 \n", 100 | "std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160 \n", 101 | "min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", 102 | "25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000 \n", 103 | "50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000 \n", 104 | "75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000 \n", 105 | "max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000 \n", 106 | "\n", 107 | " 6 7 8 \n", 108 | "count 768.000000 768.000000 768.000000 \n", 109 | "mean 0.471876 33.240885 0.348958 \n", 110 | "std 0.331329 11.760232 0.476951 \n", 111 | "min 0.078000 21.000000 0.000000 \n", 112 | "25% 0.243750 24.000000 0.000000 \n", 113 | "50% 0.372500 29.000000 0.000000 \n", 114 | "75% 0.626250 41.000000 1.000000 \n", 115 | "max 2.420000 81.000000 1.000000 \n" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "# load and summarize the dataset\n", 121 | "from pandas import read_csv\n", 122 | "# load the dataset\n", 123 | "dataset = read_csv('pima-indians-diabetes.csv', header=None)\n", 124 | "# summarize the dataset\n", 125 | "print(dataset.describe())" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 2, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "name": "stdout", 135 | "output_type": "stream", 136 | "text": [ 137 | " 0 1 2 3 4 5 6 7 8\n", 138 | "0 6 148 72 35 0 33.6 0.627 50 1\n", 139 | "1 1 85 66 29 0 26.6 0.351 31 0\n", 140 | "2 8 183 64 0 0 23.3 0.672 32 1\n", 141 | "3 1 89 66 23 94 28.1 0.167 21 0\n", 142 | "4 0 137 40 35 168 43.1 2.288 33 1\n", 143 | "5 5 116 74 0 0 25.6 0.201 30 0\n", 144 | "6 3 78 50 32 88 31.0 0.248 26 1\n", 145 | "7 10 115 0 0 0 35.3 0.134 29 0\n", 146 | "8 2 197 70 45 543 30.5 0.158 53 1\n", 147 | "9 8 125 96 0 0 0.0 0.232 54 1\n", 148 | "10 4 110 92 0 0 37.6 0.191 30 0\n", 149 | "11 10 168 74 0 0 38.0 0.537 34 1\n", 150 | "12 10 139 80 0 0 27.1 1.441 57 0\n", 151 | "13 1 189 60 23 846 30.1 0.398 59 1\n", 152 | "14 5 166 72 19 175 25.8 0.587 51 1\n", 153 | "15 7 100 0 0 0 30.0 0.484 32 1\n", 154 | "16 0 118 84 47 230 45.8 0.551 31 1\n", 155 | "17 7 107 74 0 0 29.6 0.254 31 1\n", 156 | "18 1 103 30 38 83 43.3 0.183 33 0\n", 157 | "19 1 115 70 30 96 34.6 0.529 32 1\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "# load the dataset and review rows\n", 163 | "from pandas import read_csv\n", 164 | "# load the dataset\n", 165 | "dataset = read_csv('pima-indians-diabetes.csv', header=None)\n", 166 | "# summarize the first 20 rows of data\n", 167 | "print(dataset.head(20))" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 3, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "name": "stdout", 177 | "output_type": "stream", 178 | "text": [ 179 | "1 5\n", 180 | "2 35\n", 181 | "3 227\n", 182 | "4 374\n", 183 | "5 11\n", 184 | "dtype: int64\n" 185 | ] 186 | } 187 | ], 188 | "source": [ 189 | "# example of summarizing the number of missing values for each variable\n", 190 | "from pandas import read_csv\n", 191 | "# load the dataset\n", 192 | "dataset = read_csv('pima-indians-diabetes.csv', header=None)\n", 193 | "# count the number of missing values for each column\n", 194 | "num_missing = (dataset[[1,2,3,4,5]] == 0).sum()\n", 195 | "# report the results\n", 196 | "print(num_missing)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 4, 202 | "metadata": {}, 203 | "outputs": [ 204 | { 205 | "name": "stdout", 206 | "output_type": "stream", 207 | "text": [ 208 | "0 0\n", 209 | "1 5\n", 210 | "2 35\n", 211 | "3 227\n", 212 | "4 374\n", 213 | "5 11\n", 214 | "6 0\n", 215 | "7 0\n", 216 | "8 0\n", 217 | "dtype: int64\n" 218 | ] 219 | } 220 | ], 221 | "source": [ 222 | "# example of marking missing values with nan values\n", 223 | "from numpy import nan\n", 224 | "from pandas import read_csv\n", 225 | "# load the dataset\n", 226 | "dataset = read_csv('pima-indians-diabetes.csv', header=None)\n", 227 | "# replace '0' values with 'nan'\n", 228 | "dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0, nan)\n", 229 | "# count the number of nan values in each column\n", 230 | "print(dataset.isnull().sum())" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 5, 236 | "metadata": {}, 237 | "outputs": [ 238 | { 239 | "name": "stdout", 240 | "output_type": "stream", 241 | "text": [ 242 | " 0 1 2 3 4 5 6 7 8\n", 243 | "0 6 148.0 72.0 35.0 NaN 33.6 0.627 50 1\n", 244 | "1 1 85.0 66.0 29.0 NaN 26.6 0.351 31 0\n", 245 | "2 8 183.0 64.0 NaN NaN 23.3 0.672 32 1\n", 246 | "3 1 89.0 66.0 23.0 94.0 28.1 0.167 21 0\n", 247 | "4 0 137.0 40.0 35.0 168.0 43.1 2.288 33 1\n", 248 | "5 5 116.0 74.0 NaN NaN 25.6 0.201 30 0\n", 249 | "6 3 78.0 50.0 32.0 88.0 31.0 0.248 26 1\n", 250 | "7 10 115.0 NaN NaN NaN 35.3 0.134 29 0\n", 251 | "8 2 197.0 70.0 45.0 543.0 30.5 0.158 53 1\n", 252 | "9 8 125.0 96.0 NaN NaN NaN 0.232 54 1\n", 253 | "10 4 110.0 92.0 NaN NaN 37.6 0.191 30 0\n", 254 | "11 10 168.0 74.0 NaN NaN 38.0 0.537 34 1\n", 255 | "12 10 139.0 80.0 NaN NaN 27.1 1.441 57 0\n", 256 | "13 1 189.0 60.0 23.0 846.0 30.1 0.398 59 1\n", 257 | "14 5 166.0 72.0 19.0 175.0 25.8 0.587 51 1\n", 258 | "15 7 100.0 NaN NaN NaN 30.0 0.484 32 1\n", 259 | "16 0 118.0 84.0 47.0 230.0 45.8 0.551 31 1\n", 260 | "17 7 107.0 74.0 NaN NaN 29.6 0.254 31 1\n", 261 | "18 1 103.0 30.0 38.0 83.0 43.3 0.183 33 0\n", 262 | "19 1 115.0 70.0 30.0 96.0 34.6 0.529 32 1\n" 263 | ] 264 | } 265 | ], 266 | "source": [ 267 | "# example of review data with missing values marked with a nan\n", 268 | "from numpy import nan\n", 269 | "from pandas import read_csv\n", 270 | "# load the dataset\n", 271 | "dataset = read_csv('pima-indians-diabetes.csv', header=None)\n", 272 | "# replace '0' values with 'nan'\n", 273 | "dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0, nan)\n", 274 | "# summarize the first 20 rows of data\n", 275 | "print(dataset.head(20))" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [] 284 | } 285 | ], 286 | "metadata": { 287 | "kernelspec": { 288 | "display_name": "Python 3", 289 | "language": "python", 290 | "name": "python3" 291 | }, 292 | "language_info": { 293 | "codemirror_mode": { 294 | "name": "ipython", 295 | "version": 3 296 | }, 297 | "file_extension": ".py", 298 | "mimetype": "text/x-python", 299 | "name": "python", 300 | "nbconvert_exporter": "python", 301 | "pygments_lexer": "ipython3", 302 | "version": "3.6.4" 303 | } 304 | }, 305 | "nbformat": 4, 306 | "nbformat_minor": 2 307 | } 308 | -------------------------------------------------------------------------------- /Comparing Different Imputed Statistics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | ">mean 0.862 (0.053)\n", 13 | ">median 0.866 (0.057)\n", 14 | ">most_frequent 0.876 (0.058)\n", 15 | ">constant 0.878 (0.047)\n" 16 | ] 17 | }, 18 | { 19 | "data": { 20 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD9CAYAAABQvqc9AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAF8FJREFUeJzt3X9wXeV95/H3x4qwSSHExtpsFmPsZNxGrjaFroZkEyex0/wA2o0TyKa4SRa6WjyeKeoOkCwkYjaMGdGwZeg2lES1I5eFoXIJu6XuNgNlHDlUbTy1HGzzQ3VwTBoUZxKxEJrEYGT7u3/cR+Raln2vpCPdq/t8XjN3fM5znnPu9z4++ujo3HPPVURgZmZ5mFfrAszMbPY49M3MMuLQNzPLiEPfzCwjDn0zs4w49M3MMuLQNzPLiEPfzCwjDn0zs4y8rtYFjLd48eJYtmxZrcswM5tTdu/e/XxEtFTqV3ehv2zZMgYHB2tdhpnZnCLpn6vp59M7ZmYZceibmWXEoW9mlhGHvplZRhz6ZmYZqRj6krZI+rGkJ0+xXJK+JOmApH2Sfr1s2VWSnkmPq4os3MxsJvX19dHW1kZTUxNtbW309fXVuqRCVHPJ5j3AnwD3nmL5pcCK9HgH8BXgHZIWAV8A2oEAdkvaFhEvTrdoM7OZ1NfXR1dXF729vaxatYqBgQE6OjoAWLduXY2rm56KR/oR8Rjwwmm6rAXujZKdwBslvRn4MPBoRLyQgv5R4JIiijYzm0nd3d309vayZs0ampubWbNmDb29vXR3d9e6tGkr4pz+ecBzZfPDqe1U7SeRtF7SoKTBkZGRAkoyM5u6oaEhVq1adULbqlWrGBoaqlFFxSki9DVBW5ym/eTGiE0R0R4R7S0tFT9FbGY2o1pbWxkYGDihbWBggNbW1hpVVJwiQn8YOL9sfglw6DTtZmZ1rauri46ODvr7+xkdHaW/v5+Ojg66urpqXdq0FXHvnW3AtZK2Unoj96WI+KGkR4DbJC1M/T4EfK6A5zMzm1Fjb9Z2dnYyNDREa2sr3d3dc/5NXKgi9CX1AauBxZKGKV2R0wwQET3A14HLgAPAYeB307IXJN0K7Eqb2hgRp3tD2Mysbqxbt64hQn68iqEfEad91RERwO+dYtkWYMvUSjMzs6L5E7lmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhlx6JuZZcShb2aWkYrfkZsjSYVur/Q1wvkqcjw9lt43bXoc+hOo5gdBkn9gquTxLE61Y+TxtFPx6R0zs4w49M3MMuLQNzPLiEPfzCwjDn0zs4w49M3MMuLQNzPLSFWhL+kSSfslHZB00wTLL5C0XdI+STskLSlbdkzSnvTYVmTxZmY2ORU/nCWpCbgb+CAwDOyStC0ini7rdgdwb0T8L0nvB/4A+HRa9nJEXFhw3WZmNgXVHOlfDByIiIMR8SqwFVg7rs9KYHua7p9guZmZ1YFqQv884Lmy+eHUVm4vcEWa/hhwtqRz0/wCSYOSdkr66ERPIGl96jM4MjIyifLNzGwyqgn9ie7wNP6mHp8B3ifpceB9wA+Ao2nZ0ohoB34H+J+S3nrSxiI2RUR7RLS3tLRUX72ZmU1KNTdcGwbOL5tfAhwq7xARh4DLASSdBVwRES+VLSMiDkraAVwEfHfalZuZ2aRVc6S/C1ghabmkM4ArgROuwpG0WNLYtj4HbEntCyXNH+sDvBsofwPYzMxmUcXQj4ijwLXAI8AQ8EBEPCVpo6SPpG6rgf2SvgO8CehO7a3AoKS9lN7g/eK4q37MzGwWqd7uud3e3h6Dg4O1LqMi36+8WB7PYnk88yNpd3r/9LT8iVwzs4w49M3MMuLQNzPLiEPfzCwjDn0zs4w49M3MMuLQNzPLSDW3YTAza0jSRLcWm7q58NkIh76ZZauakG60D7r59I6ZWUYc+mZmGXHom5llxKFvZpYRh76ZWUYc+mZmGXHom5llxKFvZpYRh76ZWUYc+mZmGXHom5llxKFvZpYRh76ZWUYc+mZmGXHom5llxKFvZpYRh76ZWUYc+mZmGXHom5llpKrQl3SJpP2SDki6aYLlF0jaLmmfpB2SlpQtu0rSM+lxVZHFm5nZ5FQMfUlNwN3ApcBKYJ2kleO63QHcGxFvBzYCf5DWXQR8AXgHcDHwBUkLiyvfzMwmo5oj/YuBAxFxMCJeBbYCa8f1WQlsT9P9Zcs/DDwaES9ExIvAo8Al0y/bzMymoprQPw94rmx+OLWV2wtckaY/Bpwt6dwq10XSekmDkgZHRkaqrX3SFi1ahKRCHqnuQh6LFi2asdc8kzyexfJ4Fquo8YTGGsvXVdFHE7TFuPnPAH8i6WrgMeAHwNEq1yUiNgGbANrb209aXpQXX3yRiBnb/JSN7VhzjcezWB7PYtXjeNbDWFYT+sPA+WXzS4BD5R0i4hBwOYCks4ArIuIlScPA6nHr7phGvWZmNg3VnN7ZBayQtFzSGcCVwLbyDpIWSxrb1ueALWn6EeBDkhaq9Abuh1KbmZnVQMXQj4ijwLWUwnoIeCAinpK0UdJHUrfVwH5J3wHeBHSndV8AbqX0i2MXsDG1mZlZDajeznm1t7fH4ODgjGxbUt2d44P6rauSeq27XuuqpF7rrte6KqnHumeyJkm7I6K9Uj9/ItfMLCMOfTOzjDj0zcwy4tA3M8uIQ9/MLCMOfTOzjDj0zcwy4tA3M8uIQ9/M7BRGDo9w9cNX8/zLz9e6lMI49M3MTqFnXw/f/tG36dnbU+tSCuPQN2swjXh0Wgsjh0f4qwN/RRA8dOChhhlPh75Zg2nEo9Na6NnXw/E4DsDxON4w4+nQN2sgjXp0OtvGxnH0+CgAo8dHG2Y8HfpmDaRRj05nW/k4jmmU8XTomzWIRj46nW17f7z3tXEcM3p8lD0/3lOjiopTzdclmtkccLqj05vfeXONqpqbHvzIg7UuYcb4SN+sQTTy0akVx0f6VnMjh0f47GOf5Y733cHiMxfXupw5q5GPTq04PtK3mvMlhmazJ6vvyOWWcwrZzEjTPD7bspg7Rp5n8bHjlVeoxi0vFbOd2VTAeI40zePSJf+GI/PmMf/4cR4ePlTMmGY6njPG41mcGRrLar8jN6vQL+pLiW/deStf2/81PvErnyjkDbJ6/ALnahRR9607b+Uvn/lLRo+P0jyvmctXXD7tMc15PGdCvdY1W+bK6/cXo88Qf/ilOL7E0Gz2OfQnyR9+KU4jfwDGrF459CfBR6bF8iWGZrPPl2xOgj/8UixfYmg2+3ykPwk+MjWzuc5H+pPgI1Mzm+t8pG9mlpGqQl/SJZL2Szog6aYJli+V1C/pcUn7JF2W2pdJelnSnvTwZRlmZjVU8fSOpCbgbuCDwDCwS9K2iHi6rNvNwAMR8RVJK4GvA8vSsu9GxIXFlm1mZlNRzZH+xcCBiDgYEa8CW4G14/oE8IY0fQ5wqLgSzcysKNWE/nnAc2Xzw6mt3C3ApyQNUzrK7yxbtjyd9vmmpPdMp1gzM5ueakJfE7SNvxHFOuCeiFgCXAbcJ2ke8ENgaURcBFwP/LmkN4xbF0nrJQ1KGhwZGZncKzAzs6pVE/rDwPll80s4+fRNB/AAQER8C1gALI6IIxHx/1L7buC7wC+Pf4KI2BQR7RHR3tLSMvlXYWZmVakm9HcBKyQtl3QGcCWwbVyf7wO/ASCplVLoj0hqSW8EI+ktwArgYFHFm5nZ5FS8eicijkq6FngEaAK2RMRTkjYCgxGxDbgB2CzpOkqnfq6OiJD0XmCjpKPAMWBDRLwwY6/GzMxOy/fTrwP1Wlcl9Vp3vdZVSb3WXa91zZa58vp9P30zMzuJQ9/MLCMOfTOzjDj0zcwy4tA3M8uIQ9/MLCMOfTOzjGT3zVnSRLcSqq2FCxfWugSrE94/Z1e1411tv7lwPX9WoV/kf8hc+cCGzR3eP2dfjmPk0ztmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhlx6JuZZcShb2aWEYe+mVlGsvoSFSuev+nJbG5x6NuU+ZuezOYen94xM8tIVaEv6RJJ+yUdkHTTBMuXSuqX9LikfZIuK1v2ubTefkkfLrJ4MzObnIqhL6kJuBu4FFgJrJO0cly3m4EHIuIi4Ergy2ndlWn+V4FLgC+n7ZmZ1bXOzk4WLFiAJBYsWEBnZ2etSypENUf6FwMHIuJgRLwKbAXWjusTwBvS9DnAoTS9FtgaEUci4lngQNqemVnd6uzspKenh9tuu42f//zn3HbbbfT09DRE8FcT+ucBz5XND6e2crcAn5I0DHwdGBuZatY1M6srmzdv5vbbb+f666/n9a9/Pddffz233347mzdvrnVp01ZN6E90Td74yyzWAfdExBLgMuA+SfOqXBdJ6yUNShocGRmpoiSzPEmq6lFtX5vYkSNH2LBhwwltGzZs4MiRIzWqqDjVhP4wcH7Z/BJ+cfpmTAfwAEBEfAtYACyucl0iYlNEtEdEe0tLS/XVm2UmIgp92MTmz59PT0/PCW09PT3Mnz+/RhUVp5rQ3wWskLRc0hmU3pjdNq7P94HfAJDUSin0R1K/KyXNl7QcWAH8Y1HFm5nNhGuuuYYbb7yRO++8k8OHD3PnnXdy4403cs0119S6tGmr+OGsiDgq6VrgEaAJ2BIRT0naCAxGxDbgBmCzpOsonb65OkqHEU9JegB4GjgK/F5EHJupF2NmVoS77roLgM9//vPccMMNzJ8/nw0bNrzWPpep3v7Ea29vj8HBwVqXUZE/QVosj6fZ9EjaHRHtlfr5E7lmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhPo6+ujra2NpqYm2tra6Ovrq3VJhaj4dYlmZrnp6+ujq6uL3t5eVq1axcDAAB0dHQCsW7euxtVNj4/0zczG6e7upre3lzVr1tDc3MyaNWvo7e2lu7u71qVNm78jdwKSCt1evY3xbCtyPHMfS5sdTU1NvPLKKzQ3N7/WNjo6yoIFCzh27FgNKzs1f0fuNEREoY/ceSxtrmltbWVgYOCEtoGBAVpbW2tUUXEc+mZm43R1ddHR0UF/fz+jo6P09/fT0dFBV1dXrUubNr+Ra2Y2ztibtZ2dnQwNDdHa2kp3d/ecfxMXfE7fzKwh+Jy+mZmdxKFvZpYRh76ZWUYc+mZmGakq9CVdImm/pAOSbppg+R9J2pMe35H0k7Jlx8qWbSuyeDMzm5yKl2xKagLuBj4IDAO7JG2LiKfH+kTEdWX9O4GLyjbxckRcWFzJZmY2VdUc6V8MHIiIgxHxKrAVWHua/uuAxrgdnZlZg6km9M8DniubH05tJ5F0AbAc+EZZ8wJJg5J2SvrolCs1M7Npq+YTuRPdLetUn+i6EngwIsrvSLQ0Ig5JegvwDUlPRMR3T3gCaT2wHmDp0qVVlGRmZlNRzZH+MHB+2fwS4NAp+l7JuFM7EXEo/XsQ2MGJ5/vH+myKiPaIaG9paamiJDMzm4pqQn8XsELScklnUAr2k67CkfQrwELgW2VtCyXNT9OLgXcDT49f18zMZkfF0zsRcVTStcAjQBOwJSKekrQRGIyIsV8A64CtceLNfFqBP5V0nNIvmC+WX/VjZmazyzdcMzNrAL7hmpmZncShb2aWEYe+mVlGHPpmZhlx6Js1kL6+Ptra2mhqaqKtrY2+Pt8RxU7k78g1axB9fX10dXXR29vLqlWrGBgYoKOjA6AhvtvViuFLNs0aRFtbG3fddRdr1qx5ra2/v5/Ozk6efPLJGlZms6HaSzYd+mYNoqmpiVdeeYXm5ubX2kZHR1mwYAHHjh07zZrWCHydvllmWltbGRgYOKFtYGCA1tbWGlVk9cihb9Ygurq66OjooL+/n9HRUfr7++no6KCrq6vWpVkd8Ru5Zg1i7M3azs5OhoaGaG1tpbu722/i2gl8Tt/MrAH4nL6ZmZ3EoW9mlhGHvplZRhz6ZmYZceibmWWk7q7ekTQC/HOt66jCYuD5WhfRQDyexfJ4FmeujOUFEdFSqVPdhf5cIWmwmsujrDoez2J5PIvTaGPp0ztmZhlx6JuZZcShP3Wbal1Ag/F4FsvjWZyGGkuf0zczy4iP9M3MMuLQt7ohaYek9jT9dUlvrHVNZmMkfX6a639U0sqi6pkqh77VpYi4LCJ+Uus6akXSMkm/U0W/Pkn7JF03G3VVq9r655hphT7wUcChXw/SDvpPkr4q6UlJ90v6gKS/l/SMpIsl/ZKkLZJ2SXpc0tqydf9O0rfT412pfXU6cn0wbft+SartKy3eNMfuTElbU2j9BXBm2Xa/J2lxmn5I0m5JT0laX9bnZ5K6Je2VtFPSm2Z9AGbOMuC0oSnpXwPvioi3R8QfjVtW6+/KWEaF+meKpP+U9qm9ku6TdIGk7altu6Slqd89kr4k6R8kHZT08dT+ZkmPSdqT9un3SPoicGZquz/1q3q/TLnwEeAP0zbeWoOhKYmI7B+UdtCjwL+l9ItwN7AFELAWeAi4DfhU6v9G4DvALwGvBxak9hXAYJpeDbwELEnb/Bawqtavtc7G7npgS2p/e9pOe5r/HrA4TS9K/54JPAmcm+YD+A9p+n8AN9d4HP4J+Gqq8X7gA8DfA88AFwOL0njsA3YCb0/rvg/Ykx6PA2en5S+ltutO8Zz7gJdTn/cAO9JYfxO4AWgB/jewKz3endY7F/jb9Fx/SukT8IvTa3iybPufAW5J028FHk7/v38HvC213wN8CfgH4CDw8dResf4Z+n/4VWB/+b4D/DVwVZr/z8BDZbV/Le23K4EDqf0GoCtNNwFnp+mfjXuuSe2X6fk+XvOf2VoXUA+PtLM/UzZ/L/DJNP2WtOMOpv/YsR/O7wOtwDnAfcATqf1wWm818GjZNr9CCr5Gekxz7B4C3l+27reZOPRvAfamx0vAO1P7EX5xBdpvA1+t8ThU+uV3F/CF1P/9wJ40/df8IpDPovSNdquB/1vFc5aH9A7gy2Xzf0460ACWAkNp+kvAf0/Tv5lCqlLobwdWpOl3AN9I0/cwcXBWrH+G/h86ge5xbc8DzWm6GXi+rPZPlvX7afr3vcCBtN9dWLZ8fOhPar+kTkK/1n8C1pMjZdPHy+aPU/ohPAZcERH7y1eSdAvwI+DXKO34r5xim8do3K+nnOrYQSlwTknSakpHzP8+Ig5L2gEsSItHI/00UR/j+2xEPAEg6Slge0SEpCcoBeoFwBUAEfENSedKOofSXwN3ptMG/ycihqdxJvAvyqY/AKws29YbJJ1NKdQuT3X8jaQXT7dBSWcB7wK+Vrat+WVdHoqI48DTdXCKTVTYp8YtL993SztkxGOS3kvpF+J9kv4wIu494Unm1n55Ap/Tr94jQOfYeXlJF6X2c4Afpp3+05T+HLQTnWrsHgM+mdraKJ3iGe8c4MX0g/U24J2zUO9UVfrlN1GSR0R8EfgvlE4T7Eyvc6p+XjY9j1IoXZge50XET8eed4J1j3JiJoyF2DzgJ2XbuTAiWsv6nRScNbQd+ISkcwEkLaJ06unKtPyTwMDpNiDpAuDHEbEZ6AV+PS0aldScpqeyX/6U0qm7mnLoV+9WSn8a7pP0ZJoH+DJwlaSdwC9z4g+dlZxq7L4CnCVpH/DfgH+cYN2HgdelPrdSOlc8V5X/kltN6TTDv0h6a0Q8ERG3UzoV9jaKCYi/Ba4dm5F04QR1XAosTO0/Av5V+gtkPvBbABHxL8Czkv5jWkeSfq3Cc9ck4CLiKaAb+KakvcCdwO8Dv5v2oU8D/7XCZlYDeyQ9Tukvsz9O7Zso7cP3M7X9civw2XQxQ83eyPUncs0KIGkZpXPYbWn+njT/4NgySqdV/gxYDhwG1kfEPkl3AWsonQp4Gria0l8HD1M6135PjLs65xTPuQP4TEQMpvnFwN2U3j95HfBYRGxIR8F9advfpHSq599FxPOSfp9SSD4L/AD4XkTcImk5pV/Sb6b0C3xrRGwsf53pOX8WEWelI+LT1m+14dA3y5yk71F6A30u3DPepsmnd8zMMuIjfbM6J+nDwO3jmp+NiI/Voh6b2xz6ZmYZ8ekdM7OMOPTNzDLi0Dczy4hD38wsIw59M7OM/H8Ti6xFFCEowAAAAABJRU5ErkJggg==\n", 21 | "text/plain": [ 22 | "" 23 | ] 24 | }, 25 | "metadata": { 26 | "needs_background": "light" 27 | }, 28 | "output_type": "display_data" 29 | } 30 | ], 31 | "source": [ 32 | "# compare statistical imputation strategies for the horse colic dataset\n", 33 | "from numpy import mean\n", 34 | "from numpy import std\n", 35 | "from pandas import read_csv\n", 36 | "from sklearn.ensemble import RandomForestClassifier\n", 37 | "from sklearn.impute import SimpleImputer\n", 38 | "from sklearn.model_selection import cross_val_score\n", 39 | "from sklearn.model_selection import RepeatedStratifiedKFold\n", 40 | "from sklearn.pipeline import Pipeline\n", 41 | "from matplotlib import pyplot\n", 42 | "# load dataset\n", 43 | "url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv'\n", 44 | "dataframe = read_csv(url, header=None, na_values='?')\n", 45 | "# split into input and output elements\n", 46 | "data = dataframe.values\n", 47 | "ix = [i for i in range(data.shape[1]) if i != 23]\n", 48 | "X, y = data[:, ix], data[:, 23]\n", 49 | "# evaluate each strategy on the dataset\n", 50 | "results = list()\n", 51 | "strategies = ['mean', 'median', 'most_frequent', 'constant']\n", 52 | "for s in strategies:\n", 53 | "\t# create the modeling pipeline\n", 54 | "\tpipeline = Pipeline(steps=[('i', SimpleImputer(strategy=s)), ('m', RandomForestClassifier())])\n", 55 | "\t# evaluate the model\n", 56 | "\tcv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)\n", 57 | "\tscores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)\n", 58 | "\t# store results\n", 59 | "\tresults.append(scores)\n", 60 | "\tprint('>%s %.3f (%.3f)' % (s, mean(scores), std(scores)))\n", 61 | "# plot model performance for comparison\n", 62 | "pyplot.boxplot(results, labels=strategies, showmeans=True)\n", 63 | "pyplot.show()" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 4, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "Predicted Class: 2\n" 76 | ] 77 | } 78 | ], 79 | "source": [ 80 | "# constant imputation strategy and prediction for the hose colic dataset\n", 81 | "from numpy import nan\n", 82 | "from pandas import read_csv\n", 83 | "from sklearn.ensemble import RandomForestClassifier\n", 84 | "from sklearn.impute import SimpleImputer\n", 85 | "from sklearn.pipeline import Pipeline\n", 86 | "# load dataset\n", 87 | "url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv'\n", 88 | "dataframe = read_csv(url, header=None, na_values='?')\n", 89 | "# split into input and output elements\n", 90 | "data = dataframe.values\n", 91 | "ix = [i for i in range(data.shape[1]) if i != 23]\n", 92 | "X, y = data[:, ix], data[:, 23]\n", 93 | "# create the modeling pipeline\n", 94 | "pipeline = Pipeline(steps=[('i', SimpleImputer(strategy='constant')), ('m', RandomForestClassifier())])\n", 95 | "# fit the model\n", 96 | "pipeline.fit(X, y)\n", 97 | "# define new data\n", 98 | "row = [2, 1, 530101, 38.50, 66, 28, 3, 3, nan, 2, 5, 4, 4, nan, nan, nan, 3, 5, 45.00, 8.40, nan, nan, 2, 11300, 00000, 00000, 2]\n", 99 | "# make a prediction\n", 100 | "yhat = pipeline.predict([row])\n", 101 | "# summarize prediction\n", 102 | "print('Predicted Class: %d' % yhat[0])" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [] 111 | } 112 | ], 113 | "metadata": { 114 | "kernelspec": { 115 | "display_name": "Python 3", 116 | "language": "python", 117 | "name": "python3" 118 | }, 119 | "language_info": { 120 | "codemirror_mode": { 121 | "name": "ipython", 122 | "version": 3 123 | }, 124 | "file_extension": ".py", 125 | "mimetype": "text/x-python", 126 | "name": "python", 127 | "nbconvert_exporter": "python", 128 | "pygments_lexer": "ipython3", 129 | "version": "3.6.4" 130 | } 131 | }, 132 | "nbformat": 4, 133 | "nbformat_minor": 2 134 | } 135 | -------------------------------------------------------------------------------- /KNNImputer and Model Evaluation Different K-Values.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# define modeling pipeline\n", 10 | "model = RandomForestClassifier()\n", 11 | "imputer = KNNImputer()\n", 12 | "pipeline = Pipeline(steps=[('i', imputer), ('m', model)])" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "name": "stdout", 22 | "output_type": "stream", 23 | "text": [ 24 | "Mean Accuracy: 0.867 (0.051)\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "# evaluate knn imputation and random forest for the horse colic dataset\n", 30 | "from numpy import mean\n", 31 | "from numpy import std\n", 32 | "from pandas import read_csv\n", 33 | "from sklearn.ensemble import RandomForestClassifier\n", 34 | "from sklearn.impute import KNNImputer\n", 35 | "from sklearn.model_selection import cross_val_score\n", 36 | "from sklearn.model_selection import RepeatedStratifiedKFold\n", 37 | "from sklearn.pipeline import Pipeline\n", 38 | "# load dataset\n", 39 | "url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv'\n", 40 | "dataframe = read_csv(url, header=None, na_values='?')\n", 41 | "# split into input and output elements\n", 42 | "data = dataframe.values\n", 43 | "ix = [i for i in range(data.shape[1]) if i != 23]\n", 44 | "X, y = data[:, ix], data[:, 23]\n", 45 | "# define modeling pipeline\n", 46 | "model = RandomForestClassifier()\n", 47 | "imputer = KNNImputer()\n", 48 | "pipeline = Pipeline(steps=[('i', imputer), ('m', model)])\n", 49 | "# define model evaluation\n", 50 | "cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)\n", 51 | "# evaluate model\n", 52 | "scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')\n", 53 | "print('Mean Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 3, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "name": "stdout", 63 | "output_type": "stream", 64 | "text": [ 65 | ">1 0.863 (0.049)\n", 66 | ">3 0.863 (0.053)\n", 67 | ">5 0.867 (0.054)\n", 68 | ">7 0.868 (0.050)\n", 69 | ">9 0.859 (0.052)\n", 70 | ">15 0.857 (0.054)\n", 71 | ">18 0.863 (0.052)\n", 72 | ">21 0.860 (0.054)\n" 73 | ] 74 | }, 75 | { 76 | "data": { 77 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAD8CAYAAAB3u9PLAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAGlJJREFUeJzt3X+QHOV95/H3RwsSzoGwxK5dnFYgOaezkY1POGPZVTkjYg5bUuUQPxwiHf5BijrFvoiKsaEsXfCZE6Ec13FxkisCJc5CNoXRCc4Y1cVGdnSSk7oCRyPrt3SCRXasZXVmsDHBJVtI6Ht/9LO4Gc3u9K5m1TO7n1fV1PZ0P/3Mt3dn5rP9dM+0IgIzM7NJZRdgZmbtwYFgZmaAA8HMzBIHgpmZAQ4EMzNLHAhmZgY4EMzMLCkUCJIWSjooqU/SygbLL5a0WdJuSVsl9ab5vyNpZ+72K0nXpGXrJP0wt2xeazfNzMxGQs0+mCapC3gGuAroB7YByyJif67No8D/ioivSvog8AcR8bG6fqYDfUBvRByVtC6t81grN8jMzEbnrAJt5gN9EXEIQNJ6YAmwP9dmLnBrmt4CfLNBPx8Bvh0RR0dbbHd3d8yaNWu0q5uZTUjbt29/MSJ6mrUrEggzgMO5+/3A++ra7AKuB/4SuBY4T9IFEfHTXJulwJ/XrXe3pP8EbAZWRsSx4QqZNWsW1Wq1QMlmZjZI0j8WaVfkGIIazKsfZ7oNWCBpB7AAeB44kSvmQuBSYFNunVXAO4D3AtOBzzV8cGm5pKqkaq1WK1CumZmNRpFA6Adm5u73AgP5BhExEBHXRcRlwJ+keS/nmtwAPB4Rx3PrHInMMeBBsqGpU0TEmoioRESlp6fpHo+ZmY1SkUDYBsyRNFvSZLKhn435BpK6JQ32tQpYW9fHMuCRunUuTD8FXAPsHXn5ZmbWKk0DISJOACvIhnsOABsiYp+k1ZKuTs2uAA5KegZ4K3D34PqSZpHtYXyvruuHJe0B9gDdwJ+e1paYmdlpaXraaTupVCrhg8pmZiMjaXtEVJq18yeVzcwMcCCYmVniQDAzM6DYB9PGjeyEpmI66diKDW0kf3Mo7+/eKc/NTqmzE7Tjc3NCBUKjX6gkP3HHsU75m7vOiWeo31mZv08PGZmZGeBAMDOzxIFgZmaAA8HMzBIHgpmZAQ4EMzNLHAhmZgY4EMzMLHEgmJkZ4EAwM7PEgWBmZkDBQJC0UNJBSX2SVjZYfrGkzZJ2S9oqqTe37DVJO9NtY27+bEnfl/SspP+RLs9pZmYlaRoIkrqAe4FFwFxgmaS5dc3uAb4WEe8GVgNfzC37ZUTMS7erc/O/BHw5IuYALwE3n8Z2mJnZaSqyhzAf6IuIQxHxKrAeWFLXZi6wOU1vabD8DZR97+sHgcfSrK8C1xQt2szMWq9IIMwADufu96d5ebuA69P0tcB5ki5I98+RVJX0tKTBN/0LgJ9HxIlh+jQzszOoSCA0uopD/Zd13wYskLQDWAA8Dwy+2V+ULu7874C/kPSbBfvMHlxangKlWqvVCpRrZmajUSQQ+oGZufu9wEC+QUQMRMR1EXEZ8Cdp3suDy9LPQ8BW4DLgReDNks4aqs9c32siohIRlZ6enqLbZWZmI1QkELYBc9JZQZOBpcDGfANJ3ZIG+1oFrE3zp0maMtgG+G1gf2SXA9oCfCSt8wngidPdGDMzG72mgZDG+VcAm4ADwIaI2CdptaTBs4auAA5KegZ4K3B3mn8JUJW0iywA/iwi9qdlnwM+I6mP7JjCV1q0TWZmNgrqpGuhViqVqFarLe3T14OdeDrlb+46J6ax+H1K2p6O5Q7Ln1Q2MzPAgWBmZokDwczMAAeCmZklDgQzMwMcCGZmljgQzMwMcCCYmVniQDAzM8CBYGZmiQPBzMwAOKt5EzuTsovJFVfWd8h0Sp02MY3k+enn5q85ENrMUE/OdvsCsUa1tFuNNnH5+Tk6HjIyMzPAgWBmZokDwczMgIKBIGmhpIOS+iStbLD8YkmbJe2WtFVSb5o/T9JTkvalZb+fW2edpB9K2plu81q3WWZmNlJNA0FSF3AvsAiYCyyTNLeu2T3A1yLi3cBq4Itp/lHg4xHxTmAh8BeS3pxb7/aImJduO09zW8zM7DQU2UOYD/RFxKGIeBVYDyypazMX2Jymtwwuj4hnIuLZND0AvAD0tKJwMzNrrSKBMAM4nLvfn+bl7QKuT9PXAudJuiDfQNJ8YDLwXG723Wko6cuSpjR6cEnLJVUlVWu1WoFyzcxsNIoEQqNPeNSfzHsbsEDSDmAB8Dxw4vUOpAuBh4A/iIiTafYq4B3Ae4HpwOcaPXhErImISkRUenq8c2FmNlaKfDCtH5iZu98LDOQbpOGg6wAknQtcHxEvp/tTgb8B7oiIp3PrHEmTxyQ9SBYqZmZWkiJ7CNuAOZJmS5oMLAU25htI6pY02NcqYG2aPxl4nOyA86N161yYfgq4Bth7OhtiZmanp2kgRMQJYAWwCTgAbIiIfZJWS7o6NbsCOCjpGeCtwN1p/g3A5cBNDU4vfVjSHmAP0A38aas2yszMRk6d9N0elUolqtVqS/vslO836YQ6O6FGcJ2t5jpbayzqlLQ9IirN2vmTymZmBjgQzMwscSCYmRngQDAzs8SBYGZmgAPBzMwSB4KZmQEOBDMzSxwIZmYGOBDMzCxxIJiZGeBAMDOzxIFgZmaAA8HMzBIHgpmZAQUDQdJCSQcl9Ula2WD5xZI2S9otaauk3tyyT0h6Nt0+kZv/W5L2pD7/Kl05zczMStI0ECR1AfcCi4C5wDJJc+ua3UN2mcx3A6uBL6Z1pwNfAN4HzAe+IGlaWuc+YDkwJ90WnvbWmJnZqBXZQ5gP9EXEoYh4FVgPLKlrMxfYnKa35JZ/GPhuRPwsIl4CvgssTNdTnhoRT0V2aaCvkV1X2czMSlIkEGYAh3P3+9O8vF3A9Wn6WuA8SRcMs+6MND1cn2ZmdgYVCYRGY/v1F/y8DVggaQewAHgeODHMukX6zB5cWi6pKqlaq9UKlAvTp09HUqFbeoymt+nTpxd67JEYb3UWrdF1Tsw6/Rpq/zrPKtCmH5iZu98LDOQbRMQAcB2ApHOB6yPiZUn9wBV1625NffbWzX9Dn7m+1wBrACqVSqErT7/00ktjcZHqlvYHrrPVXGdrtbrOTqgRJnadRfYQtgFzJM2WNBlYCmysK6xb0mBfq4C1aXoT8CFJ05QdTP4QsCkijgCvSHq/sq36OPBEC7bHzMxGqWkgRMQJYAXZm/sBYENE7JO0WtLVqdkVwEFJzwBvBe5O6/4MuIssVLYBq9M8gE8B/x3oA54Dvt2qjTIzs5FTq3djxlKlUolqtdq0naQx2T1zn+7TfbZPf+6zeJ+StkdEpVk7f1LZzMwAB4KZmSUOBDMzAxwIZmaWOBDMzAxwIJiZWeJAMDMzwIFgZmaJA8HMzAAHgpmZJQ4EMzMDHAhmZpY4EMzMDHAgmJlZ4kAwMzPAgWBmZkmhQJC0UNJBSX2SVjZYfpGkLZJ2SNotaXGaf6OknbnbSUnz0rKtqc/BZW9p7aaZmdlInNWsgaQu4F7gKqAf2CZpY0TszzW7g+zSmvdJmgt8C5gVEQ8DD6d+LgWeiIidufVujIjml0AzM7MxV2QPYT7QFxGHIuJVYD2wpK5NAFPT9PnAQIN+lgGPjLZQMzMbW0UCYQZwOHe/P83LuxP4qKR+sr2DWxr08/ucGggPpuGiz0tSsZLNzGwsFAmERm/U9Vd2Xgasi4heYDHwkKTX+5b0PuBoROzNrXNjRFwKfCDdPtbwwaXlkqqSqrVarUC5ZmY2GkUCoR+Ymbvfy6lDQjcDGwAi4ingHKA7t3wpdXsHEfF8+vkK8HWyoalTRMSaiKhERKWnp6dAuWZmNhpFAmEbMEfSbEmTyd7cN9a1+TFwJYCkS8gCoZbuTwJ+j+zYA2neWZK60/TZwO8CezEzs9I0PcsoIk5IWgFsArqAtRGxT9JqoBoRG4HPAg9IupVsOOmmiBgcVroc6I+IQ7lupwCbUhh0AX8LPNCyrTIzsxHTr9+321+lUolqtflZqpJo9Xa5z/HRZ+1ojdv/7nbuWXAP3W/qHrZt0T5HaqL22Qk1jtc+JW2PiEqzdv6ksrVE7WiNm568iRd/+WLZpQzr/t3384Of/ID7d91fdilmbceBYC3RCW+0taM1nuh7giD4Zt832z68zM40B4Kdtk55o71/9/2cjJMAnIyTbR1eZmUYl8cQuPP8Qv3VuiZxe08399RepPu1kwX6fblQv4WNkzrvumAaj597LscnibNPBtf94hfc8dOXCvR75uqsdU1iUe8/59ikX/8PNOXkSZ7sH2j+Oy3p7z7yfjugTr+GWtxvsTqLHkMYl4FQ9GDLXU/fxaMHH+WGt9/AHe+/oyV9jsR4qLN2tMaibyzi2GvHXp83pWsKT17/5LAHbc90nXc9fRePP/s4x08ef33e2ZPO5ro51w37Oy37YOB46tOvIR9UbludMszR7nXmh2EGteNwzK4Xdr0hDACOnzzOzhd2DrGGjRft/hpqJ00/hzBeNRpPbvafQxnavc5OeaN97OrHyi7BStLur6F2MiGHjEYzzFHGLl+n1Ok+3WcZ/RXps9NeQyP5nIyHjFqkU4Y5OqVOa71O+VxHu+u011DZp29PyEDolGGOTqnTWq/sN4bxopNeQ+1wrGNCDhmNhvt0n2eqz/wwR5EztsqqE8ZuiKOo8dRn/ky4Vp8B5yEjsw7VSR+g855MawzuHQzuzRw/ebyUvQQHglkbaZc3hiLaYYhjvGiXYx0OBLM20i5vDEV00p5Mu2uXYx0T9nMIZu2oXd4YmhlqT+aT/+qThb5W3N6oXT4n40AwayPt8sbQzHB7Mv7QV+cqNGQkaaGkg5L6JK1ssPwiSVsk7ZC0W9LiNH+WpF9K2plu9+fW+S1Je1KffyVJrdssMxtLnbInYyPTdA9BUhdwL3AV0A9sk7QxIvbnmt0BbIiI+yTNBb4FzErLnouIeQ26vg9YDjyd2i8Evj3aDTGzM6dT9mRsZIrsIcwH+iLiUES8CqwHltS1CWBqmj4fGBiuQ0kXAlMj4ql07eWvAdeMqHIzM2upIoEwAzicu9+f5uXdCXxUUj/Zf/u35JbNTkNJ35P0gVyf/U36NDOzM6hIIDQa26//eNwyYF1E9AKLgYckTQKOABdFxGXAZ4CvS5pasM/swaXlkqqSqrVarUC5ZmY2GkUCoR+Ymbvfy6lDQjcDGwAi4ingHKA7Io5FxE/T/O3Ac8C/TH32NumTtN6aiKhERKWnp6dAuWZmNhpFAmEbMEfSbEmTgaXAxro2PwauBJB0CVkg1CT1pIPSSHobMAc4FBFHgFckvT+dXfRx4ImWbJGZmY1K07OMIuKEpBXAJqALWBsR+yStBqoRsRH4LPCApFvJhn5uioiQdDmwWtIJ4DXgkxHxs9T1p4B1wJvIzi7yGUZmZiXyt50W5D7dp/tsr/7cp7/t1MzMxogDwczMAAeCmZklDgQzMwMcCGZmljgQzMwMcCCYmVniQDAzM8CBYGZmiQPBzMwAB4KZmSVNv9yuU7X6Es3Tpk1raX+dplN+n51Sp1k7GpeBMJIvkRqLL50ab0bwBVql/i47pU6zduUhIzMzAxwIZmaWOBDMzAwoGAiSFko6KKlP0soGyy+StEXSDkm7JS1O86+StF3SnvTzg7l1tqY+d6bbW1q3WWZmNlJNDyqnayLfC1wF9APbJG2MiP25ZncAGyLiPklzgW8Bs4AXgX8bEQOS3kV2Gc4ZufVujIjml0AzM7MxV2QPYT7QFxGHIuJVYD2wpK5NAFPT9PnAAEBE7IiIgTR/H3COpCmnX7aZmbVakUCYARzO3e/njf/lA9wJfFRSP9newS0N+rke2BERx3LzHkzDRZ/XECeQS1ouqSqpWqvVCpRrZmajUSQQGr1R15/EvQxYFxG9wGLgIUmv9y3pncCXgD/MrXNjRFwKfCDdPtbowSNiTURUIqLS09NToFwzMxuNIoHQD8zM3e8lDQnl3AxsAIiIp4BzgG4ASb3A48DHI+K5wRUi4vn08xXg62RDU2ZmVpIigbANmCNptqTJwFJgY12bHwNXAki6hCwQapLeDPwNsCoi/s9gY0lnSRoMjLOB3wX2nu7GmJnZ6DUNhIg4AawgO0PoANnZRPskrZZ0dWr2WeDfS9oFPALcFNl3A6wA/gXw+brTS6cAmyTtBnYCzwMPtHrjzMysOHXSd7pUKpWoVlt7lmqZ32szFo893rZnLJRdZ6f83VvdZyfUOF77lLQ9IirN2vmTymZmBjgQzMwscSCYmRngQDAzs8SBYGZmgAPBzMwSB4KZmQEOBDMzSxwIZmYGOBDMzCxxIJiZGVDgEpo2toa4LtCoTZs2raX92cTWyufnWD03O+U11Al1OhBKNJIvuyr7C9ls4hnBF6eV9tzslNdQp9TpISMzMwMcCGZmlhQKBEkLJR2U1CdpZYPlF0naImmHpN2SFueWrUrrHZT04aJ9mpnZmdU0ECR1AfcCi4C5wDJJc+ua3UF2JbXLyC6x+ddp3bnp/juBhcBfS+oq2KeZmZ1BRfYQ5gN9EXEoIl4F1gNL6toEMDVNnw8MpOklwPqIOBYRPwT6Un9F+jQzszOoSCDMAA7n7veneXl3Ah+V1A98C7ilybpF+jQzszOoSCA0Onm2/pyoZcC6iOgFFgMPSZo0zLpF+sweXFouqSqpWqvVCpRr1v4ktfTmz59YKxT5HEI/MDN3v5dfDwkNupnsGAER8ZSkc4DuJus265PU3xpgDUClUvGJ+NbxOuH8fpuYiuwhbAPmSJotaTLZQeKNdW1+DFwJIOkS4BygltotlTRF0mxgDvAPBfs0M7MzqOkeQkSckLQC2AR0AWsjYp+k1UA1IjYCnwUekHQr2dDPTZH9a7NP0gZgP3AC+KOIeA2gUZ9jsH1mZlaQOmmXtFKpRLVabWmfnbJb3gl1dkKN4DpbzXW21ljUKWl7RFSatfMnlc3MDHAgmJlZ4kAwMzPAgWBmZokDwczMAAeCmZklDgQzMwMcCGZmljgQzMwMcCCYmVniQDAzM8CBYGZmiQPBzMwAB4KZmSUOBDMzAxwIZmaWFAoESQslHZTUJ2llg+VflrQz3Z6R9PM0/3dy83dK+pWka9KydZJ+mFs2r7WbZmZmI9H0EpqSuoB7gauAfmCbpI0RsX+wTUTcmmt/C3BZmr8FmJfmTwf6gO/kur89Ih5rwXaYmdlpKrKHMB/oi4hDEfEqsB5YMkz7ZcAjDeZ/BPh2RBwdeZlmZjbWigTCDOBw7n5/mncKSRcDs4H/3WDxUk4Nirsl7U5DTlMK1GJmZmOkSCCowbyhrgC9FHgsIl57QwfShcClwKbc7FXAO4D3AtOBzzV8cGm5pKqkaq1WK1CumZmNRpFA6Adm5u73AgNDtG20FwBwA/B4RBwfnBERRyJzDHiQbGjqFBGxJiIqEVHp6ekpUK6ZmY1GkUDYBsyRNFvSZLI3/Y31jSS9HZgGPNWgj1OOK6S9BiQJuAbYO7LSzcyslZqeZRQRJyStIBvu6QLWRsQ+SauBakQMhsMyYH1EvGE4SdIssj2M79V1/bCkHrIhqZ3AJ09nQ8zM7PSo7v27rVUqlahWqy3tUxKd8DvohDo7oUZwna3mOltrLOqUtD0iKs3a+ZPKZmYGOBDMzCxxIJiZGVDgoPJ4kp3QVGx+WWONQ9U41LJ2q3Oo+a5zeJ3w3ATX2Urt+FqfUIHQCQeUOqFGcJ2t5jpbqxPqbMcaPWRkZmaAA8HMzBIHgpmZAQ4EMzNLHAhmZgY4EMzMLHEgmJkZ4EAwM7Oko77tVFIN+McWd9sNvNjiPsdCJ9TZCTWC62w119laY1HnxRHR9ApjHRUIY0FStcjXwpatE+rshBrBdbaa62ytMuv0kJGZmQEOBDMzSxwIsKbsAgrqhDo7oUZwna3mOlurtDon/DEEMzPLeA/BzMyACRwIktZKekHS3rJrGYqkcyT9g6RdkvZJ+s9l1zQUST+StEfSTknVsutpRNLbU32Dt3+S9Omy62pE0h9L2pv+7m1TY6PXjaQ7JT2f+70uLrPGVFOjOudJenrwOSppfsk1zpS0RdKB9Hf+4zT/99L9k5LO7NlGETEhb8DlwHuAvWXXMkyNAs5N02cD3wfeX3ZdQ9T6I6C77DpGUG8X8P/Izs8uvZ662t4F7AV+g+wiVn8LzCm7rlTbKa8b4E7gtrJrK1Dnd4BFaXoxsLXkGi8E3pOmzwOeAeYClwBvB7YClTNZ04TdQ4iIvwN+VnYdw4nML9Lds9PNB31a40rguYho9QcdW+ES4OmIOBoRJ4DvAdeWXBPQGa8bGLLOAKam6fOBgTNaVJ2IOBIRP0jTrwAHgBkRcSAiDpZR04QNhE4hqUvSTuAF4LsR8f2yaxpCAN+RtF3S8rKLKWAp8EjZRQxhL3C5pAsk/QbZf7MzS66pmRWSdqehmmllFzOETwP/RdJh4B5gVcn1vE7SLOAyslGA0jgQ2lxEvBYR84BeYL6kd5Vd0xB+OyLeAywC/kjS5WUXNBRJk4GrgUfLrqWRiDgAfAn4LvAksAs4UWpRw7sP+E1gHnAE+K/lljOkTwG3RsRM4FbgKyXXA4Ckc4H/CXw6Iv6pzFocCB0iIn5ONqa4sORSGoqIgfTzBeBxoNQDdk0sAn4QET8pu5ChRMRXIuI9EXE52dDHs2XXNJSI+En6x+Uk8ADt+7f/BPCNNP0obVCnpLPJwuDhiPhGs/ZjzYHQxiT1SHpzmn4T8G+A/1tuVaeS9M8knTc4DXyIbNijXS2jfYeLAJD0lvTzIuA62rheSRfm7l5L+/7tB4AFafqDlByykkS2l3IgIv68zFoGTdgPpkl6BLiC7JsFfwJ8ISLaYhdykKR3A18lOyNmErAhIlaXW9WpJL2NbK8AsrNivh4Rd5dY0pDSmPxh4G0R8XLZ9QxF0t8DFwDHgc9ExOaSSwIav27S/Xlkx5F+BPxhRBwpp8LMEHUeBP6S7Dn6K+A/RMT2Emv818DfA3uAk2n2fwSmAP8N6AF+DuyMiA+fkZomaiCYmdkbecjIzMwAB4KZmSUOBDMzAxwIZmaWOBDMzAxwIJiZWeJAMDMzwIFgZmbJ/wcmWaYUotj9XAAAAABJRU5ErkJggg==\n", 78 | "text/plain": [ 79 | "" 80 | ] 81 | }, 82 | "metadata": { 83 | "needs_background": "light" 84 | }, 85 | "output_type": "display_data" 86 | } 87 | ], 88 | "source": [ 89 | "# compare knn imputation strategies for the horse colic dataset\n", 90 | "from numpy import mean\n", 91 | "from numpy import std\n", 92 | "from pandas import read_csv\n", 93 | "from sklearn.ensemble import RandomForestClassifier\n", 94 | "from sklearn.impute import KNNImputer\n", 95 | "from sklearn.model_selection import cross_val_score\n", 96 | "from sklearn.model_selection import RepeatedStratifiedKFold\n", 97 | "from sklearn.pipeline import Pipeline\n", 98 | "from matplotlib import pyplot\n", 99 | "# load dataset\n", 100 | "url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv'\n", 101 | "dataframe = read_csv(url, header=None, na_values='?')\n", 102 | "# split into input and output elements\n", 103 | "data = dataframe.values\n", 104 | "ix = [i for i in range(data.shape[1]) if i != 23]\n", 105 | "X, y = data[:, ix], data[:, 23]\n", 106 | "# evaluate each strategy on the dataset\n", 107 | "results = list()\n", 108 | "strategies = [str(i) for i in [1,3,5,7,9,15,18,21]]\n", 109 | "for s in strategies:\n", 110 | "\t# create the modeling pipeline\n", 111 | "\tpipeline = Pipeline(steps=[('i', KNNImputer(n_neighbors=int(s))), ('m', RandomForestClassifier())])\n", 112 | "\t# evaluate the model\n", 113 | "\tcv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)\n", 114 | "\tscores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)\n", 115 | "\t# store results\n", 116 | "\tresults.append(scores)\n", 117 | "\tprint('>%s %.3f (%.3f)' % (s, mean(scores), std(scores)))\n", 118 | "# plot model performance for comparison\n", 119 | "pyplot.boxplot(results, labels=strategies, showmeans=True)\n", 120 | "pyplot.show()" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [] 129 | } 130 | ], 131 | "metadata": { 132 | "kernelspec": { 133 | "display_name": "Python 3", 134 | "language": "python", 135 | "name": "python3" 136 | }, 137 | "language_info": { 138 | "codemirror_mode": { 139 | "name": "ipython", 140 | "version": 3 141 | }, 142 | "file_extension": ".py", 143 | "mimetype": "text/x-python", 144 | "name": "python", 145 | "nbconvert_exporter": "python", 146 | "pygments_lexer": "ipython3", 147 | "version": "3.6.4" 148 | } 149 | }, 150 | "nbformat": 4, 151 | "nbformat_minor": 2 152 | } 153 | -------------------------------------------------------------------------------- /IterativeImputer and Different Number of Iterations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | ">1 0.871 (0.051)\n", 13 | ">2 0.868 (0.055)\n", 14 | ">3 0.871 (0.048)\n", 15 | ">4 0.868 (0.049)\n", 16 | ">5 0.874 (0.049)\n", 17 | ">6 0.874 (0.055)\n", 18 | ">7 0.872 (0.054)\n", 19 | ">8 0.874 (0.054)\n", 20 | ">9 0.866 (0.048)\n", 21 | ">10 0.870 (0.057)\n", 22 | ">11 0.872 (0.050)\n", 23 | ">12 0.873 (0.050)\n", 24 | ">13 0.871 (0.052)\n", 25 | ">14 0.863 (0.053)\n", 26 | ">15 0.871 (0.051)\n", 27 | ">16 0.872 (0.053)\n", 28 | ">17 0.872 (0.051)\n", 29 | ">18 0.862 (0.051)\n", 30 | ">19 0.872 (0.051)\n", 31 | ">20 0.868 (0.049)\n" 32 | ] 33 | }, 34 | { 35 | "data": { 36 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAECCAYAAAASDQdFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAG5JJREFUeJzt3X+8HXV95/HXOwnhh5KQkCg/AgQ1gOCq4G3CLw1V+amCgLVk5Zf1wY99SC0q7gOWIGwiBS1qaxflR00pshJjrBpbCktdsO2jUHIjwgo2EukWrlG5CGu3D1lCyGf/mLlmcnLuPTPnTO6dc7/v5+NxHvecmfl+znfmzH2f78yZc68iAjMzS8OUie6AmZmNH4e+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWkGkT3YFWc+bMifnz5090N8zM+sq6deuejYi5nZZrXOjPnz+fwcHBie6GmVlfkfSvZZbz6R0zs4Q49M3MEuLQNzNLiEPfzCwhDn0zs4R0DH1JKyQ9I+mHo8yXpC9I2iDpUUlHFOadJ+mJ/HZenR03M7Pqyoz0bwNOGmP+ycCC/HYh8CUASbOBq4FFwELgakmzeumsmZn1pmPoR8TfAc+NschpwO2ReRDYQ9LewInAvRHxXEQ8D9zL2G8eZma2g9VxTn9f4OnC46F82mjTtyPpQkmDkgaHh4dr6FJ7ktrezMxSUUfot0vNGGP69hMjbomIgYgYmDu347eIuxYRv7kVH5uZpaKO0B8C9is8ngdsHGO6mZlNkDpCfw1wbn4Vz5HAryLiZ8A9wAmSZuUf4J6QTzMzswnS8Q+uSboTOA6YI2mI7IqcnQAi4ibgLuAUYAPwa+CD+bznJC0H1uallkXEWB8Im5nZDtYx9CNiSYf5AXx4lHkrgBXddc3MzOrmb+SamSXEoW9mlhCHvplZQhz6ZmYJceibmSXEoW9mlhCHvplZQhz6ZmYJceibmSXEoW9mlhCHvplZQhz6ZmYJceibmSXEoW9mlhCHvplZQhz6ZmYJceibmSXEoW9mlhCHvplZQhz6ZmYJceibmSXEoW9mlpBpE92BKiRtNy0i+q4PTajRrn0dNbwtumvflBreFs3qQ101ivoq9EdWVNK4h32dfWhCjWKbXmtMlm3R7+tRR40m9KEpNZrQh7pqFPn0jplZQhz6ZmYJceibmSXEoW9mlhCHvplZQhz6ZmYJceibmSXEoW9mlhCHvplZQkqFvqSTJK2XtEHS5W3mHyDpu5IelXS/pHmFeS9L+kF+W1Nn583MrJqOf4ZB0lTgRuB4YAhYK2lNRDxeWOwG4PaI+AtJbweuA87J570QEW+uud9mZtaFMiP9hcCGiHgyIjYBK4HTWpY5FPhufv++NvPNzKwByoT+vsDThcdD+bSiR4Az8/unA7tL2jN/vIukQUkPSnpvT701M7OelAn9dn9rtfVPvV0GLJb0MLAY+CmwOZ+3f0QMAP8R+GNJr93uCaQL8zeGweHh4fK9NzOzSsqE/hCwX+HxPGBjcYGI2BgRZ0TE4cCV+bRfjczLfz4J3A8c3voEEXFLRAxExMDcuXO7WQ8zMyuhTOivBRZIOlDSdOAsYJurcCTNkTRS6wpgRT59lqSdR5YBjgGKHwCbmdk46hj6EbEZuAS4B/gRsCoiHpO0TNKp+WLHAesl/Rh4NXBtPv31wKCkR8g+4L2+5aofMzMbR5qo/0A1moGBgRgcHBxzmTr+g0yvNZrQh6bUaEIf6qjRhD40pUYT+tCUGk3oQ5kaktbln5+Oyd/INTNLiEPfzCwhDn0zs4Q49M3MEuLQNzNLiEPfzCwhDn0zs4Q49M3MEuLQNzNLiEPfzCwhDn0zs4Q49M3MEuLQNzNLiEPfzCwhDn0zs4Q49M3MEuLQtwkxe/ZsJP3mBmzzePbs2RPeh/Hqh00+Tdi/RzNtwp7Zkvb88893+i9AE96H8eqHTT5N2L9H45G+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJaRU6Es6SdJ6SRskXd5m/gGSvivpUUn3S5pXmHeepCfy23l1dt7MzKrpGPqSpgI3AicDhwJLJB3astgNwO0R8UZgGXBd3nY2cDWwCFgIXC1pVn3dNzOzKsqM9BcCGyLiyYjYBKwETmtZ5lDgu/n9+wrzTwTujYjnIuJ54F7gpN67bWZm3SgT+vsCTxceD+XTih4Bzszvnw7sLmnPkm3NzGyclAn9dv/MsfWfP14GLJb0MLAY+CmwuWRbJF0oaVDS4PDw8HYNev0nw63t66hRtX1TatSxLbhm5ja3uHrGdtPGQxP++XQTXtM6atTxT+KbuB4TtS3qsKP2b5X4x9BHAddExIn54ysAIuK6UZZ/JfDPETFP0hLguIi4KJ93M3B/RNw52vMNDAzE4OBga82O/2S4l/n98hz90k9vi/F9jn7p52R5jqb2U9K6iBgYsyjlRvprgQWSDpQ0HTgLWNPy5HMkjdS6AliR378HOEHSLGUf4J6QTzMzswnQMfQjYjNwCVlY/whYFRGPSVom6dR8seOA9ZJ+DLwauDZv+xywnOyNYy2wLJ9mZmYTYFqZhSLiLuCulmmfLNxfDawepe0Kto78zcxsAvkbuWZmCXHom5klxKFvZpYQh76ZWUIc+mZmCXHom5klxKFvZpYQh76ZWUIc+mZmCXHom5klxKFvZpYQh76ZWUIc+mZmCXHom5klxKHfx4Z/Pcz5d5/Psy88O6E1etWEPjSFt4XtaA79PnbTozfx/V98n5seuWnCatQRUnWsRxN4W9TLg5odo+9CvwkbsCl9+PaGbxME39rwra76UkeNOt40eu1DUzRlWzRl/2zCG2AT3kSb0Ieivgv9JmzApvRhS2wBYEts6aovvdao602j1/WoQ68h1aRt0ZT9c6LfAJswoGhCH1r1Veg3YQM2qQ8vbXkJgJe2vFS5L3XUqOtNo5c+1KXXkGrKtmjS/jnRb4BNGFA0oQ+t+ir069qAvYzqmtaHEVX70muNut80uulDXXoNqSZtiyaETBPeAJswoKizD3Wesuub0K9zA3Y7qmtCHwAeeeaR3/RhxEtbXuIHz/xg3GrUEVJ1rEcdeg2ppmyLyRJ0TRjU1KHOPtR5ym5azxXGSV0bsHVUd/GbLmbOrnP6pg8Aq09dPeo8oXGpUUdI1bEeI4Z/Pcwn/u4T3LD4hkrbcrSQqvKaNGVbTJaga8Kgpg519aHXvGjVN6Ff1wZsN6pbeuTSvulDU9QZ2HUojoSqbMuxQqpsnaZsi8kSdE0Y1NShrj7UnReKiK4b7wgDAwMxODi47cRrZnZueM2vxpiXtR+eOoWT5+3Di1O2ntXaecsW7h7ayJyXt5Sq0XUf8ho99aGOfpRp36GGJMbabzrNL92PktuiuE232Zadalwzk/ftsxfrd56+3ayDX9zE6o0/L1Wjox39mtZRo9B+eOoUPjF3DjcMP7t1O1as0VUf6qhRw/49bjVqzgtJ6yJioNPT9kXo9xoyI/OXP7icbz7xzW1GIjtN2YkzFpzBVUddVctzjEUSyx5Y1nUf6uhH2X42/TmKyxRf15FtufTIpY3o53hvizrmL39wOV9f/3Xef/D7txlR9sO2qPs5RjttOF79rJIXZUO/bz7IrcNkOfy1rZrw4eVk0oRLPptkor/zsCPyom/O6ddhMp3ns0wTPrycTCbD5011qfsD1G7siLxIaqRvk4+PnOrjo6ZtNeE7DztCUiN9m3x85FSfOq5kmizquJS3qTzSNzPAR01Fk/m0oUf6ZgaMfdSUmsn8BujQNzNrMZnfAH16x8wsIQ59M7OElAp9SSdJWi9pg6TL28zfX9J9kh6W9KikU/Lp8yW9IOkH+a3/PwUxM+tjHc/pS5oK3AgcDwwBayWtiYjHC4stBVZFxJckHQrcBczP5/0kIt5cb7fNzKwbZUb6C4ENEfFkRGwCVgKntSwTwIz8/kxgY31dNDOzupQJ/X2BpwuPh/JpRdcAZ0saIhvl/35h3oH5aZ/vSXpruyeQdKGkQUmDw8PD5XtvZmaVlAn9dl9rbP3zcEuA2yJiHnAK8BVJU4CfAftHxOHAx4CvSprR0paIuCUiBiJiYO7cudXWwMzMSisT+kPAfoXH89j+9M2HgFUAEfEAsAswJyJejIhf5tPXAT8BDuq102Zm1p0yob8WWCDpQEnTgbOANS3LPAW8A0DS68lCf1jS3PyDYCS9BlgAPFlX583MrJqOV+9ExGZJlwD3AFOBFRHxmKRlwGBErAE+Dtwq6aNkp37Oj4iQ9DZgmaTNwMvAxRHx3A5bGzMzG1OpP8MQEXeRfUBbnPbJwv3HgWPatPsG8I0e+2hmZjXxN3LNzBLi0DczS4hD38wsIQ59M7OEOPTNzBLi0DczS4hD38wsIQ59M7OE9M3/yJXa/d23zKxZs3pqX0eNMu2bUsPbolz7Omr007aoQxPWY0e/pnXUmMjXtC9CP2LbP+opabtpVdrXUaNq+6bU8LZoVh+aVKNXTVgP79+d+fSOmVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWkL74x+hmtuNJGnP+rFmzxqkntiM59M2MiNjmsaTtptnk4NM7ZmYJKRX6kk6StF7SBkmXt5m/v6T7JD0s6VFJpxTmXZG3Wy/pxDo7b2Zm1XQ8vSNpKnAjcDwwBKyVtCYiHi8sthRYFRFfknQocBcwP79/FnAYsA/wt5IOioiX614RMzPrrMxIfyGwISKejIhNwErgtJZlApiR358JbMzvnwasjIgXI+JfgA15PTMzmwBlQn9f4OnC46F8WtE1wNmShshG+b9foa2ZmY2TMqHf7jqu1o/1lwC3RcQ84BTgK5KmlGyLpAslDUoaHB4eLtElMzPrRpnQHwL2Kzyex9bTNyM+BKwCiIgHgF2AOSXbEhG3RMRARAzMnTu3fO/NzKySMqG/Flgg6UBJ08k+mF3TssxTwDsAJL2eLPSH8+XOkrSzpAOBBcBDdXXezMyq6Xj1TkRslnQJcA8wFVgREY9JWgYMRsQa4OPArZI+Snb65vzIvtnxmKRVwOPAZuDDvnLHzGziqGnfuhsYGIjBwcExl6nj24K91mhCH5pSowl9qKNGE/rQlBpN6ENTajShD2VqSFoXEQOd6vgbuWZmCXHom5klxKFvZpYQh76ZWUIc+mZmCXHom5klxKFvZpYQh76ZWUIc+mZmCXHom5klxKFvZpYQh76ZWUIc+mZmCXHom5klxKFvZpYQh76ZWUIc+mZmCXHom5klxKFvZpYQh76ZWUIc+mZmCXHom5klxKFvZpaQaRPdgSokbXc/IrpqX0eNbto3pYa3xdjt66jhbTH5tkU/rkervgr9Xla0jvaTqUYT+tCUGk3oQ1NqNKEPTanRhD7UVaPIp3fMzBLi0DczS4hD38wsIQ59M7OEOPTNzBLi0DczS4hD38wsIQ59M7OEqO4L/3slaRj41w6LzQGe7fGpeq3RhD40pUYT+lBHjSb0oSk1mtCHptRoQh/K1DggIuZ2rBIRfXcDBie6RhP60JQaTeiD18Pbwtui3M2nd8zMEuLQNzNLSL+G/i0NqNGEPjSlRhP6UEeNJvShKTWa0Iem1GhCH+qq0bwPcs3MbMfp15G+mZl1waE/QdT6X0zG//lfUUONvSZ6Pcysmr4KfUlTe2j7OkkDknbuocZhkhZL2rPL9sdKOgcgIqKbwJT0Hkl/0M3zF2qcBnxa0qt6qHEi8E1gvx5qHCnpnPzn9C7aL8hf06m97BvWbL0OLDww2VZfhL6kgwAi4uVufrklvRv4S+CPgNtG6lWscTJwJ/BR4HZJe1VoO0XSK4GbgSskXQy/Cf7Sr4GkE4DlwOOVOr9tjcXAp4FvR8QzXdY4Ia+xN/DxLmucSvbB1DuBy4ADKrZ/L7AauAL4HHBRHUcvee1GhEQ/h52kXWuosRdkvyddtl/QS/s29Xrenl0O9PaTNH1k/66SGW3VcbH/jrwB7wZ+DXy1MG1qhfZHA/8MHJ4//iKwomIfjgN+DCzMH38TeGcX6/KfyULyduCjFdseDfyi0IeZZEG5W8U6HwMuy+/vAxwPLAJmlmz/TmADcBiwE/A/gLdV7MOewD3AG/LHK4DfAV4F7FKy/d8Ah+aPfw9YCywFdu/idVkELAZ+qzBNFWvMqPq8bWocARw78hp30f4o4CTg+B76cDJwTo/rcSLwiTKvZYd+rAJe12X744Fh4Pd66MPbgQuAC3qosRA4BhjoZt8C3gX8kGzAuAo4OJ8+pds+NXqkn7+zXQJcCmySdAd0NeK/PiIezu9fDcyueJrnF8BFEfFQPvpYBFwi6WZJ76vw7r2Z7HTIXwALJX1O0nXKdHotfgm8BOydn176FvAlsiOXqn0YsZosMC8BbpQ0q0T7qcC5EfEY8ApgPdkbQJVRzGZgV+AQSTPI3lTPBf4YWFpixL4ZeCUwMhJcQfanO+aSDRJKy4/g7gA+AFwp6ct5zdKn3ySdAfy9pEXdjsLyo9EvAxcCl0m6qGL7U4CbyILq0vxIamRe2fXYBfhPwC35KcDK8u35GWBtRPy/lnll+7GILORuiogNLfM6bl9JJ5Ed1f8N+T5SdYSdr8cXyAZXH5C0pDCv7Hq8C7iVLLg/IulmKLdv5ZmwH3A92e/nJ4F/Au6TdFhEbOl6xN/LO/p43MhGo68k+7sTq4E7KrafSj4Ky+/PAx4G5ubT9qxY70pgaX7/g8DXRmqVaPta4PL8/sfJjmBurPDcbwKeBIbIRiBTyEL7TmB2yRpvIAvqlcAH82mvIQuMEyv0ZUr+8yTg58B/qLgd3wesAx4ErsqnvR24DXhTifYXA18BzgGuJQvui6hwFJfvDyvJR7bADOAfgNWFZcYclQHz8zb35rUGOrVpU+Nw4NGR9SY76vl8hfZHAIPAUfnjTwGnAq8qux6F5S7I1+NfgPOKr3WJtofm7S7MH+8JHFzcN8r0AzgbuDa/vw9ZaJ7buu+N0va4/Pf7LWSDgJ9T8ciHbDBzD/Cu/PElwBIqjNaB3cjedN6RP94feKaL/fMWYN+R5wM+AvwUOKjKOhVvjR7pA0TExoj494h4luyXeteREb+kIyQd0qH9yxHxb/lDAf8HeC4ihiV9APhUlfOPEXFtRHwqv//nwO6U/zDzBeBgSReQhdb1wP5lR3UR8QjZSPa6iLg1IrZENsqdRbZTlanxQ7Jz6IuAA/NpT5LtYJ3/WNPWOlvyn3eT7ZjvLnnEMtJ+Ndmpor8n+yUlIv4n2fYsc37/TuBusjeK3SLi7Ii4GXhVfvRQpg8vjzx3/vjfIuJY4NXFUVmHMluAKyPieLLPWj4JvEXStOJCHUZ2uwJfzF9f8j4dk5/LLTOqnAZcEhEPSJpNNhC4APispD8tsx6SdsrvPgN8g+xNeamkTwOfL3lkvStZ0G3JR9tfA5YBnyvbj9wQsEc+0v0r4K1kI+WVeY0tY7TdDbg4ItZFxDDZG+ASSTNLPG/RzwAkvZns9+W9wBckfaPkegj4v2RnCYiIp8iO8BdJ+uyYDbOLTn4L2IP8SGPk+SLiC8CfAP9F0i5dfc7Q7bvFRN3IRvx/Tnae/glgXhc1bgOuIxtplh6h0vLuDpyZ19irQo1lwFPAe/LHvw3s18P2GOnDqyu0mUZ2OuVJ4EP5bRB4bQ99+AcqfNZSaHty/nqeQDY6/T4wv0L7KYX75wL/CLyiQ5uDCvfPJjtnun/LPraa/DODEjVmFu5fBXyH/POBsfavlhojR55TyYLrO2w9Ql1Qov1UsiO/D7N1hD4PuA84rkwf8scHAnfm9y8DNtHhaLSlH8cAnwd+QjawEdmg6G+Bt5as8SZgDdlR9ccK0x8APjJK+4Pb7Rdk59TXkP0Fym32lw59uBT4OvAQ8JnC9IeAJSVrXEP2BvY7ZKer/hvZUfWtwB6jtH832VHf9/LlTwX+N3BFYZn5wM1lf0e2e45uG07kjewKmm5OKQiYnu+QT432y1Sizs5kQfkY+YeRFdruB7yldefsog8iG9E9DhzWZY0jgD8EPlt1W7aptYoKYV1otwfZIev3yA6pO57aGaXOyLYYcz3YemHAysK05cDTbBv8K4FFHWrcWZg2vXD/KuCrZEdyj1I4zdKhHyNBNQW4i+x00zl5aM3q1IeRfbPl8ZeBozusR/EiiVlk57Lfn2/PpWSfJ/1uhe25EDi9ZbnbgCMrbM+LyQYlf0r+pkp2IcQHS7af1rINvtPFfrEb2UDinYVpnwHe16HG1wrT/iDfhp8GdsqnfRvYu0371otObiE7UtmHLK+WAq8DzicbpM0aa51GXdduGk3kLd8p7wXe2EON8+kyKPP2OwGn0DK6qFij0nnfdu3Jzl8eMsGvR0/rUaizOz1cAUN2SmjMKz3IztXeTfZh6W0tIbEceITsFOKVwI+AA0vUuKMwb+fC/fuBjbR5E+pQYyrZkdjXgT/Lf7kPrdC+GHZnkF3VdEDFPlwPvAicmT9e3G7btqlRfPPYtXD/zAr9KNa4gGwgcCnwX/PX5JAK67Fz/nMO2SXbx5bcL4p9OI8scBfm8x+mzfn0sfatluXOJjsqntNm3tHA+YXHc4G/zu+/huwqty/m+0TXg7QJC4tebvRwKVjevpag8q3/bmx/YUAx+E8nu3rlzxjjCK5NjTta5h+Uh8OoRy0lanyLbKTddmAxVnuyQcmHyU77VVmPr+bTp4wEW6fflTY1/nvL/PPIAr9KP4qvybHAe8hGvJW3RT5/N7KR9qinYcdaD7Yeuf1VN9sznzeN7KKHh4A3j9J+tItO9s6nHZDXKXV59Wg3/8E1S1Z+6estwKaIWCLpMODfI6LTf25rV+OFiDg7/+BvBvB4ZBcfdFNjAdmVYXdERMcv4rVpfwjZtfJ/HS2XPFZcjxcj4kdl2o9S4/Vkn1ndHdnFAlVqjLwmbwR+GRE/7bIPA2Tn1Z+JsT8AblfjpYg4S9Jr2PqabuqyH28gG60/FBE/L9F+GrAL2Zco3yHpbLIPtC+NiBfK9GHU2g59S5mkOWTXdB9NNro6LiKGuqxxVF5jcURs7LLGMfmkt0bEL7pofzTZqb+3lQmXUWqMrMdv97AtRvqxOCJ+1kONyq9Jy3pMq9q+pcYxZOvR67aYQnf7xW1kVxKdQHbq539Vad9O4y/ZNNuR8tH4o2SXxp1e9Re7pcYewBlVf7FbaswgO5deOvBb2s/M21cK/JYaI+vRy7YY6UelwG9To/Jr0rIevb6mM6hnW1TaL/LLn6eTje4/AJxVR+BD9i5olqz8W8inACd0+0vVhBpN6ENTajShD73WiOwUzCZJy8m+3fxEN31o2y+f3rHUSdolWv5kQD/WaEIfmlKjCX2oo4YkRc0h7dA3M0uIz+mbmSXEoW9mlhCHvplZQhz6ZmYJceibmSXEoW9mlpD/DzGfhPs05MNMAAAAAElFTkSuQmCC\n", 37 | "text/plain": [ 38 | "" 39 | ] 40 | }, 41 | "metadata": { 42 | "needs_background": "light" 43 | }, 44 | "output_type": "display_data" 45 | } 46 | ], 47 | "source": [ 48 | "# compare iterative imputation number of iterations for the horse colic dataset\n", 49 | "from numpy import mean\n", 50 | "from numpy import std\n", 51 | "from pandas import read_csv\n", 52 | "from sklearn.ensemble import RandomForestClassifier\n", 53 | "from sklearn.experimental import enable_iterative_imputer\n", 54 | "from sklearn.impute import IterativeImputer\n", 55 | "from sklearn.model_selection import cross_val_score\n", 56 | "from sklearn.model_selection import RepeatedStratifiedKFold\n", 57 | "from sklearn.pipeline import Pipeline\n", 58 | "from matplotlib import pyplot\n", 59 | "# load dataset\n", 60 | "url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv'\n", 61 | "dataframe = read_csv(url, header=None, na_values='?')\n", 62 | "# split into input and output elements\n", 63 | "data = dataframe.values\n", 64 | "ix = [i for i in range(data.shape[1]) if i != 23]\n", 65 | "X, y = data[:, ix], data[:, 23]\n", 66 | "# evaluate each strategy on the dataset\n", 67 | "results = list()\n", 68 | "strategies = [str(i) for i in range(1, 21)]\n", 69 | "for s in strategies:\n", 70 | "\t# create the modeling pipeline\n", 71 | "\tpipeline = Pipeline(steps=[('i', IterativeImputer(max_iter=int(s))), ('m', RandomForestClassifier())])\n", 72 | "\t# evaluate the model\n", 73 | "\tcv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)\n", 74 | "\tscores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)\n", 75 | "\t# store results\n", 76 | "\tresults.append(scores)\n", 77 | "\tprint('>%s %.3f (%.3f)' % (s, mean(scores), std(scores)))\n", 78 | "# plot model performance for comparison\n", 79 | "pyplot.boxplot(results, labels=strategies, showmeans=True)\n", 80 | "pyplot.xticks(rotation=45)\n", 81 | "pyplot.show()" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 5, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "name": "stdout", 91 | "output_type": "stream", 92 | "text": [ 93 | "Predicted Class: 1\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "# iterative imputation strategy and prediction for the hose colic dataset\n", 99 | "from numpy import nan\n", 100 | "from pandas import read_csv\n", 101 | "from sklearn.ensemble import RandomForestClassifier\n", 102 | "from sklearn.experimental import enable_iterative_imputer\n", 103 | "from sklearn.impute import IterativeImputer\n", 104 | "from sklearn.pipeline import Pipeline\n", 105 | "\n", 106 | "# load dataset\n", 107 | "url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv'\n", 108 | "dataframe = read_csv(url, header=None, na_values='?')\n", 109 | "\n", 110 | "# split into input and output elements\n", 111 | "data = dataframe.values\n", 112 | "X, y = data[:, :-1], data[:, -1]\n", 113 | "\n", 114 | "# create the modeling pipeline\n", 115 | "pipeline = Pipeline(steps=[('i', IterativeImputer()), ('m', RandomForestClassifier())])\n", 116 | "\n", 117 | "# fit the model\n", 118 | "pipeline.fit(X, y)\n", 119 | "\n", 120 | "# define new data\n", 121 | "row = [2, 1, 530101, 38.50, 66, 28, 3, 3, nan, 2, 5, 4, 4, nan, nan, nan, 3, 5, 45.00, 8.40, nan, nan, 2, 11300, 00000, 00000, 2]\n", 122 | "\n", 123 | "# make a prediction\n", 124 | "\n", 125 | "yhat = pipeline.predict([row])\n", 126 | "\n", 127 | "# summarize prediction\n", 128 | "print('Predicted Class: %d' % yhat[0])" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [] 137 | } 138 | ], 139 | "metadata": { 140 | "kernelspec": { 141 | "display_name": "Python 3", 142 | "language": "python", 143 | "name": "python3" 144 | }, 145 | "language_info": { 146 | "codemirror_mode": { 147 | "name": "ipython", 148 | "version": 3 149 | }, 150 | "file_extension": ".py", 151 | "mimetype": "text/x-python", 152 | "name": "python", 153 | "nbconvert_exporter": "python", 154 | "pygments_lexer": "ipython3", 155 | "version": "3.6.4" 156 | } 157 | }, 158 | "nbformat": 4, 159 | "nbformat_minor": 2 160 | } 161 | -------------------------------------------------------------------------------- /rescaling.tscproj: -------------------------------------------------------------------------------- 1 | { 2 | "title" : "", 3 | "description" : "", 4 | "author" : "", 5 | "width" : 1280.0, 6 | "height" : 720.0, 7 | "version" : "0.5", 8 | "editRate" : 30, 9 | "authoringClientName" : { 10 | "name" : "Camtasia Studio", 11 | "platform" : "Windows", 12 | "version" : "9.0" 13 | }, 14 | "sourceBin" : [ 15 | { 16 | "id" : 1, 17 | "src" : "C:\\Users\\PSMike\\AppData\\Local\\Microsoft\\Windows\\INetCache\\IE\\AG853QLC\\Building_Features_from_Nominal_and_Numeric_Data_in_Microsoft_Azure-m3-6[1].mp4", 18 | "rect" : [0, 0, 1280, 720], 19 | "lastMod" : "20191010T172702", 20 | "sourceTracks" : [ 21 | { 22 | "range" : [0, 1026089777], 23 | "type" : 0, 24 | "editRate" : 10000000, 25 | "trackRect" : [0, 0, 1280, 720], 26 | "sampleRate" : "10000000/333333", 27 | "bitDepth" : 24, 28 | "numChannels" : 0, 29 | "metaData" : "" 30 | }, 31 | { 32 | "range" : [0, 1026089777], 33 | "type" : 2, 34 | "editRate" : 10000000, 35 | "trackRect" : [0, 0, 0, 0], 36 | "sampleRate" : 44100, 37 | "bitDepth" : 16, 38 | "numChannels" : 2, 39 | "metaData" : "" 40 | } 41 | ] 42 | }, 43 | { 44 | "id" : 2, 45 | "src" : "C:\\Users\\PSMike\\Documents\\Camtasia Studio\\Rec 03-15-20 6.trec", 46 | "rect" : [0, 0, 1280, 720], 47 | "lastMod" : "20200315T202758", 48 | "sourceTracks" : [ 49 | { 50 | "range" : [0, 1434], 51 | "type" : 0, 52 | "editRate" : 30, 53 | "trackRect" : [0, 0, 1280, 720], 54 | "sampleRate" : 29, 55 | "bitDepth" : 0, 56 | "numChannels" : 0, 57 | "metaData" : "Rec 03-15-20 6.trec;" 58 | }, 59 | { 60 | "range" : [0, 2104320], 61 | "type" : 2, 62 | "editRate" : 44100, 63 | "trackRect" : [0, 0, 0, 0], 64 | "sampleRate" : 44100, 65 | "bitDepth" : 16, 66 | "numChannels" : 2, 67 | "metaData" : "" 68 | } 69 | ] 70 | }, 71 | { 72 | "id" : 3, 73 | "src" : "C:\\Users\\PSMike\\Desktop\\kdkdkee.png", 74 | "rect" : [0, 0, 1280, 720], 75 | "lastMod" : "20200315T202829", 76 | "sourceTracks" : [ 77 | { 78 | "range" : [0, 1], 79 | "type" : 1, 80 | "editRate" : 10000000, 81 | "trackRect" : [0, 0, 1280, 720], 82 | "sampleRate" : 0, 83 | "bitDepth" : 32, 84 | "numChannels" : 0, 85 | "metaData" : "" 86 | } 87 | ] 88 | }, 89 | { 90 | "id" : 4, 91 | "src" : "C:\\Users\\PSMike\\Desktop\\9d9d9d9.png", 92 | "rect" : [0, 0, 1280, 720], 93 | "lastMod" : "20200315T202946", 94 | "sourceTracks" : [ 95 | { 96 | "range" : [0, 1], 97 | "type" : 1, 98 | "editRate" : 10000000, 99 | "trackRect" : [0, 0, 1280, 720], 100 | "sampleRate" : 0, 101 | "bitDepth" : 32, 102 | "numChannels" : 0, 103 | "metaData" : "" 104 | } 105 | ] 106 | }, 107 | { 108 | "id" : 5, 109 | "src" : "C:\\Users\\PSMike\\Desktop\\noemreke.png", 110 | "rect" : [0, 0, 1280, 720], 111 | "lastMod" : "20200315T203030", 112 | "sourceTracks" : [ 113 | { 114 | "range" : [0, 1], 115 | "type" : 1, 116 | "editRate" : 10000000, 117 | "trackRect" : [0, 0, 1280, 720], 118 | "sampleRate" : 0, 119 | "bitDepth" : 32, 120 | "numChannels" : 0, 121 | "metaData" : "" 122 | } 123 | ] 124 | }, 125 | { 126 | "id" : 6, 127 | "src" : "C:\\Users\\PSMike\\Desktop\\0d0d0d0ddd.png", 128 | "rect" : [0, 0, 1280, 720], 129 | "lastMod" : "20200315T203108", 130 | "sourceTracks" : [ 131 | { 132 | "range" : [0, 1], 133 | "type" : 1, 134 | "editRate" : 10000000, 135 | "trackRect" : [0, 0, 1280, 720], 136 | "sampleRate" : 0, 137 | "bitDepth" : 32, 138 | "numChannels" : 0, 139 | "metaData" : "" 140 | } 141 | ] 142 | } 143 | ], 144 | "timeline" : { 145 | "id" : 7, 146 | "sceneTrack" : { 147 | "scenes" : [ 148 | { 149 | "duration" : 102.6, 150 | "title" : "", 151 | "type" : "", 152 | "csml" : { 153 | "tracks" : [ 154 | { 155 | "trackIndex" : 0, 156 | "medias" : [ 157 | ] 158 | }, 159 | { 160 | "trackIndex" : 1, 161 | "medias" : [ 162 | { 163 | "id" : 8, 164 | "_type" : "IMFile", 165 | "src" : 3, 166 | "trackNumber" : 0, 167 | "trimStartSum" : 0, 168 | "attributes" : { 169 | "ident" : "kdkdkee" 170 | }, 171 | "effects" : [ 172 | 173 | ], 174 | "start" : 0, 175 | "duration" : 548, 176 | "mediaStart" : 0, 177 | "mediaDuration" : 1, 178 | "scalar" : 1, 179 | "metadata" : { 180 | "WinSubProjectDisplayName" : "", 181 | "clipSpeedAttribute" : false 182 | }, 183 | "animationTracks" : { 184 | 185 | } 186 | }, 187 | { 188 | "id" : 9, 189 | "_type" : "UnifiedMedia", 190 | "video" : 191 | { 192 | "id" : 10, 193 | "_type" : "ScreenVMFile", 194 | "src" : 2, 195 | "trackNumber" : 0, 196 | "attributes" : { 197 | "ident" : "Rec 03-15-20 6" 198 | }, 199 | "parameters" : { 200 | "cursorScale" : { 201 | "type" : "double", 202 | "defaultValue" : 1.0, 203 | "interp" : "linr" 204 | }, 205 | "cursorOpacity" : { 206 | "type" : "double", 207 | "defaultValue" : 1.0, 208 | "interp" : "linr" 209 | } 210 | }, 211 | "effects" : [ 212 | 213 | ], 214 | "start" : 548, 215 | "duration" : 138, 216 | "mediaStart" : 393, 217 | "mediaDuration" : 138, 218 | "scalar" : 1, 219 | "animationTracks" : { 220 | 221 | } 222 | }, 223 | "audio" : 224 | { 225 | "id" : 11, 226 | "_type" : "AMFile", 227 | "src" : 2, 228 | "trackNumber" : 1, 229 | "attributes" : { 230 | "ident" : "", 231 | "sampleRate" : 44100, 232 | "bitDepth" : 16, 233 | "gain" : 1.0, 234 | "mixToMono" : false 235 | }, 236 | "channelNumber" : "0,1", 237 | "effects" : [ 238 | 239 | ], 240 | "start" : 548, 241 | "duration" : 138, 242 | "mediaStart" : 393, 243 | "mediaDuration" : 138, 244 | "scalar" : 1, 245 | "animationTracks" : { 246 | 247 | } 248 | } 249 | , 250 | "start" : 548, 251 | "duration" : 138, 252 | "mediaStart" : 0, 253 | "mediaDuration" : 1432, 254 | "scalar" : 1, 255 | "metadata" : { 256 | "AutoAppliedSmartFocus" : "False", 257 | "WinSubProjectDisplayName" : "", 258 | "clipSpeedAttribute" : false 259 | } 260 | }, 261 | { 262 | "id" : 12, 263 | "_type" : "IMFile", 264 | "src" : 4, 265 | "trackNumber" : 0, 266 | "trimStartSum" : 0, 267 | "attributes" : { 268 | "ident" : "9d9d9d9" 269 | }, 270 | "effects" : [ 271 | 272 | ], 273 | "start" : 686, 274 | "duration" : 567, 275 | "mediaStart" : 0, 276 | "mediaDuration" : 1, 277 | "scalar" : 1, 278 | "metadata" : { 279 | "WinSubProjectDisplayName" : "", 280 | "clipSpeedAttribute" : false 281 | }, 282 | "animationTracks" : { 283 | 284 | } 285 | }, 286 | { 287 | "id" : 13, 288 | "_type" : "IMFile", 289 | "src" : 5, 290 | "trackNumber" : 0, 291 | "trimStartSum" : 0, 292 | "attributes" : { 293 | "ident" : "noemreke" 294 | }, 295 | "effects" : [ 296 | 297 | ], 298 | "start" : 1253, 299 | "duration" : 573, 300 | "mediaStart" : 0, 301 | "mediaDuration" : 1, 302 | "scalar" : 1, 303 | "metadata" : { 304 | "WinSubProjectDisplayName" : "", 305 | "clipSpeedAttribute" : false 306 | }, 307 | "animationTracks" : { 308 | 309 | } 310 | }, 311 | { 312 | "id" : 14, 313 | "_type" : "VMFile", 314 | "src" : 1, 315 | "trackNumber" : 0, 316 | "attributes" : { 317 | "ident" : "Building_Features_from_Nominal_and_Numeric_Data_in_Microsoft_Azure-m3-6[1]" 318 | }, 319 | "effects" : [ 320 | 321 | ], 322 | "start" : 1826, 323 | "duration" : 612, 324 | "mediaStart" : 1826, 325 | "mediaDuration" : 612, 326 | "scalar" : 1, 327 | "metadata" : { 328 | "WinSubProjectDisplayName" : "", 329 | "clipSpeedAttribute" : false 330 | }, 331 | "animationTracks" : { 332 | 333 | } 334 | }, 335 | { 336 | "id" : 15, 337 | "_type" : "IMFile", 338 | "src" : 6, 339 | "trackNumber" : 0, 340 | "trimStartSum" : 0, 341 | "attributes" : { 342 | "ident" : "0d0d0d0ddd" 343 | }, 344 | "effects" : [ 345 | 346 | ], 347 | "start" : 2438, 348 | "duration" : 640, 349 | "mediaStart" : 0, 350 | "mediaDuration" : 1, 351 | "scalar" : 1, 352 | "metadata" : { 353 | "WinSubProjectDisplayName" : "", 354 | "clipSpeedAttribute" : false 355 | }, 356 | "animationTracks" : { 357 | 358 | } 359 | } 360 | ] 361 | }, 362 | { 363 | "trackIndex" : 2, 364 | "medias" : [ 365 | { 366 | "id" : 16, 367 | "_type" : "AMFile", 368 | "src" : 1, 369 | "trackNumber" : 1, 370 | "attributes" : { 371 | "ident" : "Building_Features_from_Nominal_and_Numeric_Data_in_Microsoft_Azure-m3-6[1]", 372 | "sampleRate" : 44100, 373 | "bitDepth" : 16, 374 | "gain" : 1.0, 375 | "mixToMono" : false 376 | }, 377 | "channelNumber" : "0,1", 378 | "parameters" : { 379 | "volume" : 1.53061224489796 380 | }, 381 | "effects" : [ 382 | 383 | ], 384 | "start" : 0, 385 | "duration" : 3078, 386 | "mediaStart" : 0, 387 | "mediaDuration" : 3078, 388 | "scalar" : 1, 389 | "metadata" : { 390 | "WinSubProjectDisplayName" : "", 391 | "clipSpeedAttribute" : false 392 | }, 393 | "animationTracks" : { 394 | 395 | } 396 | } 397 | ] 398 | }, 399 | { 400 | "trackIndex" : 3, 401 | "medias" : [ 402 | ] 403 | } 404 | ] 405 | } 406 | } 407 | ] 408 | }, 409 | "trackAttributes" : [ 410 | { 411 | "ident" : "", 412 | "audioMuted" : false, 413 | "videoHidden" : false, 414 | "metadata" : { 415 | "IsLocked" : "False", 416 | "WinTrackHeight" : "56" 417 | } 418 | }, 419 | { 420 | "ident" : "", 421 | "audioMuted" : false, 422 | "videoHidden" : false, 423 | "metadata" : { 424 | "IsLocked" : "False", 425 | "WinTrackHeight" : "56" 426 | } 427 | }, 428 | { 429 | "ident" : "", 430 | "audioMuted" : false, 431 | "videoHidden" : false, 432 | "metadata" : { 433 | "IsLocked" : "False", 434 | "WinTrackHeight" : "56" 435 | } 436 | }, 437 | { 438 | "ident" : "", 439 | "audioMuted" : false, 440 | "videoHidden" : false, 441 | "metadata" : { 442 | "IsLocked" : "False", 443 | "WinTrackHeight" : "56" 444 | } 445 | } 446 | ], 447 | "captionAttributes" : { 448 | "enabled" : true, 449 | "fontName" : "Arial", 450 | "fontSize" : 42, 451 | "backgroundColor" : [ 0, 0, 0, 191], 452 | "foregroundColor" : [ 255, 255, 255, 255], 453 | "lang" : "en", 454 | "alignment" : 0, 455 | "defaultFontSize" : true, 456 | "opacity" : 0.5, 457 | "backgroundEnabled" : true, 458 | "backgroundOnlyAroundText" : true 459 | }, 460 | "backgroundColor" : [ 0, 0, 0, 255] 461 | }, 462 | "metadata" : { 463 | "AutoSaveFile" : "C:\\Users\\PSMike\\AppData\\Local\\TechSmith\\Camtasia Studio\\9.0\\Auto-Saves\\Untitled Project3484d89d.autosave.tscproj", 464 | "CanvasZoom" : 41, 465 | "Date" : "2020-03-15 04:09:04 PM", 466 | "Fit" : 1, 467 | "IsAutoSave" : "0", 468 | "Language" : "ENU", 469 | "ProfileName" : "MP4 only (up to 1080p)", 470 | "ProjectDimensionsChanged" : "1", 471 | "audioNarrationNotes" : "", 472 | "calloutStyle" : "Basic", 473 | "canvasDetached" : "False", 474 | "canvasPositionHeight" : "0", 475 | "canvasPositionLeft" : "0", 476 | "canvasPositionTop" : "0", 477 | "canvasPositionWidth" : "0" 478 | } 479 | } 480 | -------------------------------------------------------------------------------- /Sparse Column Identification and Removal.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "0 238\n", 13 | "1 297\n", 14 | "2 927\n", 15 | "3 933\n", 16 | "4 179\n", 17 | "5 375\n", 18 | "6 820\n", 19 | "7 618\n", 20 | "8 561\n", 21 | "9 57\n", 22 | "10 577\n", 23 | "11 59\n", 24 | "12 73\n", 25 | "13 107\n", 26 | "14 53\n", 27 | "15 91\n", 28 | "16 893\n", 29 | "17 810\n", 30 | "18 170\n", 31 | "19 53\n", 32 | "20 68\n", 33 | "21 9\n", 34 | "22 1\n", 35 | "23 92\n", 36 | "24 9\n", 37 | "25 8\n", 38 | "26 9\n", 39 | "27 308\n", 40 | "28 447\n", 41 | "29 392\n", 42 | "30 107\n", 43 | "31 42\n", 44 | "32 4\n", 45 | "33 45\n", 46 | "34 141\n", 47 | "35 110\n", 48 | "36 3\n", 49 | "37 758\n", 50 | "38 9\n", 51 | "39 9\n", 52 | "40 388\n", 53 | "41 220\n", 54 | "42 644\n", 55 | "43 649\n", 56 | "44 499\n", 57 | "45 2\n", 58 | "46 937\n", 59 | "47 169\n", 60 | "48 286\n", 61 | "49 2\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "from numpy import loadtxt\n", 67 | "from numpy import unique\n", 68 | "\n", 69 | "data = loadtxt('oil-spill.csv', delimiter=',')\n", 70 | "# summarize the number of unique values in each column\n", 71 | "for i in range(data.shape[1]):\n", 72 | "\tprint(i, len(unique(data[:, i])))" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 2, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "name": "stdout", 82 | "output_type": "stream", 83 | "text": [ 84 | "0 238\n", 85 | "1 297\n", 86 | "2 927\n", 87 | "3 933\n", 88 | "4 179\n", 89 | "5 375\n", 90 | "6 820\n", 91 | "7 618\n", 92 | "8 561\n", 93 | "9 57\n", 94 | "10 577\n", 95 | "11 59\n", 96 | "12 73\n", 97 | "13 107\n", 98 | "14 53\n", 99 | "15 91\n", 100 | "16 893\n", 101 | "17 810\n", 102 | "18 170\n", 103 | "19 53\n", 104 | "20 68\n", 105 | "21 9\n", 106 | "22 1\n", 107 | "23 92\n", 108 | "24 9\n", 109 | "25 8\n", 110 | "26 9\n", 111 | "27 308\n", 112 | "28 447\n", 113 | "29 392\n", 114 | "30 107\n", 115 | "31 42\n", 116 | "32 4\n", 117 | "33 45\n", 118 | "34 141\n", 119 | "35 110\n", 120 | "36 3\n", 121 | "37 758\n", 122 | "38 9\n", 123 | "39 9\n", 124 | "40 388\n", 125 | "41 220\n", 126 | "42 644\n", 127 | "43 649\n", 128 | "44 499\n", 129 | "45 2\n", 130 | "46 937\n", 131 | "47 169\n", 132 | "48 286\n", 133 | "49 2\n", 134 | "dtype: int64\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "from pandas import read_csv\n", 140 | "df = read_csv('oil-spill.csv', header=None)\n", 141 | "print(df.nunique())" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 3, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "(937, 50)\n", 154 | "[22]\n", 155 | "(937, 49)\n" 156 | ] 157 | } 158 | ], 159 | "source": [ 160 | "from pandas import read_csv\n", 161 | "# load the dataset\n", 162 | "df = read_csv('oil-spill.csv', header=None)\n", 163 | "print(df.shape)\n", 164 | "# get number of unique values for each column\n", 165 | "counts = df.nunique()\n", 166 | "# record columns to delete\n", 167 | "to_del = [i for i,v in enumerate(counts) if v == 1]\n", 168 | "print(to_del)\n", 169 | "# drop useless columns\n", 170 | "df.drop(to_del, axis=1, inplace=True)\n", 171 | "print(df.shape)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 4, 177 | "metadata": {}, 178 | "outputs": [ 179 | { 180 | "name": "stdout", 181 | "output_type": "stream", 182 | "text": [ 183 | "0, 238, 25.4%\n", 184 | "1, 297, 31.7%\n", 185 | "2, 927, 98.9%\n", 186 | "3, 933, 99.6%\n", 187 | "4, 179, 19.1%\n", 188 | "5, 375, 40.0%\n", 189 | "6, 820, 87.5%\n", 190 | "7, 618, 66.0%\n", 191 | "8, 561, 59.9%\n", 192 | "9, 57, 6.1%\n", 193 | "10, 577, 61.6%\n", 194 | "11, 59, 6.3%\n", 195 | "12, 73, 7.8%\n", 196 | "13, 107, 11.4%\n", 197 | "14, 53, 5.7%\n", 198 | "15, 91, 9.7%\n", 199 | "16, 893, 95.3%\n", 200 | "17, 810, 86.4%\n", 201 | "18, 170, 18.1%\n", 202 | "19, 53, 5.7%\n", 203 | "20, 68, 7.3%\n", 204 | "21, 9, 1.0%\n", 205 | "22, 1, 0.1%\n", 206 | "23, 92, 9.8%\n", 207 | "24, 9, 1.0%\n", 208 | "25, 8, 0.9%\n", 209 | "26, 9, 1.0%\n", 210 | "27, 308, 32.9%\n", 211 | "28, 447, 47.7%\n", 212 | "29, 392, 41.8%\n", 213 | "30, 107, 11.4%\n", 214 | "31, 42, 4.5%\n", 215 | "32, 4, 0.4%\n", 216 | "33, 45, 4.8%\n", 217 | "34, 141, 15.0%\n", 218 | "35, 110, 11.7%\n", 219 | "36, 3, 0.3%\n", 220 | "37, 758, 80.9%\n", 221 | "38, 9, 1.0%\n", 222 | "39, 9, 1.0%\n", 223 | "40, 388, 41.4%\n", 224 | "41, 220, 23.5%\n", 225 | "42, 644, 68.7%\n", 226 | "43, 649, 69.3%\n", 227 | "44, 499, 53.3%\n", 228 | "45, 2, 0.2%\n", 229 | "46, 937, 100.0%\n", 230 | "47, 169, 18.0%\n", 231 | "48, 286, 30.5%\n", 232 | "49, 2, 0.2%\n" 233 | ] 234 | } 235 | ], 236 | "source": [ 237 | "# summarize the percentage of unique values for each column using numpy\n", 238 | "from numpy import loadtxt\n", 239 | "from numpy import unique\n", 240 | "\n", 241 | "data = loadtxt('oil-spill.csv', delimiter=',')\n", 242 | "# summarize the number of unique values in each column\n", 243 | "for i in range(data.shape[1]):\n", 244 | "\tnum = len(unique(data[:, i]))\n", 245 | "\tpercentage = float(num) / data.shape[0] * 100\n", 246 | "\tprint('%d, %d, %.1f%%' % (i, num, percentage))" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 5, 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | "21, 9, 1.0%\n", 259 | "22, 1, 0.1%\n", 260 | "24, 9, 1.0%\n", 261 | "25, 8, 0.9%\n", 262 | "26, 9, 1.0%\n", 263 | "32, 4, 0.4%\n", 264 | "36, 3, 0.3%\n", 265 | "38, 9, 1.0%\n", 266 | "39, 9, 1.0%\n", 267 | "45, 2, 0.2%\n", 268 | "49, 2, 0.2%\n" 269 | ] 270 | } 271 | ], 272 | "source": [ 273 | "# summarize the percentage of unique values for each column using numpy\n", 274 | "from numpy import loadtxt\n", 275 | "from numpy import unique\n", 276 | "# load the dataset\n", 277 | "data = loadtxt('oil-spill.csv', delimiter=',')\n", 278 | "# summarize the number of unique values in each column\n", 279 | "for i in range(data.shape[1]):\n", 280 | "\tnum = len(unique(data[:, i]))\n", 281 | "\tpercentage = float(num) / data.shape[0] * 100\n", 282 | "\tif percentage < 1:\n", 283 | "\t\tprint('%d, %d, %.1f%%' % (i, num, percentage))" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 6, 289 | "metadata": {}, 290 | "outputs": [ 291 | { 292 | "name": "stdout", 293 | "output_type": "stream", 294 | "text": [ 295 | "(937, 50)\n", 296 | "[21, 22, 24, 25, 26, 32, 36, 38, 39, 45, 49]\n", 297 | "(937, 39)\n" 298 | ] 299 | } 300 | ], 301 | "source": [ 302 | "# delete columns where number of unique values is less than 1% of the rows\n", 303 | "from pandas import read_csv\n", 304 | "# load the dataset\n", 305 | "df = read_csv('oil-spill.csv', header=None)\n", 306 | "print(df.shape)\n", 307 | "# get number of unique values for each column\n", 308 | "counts = df.nunique()\n", 309 | "# record columns to delete\n", 310 | "to_del = [i for i,v in enumerate(counts) if (float(v)/df.shape[0]*100) < 1]\n", 311 | "print(to_del)\n", 312 | "# drop useless columns\n", 313 | "df.drop(to_del, axis=1, inplace=True)\n", 314 | "print(df.shape)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 10, 320 | "metadata": {}, 321 | "outputs": [ 322 | { 323 | "name": "stdout", 324 | "output_type": "stream", 325 | "text": [ 326 | "(937, 49) (937,)\n", 327 | "(937, 48)\n" 328 | ] 329 | } 330 | ], 331 | "source": [ 332 | "# example of applying the variance threshold for feature selection\n", 333 | "from pandas import read_csv\n", 334 | "from sklearn.feature_selection import VarianceThreshold\n", 335 | "# load the dataset\n", 336 | "df = read_csv('oil-spill.csv', header=None)\n", 337 | "# split data into inputs and outputs\n", 338 | "data = df.values\n", 339 | "X = data[:, :-1]\n", 340 | "y = data[:, -1]\n", 341 | "print(X.shape, y.shape)\n", 342 | "# define the transform\n", 343 | "transform = VarianceThreshold()\n", 344 | "# transform the input data\n", 345 | "X_sel = transform.fit_transform(X)\n", 346 | "print(X_sel.shape)" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 9, 352 | "metadata": {}, 353 | "outputs": [ 354 | { 355 | "name": "stdout", 356 | "output_type": "stream", 357 | "text": [ 358 | "(937, 49) (937,)\n", 359 | ">Threshold=0.00, Features=48\n", 360 | ">Threshold=0.05, Features=37\n", 361 | ">Threshold=0.10, Features=36\n", 362 | ">Threshold=0.15, Features=35\n", 363 | ">Threshold=0.20, Features=35\n", 364 | ">Threshold=0.25, Features=35\n", 365 | ">Threshold=0.30, Features=35\n", 366 | ">Threshold=0.35, Features=35\n", 367 | ">Threshold=0.40, Features=35\n", 368 | ">Threshold=0.45, Features=33\n", 369 | ">Threshold=0.50, Features=31\n" 370 | ] 371 | }, 372 | { 373 | "data": { 374 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAHf9JREFUeJzt3XmcVOWd7/HPrxe6ARvophtEuqFaMYIKNlLdLmhimCSa4AAmwEWz6NyoWTR6rzNJxpvJ+Lom3mQycyfEJMYg48QkRoOYKIOJjlGIEmUpZBVFFlkaFFo22ZfmN3/0adLTqaaroapPV53v+/WqF3WqzlP1e15lvn3ynHOex9wdERGJhrywCxARkc6j0BcRiRCFvohIhCj0RUQiRKEvIhIhCn0RkQhR6IuIRIhCX0QkQhT6IiIRUhB2Aa2Vl5d7LBYLuwwRkayyePHi99y9or39ulzox2IxEolE2GWIiGQVM9uYyn4a3hERiRCFvohIhCj0RUQiRKEvIhIhCn0RkQhR6IuIRIhCX0QkQnIm9HcfOMIP/rCGlVv2hF2KiEiX1eVuzjpV+XnGD154i+PuXDiwd9jliIh0STlzpF9SXMjQM3uxaMPOsEsREemyUg59M8s3syVmNjvYftnMlgaPrWb2VBvtGlvsNytdhSdTV13Gkk27Odp4PJNfIyKStTpypH8n8Ebzhrtf6e417l4DvAr8po12B5v3c/dxp1Fru2pjZRw82sjrW9/P5NeIiGStlELfzCqBscD0JO+VAGOApEf6nam2uhSARW9riEdEJJlUj/SnAl8Dko2bXAe84O5tHV4Xm1nCzOab2YRTKTJV/UqKifXtwUKN64uIJNVu6JvZtcB2d1/cxi7XA4+d5CMGuXscuAGYambnJPmOW4M/DImGhoZU6m5TbayMxIadHD/up/U5IiK5KJUj/dHAODPbADwOjDGzXwKYWV+gDnimrcbuvjX4dz0wFxiZZJ9p7h5393hFRbtrAJxUbXUZuw4cZV3DvtP6HBGRXNRu6Lv73e5e6e4xYArwort/Jnh7EjDb3Q8la2tmpWZWFDwvp+kPyKq0VN6GulgZgIZ4RESSON3r9KfQamjHzOJm1nzCdxiQMLNlwBzgu+6e0dAf3LcHFSVFOpkrIpJEh+7Idfe5NA3RNG9flWSfBHBz8PwVYPjpFNhRZkZdrIxFG3Z15teKiGSFnLkjt6XaWClbdh9ky+6DYZciItKl5GboVzeN62uIR0Tkv8vJ0B96Zi9Kigp0MldEpJWcDP38POPiwaU60hcRaSUnQx+aJl9bs30fu/YfCbsUEZEuI2dDvza4Xj+xUVfxiIg0y9nQH1HZm275eZpfX0SkhZwN/eLCfC6q6s1CjeuLiJyQs6EPTUM8K7fs4cCRY2GXIiLSJeR26FeXcey4s3TT7rBLERHpEnI69EcNLsVMk6+JiDTL6dDvVVzIMC2WLiJyQk6HPjRdr//aRi2WLiICEQh9LZYuIvJnEQh9LZYuItIs50O/X69iBmuxdBERIAKhD39eLN1di6WLSLRFIvTrYlosXUQEIhL6zYuqLHxbk6+JSLRFIvRjfXtQfkaRrtcXkchLOfTNLN/MlpjZ7GD7Z2b2tpktDR41bbS70czWBI8b01V4R5gZddWlmnxNRCKvI0f6dwJvtHrtq+5eEzyWtm5gZmXAPcAlQB1wj5mVnnK1p6E2VsaW3QfZqsXSRSTCUgp9M6sExgLTO/j5VwPPu/tOd98FPA9c08HPSIvmRVU0xCMiUZbqkf5U4GtA67kM7jOz5Wb2fTMrStJuILC5xXZ98FqnGzYgWCxdQzwiEmHthr6ZXQtsd/fFrd66GxgK1AJlwNeTNU/y2l9cLG9mt5pZwswSDQ0N7Vd9Ck4slq4jfRGJsFSO9EcD48xsA/A4MMbMfunu73iTw8C/0zRm31o9UNViuxLY2nond5/m7nF3j1dUVHS4E6mqqy7jrW1aLF1Eoqvd0Hf3u9290t1jwBTgRXf/jJkNADAzAyYAK5M0fw74mJmVBidwPxa8For44KZzyFosXUSi6nSu03/UzFYAK4By4NsAZhY3s+kA7r4T+BawKHjcG7wWiouq+mixdBGJtIKO7Ozuc4G5wfMxbeyTAG5usf0w8PApV5hGxYX5jKjUYukiEl2RuCO3pdrqpsXSDx5pDLsUEZFOF7nQr4s1LZa+ZLPG9UUkeiIX+hcHi6Uv0uRrIhJBkQv93t0LGarF0kUkoiIX+gB1sVJe27SLY1osXUQiJpKhX1tdxoEjWixdRKInkqFfp8nXRCSiIhn6JxZL1/X6IhIxkQx9gPjgMhIbd2mxdBGJlMiGfl11KTv3H9Fi6SISKZEN/eZFVbRYuohESWRDv7q8J+VndNPJXBGJlMiGvplRGytT6ItIpEQ29KFpiKd+10He2aPF0kUkGiId+nXVzeP6OtoXkWiIdOgPG9CLM4oKNMQjIpER6dA/sVi6ruARkYiIdOhD0+Rrq7ftZfcBLZYuIrkv8qHffL1+YoOO9kUk90U+9LVYuohESeRDv7gwn+GVvVmo0BeRCEg59M0s38yWmNnsYPtRM1ttZivN7GEzK2yjXaOZLQ0es9JVeDrVxspYUa/F0kUk93XkSP9O4I0W248CQ4HhQHfg5jbaHXT3muAx7tTKzKy66lItli4ikZBS6JtZJTAWmN78mrv/zgPAQqAyMyVm3qjBZVosXUQiIdUj/anA14C/WFQ2GNb5LPBsG22LzSxhZvPNbEKyHczs1mCfRENDQ4olpU/v7oWc179EJ3NFJOe1G/pmdi2w3d0Xt7HLA8BL7v5yG+8Pcvc4cAMw1czOab2Du09z97i7xysqKlKtPa3qqsu0WLqI5LxUjvRHA+PMbAPwODDGzH4JYGb3ABXAXW01dvetwb/rgbnAyNMrOTNqY02Lpa96R4uli0juajf03f1ud6909xgwBXjR3T9jZjcDVwPXu3vSw2MzKzWzouB5OU1/QFalrfo00uRrIhIFp3Od/oNAf+DV4HLMfwQws7iZNZ/wHQYkzGwZMAf4rrt3ydDv36uYQWU9NK4vIjmtoCM7u/tcmoZocPekbd09QXD5pru/QtMlnVmhNlbG3NXbcXfMLOxyRETSLvJ35LZUV13Kjv1HWNewP+xSREQyQqHfQvPkaxriEZFcpdBv4cRi6TqZKyI5SqHfgpkRH1ymyddEJGcp9FuprdZi6SKSuxT6rdTFdL2+iOQuhX4rwwaU0LNbvk7mikhOUui3UpCfx8WDS7V8oojkJIV+EnWxMlZv28ueA0fDLkVEJK0U+knUVpfhDomNGuIRkdyi0E+ipqoPhfmmSzdFJOco9JMoLsxnRGUf3aQlIjlHod+G2lgZK7bs4dBRLZYuIrlDod+GuupSjjY6SzbtDrsUEZG0Uei3YdSgYLF0jeuLSA5R6Lehdw8tli4iuUehfxK1sTJe26jF0kUkdyj0T6K2uoz9WixdRHKIQv8kNPmaiOQahf5JnNm7mKqy7hrXF5GckXLom1m+mS0xs9nBdrWZLTCzNWb2azPr1ka7u81srZmtNrOr01V4Z6mNlZHYsAt3D7sUEZHT1pEj/TuBN1ps/xPwfXc/F9gFfL51AzM7H5gCXABcAzxgZvmnXm7nq4uVsWP/Eda/p8XSRST7pRT6ZlYJjAWmB9sGjAFmBrs8AkxI0nQ88Li7H3b3t4G1QN3pFt2ZaquDxdI1ri8iOSDVI/2pwNeA5msX+wK73f1YsF0PDEzSbiCwucV20v3M7FYzS5hZoqGhIcWSOsfZwWLpmnxNRHJBu6FvZtcC2919ccuXk+yabNA7pf3cfZq7x909XlFR0V5Jnap5sXSdzBWRXJDKkf5oYJyZbQAep2lYZyrQx8wKgn0qga1J2tYDVS2229qvS6utLmPzzoO8u+dQ2KWIiJyWdkPf3e9290p3j9F0UvZFd/80MAeYGOx2I/B0kuazgClmVmRm1cC5wMK0VN6JamOlABriEZGsdzrX6X8duMvM1tI0xv9vAGY2zszuBXD314EZwCrgWeA2d8+6uYrPH9CrabF0ncwVkSxX0P4uf+buc4G5wfP1JLkSx91n0XSE37x9H3Df6RQZtubF0jWuLyLZTnfkpqhWi6WLSA5Q6KeoNqbF0kUk+yn0UzRykBZLF5Hsp9BPUXFhPsMH9iaxYVfYpYiInDKFfgfUVpexvH63FksXkayl0O+AulgZRxudpZu1WLqIZCeFfgfEBweLpet6fRHJUgr9DmheLF0nc0UkWyn0O0iLpYtINlPod1A8Vsr+I4288c7esEsREekwhX4H1QWLqmiIR0SykUK/gwb07k5laXedzBWRrKTQPwV1saZFVbRYuohkG4X+Kait1mLpIpKdFPqnoDamxdJFJDsp9E/BORU96dtTi6WLSPZR6J8CMyMeK9XkayKSdRT6p6g2VsamnQfY9r4WSxeR7KHQP0UnrtfXuL6IZBGF/ik6sVi6xvVFJIu0uzC6mRUDLwFFwf4z3f0eM3sZKAl26wcsdPcJSdo3AiuCzU3uPi4tlYesebF0HemLSDZpN/SBw8AYd99nZoXAPDP7vbtf2byDmT0JPN1G+4PuXpOGWruc+OAypr7wFnsOHqV398KwyxERaVe7wzveZF+wWRg8TtyKamYlwBjgqYxU2IXVVpfiDou1WLqIZImUxvTNLN/MlgLbgefdfUGLt68DXnD399toXmxmCTObb2Z/MfwTfP6twT6JhoaGDnUgTCOrSpsWS39bl26KSHZIKfTdvTEYoqkE6szswhZvXw88dpLmg9w9DtwATDWzc5J8/jR3j7t7vKKiogPlh6t7t3wuHNhbJ3NFJGt06Oodd98NzAWuATCzvkAd8MxJ2mwN/l0ftB15aqV2TXUxLZYuItmj3dA3swoz6xM87w58BHgzeHsSMNvdk96hZGalZlYUPC8HRgOr0lF4V1GrxdJFJIukcqQ/AJhjZsuBRTSN6c8O3ptCq6EdM4ub2fRgcxiQMLNlwBzgu+6eU6Efj5UCmnxNRLJDu5dsuvty2hiScferkryWAG4Onr8CDD+9Eru2Pj26cV7/Ev5z1TYmjBxIVVmPsEsSEWmT7shNgyl1VazcuocrvzeHGx6az2+X1HPwiMb4RaTrsa62+lM8HvdEIhF2GR22ZfdBnlxczxOLN7N550FKigr465qzmByv4qLK3phZ2CWKSA4zs8XBlZIn30+hn17HjzsL3t7JE4nN/G7lOxw6epwP9D+DyfEqJowcSPkZRWGXKCI5SKHfBbx/6CjPLH+HGYnNLNm0m4I8Y8zQfkyOV3HVeRUU5Gt0TUTSQ6HfxazZtpcnFtfzm9fqeW/fESpKivjkxQOZNKqKIf3OCLs8EclyCv0u6mjjceaubmBGYjMvvrmdxuPOxYP6MDlexdgRAygp1sRtItJxCv0ssH3vIZ5asoUZiXrWbt9H98J8PjF8AJPjldRVl+nkr4ikTKGfRdyb7uidkajnP5ZtZd/hYwzu24NJoyr51KhKBvTuHnaJItLFKfSz1MEjjTz7+jvMWFTPq+t3kGdw5bkVTI5X8ZHz+1FUkB92iSLSBSn0c8CmHQeYuXgzMxfXs3XPIfr0KGRCzUAmxSu54KzeYZcnIl2IQj+HNB53Xln3HjMS9Tz3+rscOXacC87qxeR4FeNrzqJPj25hlygiIVPo56jdB44wa9lWnkjUs2LLHrrl5/HRC/ozOV7FFUPKyc/TyV+RKFLoR8Cqre8zI7GZp5duYdeBowzoXczEUZVMHFXJ4L49wy5PRDqRQj9CDh9r5IU3tvNEYjN/fKuB4w6XVJcxOV7Fx4efSY9u7U6mKiJZTqEfUe/uOcSTr9XzRGIzG3Yc4IyiAq4dMYBJ8SouHtRH1/6L5CiFfsS5O4s27GJGYjO/W/EOB440ck5FTybHq7ju4oH0KykOu0QRSSOFvpyw7/AxfhdM/JbYuIv8POPD5/VjUrySMUP7UaiJ30SynkJfklrXsI+Zi+t5cnE92/cepvyMblw3ciCT4lV8oH9J2OWJyClS6MtJHWs8zktrGpixqJ4X3tzG0UanpqoPk+KV/PVFZ9FLE7+JZBWFvqRsx77DPLV0KzMWbWb1tr0UF+bx8QsHMCleyaXVfcnTtf8iXV7aQt/MioGXgCKaFlKf6e73mNnPgA8Be4Jdb3L3pUna3wj8Q7D5bXd/5GTfp9APj7uzYsue4Nr/rew9dIyqsu5MvLiKT40aSGWpFn0X6arSGfoG9HT3fWZWCMwD7gS+CMx295knaVsGJIA44MBiYJS772qrjUK/azh0tJHnXn+XJxL1zFv7HmZwxZByJsWr+Nj5/Sku1MRvIl1JqqHf7l073vRXYV+wWRg8Uh0Tuhp43t13BkU9D1wDPJZiewlJcWE+42sGMr5mIJt3Hgiu/a/njseW0Ku4gPE1A4nHSnXdv2RMrG8PRlT2CbuMnJPSmL6Z5dN0lD4E+LG7fz0Y3rkMOAy8APy9ux9u1e7vgGJ3/3aw/U3goLv/S6v9bgVuBRg0aNCojRs3nm6/JAOOH3fmr9/BjMRmfr/yXQ4fOx52SZLjbv/wEP73Rz+gOaVSkLYjfQB3bwRqzKwP8FszuxC4G3gX6AZMA74O3Nu6jmQfl+TzpwWfQTwe71pnluWEvDzj8iHlXD6knG8dOsq29w+330jklDgPvfQ2P5qzltc27eIHU0ZSUVIUdlE5oUOTsrj7bjObC1zT4mj9sJn9O/B3SZrUA1e12K4E5na8TOlqSooLtZ6vZNQ/TRzBqFgp33xqJWPvf5kfXj+SS87uG3ZZWa/dWzHNrCI4wsfMugMfAd40swHBawZMAFYmaf4c8DEzKzWzUuBjwWsiIu2aHK/iqdtG07OogBumL+DBP66jq11mnm1Suf9+ADDHzJYDi2g6MTsbeNTMVgArgHKgedw+bmbTAYITuN8K2i0C7m0+qSsikophA3ox6/bRXH1Bf777+ze55eeL2XPgaNhlZS3dnCUiWcHd+dkrG7jvmTcY0KeYB24YxfBKLRvaLNUTuZppS0SygpnxN6Or+fUXLuNYo/Opn7zCows2aringxT6IpJVRg0u5Zk7ruTSc/ryjd+u5K4Zyzhw5FjYZWUNhb6IZJ2ynt342U213PXRD/DU0i2M/9GfWLt9b9hlZQWFvohkpbw8446/Opdf/M9L2Ln/CON+9CeeXrol7LK6PIW+iGS1K84t55k7ruSCs3px5+NL+eZTKzl8rDHssroshb6IZL0zexfzq1su5dYPns0v5m9k0oOvsnnngbDL6pIU+iKSEwrz8/g/nxjGTz87irff28+1P5zHC29sC7usLkehLyI55eoLzmT2V66gsrQ7n38kwfeefZNjjZocsJlCX0RyzuC+PXnyS5dzfd0gHpi7jk9PX8D2vYfCLqtLUOiLSE4qLsznO58czv+fdBHL6ncz9v55zF+/I+yyQqfQF5Gc9qlRlTx92xWUFBdww0PzeWDuWo4fj+5dvAp9Ecl5551Zwqzbr+ATwwfwvWdXc8vPE+w+cCTsskKh0BeRSDijqIAfXj+Se8dfwEtrGhh7/zyW1+8Ou6xOp9AXkcgwMz53WYwnvng5ABN/8iq/mB+tSdsU+iISOTVVfZj9lSsYPaQv33xqJXc+vpT9h6MxaZtCX0QiqbRnN/7txlq+evV5zF6+lfE//hNrtuX+pG0KfRGJrLw847YPD+GXN1/C7gNNk7Y9tSS3J21T6ItI5F1+TtOkbcMH9uZ//Xop3/jtCg4dzc1J2xT6IiJA/17F/OqWS/jCh87m0QWbmPjgKzk5aZtCX0QkUJCfx90fH8ZDn4uzaccBxt7/Ms+vyq1J29oNfTMrNrOFZrbMzF43s/8bvP6oma02s5Vm9rCZFbbRvtHMlgaPWenugIhIun30/P48c8eVDOrbg1t+nuA7v38jZyZtS+VI/zAwxt0vAmqAa8zsUuBRYCgwHOgO3NxG+4PuXhM8xqWjaBGRTKsq68HML17Opy8ZxE//uJ4bpi9g+/vZP2lbu6HvTfYFm4XBw939d8F7DiwEKjNYp4hIpysuzOe+64Yz9X/UsKJ+D5+4fx6vrHsv7LJOS0pj+maWb2ZLge3A8+6+oMV7hcBngWfbaF5sZgkzm29mE9r4/FuDfRINDQ0d7IKISGZNGDmQWbePpnf3Aj4zfQE/npO9k7alFPru3ujuNTQdzdeZ2YUt3n4AeMndX26j+SB3jwM3AFPN7Jwknz/N3ePuHq+oqOhgF0REMu/c/k2Ttl074iz++bnVfP6RRezan32TtnXo6h133w3MBa4BMLN7gArgrpO02Rr8uz5oO/LUShURCVfPogJ+MKWGb024kD+t3cG1P5zHkk27wi6rQ1K5eqfCzPoEz7sDHwHeNLObgauB69096WltMys1s6LgeTkwGliVruJFRDqbmfHZSwcz80uXATD5p6/yyCsbsmbStlSO9AcAc8xsObCIpjH92cCDQH/g1eByzH8EMLO4mU0P2g4DEma2DJgDfNfdFfoikvVGVPbhmTuu4IPnVnDPrNe5/bEl7MuCSdusq/11isfjnkgkwi5DRCQlx487D760jn95bjWx8p785NOjOO/Mkk6vw8wWB+dPT0p35IqInIa8POPLVw3hV7dcyt5Dxxj/43k8ubg+7LLapNAXEUmDS8/uyzN3XEFNVR/+9oll3P2b5V1y0jaFvohImvQrKeaXn7+E2z58Do8t3MwnH3iFjTv2h13Wf6PQFxFJo4L8PL569VAevinOlt0HufaH83ju9XfDLusEhb6ISAaMGdqf2V+5grPLe/KFXyzmvmdWcbQLTNqm0BcRyZCqsh7M+OJlfO6ywTz08ttcP20+7+4Jd9I2hb6ISAYVFeRz7/gLuf/6kax6533G3v8y89aEN2mbQl9EpBOMu+gsZt0+mrKe3fjswwu4/4U1oUzaptAXEekkQ/qV8PTto5lQM5B/ff4tbvrZInZ28qRtCn0RkU7Uo1sB/zr5Iv7fdcOZv24HY+9/mdc6cdI2hb6ISCczM264ZBC/+fLlFOQbkx98lYfnvd0pk7Yp9EVEQnLhwN7Mvv1KrjqvH/fOXsXtv1qS8XH+gox+uoiInFTvHoU89LlRTHtpPXsPHSMvzzL6fQp9EZGQmRlf+NBfLCqYERreERGJEIW+iEiEKPRFRCJEoS8iEiEKfRGRCFHoi4hEiEJfRCRCFPoiIhFinTHXQ0eYWQOw8TQ+ohwIb7LqcEStz1HrL6jPUXE6fR7s7hXt7dTlQv90mVnC3eNh19GZotbnqPUX1Oeo6Iw+a3hHRCRCFPoiIhGSi6E/LewCQhC1Pketv6A+R0XG+5xzY/oiItK2XDzSFxGRNmRl6JvZNWa22szWmtnfJ3m/yMx+Hby/wMxinV9leqXQ5w+a2WtmdszMJoZRY7ql0Oe7zGyVmS03sxfMbHAYdaZTCn3+opmtMLOlZjbPzM4Po850aq/PLfabaGZuZll/RU8Kv/NNZtYQ/M5LzezmtH25u2fVA8gH1gFnA92AZcD5rfb5MvBg8HwK8Ouw6+6EPseAEcDPgYlh19xJff4w0CN4/qWI/M69WjwfBzwbdt2Z7nOwXwnwEjAfiIdddyf8zjcBP8rE92fjkX4dsNbd17v7EeBxYHyrfcYDjwTPZwJ/ZWaZXYMss9rts7tvcPflwPEwCsyAVPo8x90PBJvzgcpOrjHdUunz+y02ewLZflIulf89A3wL+B5wqDOLy5BU+5wR2Rj6A4HNLbbrg9eS7uPux4A9QN9OqS4zUulzrulonz8P/D6jFWVeSn02s9vMbB1NIXhHJ9WWKe322cxGAlXuPrszC8ugVP/b/lQwdDnTzKrS9eXZGPrJjthbH+2ksk82ybX+pCLlPpvZZ4A48M8ZrSjzUuqzu//Y3c8Bvg78Q8aryqyT9tnM8oDvA3/baRVlXiq/838AMXcfAfyBP49cnLZsDP16oOVfvUpga1v7mFkB0BvY2SnVZUYqfc41KfXZzD4CfAMY5+6HO6m2TOno7/w4MCGjFWVee30uAS4E5prZBuBSYFaWn8xt93d29x0t/nt+CBiVri/PxtBfBJxrZtVm1o2mE7WzWu0zC7gxeD4ReNGDsyNZKpU+55p2+xz83/6f0hT420OoMd1S6fO5LTbHAms6sb5MOGmf3X2Pu5e7e8zdYzSduxnn7olwyk2LVH7nAS02xwFvpO3bwz6TfYpnvz8BvEXTGfBvBK/dS9N/DADFwBPAWmAhcHbYNXdCn2tpOoLYD+wAXg+75k7o8x+AbcDS4DEr7Jo7oc8/AF4P+jsHuCDsmjPd51b7ziXLr95J8Xf+TvA7Lwt+56Hp+m7dkSsiEiHZOLwjIiKnSKEvIhIhCn0RkQhR6IuIRIhCX0QkQhT6IiIRotAXEYkQhb6ISIT8F0XYbTJXIgktAAAAAElFTkSuQmCC\n", 375 | "text/plain": [ 376 | "" 377 | ] 378 | }, 379 | "metadata": { 380 | "needs_background": "light" 381 | }, 382 | "output_type": "display_data" 383 | } 384 | ], 385 | "source": [ 386 | "from numpy import arange\n", 387 | "from pandas import read_csv\n", 388 | "from sklearn.feature_selection import VarianceThreshold\n", 389 | "from matplotlib import pyplot\n", 390 | "\n", 391 | "df = read_csv('oil-spill.csv', header=None)\n", 392 | "# split data into inputs and outputs\n", 393 | "data = df.values\n", 394 | "X = data[:, :-1]\n", 395 | "y = data[:, -1]\n", 396 | "print(X.shape, y.shape)\n", 397 | "# define thresholds to check\n", 398 | "thresholds = arange(0.0, 0.55, 0.05)\n", 399 | "# apply transform with each threshold\n", 400 | "results = list()\n", 401 | "for t in thresholds:\n", 402 | "\t# define the transform\n", 403 | "\ttransform = VarianceThreshold(threshold=t)\n", 404 | "\t# transform the input data\n", 405 | "\tX_sel = transform.fit_transform(X)\n", 406 | "\t# determine the number of input features\n", 407 | "\tn_features = X_sel.shape[1]\n", 408 | "\tprint('>Threshold=%.2f, Features=%d' % (t, n_features))\n", 409 | "\t# store the result\n", 410 | "\tresults.append(n_features)\n", 411 | "# plot the threshold vs the number of selected features\n", 412 | "pyplot.plot(thresholds, results)\n", 413 | "pyplot.show()" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "metadata": {}, 420 | "outputs": [], 421 | "source": [] 422 | } 423 | ], 424 | "metadata": { 425 | "kernelspec": { 426 | "display_name": "Python 3", 427 | "language": "python", 428 | "name": "python3" 429 | }, 430 | "language_info": { 431 | "codemirror_mode": { 432 | "name": "ipython", 433 | "version": 3 434 | }, 435 | "file_extension": ".py", 436 | "mimetype": "text/x-python", 437 | "name": "python", 438 | "nbconvert_exporter": "python", 439 | "pygments_lexer": "ipython3", 440 | "version": "3.6.4" 441 | } 442 | }, 443 | "nbformat": 4, 444 | "nbformat_minor": 2 445 | } 446 | -------------------------------------------------------------------------------- /Polynomial Feature Transform.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "[[2 3]\n", 13 | " [2 3]\n", 14 | " [2 3]]\n", 15 | "[[1. 2. 3. 4. 6. 9.]\n", 16 | " [1. 2. 3. 4. 6. 9.]\n", 17 | " [1. 2. 3. 4. 6. 9.]]\n" 18 | ] 19 | } 20 | ], 21 | "source": [ 22 | "# demonstrate the types of features created\n", 23 | "from numpy import asarray\n", 24 | "from sklearn.preprocessing import PolynomialFeatures\n", 25 | "# define the dataset\n", 26 | "data = asarray([[2,3],[2,3],[2,3]])\n", 27 | "print(data)\n", 28 | "# perform a polynomial features transform of the dataset\n", 29 | "trans = PolynomialFeatures(degree=2)\n", 30 | "data = trans.fit_transform(data)\n", 31 | "print(data)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "Accuracy: 0.797 (0.073)\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "# evaluate knn on the raw sonar dataset\n", 49 | "from numpy import mean\n", 50 | "from numpy import std\n", 51 | "from pandas import read_csv\n", 52 | "from sklearn.model_selection import cross_val_score\n", 53 | "from sklearn.model_selection import RepeatedStratifiedKFold\n", 54 | "from sklearn.neighbors import KNeighborsClassifier\n", 55 | "from sklearn.preprocessing import LabelEncoder\n", 56 | "# load dataset\n", 57 | "dataset = read_csv('sonar.csv', header=None)\n", 58 | "data = dataset.values\n", 59 | "# separate into input and output columns\n", 60 | "X, y = data[:, :-1], data[:, -1]\n", 61 | "# ensure inputs are floats and output is an integer label\n", 62 | "X = X.astype('float32')\n", 63 | "y = LabelEncoder().fit_transform(y.astype('str'))\n", 64 | "# define and configure the model\n", 65 | "model = KNeighborsClassifier()\n", 66 | "# evaluate the model\n", 67 | "cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)\n", 68 | "n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)\n", 69 | "# report model performance\n", 70 | "print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 3, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "name": "stdout", 80 | "output_type": "stream", 81 | "text": [ 82 | "(208, 39711)\n" 83 | ] 84 | } 85 | ], 86 | "source": [ 87 | "# visualize a polynomial features transform of the sonar dataset\n", 88 | "from pandas import read_csv\n", 89 | "from pandas import DataFrame\n", 90 | "from sklearn.preprocessing import PolynomialFeatures\n", 91 | "# load dataset\n", 92 | "dataset = read_csv('sonar.csv', header=None)\n", 93 | "# retrieve just the numeric input values\n", 94 | "data = dataset.values[:, :-1]\n", 95 | "# perform a polynomial features transform of the dataset\n", 96 | "trans = PolynomialFeatures(degree=3)\n", 97 | "data = trans.fit_transform(data)\n", 98 | "# convert the array back to a dataframe\n", 99 | "dataset = DataFrame(data)\n", 100 | "# summarize\n", 101 | "print(dataset.shape)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 4, 107 | "metadata": {}, 108 | "outputs": [ 109 | { 110 | "name": "stdout", 111 | "output_type": "stream", 112 | "text": [ 113 | "Accuracy: 0.800 (0.077)\n" 114 | ] 115 | } 116 | ], 117 | "source": [ 118 | "# evaluate knn on the sonar dataset with polynomial features transform\n", 119 | "from numpy import mean\n", 120 | "from numpy import std\n", 121 | "from pandas import read_csv\n", 122 | "from sklearn.model_selection import cross_val_score\n", 123 | "from sklearn.model_selection import RepeatedStratifiedKFold\n", 124 | "from sklearn.neighbors import KNeighborsClassifier\n", 125 | "from sklearn.preprocessing import LabelEncoder\n", 126 | "from sklearn.preprocessing import PolynomialFeatures\n", 127 | "from sklearn.pipeline import Pipeline\n", 128 | "# load dataset\n", 129 | "dataset = read_csv('sonar.csv', header=None)\n", 130 | "data = dataset.values\n", 131 | "# separate into input and output columns\n", 132 | "X, y = data[:, :-1], data[:, -1]\n", 133 | "# ensure inputs are floats and output is an integer label\n", 134 | "X = X.astype('float32')\n", 135 | "y = LabelEncoder().fit_transform(y.astype('str'))\n", 136 | "# define the pipeline\n", 137 | "trans = PolynomialFeatures(degree=3)\n", 138 | "model = KNeighborsClassifier()\n", 139 | "pipeline = Pipeline(steps=[('t', trans), ('m', model)])\n", 140 | "# evaluate the pipeline\n", 141 | "cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)\n", 142 | "n_scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)\n", 143 | "# report pipeline performance\n", 144 | "print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 6, 150 | "metadata": {}, 151 | "outputs": [ 152 | { 153 | "name": "stdout", 154 | "output_type": "stream", 155 | "text": [ 156 | "Degree: 1, Features: 61\n", 157 | "Degree: 2, Features: 1891\n", 158 | "Degree: 3, Features: 39711\n", 159 | "Degree: 4, Features: 635376\n", 160 | "Degree: 5, Features: 8259888\n" 161 | ] 162 | }, 163 | { 164 | "data": { 165 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEDCAYAAADOc0QpAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAG7RJREFUeJzt3X10VPed3/H3V+LJgIRtECAehW1sECQxIIMIu9lsbCfY2wNObMdgG9epuz7brnfbkz1ts22Pu03PntPuntNtu+s9u940pysZjLHT2NjBduvYSbyphBE2YB4dgjWSkLDEk3gQSEj69o8Z4WG4kkYwM3cePq9zdDxz72/mfvkZfbj6zdX3mrsjIiL5pSjsAkREJPUU7iIieUjhLiKShxTuIiJ5SOEuIpKHFO4iInko1HA3sx+aWbuZ7U1i7F+Y2a7Y1ydmdjoTNYqI5CIL8zp3M/sKcA6ocffFI3jdHwBL3P2fpK04EZEcFuqZu7v/AjgZv83MbjWzt8xsp5m9b2YLAl66HngxI0WKiOSgUWEXEOB54Pfc/VdmtgL4a+BrAzvNbC4wD3g3pPpERLJeVoW7mU0Evgy8bGYDm8cmDFsHvOLufZmsTUQkl2RVuBNdJjrt7ncOMWYd8PsZqkdEJCdl1aWQ7n4G+NTMHgawqC8N7DezO4CbgLqQShQRyQlhXwr5ItGgvsPMWszsKeAx4Ckz2w3sA9bGvWQ9sNnVylJEZEihXgopIiLpkVXLMiIikhqhfaA6ZcoUr6ioCOvwIiI5aefOncfdvWy4caGFe0VFBQ0NDWEdXkQkJ5lZJJlxWpYREclDCncRkTykcBcRyUMKdxGRPKRwFxHJQwp3EZE8pHAXEclDCncRkQxxd/70J/vZ33om7cdSuIuIZMgvD5/g797/lEOfKdxFRPJGTV0jN08Yw32Ly9N+LIW7iEgGHD19gXcOfMYjd81m3OjitB8vqXA3s9VmdsjMDpvZ9wL2zzGz98zsIzPbY2b3p75UEZHc9eL2JgAeWzEnI8cbNtzNrBh4DrgPqATWm1llwrB/D2xx9yVEb4P316kuVEQkV3X39rF5RxNfWzCNWTeNz8gxkzlzXw4cdvcj7t4DbObKuyMBOFAaezwJaE1diSIiue2tvcc4fq6HDSvnZuyYyYT7TKA57nlLbFu8PwEeN7MWYBvwB0FvZGZPm1mDmTV0dHRcQ7kiIrmnpi5CxeTx/OZtUzJ2zGTC3QK2Jd6bbz3wv9x9FnA/UGtmV723uz/v7lXuXlVWNmyveRGRnLevtZOdkVM8Xj2XoqKgOE2PZMK9BZgd93wWVy+7PAVsAXD3OmAckLl/okREstQL9RHGjS7i4WWzhx+cQsmE+w5gvpnNM7MxRD8w3Zowpgm4G8DMFhINd627iEhB67xwiVc/amXtl2YyafzojB572HB3917gGeBt4ADRq2L2mdn3zWxNbNgfAb9rZruBF4En3T1x6UZEpKC8srOFC5f6MvpB6oCk7qHq7tuIflAav+3ZuMf7gVWpLU1EJHf19zsv1EdYOudGFs+clPHj6zdURUTS4Je/Ps6nx8/zxMqKUI6vcBcRSYOaugiTJ4zhvi9MD+X4CncRkRQ7evoCP431kRk7Kv19ZIIo3EVEUmzT9ggAj1Vn/oPUAQp3EZEU6u7tY/MHzdy9cBozb7whtDoU7iIiKfTmx8c4cb6HDSGetYPCXUQkpWrqGpk3ZQK/kcE+MkEU7iIiKbL3aCcfNp3OeB+ZIAp3EZEUGegj89CyWWGXonAXEUmFzq5LvLrrKA/cOZNJN2S2j0wQhbuISAq8vLOZi5f6Q+kjE0ThLiJynfr7nY3bm1g29yYWzch8H5kgCncRkev0D4cH+shkx1k7KNxFRK7bQB+Z1YvD6SMTROEuInIdWk518e7Bz1i3PLw+MkEU7iIi12HT9iYAHl2RPUsyoHAXEblm3b19vLSjmXtC7iMTROEuInKNtn3cFu0jk0UfpA5QuIuIXKOaugi3TJnAqlvD7SMTROEuInIN9h7t5KMs6SMTROEuInINausi3DC6mAezoI9MEIW7iMgIdXZd4rXdR3lgyYys6CMTROEuIjJCl/vIVFeEXcqgFO4iIiPQ3++8UB+hau5NVM4oDbucQSncRURG4P3Dx2k80ZWVlz/GU7iLiIxAbV0jUyZmVx+ZIAp3EZEkNZ/s4qcH21l315ys6iMTROEuIpKkTR80YcCjK+aEXcqwFO4iIkm4eCnaR+beymnMyLI+MkEU7iIiSdj2cRsnz/dk9eWP8RTuIiJJqKmLcEvZBFbdNjnsUpKicBcRGcbHLZ3saj7Nhuq5mGVfH5kgCncRkWHU1jcyfkz29pEJonAXERnC6a4eXtvVygNLZlI6Ljv7yARRuIuIDOHlhha6e/vZUJ3dv5GaSOEuIjKI/n7nhe0R7qq4iYXl2dtHJojCXURkEL/4VQeRE11sWFkRdikjpnAXERlEbV2EKRPHsnpRdveRCaJwFxEJ0Hyyi3cPtbN++WzGjMq9qEyqYjNbbWaHzOywmX1vkDHfNrP9ZrbPzDaltkwRkczauL2JIrOc6CMTZNRwA8ysGHgOuBdoAXaY2VZ33x83Zj7wx8Aqdz9lZlPTVbCISLpF+8g0ce/CaZRPyv4+MkGSOXNfDhx29yPu3gNsBtYmjPld4Dl3PwXg7u2pLVNEJHN+sqeNU12XeCLLb8gxlGTCfSbQHPe8JbYt3u3A7Wb2SzOrN7PVQW9kZk+bWYOZNXR0dFxbxSIiaVZTH+HWsgmsvDU3+sgESSbcgxopeMLzUcB84KvAeuAHZnbjVS9yf97dq9y9qqysbKS1ioik3Z6W0+zOsT4yQZIJ9xZgdtzzWUBrwJjX3P2Su38KHCIa9iIiOaW2LsL4McV8K4f6yARJJtx3APPNbJ6ZjQHWAVsTxrwK/DaAmU0hukxzJJWFioik26nzPWzd3co3c6yPTJBhw93de4FngLeBA8AWd99nZt83szWxYW8DJ8xsP/Ae8K/c/US6ihYRSYeXdzZH+8jk8AepA4a9FBLA3bcB2xK2PRv32IHvxr5ERHJOf7/zQn0TyytuZsH03OojEyT3fu1KRCQNfv6rDppOduXFWTso3EVEgOgHqWUlY/lGDvaRCaJwF5GC13yyi/cOtbP+rtzsIxMkP/4UIiLX4YXtkVgfmfxYkgGFu4gUuIuX+tiyo5mvV05j+qRxYZeTMgp3ESlob8T6yOTLB6kDFO4iUtBq6xq5bepEVt6Su31kgijcRaRg7W4+ze6WzpzvIxNE4S4iBau2PsKEMcV8a2lio9vcp3AXkYJ06nwPr+9u5ZtLZ1KS431kgijcRaQgbWmI9ZGprgi7lLRQuItIwenvd17YHmH5vJu5Y3pJ2OWkhcJdRArOzz/poPnkhZy+jd5wFO4iUnBq6hrzqo9MEIW7iBSUphNd/OyTDtYvn8Po4vyNwPz9k4mIBNg40Edm+ZywS0krhbuIFIyLl/p4qaGZbyzKrz4yQRTuIlIwXt/dyumuS3l7+WM8hbuIFIza+gjzp06k+pabwy4l7RTuIlIQdjefZk9LJxtW5l8fmSAKdxEpCDV10T4y31ySf31kgijcRSTvnTzfw+t7WvnW0ll52UcmiMJdRPLeloZmenr78+6GHENRuItIXuvrdzZuj7Bi3s3cPi0/+8gEUbiLSF77+SftsT4yFWGXklEKdxHJazV1EaaWjOXri6aFXUpGKdxFJG9FTpzn5wXQRyZIYf1pRaSgbNzeRLEZj67I7z4yQRTuIpKXLl7qY0tDM99YNJ1ppfndRyaIwl1E8tLWgT4yBXT5YzyFu4jkHXenti7C7dMmsmJe/veRCaJwF5G8s7ulk4+PdrKhujD6yARRuItI3qmpa2Ti2FF8c+mssEsJjcJdRPLKyfM9vLGnjW8tncnEsaPCLic0CncRySsv7Yj2kXm8ujA/SB2gcBeRvDHQR6b6lsLqIxNE4S4ieeNnh9ppOVV4fWSCKNxFJG/U1EWYVjqWeysLq49MEIW7iOSFxuOF20cmSFIzYGarzeyQmR02s+8NMe4hM3Mzq0pdiSIiw9u4PcKoIuPR5YXXRybIsOFuZsXAc8B9QCWw3swqA8aVAH8IbE91kSIiQ7nQ08eWhha+sXg6Uwuwj0yQZM7clwOH3f2Iu/cAm4G1AeP+E/BnwMUU1iciMqzXd7fSeeESTxT45Y/xkgn3mUBz3POW2LbLzGwJMNvd3xjqjczsaTNrMLOGjo6OERcrIpLI3ampb+T2aRNZXqB9ZIIkE+5BjRn88k6zIuAvgD8a7o3c/Xl3r3L3qrKysuSrFBEZxK7m0+w9eoYNKysKto9MkGTCvQWYHfd8FtAa97wEWAz8zMwagWpgqz5UFZFMqK2LRPvILJk5/OACkky47wDmm9k8MxsDrAO2Dux09053n+LuFe5eAdQDa9y9IS0Vi4jEnDjXzRt72niwwPvIBBk23N29F3gGeBs4AGxx931m9n0zW5PuAkVEBvNSQzM9feojEySpf+rcfRuwLWHbs4OM/er1lyUiMrS+fmdjfRMrb5nM/ALvIxNEv8YlIjnpvYPtHD19gScK9DZ6w1G4i0hOqqlXH5mhKNxFJOd8evw8v/ikg0eXz2WU+sgE0qyISM7ZWB/tI7N++ezhBxcohbuI5JRoH5lmVquPzJAU7iKSU7buPsqZi726IccwFO4ikjPcnZq6CAuml3BXxU1hl5PVFO4ikjM+aj7NvtYzPF49V31khqFwF5GcUVsXoUR9ZJKicBeRnHD8XDc/2dPGg8tmMUF9ZIalcBeRnPDSDvWRGQmFu4hkvb5+Z9P2Jr5862Rumzox7HJygsJdRLLeu+ojM2IKdxHJejV1jZRPGsc9C9VHJlkKdxHJakc6zvH+r47z6PI56iMzApopEclqG7c3MbrYeER9ZEZE4S4iWetCTx8vNzSzenE5U0vUR2YkFO4ikrVe2zXQR0YfpI6Uwl1EslJ8H5mqueojM1IKdxHJSh82nWZ/2xk2rFQfmWuhcBeRrFRb10jJ2FE8cKf6yFwLhbuIZJ3j57rZ9vEx9ZG5Dgp3Eck6A31kNuiD1GumcBeRrNLb18/G+girbpvMrWXqI3OtFO4iklXePdhOa+dFNlRXhF1KTlO4i0hWqa2PMGPSOO5ZODXsUnKawl1EssblPjIr1Efmemn2RCRr1NZHon1k7poTdik5T+EuIlmhq6eXV3a2cN/icspKxoZdTs5TuItIVnhtVytn1UcmZRTuIhK6gT4yC8tLWaY+MimhcBeR0O2MnOJA2xk2VKuPTKoo3EUkdLX1EUrGjeKBJTPCLiVvKNxFJFQdZ7vZ9nEbDy2bxfgx6iOTKgp3EQnVSzuauNTnPF6tD1JTSeEuIqHp7etn4/YmfuO2Keojk2IKdxEJzU8PttPWeVHdH9NA4S4ioamti/aRuXuB+sikmsJdRELx645z/MPh4zxWPVd9ZNIgqRk1s9VmdsjMDpvZ9wL2f9fM9pvZHjP7qZnpZywRGVJtXbSPzLerZoddSl4aNtzNrBh4DrgPqATWm1llwrCPgCp3/yLwCvBnqS5URPJHV08vP9rZwv1fUB+ZdEnmzH05cNjdj7h7D7AZWBs/wN3fc/eu2NN6YFZqyxSRfPLqR62c7VYfmXRKJtxnAs1xz1ti2wbzFPBm0A4ze9rMGsysoaOjI/kqRSRvRPvINFJZXsrSOeojky7JhHtQowcPHGj2OFAF/HnQfnd/3t2r3L2qrKws+SpFJG80RE5x8NhZNqxUH5l0SuZ3fVuA+E88ZgGtiYPM7B7g3wG/5e7dqSlPRPJNbV20j8zaO9VHJp2SOXPfAcw3s3lmNgZYB2yNH2BmS4C/Bda4e3vqyxSRfNBxtps397bx8LLZ6iOTZsOGu7v3As8AbwMHgC3uvs/Mvm9ma2LD/hyYCLxsZrvMbOsgbyciBWzzBwN9ZHQbvXRL6p9Od98GbEvY9mzc43tSXJeI5Jnevn42fdDEb86fwi3qI5N2+rUwEcmIdw7E+sio+2NGKNxFJCNq6xuZeeMN3L1wWtilFASFu4ik3eH2c/zy8AkeXTGH4iJd/pgJCncRSbsX6iOMKS7ikbvURyZTFO4iklbnuwf6yExnykT1kckUhbuIpNWru45ytruXDSsrwi6loCjcRSRt3J3augiLZpSydM6NYZdTUBTuIpI2OxpjfWSq1Ucm0xTuIpI2tfURSseNYu2dQzWSlXRQuItIWrSfvchbe9t4uGo2N4wpDrucgqNwF5G02PxBc6yPjH4jNQwKdxFJud6+fjZtj/aRmTdlQtjlFCSFu4ik3DsHPuPYmYs8ocsfQ6NwF5GUq6mLMPPGG/jagqlhl1KwFO4iklKH28/y/359gseq1UcmTAp3EUmp2rpYH5kq9ZEJk8JdRFLmfHcvP/rwKL/zxXImq49MqBTuIpIyP/7oKOe6e9mwUpc/hk3hLiIpMdBHZvHMUpbMVh+ZsCncRSQlPvj0JIc+Ux+ZbKFwF5GUqK2PMOmG0az5kvrIZAOFu4hct/YzF3lr7zEeXjZLfWSyhMJdRK7bix8009uvPjLZROEuItflUl8/mz6I8JXby6hQH5msoXAXkevyzv7P+OxMN0/orD2rKNxF5LoM9JH5bfWRySqjwi5ARHKDu9N+tpv9rWfY33bm8n8/PX6ef7N6gfrIZBmFu4hcpbevn0+Pn78ixPe3nuHE+Z7LY+ZOHk9leSmP3DWb76yqCK9YCaRwFylw57p7Odh25dn4oWNn6e7tB2DMqCLumFbCPQunUTmjlMoZpSyYXkLJuNEhVy5DUbiLFAh357Mz3exv67zibLzxRNflMTeNH03ljFKeWDk3GuTlk7ilbAKji/XxXK5RuIvkod6+fo4cP3/V+vjJuGWVisnjqZxRyoNLZ10+I59eOk6tA/KEwl0kxwUtqxw8dpaeuGWVBdNL+HplbFmlvJQ7tKyS9xTuIjnC3Tl25mI0wGMhfqDt6mWVRTMm8eSXK6gsj56N3zJlAqO0rFJwFO4iWehSXz9HOs5ftT5+quvS5TEDyyoPLZt1eX18WulYLasIoHAXCd3Zi5c4eOzsFWfkhz77fFllbGxZZfXi6Swsjy6rLCgvZeJYffvK4PS3QyRDgpZV9redIRK3rHLzhDEsmlHKd75ccXl9fJ6WVeQaKNxF0uBSXz+/7jjHgYRfAopfVpk3ZQKLZ0zi21WzL6+PTy3RsoqkhsJd5DqduXiJg21n2d/aefls/JNj5+jpu3pZZSDE75iuZRVJL/3tEkmSu9PWefGqa8ebTn6+rDJ5whgqZ5TynVVaVpFwJRXuZrYa+O9AMfADd//PCfvHAjXAMuAE8Ii7N6a2VJHMGVhWSVwfPx1bVjGDeZMn8IVZk3jkLi2rSPYZNtzNrBh4DrgXaAF2mNlWd98fN+wp4JS732Zm64D/AjySjoIlfdydvn6nz53+fuiLPe+/vM3jtsXtH3hd3ON+d3r7rnyv/v749/eE1xOwLWF/wHH63OnrS3xPgmu+/HquqDPxPbt6+jjScf7KZZXyUu5bXH75bHzB9BImaFlFslgyfzuXA4fd/QiAmW0G1gLx4b4W+JPY41eAvzIzc3dPYa0AbNnRzPPvH7lq+1CHGnTPENUNtmuw4wz1Bx2sNB/iVYO+5hpm1D0u9Nzp7eun37kqaFP/fyt9zKDYjKIio9iM4iKjyKC4yD7/Gtgf//jyNq7aNqqoiLGjjJvGj+G37iijsryURTNKqZisZRXJPcmE+0ygOe55C7BisDHu3mtmncBk4Hj8IDN7GngaYM6cOddU8E0TxnDHtJLgnUP8NDzYrqF+hB78NSM+/KDHGfIH+EGPM8h7DfFmiaFWXFREcREJ4fh5MEYfE7DtyoC8atvAe13eFj3eqOLPXz/4ezJIEF/5XsVFpqUPkWEkE+5B30WJ53jJjMHdnweeB6iqqrqm88R7K6dxb+W0a3mpiEjBSOZnzRZgdtzzWUDrYGPMbBQwCTiZigJFRGTkkgn3HcB8M5tnZmOAdcDWhDFbgX8ce/wQ8G461ttFRCQ5wy7LxNbQnwHeJnop5A/dfZ+ZfR9ocPetwP8Eas3sMNEz9nXpLFpERIaW1LVc7r4N2Jaw7dm4xxeBh1NbmoiIXCtd3yUikocU7iIieUjhLiKShxTuIiJ5yMK6YtHMOoDINb58Cgm//ZolVNfIqK6Ry9baVNfIXE9dc929bLhBoYX79TCzBnevCruORKprZFTXyGVrbaprZDJRl5ZlRETykMJdRCQP5Wq4Px92AYNQXSOjukYuW2tTXSOT9rpycs1dRESGlqtn7iIiMgSFu4hIHsracDezH5pZu5ntHWS/mdn/MLPDZrbHzJZmSV1fNbNOM9sV+3o2aFwa6pptZu+Z2QEz22dm/yJgTMbnLMm6Mj5nZjbOzD4ws92xuv5jwJixZvZSbL62m1lFltT1pJl1xM3XP013XXHHLjazj8zsjYB9GZ+vJOsKc74azezj2HEbAvan73vS3bPyC/gKsBTYO8j++4E3id4FqhrYniV1fRV4I4T5KgeWxh6XAJ8AlWHPWZJ1ZXzOYnMwMfZ4NLAdqE4Y88+Bv4k9Xge8lCV1PQn8Vab/jsWO/V1gU9D/rzDmK8m6wpyvRmDKEPvT9j2ZtWfu7v4Lhr6b01qgxqPqgRvNrDwL6gqFu7e5+4exx2eBA0TvbRsv43OWZF0ZF5uDc7Gno2NfiVcXrAX+Pvb4FeBuS/PNW5OsKxRmNgv4HeAHgwzJ+HwlWVc2S9v3ZNaGexKCbtwdemjErIz9WP2mmS3K9MFjPw4vIXrWFy/UORuiLghhzmI/yu8C2oH/6+6Dzpe79wIDN34Puy6AB2M/xr9iZrMD9qfDfwP+NdA/yP5Q5iuJuiCc+YLoP8z/x8x2mtnTAfvT9j2Zy+Ge1E25Q/Ah0d4PXwL+Eng1kwc3s4nAj4B/6e5nEncHvCQjczZMXaHMmbv3ufudRO8LvNzMFicMCWW+kqjrdaDC3b8IvMPnZ8tpY2b/CGh3951DDQvYltb5SrKujM9XnFXuvhS4D/h9M/tKwv60zVkuh3syN+7OOHc/M/BjtUfvYDXazKZk4thmNppogG509/8dMCSUORuurjDnLHbM08DPgNUJu0K98ftgdbn7CXfvjj39O2BZBspZBawxs0ZgM/A1M3shYUwY8zVsXSHN18CxW2P/bQd+DCxPGJK278lcDvetwBOxT5urgU53bwu7KDObPrDOaGbLic7xiQwc14jey/aAu//XQYZlfM6SqSuMOTOzMjO7Mfb4BuAe4GDCsIzf+D2ZuhLWZNcQ/Rwjrdz9j919lrtXEP2w9F13fzxhWMbnK5m6wpiv2HEnmFnJwGPg60DiVXZp+55M6h6qYTCzF4leRTHFzFqA/0D0wyXc/W+I3tP1fuAw0AV8J0vqegj4Z2bWC1wA1qX7L3jMKmAD8HFsvRbg3wJz4moLY86SqSuMOSsH/t7Mion+Y7LF3d+w8G/8nkxdf2hma4DeWF1PZqCuQFkwX8nUFdZ8TQN+HDtvGQVscve3zOz3IP3fk2o/ICKSh3J5WUZERAahcBcRyUMKdxGRPKRwFxHJQwp3EZE8pHAXEclDCncRkTz0/wFjWwBZ/fHA9gAAAABJRU5ErkJggg==\n", 166 | "text/plain": [ 167 | "" 168 | ] 169 | }, 170 | "metadata": { 171 | "needs_background": "light" 172 | }, 173 | "output_type": "display_data" 174 | } 175 | ], 176 | "source": [ 177 | "# compare the effect of the degree on the number of created features\n", 178 | "from pandas import read_csv\n", 179 | "from sklearn.preprocessing import LabelEncoder\n", 180 | "from sklearn.preprocessing import PolynomialFeatures\n", 181 | "from matplotlib import pyplot\n", 182 | " \n", 183 | "# get the dataset\n", 184 | "def get_dataset():\n", 185 | "\t# load dataset\n", 186 | "\turl = \"https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv\"\n", 187 | "\tdataset = read_csv(url, header=None)\n", 188 | "\tdata = dataset.values\n", 189 | "\t# separate into input and output columns\n", 190 | "\tX, y = data[:, :-1], data[:, -1]\n", 191 | "\t# ensure inputs are floats and output is an integer label\n", 192 | "\tX = X.astype('float32')\n", 193 | "\ty = LabelEncoder().fit_transform(y.astype('str'))\n", 194 | "\treturn X, y\n", 195 | " \n", 196 | "# define dataset\n", 197 | "X, y = get_dataset()\n", 198 | "# calculate change in number of features\n", 199 | "num_features = list()\n", 200 | "degress = [i for i in range(1, 6)]\n", 201 | "for d in degress:\n", 202 | "\t# create transform\n", 203 | "\ttrans = PolynomialFeatures(degree=d)\n", 204 | "\t# fit and transform\n", 205 | "\tdata = trans.fit_transform(X)\n", 206 | "\t# record number of features\n", 207 | "\tnum_features.append(data.shape[1])\n", 208 | "\t# summarize\n", 209 | "\tprint('Degree: %d, Features: %d' % (d, data.shape[1]))\n", 210 | "# plot degree vs number of features\n", 211 | "pyplot.plot(degress, num_features)\n", 212 | "pyplot.show()" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 7, 218 | "metadata": {}, 219 | "outputs": [ 220 | { 221 | "name": "stdout", 222 | "output_type": "stream", 223 | "text": [ 224 | ">1 0.797 (0.073)\n", 225 | ">2 0.793 (0.085)\n", 226 | ">3 0.800 (0.077)\n", 227 | ">4 0.795 (0.079)\n" 228 | ] 229 | }, 230 | { 231 | "data": { 232 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAFWtJREFUeJzt3X+MFOd9x/H3x2cDkRI7EC5RxIEh0TmCotRWVyQqtDZJ7ZzdChJbiiBKZEs0qFJwUjdJhYUlHCwU/xE16R+kDjUoP9pALTexr1Vk5Ma4LZHd3BKDU6DY50sTLkRlHXDSKpbNj2//2CEejrN37m642d3n85JW3DzzzN13RzOfHZ6d3UcRgZmZpeGyqgswM7Pp49A3M0uIQ9/MLCEOfTOzhDj0zcwS4tA3M0uIQ9/MLCEOfTOzhDj0zcwScnnVBYw1d+7cWLhwYdVlmJl1lP37978YEb2t+rVd6C9cuJB6vV51GWZmHUXST4v08/COmVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZgkpFPqSBiQdlTQsaeM466+W9H1Jz0p6UlJfbt1ZSQeyx2CZxZuZ2cS0vE9fUg+wDbgRGAWGJA1GxOFcty8B34yIb0j6APBF4BPZupcj4tqS6zYzs0ko8uGsZcBwRIwASNoNrAbyob8EuCv7eS/wSJlFTjdJpf4+z0Ns1p5SPNeLDO/MA47llkeztryDwG3Zzx8B3iLpbdnyLEl1SU9L+vB4f0DS+qxPvdFoTKD8SyMiWj6K9uuEg+BSk1TaI3Vl7kvvzzTP9SKhP96RMfbZfQ64XtIzwPXAz4Ez2boFEVEDPgZ8RdK7L/plEdsjohYRtd7ell8dYR2mzBMrdRMJH+9PG0+R4Z1RYH5uuQ84nu8QEceBWwEkvRm4LSJ+lVtHRIxIehK4DnhhypWbmdmEFbnSHwL6JS2SNANYA1xwF46kuZLO/667gZ1Z+2xJM8/3AZZz4XsBZmY2jVqGfkScATYAe4AjwEMRcUjSFkmrsm43AEclPQe8A9iatS8G6pIO0nyD9/4xd/2Ymdk0UruN69VqteiEr1aW5DHREnl/lsv7szydsi8l7c/eP31D/kSumVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJaRQ6EsakHRU0rCkjeOsv1rS9yU9K+lJSX25dbdLej573F5m8WZmNjEtQ19SD7ANuBlYAqyVtGRMty8B34yI9wJbgC9m284BNgPvA5YBmyXNLq98MzObiCJX+suA4YgYiYhXgd3A6jF9lgDfz37em1v/IeDxiDgZEaeAx4GBqZdtZmaTUST05wHHcsujWVveQeC27OePAG+R9LaC25qZ2TQpEvoap23shJGfA66X9AxwPfBz4EzBbZG0XlJdUr3RaBQoyczMJqNI6I8C83PLfcDxfIeIOB4Rt0bEdcCmrO1XRbbN+m6PiFpE1Hp7eyf4FMzMrKgioT8E9EtaJGkGsAYYzHeQNFfS+d91N7Az+3kPcJOk2dkbuDdlbWZmVoGWoR8RZ4ANNMP6CPBQRByStEXSqqzbDcBRSc8B7wC2ZtueBO6j+cIxBGzJ2szMrAKKuGiIvVK1Wi3q9XrVZbQkiXbbd53M+7Nc3p/l6ZR9KWl/RNRa9fMncs3MEuLQNzNLiEPfzCwhDn0zs4Q49M3MEuLQNzNLiEPfzLrSnDlzkDTlB1DK75HEnDlzKt4rcHnVBZiZXQqnTp1qu/vrz7+IVMlX+mZmCXHom5klxKFvZpYQh76ZWUIc+mZmCXHom5klxKFvZpYQh76ZWUIKhb6kAUlHJQ1L2jjO+gWS9kp6RtKzkm7J2hdKelnSgezxQNlPwKpT1iceu/FTj5Ph/WnToeUnciX1ANuAG2lOdD4kaTAiDue63UNzGsW/kbQE+B6wMFv3QkRcW27Z1g7a8ROP0B6fepwM70+bDkWu9JcBwxExEhGvAruB1WP6BHBl9vNVwPHySjQzs7IUCf15wLHc8mjWlncv8HFJozSv8u/MrVuUDfv8q6Q/mEqxZmY2NUVCf7z/2439P+ha4OsR0QfcAnxL0mXAL4AFEXEd8BfAtyVdOWZbJK2XVJdUbzQaE3sGE+AxUzNLXZFv2RwF5ueW+7h4+GYdMAAQEU9JmgXMjYgTwCtZ+35JLwDXAPX8xhGxHdgOUKvVLtmgpsdMzSx1Ra70h4B+SYskzQDWAINj+vwM+CCApMXALKAhqTd7IxhJ7wL6gZGyijczs4lpeaUfEWckbQD2AD3Azog4JGkLUI+IQeCzwN9Kuovm0M8dERGS/hDYIukMcBb4s4g4ecmejZmZvSG123BHrVaLer3euuMkSGrb4Z12rKuVdq27XetqpV3rbte6WmnHui9lTZL2R0StVT9/ItfMLCEOfTOzhDj0zcwS4tA3M0uIQ9/MLCEOfTOzhDj0zcwS4tA3M0uIQ9/MLCEOfTOzhDj0zcwS4tA3M0uIQ9/MLCEOfTOzhBSZOatrxOYr4d6rqi7jIrH5ohkkO4L3Z7m8P206FPo+fUkDwF/TnETlwYi4f8z6BcA3gLdmfTZGxPeydXfTnE7xLPDpiNjzRn/L36ffOdq17natq5V2rbtd62qlHevuiO/Tz6Y73AbcDCwB1kpaMqbbPcBD2QToa4CvZtsuyZZ/h+Ycul89P32imV0ajd80uOOxO3jx5RerLsXaUJEx/WXAcESMRMSrwG5g9Zg+AZz/P+BVvDZx+mpgd0S8EhE/AYaz32dml8gDzz7Aj/7nRzxw8IGqS7E2VCT05wHHcsujWVvevcDHJY0C3wPunMC2HcdXUtauGr9p8OjwowTBI8OP+Bidom4814uEvsZpGzsotRb4ekT0AbcA35J0WcFtkbReUl1SvdFoFCipWr6SKlc3nlhVeeDZBzgX5wA4F+d8jE5RN57rRUJ/FJifW+7jteGb89YBDwFExFPALGBuwW2JiO0RUYuIWm9vb/HqK+ArqfJ144lVhfPH5ulzpwE4fe60j9Ep6NZzvUjoDwH9khZJmkHzjdnBMX1+BnwQQNJimqHfyPqtkTRT0iKgH/hhWcVXwVdS5erWE6sK+WPzPB+jk9et53rL0I+IM8AGYA9whOZdOockbZG0Kuv2WeCTkg4Cu4A7oukQzf8BHAYeAz4VEWcvxROZDr6SKl+3nlhVOHji4G+PzfNOnzvNgRMHKqqoc3XzuV7oPv3p1M736d/39H189/nvXnBiXXHZFdzafyv3vP+eyuqqylTrbvymwc3fuZlXzr7y27aZPTN57LbHmPumuZXVVZV2rbtd62plKnV34rle2n369hpfSZXLwxHWrrr5XE/qaxim6uFVD1ddQlfp5hPLOls3n+sOfatMN59YZu3KwztmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWkOS+e0cabwbHas2ePbvqEqxN+PgsT2y+Eu69quoyLhCbr6y6hLRCv8zvse7U7xi39uXjs1z6wq/bbh9IIu6ttoZCwzuSBiQdlTQsaeM4678s6UD2eE7SS7l1Z3Prxk6zaGZm06jllb6kHmAbcCPNic6HJA1GxOHzfSLirlz/O4Hrcr/i5Yi4trySzcxssooM7ywDhiNiBEDSbmA1zXlvx7MW2FxOedbuPAZt7azdjs92ODaLhP484FhueRR433gdJV0NLAKeyDXPklQHzgD3R8Qjk6zV2ozHoK2dlXU8dduxWST0x3upfL09sAZ4OCLO5toWRMRxSe8CnpD044h44YI/IK0H1gMsWLCgQElmZjYZRd7IHQXm55b7gOOv03cNsCvfEBHHs39HgCe5cLz/fJ/tEVGLiFpvb2+BkszMbDKKhP4Q0C9pkaQZNIP9ortwJL0HmA08lWubLWlm9vNcYDmv/16AmZldYi2HdyLijKQNwB6gB9gZEYckbQHqEXH+BWAtsDsuHPxaDHxN0jmaLzD35+/6MTOz6aV2e4OiVqtFvV6vuoyWuu3Nnap5f5bL+7M8nbIvJe2PiFqrfv7uHTOzhDj0zcwS4tA3M0uIQ9/MLCEOfTOzhDj0zcwS4tA3M0uIQ9/MLCEOfTOzhDj0zcwS4tA3M0uIQ9/MLCEOfTOzhDj0zcwS4tA3M0uIQ9/MLCGFQl/SgKSjkoYlbRxn/ZclHcgez0l6KbfudknPZ4/byyzezMwmpuV0iZJ6gG3AjTQnSR+SNJif9jAi7sr1v5Ns8nNJc4DNQA0IYH+27alSn4WZmRVS5Ep/GTAcESMR8SqwG1j9Bv3XAruynz8EPB4RJ7OgfxwYmErBZmY2eUVCfx5wLLc8mrVdRNLVwCLgiYlsK2m9pLqkeqPRKFK3mZlNQpHQ1zhtrzdL8Brg4Yg4O5FtI2J7RNQiotbb21ugJDMzm4wioT8KzM8t9wHHX6fvGl4b2pnotmZmdokVCf0hoF/SIkkzaAb74NhOkt4DzAaeyjXvAW6SNFvSbOCmrM3MzCrQ8u6diDgjaQPNsO4BdkbEIUlbgHpEnH8BWAvsjojIbXtS0n00XzgAtkTEyXKfgpmZFaVcRreFWq0W9Xq96jJakkS77btO5v1ZLu/P8nTKvpS0PyJqrfr5E7lmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWkJZfuGY2VdJ40ypMrl8nfAeKdY4yj03ojOPToW+XXCecCJamFI9ND++YmSXEoW9mlpBCoS9pQNJRScOSNr5On49KOizpkKRv59rPSjqQPS6accvMzKZPyzF9ST3ANuBGmnPeDkkajIjDuT79wN3A8og4JentuV/xckRcW3LdZmY2CUWu9JcBwxExEhGvAruB1WP6fBLYFhGnACLiRLllmplZGYqE/jzgWG55NGvLuwa4RtIPJD0taSC3bpaketb+4SnWa2ZmU1Dkls3xblAde5/T5UA/cAPQB/y7pKUR8RKwICKOS3oX8ISkH0fECxf8AWk9sB5gwYIFE3wK5Uvx3l3rDEWPuaJ9fWymp8iV/igwP7fcBxwfp8+jEXE6In4CHKX5IkBEHM/+HQGeBK4b+wciYntE1CKi1tvbO+EnUbaIKPVhVhYfmzZVRUJ/COiXtEjSDGANMPYunEeAlQCS5tIc7hmRNFvSzFz7cuAwZmZWiZbDOxFxRtIGYA/QA+yMiEOStgD1iBjM1t0k6TBwFvh8RPxS0u8DX5N0juYLzP35u37MzGx6qd3+i1er1aJer1ddhplZR5G0PyJqrfr5E7lmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCCoW+pAFJRyUNS9r4On0+KumwpEOSvp1rv13S89nj9rIKNzOziWs5XaKkHmAbcCPNCdCHJA3mpz2U1A/cDSyPiFOS3p61zwE2AzUggP3ZtqfKfypmZtZKkSv9ZcBwRIxExKvAbmD1mD6fBLadD/OIOJG1fwh4PCJOZuseBwbKKd3MzCaqSOjPA47llkeztrxrgGsk/UDS05IGJrAtktZLqkuqNxqN4tWbmdmEFAl9jdM2djb1y4F+4AZgLfCgpLcW3JaI2B4RtYio9fb2FijJzMwmo0jojwLzc8t9wPFx+jwaEacj4ifAUZovAkW2NTOzaVIk9IeAfkmLJM0A1gCDY/o8AqwEkDSX5nDPCLAHuEnSbEmzgZuyNjMzq0DLu3ci4oykDTTDugfYGRGHJG0B6hExyGvhfhg4C3w+In4JIOk+mi8cAFsi4uSleCJmZtaaIi4aYq9UrVaLer1edRlmZh1F0v6IqLXq50/kmpklxKFvZpYQh76ZWUIc+mZmCXHom5klxKFvZpYQh76ZWUIc+mZmCXHom5klxKFvZpYQh76ZWUIc+mZmCXHom5klxKFvZpYQh76ZWUIKhb6kAUlHJQ1L2jjO+jskNSQdyB5/mlt3Ntc+dsYtMzObRi1DX1IPsA24GVgCrJW0ZJyu/xAR12aPB3PtL+faV5VTtnWLXbt2sXTpUnp6eli6dCm7du2quiSzrtZyukRgGTAcESMAknYDq4HDl7Iw6367du1i06ZN7NixgxUrVrBv3z7WrVsHwNq1ayuuzqw7FRnemQccyy2PZm1j3SbpWUkPS5qfa58lqS7paUkfnkqx1l22bt3Kjh07WLlyJVdccQUrV65kx44dbN26terSzLpWkdDXOG1jJ9b9J2BhRLwX+BfgG7l1C7J5Gz8GfEXSuy/6A9L67IWh3mg0CpZune7IkSOsWLHigrYVK1Zw5MiRiioy635FQn8UyF+59wHH8x0i4pcR8Uq2+LfA7+XWHc/+HQGeBK4b+wciYntE1CKi1tvbO6EnYJ1r8eLF7Nu374K2ffv2sXjx4ooqMut+RUJ/COiXtEjSDGANcMFdOJLemVtcBRzJ2mdLmpn9PBdYjt8LsMymTZtYt24de/fu5fTp0+zdu5d169axadOmqksz61ot38iNiDOSNgB7gB5gZ0QckrQFqEfEIPBpSauAM8BJ4I5s88XA1ySdo/kCc39EOPQNeO3N2jvvvJMjR46wePFitm7d6jdxzS4hRYwdnq9WrVaLer1edRlmZh1F0v7s/dM35E/kmpklxKFvZpYQh76ZWUIc+mZmCXHom5klpO3u3pHUAH5adR0FzAVerLqILuL9WS7vz/J0yr68OiJafrq17UK/U0iqF7k9yorx/iyX92d5um1fenjHzCwhDn0zs4Q49Cdve9UFdBnvz3J5f5anq/alx/TNzBLiK30zs4Q49CdI0k5JJyT9Z9W1dANJ8yXtlXRE0iFJn6m6pk4laZakH0o6mO3LL1RdUzeQ1CPpGUn/XHUtZXDoT9zXgYGqi+giZ4DPRsRi4P3ApyQtqbimTvUK8IGI+F3gWmBA0vsrrqkbfIZsjpBu4NCfoIj4N5pzBlgJIuIXEfGj7Of/pXlyjTcHs7UQTf+XLV6RPfym3RRI6gP+GHiw6lrK4tC3tiFpIc3pNP+j2ko6VzYUcQA4ATweEd6XU/MV4C+Bc1UXUhaHvrUFSW8G/hH484j4ddX1dKqIOBsR19Kcy3qZpKVV19SpJP0JcCIi9lddS5kc+lY5SVfQDPy/j4jvVF1PN4iIl4An8ftPU7EcWCXpv4HdwAck/V21JU2dQ98qJUnADuBIRPxV1fV0Mkm9kt6a/fwm4I+A/6q2qs4VEXdHRF9ELATWAE9ExMcrLmvKHPoTJGkX8BTwHkmjktZVXVOHWw58guZV1IHscUvVRXWodwJ7JT0LDNEc0++K2wytPP5ErplZQnylb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJeT/Af6izf0XxIBPAAAAAElFTkSuQmCC\n", 233 | "text/plain": [ 234 | "" 235 | ] 236 | }, 237 | "metadata": { 238 | "needs_background": "light" 239 | }, 240 | "output_type": "display_data" 241 | } 242 | ], 243 | "source": [ 244 | "# explore the effect of degree on accuracy for the polynomial features transform\n", 245 | "from numpy import mean\n", 246 | "from numpy import std\n", 247 | "from pandas import read_csv\n", 248 | "from sklearn.model_selection import cross_val_score\n", 249 | "from sklearn.model_selection import RepeatedStratifiedKFold\n", 250 | "from sklearn.neighbors import KNeighborsClassifier\n", 251 | "from sklearn.preprocessing import PolynomialFeatures\n", 252 | "from sklearn.preprocessing import LabelEncoder\n", 253 | "from sklearn.pipeline import Pipeline\n", 254 | "from matplotlib import pyplot\n", 255 | " \n", 256 | "# get the dataset\n", 257 | "def get_dataset():\n", 258 | "\t# load dataset\n", 259 | "\turl = \"https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv\"\n", 260 | "\tdataset = read_csv(url, header=None)\n", 261 | "\tdata = dataset.values\n", 262 | "\t# separate into input and output columns\n", 263 | "\tX, y = data[:, :-1], data[:, -1]\n", 264 | "\t# ensure inputs are floats and output is an integer label\n", 265 | "\tX = X.astype('float32')\n", 266 | "\ty = LabelEncoder().fit_transform(y.astype('str'))\n", 267 | "\treturn X, y\n", 268 | " \n", 269 | "# get a list of models to evaluate\n", 270 | "def get_models():\n", 271 | "\tmodels = dict()\n", 272 | "\tfor d in range(1,5):\n", 273 | "\t\t# define the pipeline\n", 274 | "\t\ttrans = PolynomialFeatures(degree=d)\n", 275 | "\t\tmodel = KNeighborsClassifier()\n", 276 | "\t\tmodels[str(d)] = Pipeline(steps=[('t', trans), ('m', model)])\n", 277 | "\treturn models\n", 278 | " \n", 279 | "# evaluate a give model using cross-validation\n", 280 | "def evaluate_model(model, X, y):\n", 281 | "\tcv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)\n", 282 | "\tscores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')\n", 283 | "\treturn scores\n", 284 | " \n", 285 | "# define dataset\n", 286 | "X, y = get_dataset()\n", 287 | "# get the models to evaluate\n", 288 | "models = get_models()\n", 289 | "# evaluate the models and store results\n", 290 | "results, names = list(), list()\n", 291 | "for name, model in models.items():\n", 292 | "\tscores = evaluate_model(model, X, y)\n", 293 | "\tresults.append(scores)\n", 294 | "\tnames.append(name)\n", 295 | "\tprint('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))\n", 296 | "# plot model performance for comparison\n", 297 | "pyplot.boxplot(results, labels=names, showmeans=True)\n", 298 | "pyplot.show()" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [] 307 | } 308 | ], 309 | "metadata": { 310 | "kernelspec": { 311 | "display_name": "Python 3", 312 | "language": "python", 313 | "name": "python3" 314 | }, 315 | "language_info": { 316 | "codemirror_mode": { 317 | "name": "ipython", 318 | "version": 3 319 | }, 320 | "file_extension": ".py", 321 | "mimetype": "text/x-python", 322 | "name": "python", 323 | "nbconvert_exporter": "python", 324 | "pygments_lexer": "ipython3", 325 | "version": "3.6.4" 326 | } 327 | }, 328 | "nbformat": 4, 329 | "nbformat_minor": 2 330 | } 331 | -------------------------------------------------------------------------------- /Categorical Feature Selection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Train (191, 9) (191,)\n", 13 | "Test (95, 9) (95,)\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "# load and summarize the dataset\n", 19 | "from pandas import read_csv\n", 20 | "from sklearn.model_selection import train_test_split\n", 21 | "\n", 22 | "# load the dataset\n", 23 | "def load_dataset(filename):\n", 24 | "\t# load the dataset\n", 25 | "\tdata = read_csv(filename, header=None)\n", 26 | "\t# retrieve array\n", 27 | "\tdataset = data.values\n", 28 | "\t# split into input and output variables\n", 29 | "\tX = dataset[:, :-1]\n", 30 | "\ty = dataset[:,-1]\n", 31 | "\t# format all fields as string\n", 32 | "\tX = X.astype(str)\n", 33 | "\treturn X, y\n", 34 | "\n", 35 | "# load the dataset\n", 36 | "X, y = load_dataset('breast-cancer.csv')\n", 37 | "\n", 38 | "# split into train and test sets\n", 39 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)\n", 40 | "\n", 41 | "# summarize\n", 42 | "print('Train', X_train.shape, y_train.shape)\n", 43 | "print('Test', X_test.shape, y_test.shape)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "Train (191, 9) (191,)\n", 56 | "Test (95, 9) (95,)\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "# example of loading and preparing the breast cancer dataset\n", 62 | "from pandas import read_csv\n", 63 | "from sklearn.model_selection import train_test_split\n", 64 | "from sklearn.preprocessing import LabelEncoder\n", 65 | "from sklearn.preprocessing import OrdinalEncoder\n", 66 | "\n", 67 | "# load the dataset\n", 68 | "def load_dataset(filename):\n", 69 | "\t# load the dataset\n", 70 | "\tdata = read_csv(filename, header=None)\n", 71 | "\t# retrieve array\n", 72 | "\tdataset = data.values\n", 73 | "\t# split into input and output variables\n", 74 | "\tX = dataset[:, :-1]\n", 75 | "\ty = dataset[:,-1]\n", 76 | "\t# format all fields as string\n", 77 | "\tX = X.astype(str)\n", 78 | "\treturn X, y\n", 79 | "\n", 80 | "# prepare input data\n", 81 | "def prepare_inputs(X_train, X_test):\n", 82 | "\toe = OrdinalEncoder()\n", 83 | "\toe.fit(X_train)\n", 84 | "\tX_train_enc = oe.transform(X_train)\n", 85 | "\tX_test_enc = oe.transform(X_test)\n", 86 | "\treturn X_train_enc, X_test_enc\n", 87 | "\n", 88 | "# prepare target\n", 89 | "def prepare_targets(y_train, y_test):\n", 90 | "\tle = LabelEncoder()\n", 91 | "\tle.fit(y_train)\n", 92 | "\ty_train_enc = le.transform(y_train)\n", 93 | "\ty_test_enc = le.transform(y_test)\n", 94 | "\treturn y_train_enc, y_test_enc\n", 95 | "\n", 96 | "# load the dataset\n", 97 | "X, y = load_dataset('breast-cancer.csv')\n", 98 | "# split into train and test sets\n", 99 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)\n", 100 | "# prepare input data\n", 101 | "X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)\n", 102 | "# prepare output data\n", 103 | "y_train_enc, y_test_enc = prepare_targets(y_train, y_test)\n", 104 | "# summarize\n", 105 | "print('Train', X_train_enc.shape, y_train_enc.shape)\n", 106 | "print('Test', X_test_enc.shape, y_test_enc.shape)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 4, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "Feature 0: 0.472553\n", 119 | "Feature 1: 0.029193\n", 120 | "Feature 2: 2.137658\n", 121 | "Feature 3: 29.381059\n", 122 | "Feature 4: 8.222601\n", 123 | "Feature 5: 8.100183\n", 124 | "Feature 6: 1.273822\n", 125 | "Feature 7: 0.950682\n", 126 | "Feature 8: 3.699989\n" 127 | ] 128 | }, 129 | { 130 | "data": { 131 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD8CAYAAABn919SAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAADJNJREFUeJzt3W+IZYV5x/Hvr64hiaZocJStfzo2iFUKrmVYbIWSatLaWKqBBiJUpFg2L2KqRShb3zSFvthCos2LImzUZqHWNKhBiZJGrEUCxXZWt7pmE0zt1qxu3ZHUavui6erTF3Mty86M986dO3Nnn/1+YJh7z5y75+Gw892zZ865k6pCknTi+6lpDyBJmgyDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpiS0bubGzzjqrZmdnN3KTknTC27t37xtVNTNsvQ0N+uzsLPPz8xu5SUk64SX5t1HW85SLJDUxNOhJPpjkH5P8c5IXk/zJYPmFSZ5J8lKSv0nygfUfV5K0klGO0P8HuKqqLgO2AdckuQL4M+CuqroI+A/g5vUbU5I0zNCg16L/Gjw9dfBRwFXAg4Ple4Dr12VCSdJIRjqHnuSUJPuAI8ATwL8Ab1bV0cEqh4BzV3jtjiTzSeYXFhYmMbMkaRkjBb2q3qmqbcB5wHbgkuVWW+G1u6tqrqrmZmaGXnUjSRrTqq5yqao3gb8HrgDOSPLeZY/nAa9NdjRJ0mqMcpXLTJIzBo8/BHwCOAA8Bfz2YLWbgEfWa0hJ0nCj3Fi0FdiT5BQW/wH4RlV9K8n3gK8n+VPgOeDedZxTkjTE0KBX1fPA5cssf5nF8+kSszsf29DtHdx17YZuTzoReKeoJDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpiaFBT3J+kqeSHEjyYpJbB8u/mOTVJPsGH59a/3ElSSvZMsI6R4Hbq+rZJB8B9iZ5YvC1u6rqS+s3niRpVEODXlWHgcODx28nOQCcu96DSZJWZ1Xn0JPMApcDzwwW3ZLk+ST3JTlzwrNJklZh5KAnOR14CLitqt4C7gY+Bmxj8Qj+yyu8bkeS+STzCwsLExhZkrSckYKe5FQWY35/VT0MUFWvV9U7VfUu8FVg+3KvrardVTVXVXMzMzOTmluSdJxRrnIJcC9woKruPGb51mNW+zSwf/LjSZJGNcpVLlcCNwIvJNk3WHYHcEOSbUABB4HPrcuEkqSRjHKVy3eBLPOlxyc/jiRpXN4pKklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJamJo0JOcn+SpJAeSvJjk1sHyjyZ5IslLg89nrv+4kqSVjHKEfhS4vaouAa4APp/kUmAn8GRVXQQ8OXguSZqSoUGvqsNV9ezg8dvAAeBc4Dpgz2C1PcD16zWkJGm4VZ1DTzILXA48A5xTVYdhMfrA2ZMeTpI0upGDnuR04CHgtqp6axWv25FkPsn8wsLCODNKkkYwUtCTnMpizO+vqocHi19PsnXw9a3AkeVeW1W7q2ququZmZmYmMbMkaRmjXOUS4F7gQFXdecyXHgVuGjy+CXhk8uNJkka1ZYR1rgRuBF5Ism+w7A5gF/CNJDcDrwCfWZ8RJUmjGBr0qvoukBW+fPVkx5Ekjcs7RSWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpCYMuSU0MDXqS+5IcSbL/mGVfTPJqkn2Dj0+t75iSpGFGOUL/GnDNMsvvqqptg4/HJzuWJGm1hga9qp4GfrwBs0iS1mAt59BvSfL84JTMmRObSJI0lnGDfjfwMWAbcBj48korJtmRZD7J/MLCwpibkyQNM1bQq+r1qnqnqt4Fvgpsf591d1fVXFXNzczMjDunJGmIsYKeZOsxTz8N7F9pXUnSxtgybIUkDwAfB85Kcgj4Y+DjSbYBBRwEPreOM0qSRjA06FV1wzKL712HWSRJa+CdopLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1MTToSe5LciTJ/mOWfTTJE0leGnw+c33HlCQNM8oR+teAa45bthN4sqouAp4cPJckTdHQoFfV08CPj1t8HbBn8HgPcP2E55IkrdK459DPqarDAIPPZ6+0YpIdSeaTzC8sLIy5OUnSMOv+Q9Gq2l1Vc1U1NzMzs96bk6ST1rhBfz3JVoDB5yOTG0mSNI5xg/4ocNPg8U3AI5MZR5I0rlEuW3wA+Afg4iSHktwM7AI+meQl4JOD55KkKdoybIWqumGFL1094VkkSWvgnaKS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSE0OvQ5dONLM7H9uwbR3cde2GbUsaxqBL62Qj/2EB/3GRp1wkqQ2DLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJamJNf3GoiQHgbeBd4CjVTU3iaEkSas3iV9B96tV9cYE/hxJ0hp4ykWSmlhr0Av4TpK9SXZMYiBJ0njWesrlyqp6LcnZwBNJvl9VTx+7wiD0OwAuuOCCNW5OkrSSNR2hV9Vrg89HgG8C25dZZ3dVzVXV3MzMzFo2J0l6H2MHPclpST7y3mPg14D9kxpMkrQ6aznlcg7wzSTv/Tl/XVXfnshUkqRVGzvoVfUycNkEZ5EkrYGXLUpSEwZdkpow6JLUhEGXpCYm8V4uknTCmd352IZu7+Cua9d9Gx6hS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKa8Nb/E1jHW5cljc8jdElqwqBLUhMGXZKaMOiS1IRBl6QmvMpFOgls5BVRXg01PR6hS1ITHqFL2jDeO7G+PEKXpCYMuiQ1saZTLkmuAb4CnALcU1W7JjLVMvyvmiS9v7GP0JOcAvwF8BvApcANSS6d1GCSpNVZyymX7cAPq+rlqvoJ8HXgusmMJUlarbUE/VzgR8c8PzRYJkmaglTVeC9MPgP8elX93uD5jcD2qvrCcevtAHYMnl4M/GD8ccdyFvDGBm9zs3OfLOU+WZ77Zalp7JOfraqZYSut5Yeih4Dzj3l+HvDa8StV1W5g9xq2syZJ5qtqblrb34zcJ0u5T5bnfllqM++TtZxy+SfgoiQXJvkA8Fng0cmMJUlarbGP0KvqaJJbgL9l8bLF+6rqxYlNJklalTVdh15VjwOPT2iW9TK10z2bmPtkKffJ8twvS23afTL2D0UlSZuLt/5LUhOtg57kmiQ/SPLDJDunPc+0JTk/yVNJDiR5Mcmt055ps0hySpLnknxr2rNsBknOSPJgku8P/r780rRnmrYkfzD4vtmf5IEkH5z2TMdrG3TfmmBZR4Hbq+oS4Arg8+6T/3crcGDaQ2wiXwG+XVU/D1zGSb5vkpwL/D4wV1W/wOKFIJ+d7lRLtQ06vjXBElV1uKqeHTx+m8Vv0pP+7t4k5wHXAvdMe5bNIMlPA78C3AtQVT+pqjenO9WmsAX4UJItwIdZ5r6baescdN+a4H0kmQUuB56Z7iSbwp8Dfwi8O+1BNomfAxaAvxychronyWnTHmqaqupV4EvAK8Bh4D+r6jvTnWqpzkHPMsu8pAdIcjrwEHBbVb017XmmKclvAkeqau+0Z9lEtgC/CNxdVZcD/w2c1D+DSnImi//DvxD4GeC0JL8z3amW6hz0kd6a4GST5FQWY35/VT087Xk2gSuB30pykMXTclcl+avpjjR1h4BDVfXe/94eZDHwJ7NPAP9aVQtV9b/Aw8AvT3mmJToH3bcmOE6SsHhe9EBV3TnteTaDqvqjqjqvqmZZ/Dvyd1W16Y68NlJV/TvwoyQXDxZdDXxviiNtBq8AVyT58OD76Go24Q+K2/6SaN+aYFlXAjcCLyTZN1h2x+COX+lYXwDuHxwMvQz87pTnmaqqeibJg8CzLF4t9hyb8I5R7xSVpCY6n3KRpJOKQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKa+D8Ax+cHtITPNAAAAABJRU5ErkJggg==\n", 132 | "text/plain": [ 133 | "" 134 | ] 135 | }, 136 | "metadata": { 137 | "needs_background": "light" 138 | }, 139 | "output_type": "display_data" 140 | } 141 | ], 142 | "source": [ 143 | "# example of chi squared feature selection for categorical data\n", 144 | "from pandas import read_csv\n", 145 | "from sklearn.model_selection import train_test_split\n", 146 | "from sklearn.preprocessing import LabelEncoder\n", 147 | "from sklearn.preprocessing import OrdinalEncoder\n", 148 | "from sklearn.feature_selection import SelectKBest\n", 149 | "from sklearn.feature_selection import chi2\n", 150 | "from matplotlib import pyplot\n", 151 | " \n", 152 | "# load the dataset\n", 153 | "def load_dataset(filename):\n", 154 | "\t# load the dataset as a pandas DataFrame\n", 155 | "\tdata = read_csv(filename, header=None)\n", 156 | "\t# retrieve numpy array\n", 157 | "\tdataset = data.values\n", 158 | "\t# split into input (X) and output (y) variables\n", 159 | "\tX = dataset[:, :-1]\n", 160 | "\ty = dataset[:,-1]\n", 161 | "\t# format all fields as string\n", 162 | "\tX = X.astype(str)\n", 163 | "\treturn X, y\n", 164 | " \n", 165 | "# prepare input data\n", 166 | "def prepare_inputs(X_train, X_test):\n", 167 | "\toe = OrdinalEncoder()\n", 168 | "\toe.fit(X_train)\n", 169 | "\tX_train_enc = oe.transform(X_train)\n", 170 | "\tX_test_enc = oe.transform(X_test)\n", 171 | "\treturn X_train_enc, X_test_enc\n", 172 | " \n", 173 | "# prepare target\n", 174 | "def prepare_targets(y_train, y_test):\n", 175 | "\tle = LabelEncoder()\n", 176 | "\tle.fit(y_train)\n", 177 | "\ty_train_enc = le.transform(y_train)\n", 178 | "\ty_test_enc = le.transform(y_test)\n", 179 | "\treturn y_train_enc, y_test_enc\n", 180 | " \n", 181 | "# feature selection\n", 182 | "def select_features(X_train, y_train, X_test):\n", 183 | "\tfs = SelectKBest(score_func=chi2, k='all')\n", 184 | "\tfs.fit(X_train, y_train)\n", 185 | "\tX_train_fs = fs.transform(X_train)\n", 186 | "\tX_test_fs = fs.transform(X_test)\n", 187 | "\treturn X_train_fs, X_test_fs, fs\n", 188 | " \n", 189 | "# load the dataset\n", 190 | "X, y = load_dataset('breast-cancer.csv')\n", 191 | "# split into train and test sets\n", 192 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)\n", 193 | "# prepare input data\n", 194 | "X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)\n", 195 | "# prepare output data\n", 196 | "y_train_enc, y_test_enc = prepare_targets(y_train, y_test)\n", 197 | "# feature selection\n", 198 | "X_train_fs, X_test_fs, fs = select_features(X_train_enc, y_train_enc, X_test_enc)\n", 199 | "# what are scores for the features\n", 200 | "for i in range(len(fs.scores_)):\n", 201 | "\tprint('Feature %d: %f' % (i, fs.scores_[i]))\n", 202 | "# plot the scores\n", 203 | "pyplot.bar([i for i in range(len(fs.scores_))], fs.scores_)\n", 204 | "pyplot.show()" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 5, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "name": "stdout", 214 | "output_type": "stream", 215 | "text": [ 216 | "Feature 0: 0.000000\n", 217 | "Feature 1: 0.000000\n", 218 | "Feature 2: 0.064215\n", 219 | "Feature 3: 0.000000\n", 220 | "Feature 4: 0.000000\n", 221 | "Feature 5: 0.035942\n", 222 | "Feature 6: 0.000208\n", 223 | "Feature 7: 0.041381\n", 224 | "Feature 8: 0.066756\n" 225 | ] 226 | }, 227 | { 228 | "data": { 229 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAEXxJREFUeJzt3XuMHedZx/HvDy9xaBEpcpc/8IV1FXNx2tKWxQEKBWFaHBUwCEfdcItQJKsq5l4VF4moWPzRIERAIlysOihyEQ5yuayowUgYIYGK8aZJaZ1gtHUDXlyEUwdDWlLX7cMfO0Enx+vu7PrYZ+H9fqRVZt55Zuc5J7u/83rOnNlUFZKkNnzBuBuQJN08hr4kNcTQl6SGGPqS1BBDX5IaYuhLUkN6hX6SXUnOJJlPsn+J7euTPNptP5lkqhv/wSRPDHx9LslrRvsQJEl9Zbnr9JOsA/4JeCOwAJwC7qmqJwdq3ga8uqremmQG+L6qesvQ93kV8CdV9YoRPwZJUk99Zvo7gPmqOltVl4EjwO6hmt3AI93yUWBnkgzV3AP8/vU0K0m6PhM9ajYC5wbWF4A7r1VTVVeSXAI2AM8M1LyFq18srvLyl7+8pqamerQlSXrBY4899kxVTS5X1yf0h2fsAMPnhD5vTZI7gU9V1UeWPECyF9gLsGXLFubm5nq0JUl6QZJ/7lPX5/TOArB5YH0TcP5aNUkmgNuAiwPbZ/g8p3aq6mBVTVfV9OTksi9UkqRV6hP6p4BtSbYmuYXFAJ8dqpkF7u2W9wAnqnuHOMkXAHez+F6AJGmMlj29052j3wccB9YBD1fV6SQHgLmqmgUOAYeTzLM4w58Z+BZvABaq6uzo25ckrcSyl2zebNPT0+U5fUlamSSPVdX0cnV+IleSGmLoS1JDDH1JaoihL0kNMfQlqSF9PpErSU2a2v/+m3q8p9/95ht+DGf6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xOv0G3AzrzW+GdcZS1o9Z/qS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0JekhvQK/SS7kpxJMp9k/xLb1yd5tNt+MsnUwLZXJ/lAktNJPpzk1tG1L0laiWVDP8k64CHgLmA7cE+S7UNl9wHPVtXtwIPAA92+E8B7gbdW1R3AtwGfGVn3kqQV6TPT3wHMV9XZqroMHAF2D9XsBh7plo8CO5MEeBPwD1X1IYCq+kRVfXY0rUuSVqpP6G8Ezg2sL3RjS9ZU1RXgErAB+EqgkhxP8sEk77j+liVJq9XnhmtZYqx61kwA3wx8PfAp4C+TPFZVf/minZO9wF6ALVu29GhJkrQafWb6C8DmgfVNwPlr1XTn8W8DLnbjf11Vz1TVp4BjwOuGD1BVB6tquqqmJycnV/4oJEm99An9U8C2JFuT3ALMALNDNbPAvd3yHuBEVRVwHHh1kpd0LwbfCjw5mtYlSSu17OmdqrqSZB+LAb4OeLiqTic5AMxV1SxwCDicZJ7FGf5Mt++zSX6VxReOAo5V1c27ubsk6UV6/RGVqjrG4qmZwbH7B5afB+6+xr7vZfGyTUnSmPmJXElqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1JBeoZ9kV5IzSeaT7F9i+/okj3bbTyaZ6sankvx3kie6r98ebfuSpJWYWK4gyTrgIeCNwAJwKslsVT05UHYf8GxV3Z5kBngAeEu37aNV9ZoR9y1JWoU+M/0dwHxVna2qy8ARYPdQzW7gkW75KLAzSUbXpiRpFPqE/kbg3MD6Qje2ZE1VXQEuARu6bVuTPJ7kr5N8y3X2K0m6Dsue3gGWmrFXz5qPA1uq6hNJvg744yR3VNV/vmjnZC+wF2DLli09WpIkrUafmf4CsHlgfRNw/lo1SSaA24CLVfXpqvoEQFU9BnwU+MrhA1TVwaqarqrpycnJlT8KSVIvfUL/FLAtydYktwAzwOxQzSxwb7e8BzhRVZVksnsjmCSvALYBZ0fTuiRppZY9vVNVV5LsA44D64CHq+p0kgPAXFXNAoeAw0nmgYssvjAAvAE4kOQK8FngrVV18UY8EEnS8vqc06eqjgHHhsbuH1h+Hrh7if3eB7zvOnuUJI2In8iVpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JasjEuBuQpGFT+99/04719LvffNOOtRb0mukn2ZXkTJL5JPuX2L4+yaPd9pNJpoa2b0nyXJK3j6ZtSdJqLBv6SdYBDwF3AduBe5JsHyq7D3i2qm4HHgQeGNr+IPBn19+uJOl69Jnp7wDmq+psVV0GjgC7h2p2A490y0eBnUkCkOR7gbPA6dG0LElarT6hvxE4N7C+0I0tWVNVV4BLwIYkLwV+DvjF629VknS9+oR+lhirnjW/CDxYVc993gMke5PMJZm7cOFCj5YkSavR5+qdBWDzwPom4Pw1ahaSTAC3AReBO4E9SX4ZeBnwuSTPV9VvDO5cVQeBgwDT09PDLyiSpBHpE/qngG1JtgL/CswAPzBUMwvcC3wA2AOcqKoCvuWFgiTvAp4bDnxJ0s2zbOhX1ZUk+4DjwDrg4ao6neQAMFdVs8Ah4HCSeRZn+DM3smlJ0ur0+nBWVR0Djg2N3T+w/Dxw9zLf412r6E+SNELehkGSGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIb4R1SkMbqZfywE2vuDIbqaM31JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1JBeoZ9kV5IzSeaT7F9i+/okj3bbTyaZ6sZ3JHmi+/pQku8bbfuSpJVYNvSTrAMeAu4CtgP3JNk+VHYf8GxV3Q48CDzQjX8EmK6q1wC7gN9J4j38JWlM+sz0dwDzVXW2qi4DR4DdQzW7gUe65aPAziSpqk9V1ZVu/FagRtG0JGl1+oT+RuDcwPpCN7ZkTRfyl4ANAEnuTHIa+DDw1oEXAUnSTdYn9LPE2PCM/Zo1VXWyqu4Avh54Z5JbrzpAsjfJXJK5Cxcu9GhJkrQafUJ/Adg8sL4JOH+tmu6c/W3AxcGCqnoK+CTwyuEDVNXBqpququnJycn+3UuSVqRP6J8CtiXZmuQWYAaYHaqZBe7tlvcAJ6qqun0mAJJ8BfBVwNMj6VyStGLLXklTVVeS7AOOA+uAh6vqdJIDwFxVzQKHgMNJ5lmc4c90u38zsD/JZ4DPAW+rqmduxAORJC2v1+WTVXUMODY0dv/A8vPA3Uvsdxg4fJ09SpJGxE/kSlJDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0JekhvQK/SS7kpxJMp9k/xLb1yd5tNt+MslUN/7GJI8l+XD3328fbfuSpJVYNvSTrAMeAu4CtgP3JNk+VHYf8GxV3Q48CDzQjT8DfHdVvQq4Fzg8qsYlSSvXZ6a/A5ivqrNVdRk4AuweqtkNPNItHwV2JklVPV5V57vx08CtSdaPonFJ0sr1Cf2NwLmB9YVubMmaqroCXAI2DNV8P/B4VX16da1Kkq7XRI+aLDFWK6lJcgeLp3zetOQBkr3AXoAtW7b0aEmStBp9ZvoLwOaB9U3A+WvVJJkAbgMuduubgD8CfqSqPrrUAarqYFVNV9X05OTkyh6BJKm3PqF/CtiWZGuSW4AZYHaoZpbFN2oB9gAnqqqSvAx4P/DOqvrbUTUtSVqdZUO/O0e/DzgOPAX8QVWdTnIgyfd0ZYeADUnmgZ8BXriscx9wO/ALSZ7ovr5s5I9CktRLn3P6VNUx4NjQ2P0Dy88Ddy+x3y8Bv3SdPUqSRsRP5EpSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIb0Cv0ku5KcSTKfZP8S29cnebTbfjLJVDe+IclfJXkuyW+MtnVJ0kotG/pJ1gEPAXcB24F7kmwfKrsPeLaqbgceBB7oxp8HfgF4+8g6liStWp+Z/g5gvqrOVtVl4Aiwe6hmN/BIt3wU2JkkVfXJqvobFsNfkjRmfUJ/I3BuYH2hG1uypqquAJeADaNoUJI0On1CP0uM1Spqrn2AZG+SuSRzFy5c6LubJGmF+oT+ArB5YH0TcP5aNUkmgNuAi32bqKqDVTVdVdOTk5N9d5MkrVCf0D8FbEuyNcktwAwwO1QzC9zbLe8BTlRV75m+JOnmmFiuoKquJNkHHAfWAQ9X1ekkB4C5qpoFDgGHk8yzOMOfeWH/JE8DXwLckuR7gTdV1ZOjfyiSpOUsG/oAVXUMODY0dv/A8vPA3dfYd+o6+pMkjZCfyJWkhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ3pFfpJdiU5k2Q+yf4ltq9P8mi3/WSSqYFt7+zGzyT5ztG1LklaqWVDP8k64CHgLmA7cE+S7UNl9wHPVtXtwIPAA92+24EZ4A5gF/Cb3feTJI1Bn5n+DmC+qs5W1WXgCLB7qGY38Ei3fBTYmSTd+JGq+nRVfQyY776fJGkMJnrUbATODawvAHdeq6aqriS5BGzoxv9uaN+Nq+5W0g0ztf/9N/V4T7/7zTf1eFrUJ/SzxFj1rOmzL0n2Anu71eeSnOnR1yi9HHjmJh/z/4IVPy954AZ1snb8n/5ZuUH/f1b1nKyVn5X/R8/JV/Qp6hP6C8DmgfVNwPlr1CwkmQBuAy723JeqOggc7NPwjZBkrqqmx3X8tcrn5Wo+J1fzObnaWn5O+pzTPwVsS7I1yS0svjE7O1QzC9zbLe8BTlRVdeMz3dU9W4FtwN+PpnVJ0kotO9PvztHvA44D64CHq+p0kgPAXFXNAoeAw0nmWZzhz3T7nk7yB8CTwBXgx6rqszfosUiSlpHFCXnbkuztTjFpgM/L1XxOruZzcrW1/JwY+pLUEG/DIEkNaT70l7vFRGuSbE7yV0meSnI6yU+Ou6e1Ism6JI8n+dNx97IWJHlZkqNJ/rH7efnGcfe0FiT56e535yNJfj/JrePuaVDTod/zFhOtuQL8bFV9DfANwI/5nPyvnwSeGncTa8ivA39eVV8NfC0+NyTZCPwEMF1Vr2Tx4peZ8Xb1Yk2HPv1uMdGUqvp4VX2wW/4vFn+Rm/8UdZJNwJuB94y7l7UgyZcAb2Dxyj2q6nJV/cd4u1ozJoAv6j6z9BKW+GzSOLUe+kvdYqL5gHtBd7fU1wInx9vJmvBrwDuAz427kTXiFcAF4He7U17vSfLScTc1blX1r8CvAP8CfBy4VFV/Md6uXqz10O91m4gWJfli4H3AT1XVf467n3FK8l3Av1fVY+PuZQ2ZAF4H/FZVvRb4JOB7YsmXsni2YCvw5cBLk/zQeLt6sdZDv9dtIlqT5AtZDPzfq6o/HHc/a8Drge9J8jSLpwC/Pcl7x9vS2C0AC1X1wr8Cj7L4ItC67wA+VlUXquozwB8C3zTmnl6k9dDvc4uJpnS3xD4EPFVVvzruftaCqnpnVW2qqikWf0ZOVNWamr3dbFX1b8C5JF/VDe1k8ZP3rfsX4BuSvKT7XdrJGnuDu88N1/7futYtJsbc1ri9Hvhh4MNJnujGfr6qjo2xJ61NPw78XjdhOgv86Jj7GbuqOpnkKPBBFq+Ee5wx3kxyKX4iV5Ia0vrpHUlqiqEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JD/gd2fvYfta2lBQAAAABJRU5ErkJggg==\n", 230 | "text/plain": [ 231 | "" 232 | ] 233 | }, 234 | "metadata": { 235 | "needs_background": "light" 236 | }, 237 | "output_type": "display_data" 238 | } 239 | ], 240 | "source": [ 241 | "# example of mutual information feature selection for categorical data\n", 242 | "from pandas import read_csv\n", 243 | "from sklearn.model_selection import train_test_split\n", 244 | "from sklearn.preprocessing import LabelEncoder\n", 245 | "from sklearn.preprocessing import OrdinalEncoder\n", 246 | "from sklearn.feature_selection import SelectKBest\n", 247 | "from sklearn.feature_selection import mutual_info_classif\n", 248 | "from matplotlib import pyplot\n", 249 | " \n", 250 | "# load the dataset\n", 251 | "def load_dataset(filename):\n", 252 | "\t# load the dataset as a pandas DataFrame\n", 253 | "\tdata = read_csv(filename, header=None)\n", 254 | "\t# retrieve numpy array\n", 255 | "\tdataset = data.values\n", 256 | "\t# split into input (X) and output (y) variables\n", 257 | "\tX = dataset[:, :-1]\n", 258 | "\ty = dataset[:,-1]\n", 259 | "\t# format all fields as string\n", 260 | "\tX = X.astype(str)\n", 261 | "\treturn X, y\n", 262 | " \n", 263 | "# prepare input data\n", 264 | "def prepare_inputs(X_train, X_test):\n", 265 | "\toe = OrdinalEncoder()\n", 266 | "\toe.fit(X_train)\n", 267 | "\tX_train_enc = oe.transform(X_train)\n", 268 | "\tX_test_enc = oe.transform(X_test)\n", 269 | "\treturn X_train_enc, X_test_enc\n", 270 | " \n", 271 | "# prepare target\n", 272 | "def prepare_targets(y_train, y_test):\n", 273 | "\tle = LabelEncoder()\n", 274 | "\tle.fit(y_train)\n", 275 | "\ty_train_enc = le.transform(y_train)\n", 276 | "\ty_test_enc = le.transform(y_test)\n", 277 | "\treturn y_train_enc, y_test_enc\n", 278 | " \n", 279 | "# feature selection\n", 280 | "def select_features(X_train, y_train, X_test):\n", 281 | "\tfs = SelectKBest(score_func=mutual_info_classif, k='all')\n", 282 | "\tfs.fit(X_train, y_train)\n", 283 | "\tX_train_fs = fs.transform(X_train)\n", 284 | "\tX_test_fs = fs.transform(X_test)\n", 285 | "\treturn X_train_fs, X_test_fs, fs\n", 286 | " \n", 287 | "# load the dataset\n", 288 | "X, y = load_dataset('breast-cancer.csv')\n", 289 | "# split into train and test sets\n", 290 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)\n", 291 | "# prepare input data\n", 292 | "X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)\n", 293 | "# prepare output data\n", 294 | "y_train_enc, y_test_enc = prepare_targets(y_train, y_test)\n", 295 | "# feature selection\n", 296 | "X_train_fs, X_test_fs, fs = select_features(X_train_enc, y_train_enc, X_test_enc)\n", 297 | "# what are scores for the features\n", 298 | "for i in range(len(fs.scores_)):\n", 299 | "\tprint('Feature %d: %f' % (i, fs.scores_[i]))\n", 300 | "# plot the scores\n", 301 | "pyplot.bar([i for i in range(len(fs.scores_))], fs.scores_)\n", 302 | "pyplot.show()" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 6, 308 | "metadata": {}, 309 | "outputs": [ 310 | { 311 | "name": "stdout", 312 | "output_type": "stream", 313 | "text": [ 314 | "Accuracy: 75.79\n" 315 | ] 316 | } 317 | ], 318 | "source": [ 319 | "# evaluation of a model using all input features\n", 320 | "from pandas import read_csv\n", 321 | "from sklearn.preprocessing import LabelEncoder\n", 322 | "from sklearn.preprocessing import OrdinalEncoder\n", 323 | "from sklearn.model_selection import train_test_split\n", 324 | "from sklearn.linear_model import LogisticRegression\n", 325 | "from sklearn.metrics import accuracy_score\n", 326 | " \n", 327 | "# load the dataset\n", 328 | "def load_dataset(filename):\n", 329 | "\t# load the dataset as a pandas DataFrame\n", 330 | "\tdata = read_csv(filename, header=None)\n", 331 | "\t# retrieve numpy array\n", 332 | "\tdataset = data.values\n", 333 | "\t# split into input (X) and output (y) variables\n", 334 | "\tX = dataset[:, :-1]\n", 335 | "\ty = dataset[:,-1]\n", 336 | "\t# format all fields as string\n", 337 | "\tX = X.astype(str)\n", 338 | "\treturn X, y\n", 339 | " \n", 340 | "# prepare input data\n", 341 | "def prepare_inputs(X_train, X_test):\n", 342 | "\toe = OrdinalEncoder()\n", 343 | "\toe.fit(X_train)\n", 344 | "\tX_train_enc = oe.transform(X_train)\n", 345 | "\tX_test_enc = oe.transform(X_test)\n", 346 | "\treturn X_train_enc, X_test_enc\n", 347 | " \n", 348 | "# prepare target\n", 349 | "def prepare_targets(y_train, y_test):\n", 350 | "\tle = LabelEncoder()\n", 351 | "\tle.fit(y_train)\n", 352 | "\ty_train_enc = le.transform(y_train)\n", 353 | "\ty_test_enc = le.transform(y_test)\n", 354 | "\treturn y_train_enc, y_test_enc\n", 355 | " \n", 356 | "# load the dataset\n", 357 | "X, y = load_dataset('breast-cancer.csv')\n", 358 | "# split into train and test sets\n", 359 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)\n", 360 | "# prepare input data\n", 361 | "X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)\n", 362 | "# prepare output data\n", 363 | "y_train_enc, y_test_enc = prepare_targets(y_train, y_test)\n", 364 | "# fit the model\n", 365 | "model = LogisticRegression(solver='lbfgs')\n", 366 | "model.fit(X_train_enc, y_train_enc)\n", 367 | "# evaluate the model\n", 368 | "yhat = model.predict(X_test_enc)\n", 369 | "# evaluate predictions\n", 370 | "accuracy = accuracy_score(y_test_enc, yhat)\n", 371 | "print('Accuracy: %.2f' % (accuracy*100))" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": 7, 377 | "metadata": {}, 378 | "outputs": [ 379 | { 380 | "name": "stdout", 381 | "output_type": "stream", 382 | "text": [ 383 | "Accuracy: 74.74\n" 384 | ] 385 | } 386 | ], 387 | "source": [ 388 | "# evaluation of a model fit using chi squared input features\n", 389 | "from pandas import read_csv\n", 390 | "from sklearn.preprocessing import LabelEncoder\n", 391 | "from sklearn.preprocessing import OrdinalEncoder\n", 392 | "from sklearn.feature_selection import SelectKBest\n", 393 | "from sklearn.feature_selection import chi2\n", 394 | "from sklearn.model_selection import train_test_split\n", 395 | "from sklearn.linear_model import LogisticRegression\n", 396 | "from sklearn.metrics import accuracy_score\n", 397 | " \n", 398 | "# load the dataset\n", 399 | "def load_dataset(filename):\n", 400 | "\t# load the dataset as a pandas DataFrame\n", 401 | "\tdata = read_csv(filename, header=None)\n", 402 | "\t# retrieve numpy array\n", 403 | "\tdataset = data.values\n", 404 | "\t# split into input (X) and output (y) variables\n", 405 | "\tX = dataset[:, :-1]\n", 406 | "\ty = dataset[:,-1]\n", 407 | "\t# format all fields as string\n", 408 | "\tX = X.astype(str)\n", 409 | "\treturn X, y\n", 410 | " \n", 411 | "# prepare input data\n", 412 | "def prepare_inputs(X_train, X_test):\n", 413 | "\toe = OrdinalEncoder()\n", 414 | "\toe.fit(X_train)\n", 415 | "\tX_train_enc = oe.transform(X_train)\n", 416 | "\tX_test_enc = oe.transform(X_test)\n", 417 | "\treturn X_train_enc, X_test_enc\n", 418 | " \n", 419 | "# prepare target\n", 420 | "def prepare_targets(y_train, y_test):\n", 421 | "\tle = LabelEncoder()\n", 422 | "\tle.fit(y_train)\n", 423 | "\ty_train_enc = le.transform(y_train)\n", 424 | "\ty_test_enc = le.transform(y_test)\n", 425 | "\treturn y_train_enc, y_test_enc\n", 426 | " \n", 427 | "# feature selection\n", 428 | "def select_features(X_train, y_train, X_test):\n", 429 | "\tfs = SelectKBest(score_func=chi2, k=4)\n", 430 | "\tfs.fit(X_train, y_train)\n", 431 | "\tX_train_fs = fs.transform(X_train)\n", 432 | "\tX_test_fs = fs.transform(X_test)\n", 433 | "\treturn X_train_fs, X_test_fs\n", 434 | " \n", 435 | "# load the dataset\n", 436 | "X, y = load_dataset('breast-cancer.csv')\n", 437 | "# split into train and test sets\n", 438 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)\n", 439 | "# prepare input data\n", 440 | "X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)\n", 441 | "# prepare output data\n", 442 | "y_train_enc, y_test_enc = prepare_targets(y_train, y_test)\n", 443 | "# feature selection\n", 444 | "X_train_fs, X_test_fs = select_features(X_train_enc, y_train_enc, X_test_enc)\n", 445 | "# fit the model\n", 446 | "model = LogisticRegression(solver='lbfgs')\n", 447 | "model.fit(X_train_fs, y_train_enc)\n", 448 | "# evaluate the model\n", 449 | "yhat = model.predict(X_test_fs)\n", 450 | "# evaluate predictions\n", 451 | "accuracy = accuracy_score(y_test_enc, yhat)\n", 452 | "print('Accuracy: %.2f' % (accuracy*100))" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": 9, 458 | "metadata": {}, 459 | "outputs": [ 460 | { 461 | "name": "stdout", 462 | "output_type": "stream", 463 | "text": [ 464 | "Accuracy: 77.89\n" 465 | ] 466 | } 467 | ], 468 | "source": [ 469 | "# evaluation of a model fit using mutual information input features\n", 470 | "from pandas import read_csv\n", 471 | "from sklearn.preprocessing import LabelEncoder\n", 472 | "from sklearn.preprocessing import OrdinalEncoder\n", 473 | "from sklearn.feature_selection import SelectKBest\n", 474 | "from sklearn.feature_selection import mutual_info_classif\n", 475 | "from sklearn.model_selection import train_test_split\n", 476 | "from sklearn.linear_model import LogisticRegression\n", 477 | "from sklearn.metrics import accuracy_score\n", 478 | " \n", 479 | "# load the dataset\n", 480 | "def load_dataset(filename):\n", 481 | "\t# load the dataset as a pandas DataFrame\n", 482 | "\tdata = read_csv(filename, header=None)\n", 483 | "\t# retrieve numpy array\n", 484 | "\tdataset = data.values\n", 485 | "\t# split into input (X) and output (y) variables\n", 486 | "\tX = dataset[:, :-1]\n", 487 | "\ty = dataset[:,-1]\n", 488 | "\t# format all fields as string\n", 489 | "\tX = X.astype(str)\n", 490 | "\treturn X, y\n", 491 | " \n", 492 | "# prepare input data\n", 493 | "def prepare_inputs(X_train, X_test):\n", 494 | "\toe = OrdinalEncoder()\n", 495 | "\toe.fit(X_train)\n", 496 | "\tX_train_enc = oe.transform(X_train)\n", 497 | "\tX_test_enc = oe.transform(X_test)\n", 498 | "\treturn X_train_enc, X_test_enc\n", 499 | " \n", 500 | "# prepare target\n", 501 | "def prepare_targets(y_train, y_test):\n", 502 | "\tle = LabelEncoder()\n", 503 | "\tle.fit(y_train)\n", 504 | "\ty_train_enc = le.transform(y_train)\n", 505 | "\ty_test_enc = le.transform(y_test)\n", 506 | "\treturn y_train_enc, y_test_enc\n", 507 | " \n", 508 | "# feature selection\n", 509 | "def select_features(X_train, y_train, X_test):\n", 510 | "\tfs = SelectKBest(score_func=mutual_info_classif, k=4)\n", 511 | "\tfs.fit(X_train, y_train)\n", 512 | "\tX_train_fs = fs.transform(X_train)\n", 513 | "\tX_test_fs = fs.transform(X_test)\n", 514 | "\treturn X_train_fs, X_test_fs\n", 515 | " \n", 516 | "# load the dataset\n", 517 | "X, y = load_dataset('breast-cancer.csv')\n", 518 | "# split into train and test sets\n", 519 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)\n", 520 | "# prepare input data\n", 521 | "X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)\n", 522 | "# prepare output data\n", 523 | "y_train_enc, y_test_enc = prepare_targets(y_train, y_test)\n", 524 | "# feature selection\n", 525 | "X_train_fs, X_test_fs = select_features(X_train_enc, y_train_enc, X_test_enc)\n", 526 | "# fit the model\n", 527 | "model = LogisticRegression(solver='lbfgs')\n", 528 | "model.fit(X_train_fs, y_train_enc)\n", 529 | "# evaluate the model\n", 530 | "yhat = model.predict(X_test_fs)\n", 531 | "# evaluate predictions\n", 532 | "accuracy = accuracy_score(y_test_enc, yhat)\n", 533 | "print('Accuracy: %.2f' % (accuracy*100))" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": null, 539 | "metadata": {}, 540 | "outputs": [], 541 | "source": [] 542 | } 543 | ], 544 | "metadata": { 545 | "kernelspec": { 546 | "display_name": "Python 3", 547 | "language": "python", 548 | "name": "python3" 549 | }, 550 | "language_info": { 551 | "codemirror_mode": { 552 | "name": "ipython", 553 | "version": 3 554 | }, 555 | "file_extension": ".py", 556 | "mimetype": "text/x-python", 557 | "name": "python", 558 | "nbconvert_exporter": "python", 559 | "pygments_lexer": "ipython3", 560 | "version": "3.6.4" 561 | } 562 | }, 563 | "nbformat": 4, 564 | "nbformat_minor": 2 565 | } 566 | --------------------------------------------------------------------------------