├── BigMartTrain.csv
├── OnlineRetail.csv.zip
├── README.md
├── Superstore.xls
├── Tutorial_1.ipynb
├── Tutorial_2.ipynb
├── Tutorial_3.ipynb
├── Tutorial_4.ipynb
├── Tutorial_5.ipynb
├── Tutorial_6.ipynb
├── Tutorial_7.ipynb
└── Tutorial_8.ipynb
/OnlineRetail.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sauravsingla/Outlier_Detection_Tutorials/425ec7bb2ffb971c634b7bcc404ab3234ef0ff0c/OnlineRetail.csv.zip
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Outlier_Detection_Tutorials
--------------------------------------------------------------------------------
/Superstore.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sauravsingla/Outlier_Detection_Tutorials/425ec7bb2ffb971c634b7bcc404ab3234ef0ff0c/Superstore.xls
--------------------------------------------------------------------------------
/Tutorial_1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "Tutorial_1.ipynb",
7 | "provenance": [],
8 | "collapsed_sections": [],
9 | "authorship_tag": "ABX9TyP3r04a+D8qzfOtGm719dlZ",
10 | "include_colab_link": true
11 | },
12 | "kernelspec": {
13 | "name": "python3",
14 | "display_name": "Python 3"
15 | },
16 | "accelerator": "GPU"
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {
32 | "id": "NC6mQFXKojRK",
33 | "colab_type": "text"
34 | },
35 | "source": [
36 | "Extreme Value Analysis:\n",
37 | "\n",
38 | "* Standard Deviation\n",
39 | "* Interquartile Range\n",
40 | "\n"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "metadata": {
46 | "id": "6tDMdu52EvYO",
47 | "colab_type": "code",
48 | "colab": {
49 | "base_uri": "https://localhost:8080/",
50 | "height": 52
51 | },
52 | "outputId": "f70106d1-575f-4def-aff4-a88e7a563e31"
53 | },
54 | "source": [
55 | "from numpy.random import seed\n",
56 | "from numpy.random import randn\n",
57 | "from numpy import mean\n",
58 | "from numpy import std\n",
59 | "\n",
60 | "seed(1)\n",
61 | "data = 5 * randn(10000) + 50\n",
62 | "data_mean, data_std = mean(data), std(data)\n",
63 | "# outliers\n",
64 | "cut_off = data_std * 3\n",
65 | "lower, upper = data_mean - cut_off, data_mean + cut_off\n",
66 | "# outliers\n",
67 | "outliers = [x for x in data if x < lower or x > upper]\n",
68 | "print('Identified outliers: %d' % len(outliers))\n",
69 | "# remove outliers\n",
70 | "outliers_removed = [x for x in data if x >= lower and x <= upper]\n",
71 | "print('Non-outlier observations: %d' % len(outliers_removed))"
72 | ],
73 | "execution_count": null,
74 | "outputs": [
75 | {
76 | "output_type": "stream",
77 | "text": [
78 | "Identified outliers: 29\n",
79 | "Non-outlier observations: 9971\n"
80 | ],
81 | "name": "stdout"
82 | }
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "metadata": {
88 | "id": "U9cPE_BBGVDh",
89 | "colab_type": "text"
90 | },
91 | "source": [
92 | "Interquartile Range"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "metadata": {
98 | "id": "TZMKPowHFP11",
99 | "colab_type": "code",
100 | "colab": {
101 | "base_uri": "https://localhost:8080/",
102 | "height": 69
103 | },
104 | "outputId": "637bbf4a-6aef-4091-edba-d5c115565393"
105 | },
106 | "source": [
107 | "from numpy.random import seed\n",
108 | "from numpy.random import randn\n",
109 | "from numpy import percentile\n",
110 | "\n",
111 | "seed(1)\n",
112 | "data = 5 * randn(10000) + 50\n",
113 | "# interquartile range\n",
114 | "q25, q75 = percentile(data, 25), percentile(data, 75)\n",
115 | "iqr = q75 - q25\n",
116 | "print('Percentiles: 25th=%.3f, 75th=%.3f, IQR=%.3f' % (q25, q75, iqr))\n",
117 | "# outlier cutoff\n",
118 | "cut_off = iqr * 1.5\n",
119 | "lower, upper = q25 - cut_off, q75 + cut_off\n",
120 | "# outliers\n",
121 | "outliers = [x for x in data if x < lower or x > upper]\n",
122 | "print('Identified outliers: %d' % len(outliers))\n",
123 | "# remove outliers\n",
124 | "outliers_removed = [x for x in data if x >= lower and x <= upper]\n",
125 | "print('Non-outlier observations: %d' % len(outliers_removed))"
126 | ],
127 | "execution_count": null,
128 | "outputs": [
129 | {
130 | "output_type": "stream",
131 | "text": [
132 | "Percentiles: 25th=46.685, 75th=53.359, IQR=6.674\n",
133 | "Identified outliers: 81\n",
134 | "Non-outlier observations: 9919\n"
135 | ],
136 | "name": "stdout"
137 | }
138 | ]
139 | }
140 | ]
141 | }
--------------------------------------------------------------------------------
/Tutorial_2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "Tutorial_2.ipynb",
7 | "provenance": [],
8 | "collapsed_sections": [],
9 | "authorship_tag": "ABX9TyP51jOe19eCU0gsuWdRgtyR",
10 | "include_colab_link": true
11 | },
12 | "kernelspec": {
13 | "name": "python3",
14 | "display_name": "Python 3"
15 | },
16 | "accelerator": "GPU"
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {
32 | "id": "NM85nagYo5Xa",
33 | "colab_type": "text"
34 | },
35 | "source": [
36 | "\n",
37 | "\n",
38 | "* Visualise Graph through Boxplot and Scatterplot\n",
39 | "* Z Score\n",
40 | "* Interquartile range (IQR)\n"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "metadata": {
46 | "id": "PQU5n-Wk2zxn",
47 | "colab_type": "code",
48 | "colab": {}
49 | },
50 | "source": [
51 | "#Import libraries\n",
52 | "import numpy as np\n",
53 | "import pandas as pd\n",
54 | "from sklearn.datasets import load_boston"
55 | ],
56 | "execution_count": null,
57 | "outputs": []
58 | },
59 | {
60 | "cell_type": "code",
61 | "metadata": {
62 | "id": "Wa2uSZA03Qz7",
63 | "colab_type": "code",
64 | "colab": {
65 | "base_uri": "https://localhost:8080/",
66 | "height": 202
67 | },
68 | "outputId": "15020468-9e0c-44fb-8a25-e062d2f8b336"
69 | },
70 | "source": [
71 | "#Get dataset\n",
72 | "boston = load_boston()\n",
73 | "x = boston.data\n",
74 | "y = boston.target\n",
75 | "columns = boston.feature_names\n",
76 | "\n",
77 | "#create the dataframe\n",
78 | "boston_df = pd.DataFrame(boston.data)\n",
79 | "boston_df.columns = columns\n",
80 | "boston_df.head()"
81 | ],
82 | "execution_count": null,
83 | "outputs": [
84 | {
85 | "output_type": "execute_result",
86 | "data": {
87 | "text/html": [
88 | "
\n",
89 | "\n",
102 | "
\n",
103 | " \n",
104 | " \n",
105 | " | \n",
106 | " CRIM | \n",
107 | " ZN | \n",
108 | " INDUS | \n",
109 | " CHAS | \n",
110 | " NOX | \n",
111 | " RM | \n",
112 | " AGE | \n",
113 | " DIS | \n",
114 | " RAD | \n",
115 | " TAX | \n",
116 | " PTRATIO | \n",
117 | " B | \n",
118 | " LSTAT | \n",
119 | "
\n",
120 | " \n",
121 | " \n",
122 | " \n",
123 | " 0 | \n",
124 | " 0.00632 | \n",
125 | " 18.0 | \n",
126 | " 2.31 | \n",
127 | " 0.0 | \n",
128 | " 0.538 | \n",
129 | " 6.575 | \n",
130 | " 65.2 | \n",
131 | " 4.0900 | \n",
132 | " 1.0 | \n",
133 | " 296.0 | \n",
134 | " 15.3 | \n",
135 | " 396.90 | \n",
136 | " 4.98 | \n",
137 | "
\n",
138 | " \n",
139 | " 1 | \n",
140 | " 0.02731 | \n",
141 | " 0.0 | \n",
142 | " 7.07 | \n",
143 | " 0.0 | \n",
144 | " 0.469 | \n",
145 | " 6.421 | \n",
146 | " 78.9 | \n",
147 | " 4.9671 | \n",
148 | " 2.0 | \n",
149 | " 242.0 | \n",
150 | " 17.8 | \n",
151 | " 396.90 | \n",
152 | " 9.14 | \n",
153 | "
\n",
154 | " \n",
155 | " 2 | \n",
156 | " 0.02729 | \n",
157 | " 0.0 | \n",
158 | " 7.07 | \n",
159 | " 0.0 | \n",
160 | " 0.469 | \n",
161 | " 7.185 | \n",
162 | " 61.1 | \n",
163 | " 4.9671 | \n",
164 | " 2.0 | \n",
165 | " 242.0 | \n",
166 | " 17.8 | \n",
167 | " 392.83 | \n",
168 | " 4.03 | \n",
169 | "
\n",
170 | " \n",
171 | " 3 | \n",
172 | " 0.03237 | \n",
173 | " 0.0 | \n",
174 | " 2.18 | \n",
175 | " 0.0 | \n",
176 | " 0.458 | \n",
177 | " 6.998 | \n",
178 | " 45.8 | \n",
179 | " 6.0622 | \n",
180 | " 3.0 | \n",
181 | " 222.0 | \n",
182 | " 18.7 | \n",
183 | " 394.63 | \n",
184 | " 2.94 | \n",
185 | "
\n",
186 | " \n",
187 | " 4 | \n",
188 | " 0.06905 | \n",
189 | " 0.0 | \n",
190 | " 2.18 | \n",
191 | " 0.0 | \n",
192 | " 0.458 | \n",
193 | " 7.147 | \n",
194 | " 54.2 | \n",
195 | " 6.0622 | \n",
196 | " 3.0 | \n",
197 | " 222.0 | \n",
198 | " 18.7 | \n",
199 | " 396.90 | \n",
200 | " 5.33 | \n",
201 | "
\n",
202 | " \n",
203 | "
\n",
204 | "
"
205 | ],
206 | "text/plain": [
207 | " CRIM ZN INDUS CHAS NOX ... RAD TAX PTRATIO B LSTAT\n",
208 | "0 0.00632 18.0 2.31 0.0 0.538 ... 1.0 296.0 15.3 396.90 4.98\n",
209 | "1 0.02731 0.0 7.07 0.0 0.469 ... 2.0 242.0 17.8 396.90 9.14\n",
210 | "2 0.02729 0.0 7.07 0.0 0.469 ... 2.0 242.0 17.8 392.83 4.03\n",
211 | "3 0.03237 0.0 2.18 0.0 0.458 ... 3.0 222.0 18.7 394.63 2.94\n",
212 | "4 0.06905 0.0 2.18 0.0 0.458 ... 3.0 222.0 18.7 396.90 5.33\n",
213 | "\n",
214 | "[5 rows x 13 columns]"
215 | ]
216 | },
217 | "metadata": {
218 | "tags": []
219 | },
220 | "execution_count": 3
221 | }
222 | ]
223 | },
224 | {
225 | "cell_type": "markdown",
226 | "metadata": {
227 | "id": "fBpH8BwuEEWL",
228 | "colab_type": "text"
229 | },
230 | "source": [
231 | "Boxplot"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "metadata": {
237 | "id": "kBu527LS3XDB",
238 | "colab_type": "code",
239 | "colab": {
240 | "base_uri": "https://localhost:8080/",
241 | "height": 351
242 | },
243 | "outputId": "042db6e6-b0d2-450f-d184-5df99ecb2973"
244 | },
245 | "source": [
246 | "import seaborn as sns\n",
247 | "sns.boxplot(x=boston_df['DIS'])"
248 | ],
249 | "execution_count": null,
250 | "outputs": [
251 | {
252 | "output_type": "stream",
253 | "text": [
254 | "/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n",
255 | " import pandas.util.testing as tm\n"
256 | ],
257 | "name": "stderr"
258 | },
259 | {
260 | "output_type": "execute_result",
261 | "data": {
262 | "text/plain": [
263 | ""
264 | ]
265 | },
266 | "metadata": {
267 | "tags": []
268 | },
269 | "execution_count": 4
270 | },
271 | {
272 | "output_type": "display_data",
273 | "data": {
274 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWAAAAEGCAYAAABbzE8LAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAALF0lEQVR4nO3dYazd9V3H8c+3vUtWcHNSWLMV4xWvGVnG3JQHUxNjNkiasWw+NNFRo8meaKlkiXFZExPTmCUaIxTjgkxplc0HOKMZtVuZJj5Rs3ZDYEDcyewmFUZX4rYAOm/5+eAeFtoVaOHc87338HolpP977uH/+/7be97877/n/qkxRgCYvy3dAwC8WgkwQBMBBmgiwABNBBigydLFPPnyyy8fy8vL6zQKwGI6fvz4N8cYV5z7+EUFeHl5OceOHZvdVACvAlX1tfM97hIEQBMBBmgiwABNBBigiQADNBFggCYCDNBEgAGaCDBAEwEGaCLAAE0EGKCJAAM0EWCAJgIM0ESAAZoIMEATAQZoIsAATS7q/wm3GRw4cCCTyWTu6548eTJJsnPnzrmvPQ8rKyvZs2dP9xiwUBYuwJPJJPc9+HDOXHLZXNfd+vS3kiSP/+/C/ZZm69NPdo8AC2nxapHkzCWX5Zmr3zvXNbc9cjhJ5r7uPDx3bMBsuQYM0ESAAZoIMEATAQZoIsAATQQYoIkAAzQRYIAmAgzQRIABmggwQBMBBmgiwABNBBigiQADNBFggCYCDNBEgAGaCDBAEwEGaCLAAE0EGKCJAAM0EWCAJgIM0ESAAZoIMEATAQZoIsAATQQYoIkAAzQRYIAmAgzQRIABmggwQBMBBmgiwABNBBigiQADNBFggCYCDNBkLgE+cOBADhw4MI+lgHN4/W1cS/NYZDKZzGMZ4Dy8/jYulyAAmggwQBMBBmgiwABNBBigiQADNBFggCYCDNBEgAGaCDBAEwEGaCLAAE0EGKCJAAM0EWCAJgIM0ESAAZoIMEATAQZoIsAATQQYoIkAAzQRYIAmAgzQRIABmggwQBMBBmgiwABNBBigiQADNBFggCYCDNBEgAGaCDBAEwEGaCLAAE0EGKCJAAM0EWCgxWQyyQ033JDJZHLW9kZz+vTp3HTTTTl9+vTM9y3AQIv9+/fnqaeeyv79+8/a3mgOHjyYBx54IIcOHZr5vgUYmLvJZJITJ04kSU6cOHHW9kY6Cz59+nSOHDmSMUaOHDky87PgpZnu7QWcPHkyzzzzTPbu3bvua00mk2z57lj3dV5NtvzPtzOZfGcuf37M3mQyybZt27rHOMuLnenu378/d9555/yGeREHDx7Ms88+myQ5c+ZMDh06lJtvvnlm+3/JM+Cq+lBVHauqY6dOnZrZwsCr13NnvBf7uXm79957s7q6miRZXV3N0aNHZ7r/lzwDHmPcnuT2JLn22mtf1qnlzp07kyS33HLLy/nXL8revXtz/KvfWPd1Xk2efe3rs3LVjrn8+TF7G/E7l+Xl5RcM7fLy8lxneTHXXXddDh8+nNXV1SwtLeX666+f6f5dAwbmbt++fS/rc/O2e/fubNmylsmtW7fmxhtvnOn+BRiYu5WVle+d6S4vL5+1vbKy0jfYObZv355du3alqrJr165s3759pvsXYKDFvn37cumll2bfvn1nbW80u3fvzjXXXDPzs99kTu+CADjXyspK7rnnnu99/PztjWT79u259dZb12XfzoABmggwQBMBBmgiwABNBBigiQADNBFggCYCDNBEgAGaCDBAEwEGaCLAAE0EGKCJAAM0EWCAJgIM0ESAAZoIMEATAQZoIsAATQQYoIkAAzQRYIAmAgzQRIABmggwQBMBBmgiwABNBBigiQADNBFggCYCDNBEgAGaCDBAEwEGaCLAAE0EGKCJAAM0WZrHIisrK/NYBjgPr7+Nay4B3rNnzzyWAc7D62/jcgkCoIkAAzQRYIAmAgzQRIABmggwQBMBBmgiwABNBBigiQADNBFggCYCDNBEgAGaCDBAEwEGaCLAAE0EGKCJAAM0EWCAJgIM0ESAAZoIMEATAQZoIsAATQQYoIkAAzQRYIAmAgzQRIABmggwQBMBBmgiwABNBBigiQADNBFggCYCDNBEgAGaCDBAEwEGaCLAAE2WugdYD1uffjLbHjk85zVPJ8nc152HrU8/mWRH9xiwcBYuwCsrKy3rnjy5miTZuXMRQ7Wj7fcVFtnCBXjPnj3dIwBcENeAAZoIMEATAQZoIsAATQQYoIkAAzQRYIAmAgzQRIABmggwQBMBBmgiwABNBBigiQADNBFggCYCDNBEgAGaCDBAEwEGaCLAAE1qjHHhT646leRr6zfOK3J5km92D7FOFvnYksU+Pse2ec3y+H5kjHHFuQ9eVIA3sqo6Nsa4tnuO9bDIx5Ys9vE5ts1rHsfnEgRAEwEGaLJIAb69e4B1tMjHliz28Tm2zWvdj29hrgEDbDaLdAYMsKkIMECTTR/gqvrhqvrHqnqoqr5cVXu7Z5q1qtpaVV+qqs90zzJLVfWGqrq7qh6pqoer6qe7Z5qlqrp5+jX5YFV9qqpe2z3Ty1VVf1ZVT1TVg8977LKqOlpVX5n++kOdM74SL3B8vz/92ry/qv6mqt4w63U3fYCTrCb58BjjrUneleTXq+qtzTPN2t4kD3cPsQ5uSXJkjHF1kp/IAh1jVe1MclOSa8cYb0uyNckv9k71ityZZNc5j/12ks+PMX48yeenH29Wd+b7j+9okreNMd6e5N+TfGTWi276AI8xHhtjfHG6/Z2svYh39k41O1V1ZZIbktzRPcssVdUPJvm5JJ9IkjHGd8cY/9071cwtJdlWVUtJLknyX83zvGxjjH9K8uQ5D38gycHp9sEkvzDXoWbofMc3xvjcGGN1+uG/JLly1utu+gA/X1UtJ3lnkn/tnWSm/ijJbyV5tnuQGfvRJKeS/Pn08sodVXVp91CzMsY4meQPknw9yWNJvjXG+FzvVDO3Y4zx2HT78SQ7OodZZ7+a5O9nvdOFCXBV/UCSv07ym2OMb3fPMwtV9b4kT4wxjnfPsg6Wkvxkkj8ZY7wzyVPZ3N/CnmV6PfQDWfsPzZuTXFpVv9w71foZa+9nXcj3tFbVR7N2qfOuWe97IQJcVa/JWnzvGmN8unueGfrZJO+vqhNJ/irJu6vqL3tHmplHkzw6xnjuu5W7sxbkRXFdkv8YY5waY/xfkk8n+ZnmmWbtG1X1piSZ/vpE8zwzV1W/kuR9SX5prMMPTWz6AFdVZe064sNjjD/snmeWxhgfGWNcOcZYztpf4PzDGGMhzqLGGI8n+c+qesv0ofckeahxpFn7epJ3VdUl06/R92SB/pJx6u+S7J5u707yt42zzFxV7cra5b/3jzGeXo81Nn2As3aW+MGsnR3eN/3nvd1DcUH2JLmrqu5P8o4kv9c8z8xMz+zvTvLFJA9k7bW2aX90t6o+leSfk7ylqh6tql9L8rEk11fVV7J2xv+xzhlfiRc4vtuSvC7J0WlXPj7zdf0oMkCPRTgDBtiUBBigiQADNBFggCYCDNBEgNk0qurM9O1AX66qf6uqD1fVlunnfv65u8VV1Y6q+sz0OQ9V1eHeyeH8lroHgIvwzBjjHUlSVW9M8skkr0/yO+c873eTHB1j3DJ97tvnOiVcIGfAbEpjjCeSfCjJb0x/0uz53pS1H3V+7rn3z3M2uFACzKY1xvhq1u6z+8ZzPvXHST4xvVH/R6vqzfOfDl6aALNwxhifTXJVkj9NcnWSL1XVFb1TwfcTYDatqroqyZmc5y5cY4wnxxifHGN8MMkXsnbzd9hQBJhNaXpG+/Ekt517m8CqendVXTLdfl2SH8va3clgQ/EuCDaTbVV1X5LXZO0G2X+R5Hy3IP2pJLdV1WrWTjLuGGN8YX5jwoVxNzSAJi5BADQRYIAmAgzQRIABmggwQBMBBmgiwABN/h9PplhFNys9TwAAAABJRU5ErkJggg==\n",
275 | "text/plain": [
276 | ""
277 | ]
278 | },
279 | "metadata": {
280 | "tags": [],
281 | "needs_background": "light"
282 | }
283 | }
284 | ]
285 | },
286 | {
287 | "cell_type": "markdown",
288 | "metadata": {
289 | "id": "OYy3PNL7ECMz",
290 | "colab_type": "text"
291 | },
292 | "source": [
293 | "Scatterplot"
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "metadata": {
299 | "id": "Zb6xwAHs3Z5O",
300 | "colab_type": "code",
301 | "colab": {
302 | "base_uri": "https://localhost:8080/",
303 | "height": 497
304 | },
305 | "outputId": "90f5e1e4-b5be-44da-cfdb-dbde14c0e915"
306 | },
307 | "source": [
308 | "import matplotlib.pyplot as plt\n",
309 | "fig, ax = plt.subplots(figsize=(16,8))\n",
310 | "ax.scatter(boston_df['INDUS'], boston_df['TAX'])\n",
311 | "ax.set_xlabel('Proportion of non-retail business acres per town')\n",
312 | "ax.set_ylabel('Full-value property-tax rate per $10,000')\n",
313 | "plt.show()"
314 | ],
315 | "execution_count": null,
316 | "outputs": [
317 | {
318 | "output_type": "display_data",
319 | "data": {
320 | "image/png": "\n",
321 | "text/plain": [
322 | ""
323 | ]
324 | },
325 | "metadata": {
326 | "tags": [],
327 | "needs_background": "light"
328 | }
329 | }
330 | ]
331 | },
332 | {
333 | "cell_type": "markdown",
334 | "metadata": {
335 | "id": "sl251VIDEAQm",
336 | "colab_type": "text"
337 | },
338 | "source": [
339 | "Z score"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "metadata": {
345 | "id": "LJYijwie32B-",
346 | "colab_type": "code",
347 | "colab": {
348 | "base_uri": "https://localhost:8080/",
349 | "height": 139
350 | },
351 | "outputId": "4db0b977-93c1-4347-9268-069e5f4c3d8f"
352 | },
353 | "source": [
354 | "from scipy import stats\n",
355 | "import numpy as np\n",
356 | "z = np.abs(stats.zscore(boston_df))\n",
357 | "print(z)"
358 | ],
359 | "execution_count": null,
360 | "outputs": [
361 | {
362 | "output_type": "stream",
363 | "text": [
364 | "[[0.41978194 0.28482986 1.2879095 ... 1.45900038 0.44105193 1.0755623 ]\n",
365 | " [0.41733926 0.48772236 0.59338101 ... 0.30309415 0.44105193 0.49243937]\n",
366 | " [0.41734159 0.48772236 0.59338101 ... 0.30309415 0.39642699 1.2087274 ]\n",
367 | " ...\n",
368 | " [0.41344658 0.48772236 0.11573841 ... 1.17646583 0.44105193 0.98304761]\n",
369 | " [0.40776407 0.48772236 0.11573841 ... 1.17646583 0.4032249 0.86530163]\n",
370 | " [0.41500016 0.48772236 0.11573841 ... 1.17646583 0.44105193 0.66905833]]\n"
371 | ],
372 | "name": "stdout"
373 | }
374 | ]
375 | },
376 | {
377 | "cell_type": "code",
378 | "metadata": {
379 | "id": "kNtOvRUG37GQ",
380 | "colab_type": "code",
381 | "colab": {
382 | "base_uri": "https://localhost:8080/",
383 | "height": 243
384 | },
385 | "outputId": "5eb110d8-3b14-4080-8762-7f4c6a89edd6"
386 | },
387 | "source": [
388 | "threshold = 3\n",
389 | "print(np.where(z > 3))"
390 | ],
391 | "execution_count": null,
392 | "outputs": [
393 | {
394 | "output_type": "stream",
395 | "text": [
396 | "(array([ 55, 56, 57, 102, 141, 142, 152, 154, 155, 160, 162, 163, 199,\n",
397 | " 200, 201, 202, 203, 204, 208, 209, 210, 211, 212, 216, 218, 219,\n",
398 | " 220, 221, 222, 225, 234, 236, 256, 257, 262, 269, 273, 274, 276,\n",
399 | " 277, 282, 283, 283, 284, 347, 351, 352, 353, 353, 354, 355, 356,\n",
400 | " 357, 358, 363, 364, 364, 365, 367, 369, 370, 372, 373, 374, 374,\n",
401 | " 380, 398, 404, 405, 406, 410, 410, 411, 412, 412, 414, 414, 415,\n",
402 | " 416, 418, 418, 419, 423, 424, 425, 426, 427, 427, 429, 431, 436,\n",
403 | " 437, 438, 445, 450, 454, 455, 456, 457, 466]), array([ 1, 1, 1, 11, 12, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,\n",
404 | " 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 3, 3, 1, 5,\n",
405 | " 5, 3, 3, 3, 3, 3, 3, 1, 3, 1, 1, 7, 7, 1, 7, 7, 7,\n",
406 | " 3, 3, 3, 3, 3, 5, 5, 5, 3, 3, 3, 12, 5, 12, 0, 0, 0,\n",
407 | " 0, 5, 0, 11, 11, 11, 12, 0, 12, 11, 11, 0, 11, 11, 11, 11, 11,\n",
408 | " 11, 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11]))\n"
409 | ],
410 | "name": "stdout"
411 | }
412 | ]
413 | },
414 | {
415 | "cell_type": "code",
416 | "metadata": {
417 | "id": "DkAMWiEA3_8V",
418 | "colab_type": "code",
419 | "colab": {
420 | "base_uri": "https://localhost:8080/",
421 | "height": 35
422 | },
423 | "outputId": "dabd30bf-1b13-40b0-9465-cb4753104612"
424 | },
425 | "source": [
426 | "print(z[55][1])"
427 | ],
428 | "execution_count": null,
429 | "outputs": [
430 | {
431 | "output_type": "stream",
432 | "text": [
433 | "3.375038763517309\n"
434 | ],
435 | "name": "stdout"
436 | }
437 | ]
438 | },
439 | {
440 | "cell_type": "markdown",
441 | "metadata": {
442 | "id": "JTFQVAMLES-a",
443 | "colab_type": "text"
444 | },
445 | "source": [
446 | "interquartile range (IQR)"
447 | ]
448 | },
449 | {
450 | "cell_type": "code",
451 | "metadata": {
452 | "id": "aVdMHz8u4CyV",
453 | "colab_type": "code",
454 | "colab": {
455 | "base_uri": "https://localhost:8080/",
456 | "height": 260
457 | },
458 | "outputId": "4e619f06-76a2-44d7-f1d0-d6274d8f584e"
459 | },
460 | "source": [
461 | "boston_df_o1 = boston_df\n",
462 | "Q1 = boston_df_o1.quantile(0.25)\n",
463 | "Q3 = boston_df_o1.quantile(0.75)\n",
464 | "IQR = Q3 - Q1\n",
465 | "print(IQR)"
466 | ],
467 | "execution_count": null,
468 | "outputs": [
469 | {
470 | "output_type": "stream",
471 | "text": [
472 | "CRIM 3.595038\n",
473 | "ZN 12.500000\n",
474 | "INDUS 12.910000\n",
475 | "CHAS 0.000000\n",
476 | "NOX 0.175000\n",
477 | "RM 0.738000\n",
478 | "AGE 49.050000\n",
479 | "DIS 3.088250\n",
480 | "RAD 20.000000\n",
481 | "TAX 387.000000\n",
482 | "PTRATIO 2.800000\n",
483 | "B 20.847500\n",
484 | "LSTAT 10.005000\n",
485 | "dtype: float64\n"
486 | ],
487 | "name": "stdout"
488 | }
489 | ]
490 | },
491 | {
492 | "cell_type": "markdown",
493 | "metadata": {
494 | "id": "TngApIhvDYs9",
495 | "colab_type": "text"
496 | },
497 | "source": [
498 | "Remove Outliers"
499 | ]
500 | },
501 | {
502 | "cell_type": "code",
503 | "metadata": {
504 | "id": "k446ZEoQ4Ovq",
505 | "colab_type": "code",
506 | "colab": {}
507 | },
508 | "source": [
509 | "boston_df_o = boston_df\n",
510 | "boston_df_o = boston_df_o[(z < 3).all(axis=1)]"
511 | ],
512 | "execution_count": null,
513 | "outputs": []
514 | },
515 | {
516 | "cell_type": "code",
517 | "metadata": {
518 | "id": "6kekp8nVDfp9",
519 | "colab_type": "code",
520 | "colab": {
521 | "base_uri": "https://localhost:8080/",
522 | "height": 35
523 | },
524 | "outputId": "9d5fcdec-1679-46ee-c6bc-53cc1f5b9882"
525 | },
526 | "source": [
527 | "boston_df.shape"
528 | ],
529 | "execution_count": null,
530 | "outputs": [
531 | {
532 | "output_type": "execute_result",
533 | "data": {
534 | "text/plain": [
535 | "(506, 13)"
536 | ]
537 | },
538 | "metadata": {
539 | "tags": []
540 | },
541 | "execution_count": 12
542 | }
543 | ]
544 | },
545 | {
546 | "cell_type": "code",
547 | "metadata": {
548 | "id": "nTti644HDkpi",
549 | "colab_type": "code",
550 | "colab": {
551 | "base_uri": "https://localhost:8080/",
552 | "height": 35
553 | },
554 | "outputId": "93986b11-5be7-4a24-d3e3-109ed1cb03a0"
555 | },
556 | "source": [
557 | "boston_df_o.shape"
558 | ],
559 | "execution_count": null,
560 | "outputs": [
561 | {
562 | "output_type": "execute_result",
563 | "data": {
564 | "text/plain": [
565 | "(415, 13)"
566 | ]
567 | },
568 | "metadata": {
569 | "tags": []
570 | },
571 | "execution_count": 14
572 | }
573 | ]
574 | },
575 | {
576 | "cell_type": "markdown",
577 | "metadata": {
578 | "id": "wMm_ZyQJDyu2",
579 | "colab_type": "text"
580 | },
581 | "source": [
582 | "Another method to remove outlier"
583 | ]
584 | },
585 | {
586 | "cell_type": "code",
587 | "metadata": {
588 | "id": "yjR35vnN4oVC",
589 | "colab_type": "code",
590 | "colab": {
591 | "base_uri": "https://localhost:8080/",
592 | "height": 35
593 | },
594 | "outputId": "8a103665-02c0-40b7-dc3c-f787e3287fa6"
595 | },
596 | "source": [
597 | "boston_df_out = boston_df_o1[~((boston_df_o1 < (Q1 - 1.5 * IQR)) |(boston_df_o1 > (Q3 + 1.5 * IQR))).any(axis=1)]\n",
598 | "boston_df_out.shape"
599 | ],
600 | "execution_count": null,
601 | "outputs": [
602 | {
603 | "output_type": "execute_result",
604 | "data": {
605 | "text/plain": [
606 | "(274, 13)"
607 | ]
608 | },
609 | "metadata": {
610 | "tags": []
611 | },
612 | "execution_count": 11
613 | }
614 | ]
615 | }
616 | ]
617 | }
--------------------------------------------------------------------------------
/Tutorial_3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "Tutorial_3.ipynb",
7 | "provenance": [],
8 | "collapsed_sections": [],
9 | "authorship_tag": "ABX9TyOHfXMB3stfPHuGAycj3A3r",
10 | "include_colab_link": true
11 | },
12 | "kernelspec": {
13 | "name": "python3",
14 | "display_name": "Python 3"
15 | },
16 | "accelerator": "GPU"
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {
32 | "id": "KwPXNeOjqPP0",
33 | "colab_type": "text"
34 | },
35 | "source": [
36 | "KNN\n",
37 | "\n"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "metadata": {
43 | "id": "iVGka0sYPZLh",
44 | "colab_type": "code",
45 | "colab": {
46 | "base_uri": "https://localhost:8080/",
47 | "height": 1000
48 | },
49 | "outputId": "ccb48f21-a275-4969-b9d5-92fbe2f543d4"
50 | },
51 | "source": [
52 | "!pip install pyod\n",
53 | "!pip install --upgrade pyod "
54 | ],
55 | "execution_count": null,
56 | "outputs": [
57 | {
58 | "output_type": "stream",
59 | "text": [
60 | "Collecting pyod\n",
61 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/b5/69/1dfc42299d9bed2fb78fc48819897c84aae19e22fb4d1d28cc43cec8c780/pyod-0.8.2.tar.gz (96kB)\n",
62 | "\r\u001b[K |███▍ | 10kB 27.5MB/s eta 0:00:01\r\u001b[K |██████▉ | 20kB 32.9MB/s eta 0:00:01\r\u001b[K |██████████▏ | 30kB 38.4MB/s eta 0:00:01\r\u001b[K |█████████████▋ | 40kB 19.9MB/s eta 0:00:01\r\u001b[K |█████████████████ | 51kB 14.4MB/s eta 0:00:01\r\u001b[K |████████████████████▍ | 61kB 14.5MB/s eta 0:00:01\r\u001b[K |███████████████████████▊ | 71kB 13.1MB/s eta 0:00:01\r\u001b[K |███████████████████████████▏ | 81kB 11.5MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▋ | 92kB 11.7MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 102kB 7.4MB/s \n",
63 | "\u001b[?25hCollecting combo\n",
64 | " Downloading https://files.pythonhosted.org/packages/0a/2a/61b6ac584e75d8df16dc27962aa5fe99d76b09da5b6710e83d4862c84001/combo-0.1.1.tar.gz\n",
65 | "Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from pyod) (0.16.0)\n",
66 | "Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from pyod) (3.2.2)\n",
67 | "Requirement already satisfied: numpy>=1.13 in /usr/local/lib/python3.6/dist-packages (from pyod) (1.18.5)\n",
68 | "Requirement already satisfied: numba>=0.35 in /usr/local/lib/python3.6/dist-packages (from pyod) (0.48.0)\n",
69 | "Requirement already satisfied: pandas>=0.25 in /usr/local/lib/python3.6/dist-packages (from pyod) (1.0.5)\n",
70 | "Requirement already satisfied: scipy>=0.19.1 in /usr/local/lib/python3.6/dist-packages (from pyod) (1.4.1)\n",
71 | "Requirement already satisfied: scikit_learn>=0.19.1 in /usr/local/lib/python3.6/dist-packages (from pyod) (0.22.2.post1)\n",
72 | "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from pyod) (1.15.0)\n",
73 | "Requirement already satisfied: statsmodels in /usr/local/lib/python3.6/dist-packages (from pyod) (0.10.2)\n",
74 | "Collecting suod\n",
75 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/a1/87/9170cabe1b5e10a7d095c0e28f2e30e7c1886a13f063de85d3cfacc06f4b/suod-0.0.4.tar.gz (2.1MB)\n",
76 | "\u001b[K |████████████████████████████████| 2.1MB 19.2MB/s \n",
77 | "\u001b[?25hRequirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->pyod) (2.4.7)\n",
78 | "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->pyod) (2.8.1)\n",
79 | "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->pyod) (0.10.0)\n",
80 | "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->pyod) (1.2.0)\n",
81 | "Requirement already satisfied: llvmlite<0.32.0,>=0.31.0dev0 in /usr/local/lib/python3.6/dist-packages (from numba>=0.35->pyod) (0.31.0)\n",
82 | "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from numba>=0.35->pyod) (49.6.0)\n",
83 | "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.25->pyod) (2018.9)\n",
84 | "Requirement already satisfied: patsy>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from statsmodels->pyod) (0.5.1)\n",
85 | "Building wheels for collected packages: pyod, combo, suod\n",
86 | " Building wheel for pyod (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
87 | " Created wheel for pyod: filename=pyod-0.8.2-cp36-none-any.whl size=110198 sha256=9fbdee4f02e7fe7e8443258c74bcb20ddbcfb9200a8d877269635684cc22e1e5\n",
88 | " Stored in directory: /root/.cache/pip/wheels/3a/ea/04/dbd99df8826a3d22139f44404cd14641615cd47ec2171cfe60\n",
89 | " Building wheel for combo (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
90 | " Created wheel for combo: filename=combo-0.1.1-cp36-none-any.whl size=42113 sha256=773054177cd3a3b01f52975a8d8dad002d6763acd508b432ae12ffac9007a77a\n",
91 | " Stored in directory: /root/.cache/pip/wheels/55/ec/e5/a2331372c676c467e70c6646e646edf6997d5c4905b8c0f5e6\n",
92 | " Building wheel for suod (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
93 | " Created wheel for suod: filename=suod-0.0.4-cp36-none-any.whl size=2167158 sha256=8948f151d7cc26d9fb7476831d8a263e8093258b24913514d525e648a1be6a16\n",
94 | " Stored in directory: /root/.cache/pip/wheels/57/55/e5/a4fca65bba231f6d0115059b589148774b41faea25b3f2aa27\n",
95 | "Successfully built pyod combo suod\n",
96 | "Installing collected packages: combo, suod, pyod\n",
97 | "Successfully installed combo-0.1.1 pyod-0.8.2 suod-0.0.4\n",
98 | "Requirement already up-to-date: pyod in /usr/local/lib/python3.6/dist-packages (0.8.2)\n",
99 | "Requirement already satisfied, skipping upgrade: combo in /usr/local/lib/python3.6/dist-packages (from pyod) (0.1.1)\n",
100 | "Requirement already satisfied, skipping upgrade: six in /usr/local/lib/python3.6/dist-packages (from pyod) (1.15.0)\n",
101 | "Requirement already satisfied, skipping upgrade: suod in /usr/local/lib/python3.6/dist-packages (from pyod) (0.0.4)\n",
102 | "Requirement already satisfied, skipping upgrade: scipy>=0.19.1 in /usr/local/lib/python3.6/dist-packages (from pyod) (1.4.1)\n",
103 | "Requirement already satisfied, skipping upgrade: pandas>=0.25 in /usr/local/lib/python3.6/dist-packages (from pyod) (1.0.5)\n",
104 | "Requirement already satisfied, skipping upgrade: joblib in /usr/local/lib/python3.6/dist-packages (from pyod) (0.16.0)\n",
105 | "Requirement already satisfied, skipping upgrade: matplotlib in /usr/local/lib/python3.6/dist-packages (from pyod) (3.2.2)\n",
106 | "Requirement already satisfied, skipping upgrade: numba>=0.35 in /usr/local/lib/python3.6/dist-packages (from pyod) (0.48.0)\n",
107 | "Requirement already satisfied, skipping upgrade: numpy>=1.13 in /usr/local/lib/python3.6/dist-packages (from pyod) (1.18.5)\n",
108 | "Requirement already satisfied, skipping upgrade: statsmodels in /usr/local/lib/python3.6/dist-packages (from pyod) (0.10.2)\n",
109 | "Requirement already satisfied, skipping upgrade: scikit-learn>=0.19.1 in /usr/local/lib/python3.6/dist-packages (from pyod) (0.22.2.post1)\n",
110 | "Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.25->pyod) (2018.9)\n",
111 | "Requirement already satisfied, skipping upgrade: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.25->pyod) (2.8.1)\n",
112 | "Requirement already satisfied, skipping upgrade: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->pyod) (1.2.0)\n",
113 | "Requirement already satisfied, skipping upgrade: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->pyod) (2.4.7)\n",
114 | "Requirement already satisfied, skipping upgrade: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->pyod) (0.10.0)\n",
115 | "Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.6/dist-packages (from numba>=0.35->pyod) (49.6.0)\n",
116 | "Requirement already satisfied, skipping upgrade: llvmlite<0.32.0,>=0.31.0dev0 in /usr/local/lib/python3.6/dist-packages (from numba>=0.35->pyod) (0.31.0)\n",
117 | "Requirement already satisfied, skipping upgrade: patsy>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from statsmodels->pyod) (0.5.1)\n"
118 | ],
119 | "name": "stdout"
120 | }
121 | ]
122 | },
123 | {
124 | "cell_type": "markdown",
125 | "metadata": {
126 | "id": "NMo2RuHlQgY7",
127 | "colab_type": "text"
128 | },
129 | "source": [
130 | "PyOD embedded functions used to create a random dataset with a fraction of outliers"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "metadata": {
136 | "id": "PRgN7UtqO0Et",
137 | "colab_type": "code",
138 | "colab": {
139 | "base_uri": "https://localhost:8080/",
140 | "height": 352
141 | },
142 | "outputId": "1201f7b6-2292-4714-d1e3-863a6a3d27ec"
143 | },
144 | "source": [
145 | "import numpy as np\n",
146 | "from scipy import stats\n",
147 | "import matplotlib.pyplot as plt\n",
148 | "from pyod.models.knn import KNN\n",
149 | "from pyod.utils.data import generate_data\n",
150 | "outlier_fraction = 0.1 \n",
151 | "n_train = 200 \n",
152 | "n_test = 100\n",
153 | "X_train, y_train, X_test, y_test = generate_data(n_train=n_train, n_test=n_test, contamination=outlier_fraction)\n",
154 | "#plot train and test set\n",
155 | "feature_1_train = X_train[:,[0]].reshape(-1,1)\n",
156 | "feature_2_train = X_train[:,[1]].reshape(-1,1)\n",
157 | "feature_1_test = X_test[:,[0]].reshape(-1,1)\n",
158 | "feature_2_test = X_test[:,[1]].reshape(-1,1)\n",
159 | "#scatter plot \n",
160 | "plt.scatter(feature_1_train,feature_2_train)\n",
161 | "plt.scatter(feature_1_test,feature_2_test)\n",
162 | "plt.xlabel('feature_1')\n",
163 | "plt.ylabel('feature_2')"
164 | ],
165 | "execution_count": null,
166 | "outputs": [
167 | {
168 | "output_type": "stream",
169 | "text": [
170 | "/usr/local/lib/python3.6/dist-packages/pyod/utils/data.py:190: FutureWarning: behaviour=\"old\" is deprecated and will be removed in version 0.8.0. Please use behaviour=\"new\", which makes the returned datasets in the order of X_train, X_test, y_train, y_test.\n",
171 | " FutureWarning)\n"
172 | ],
173 | "name": "stderr"
174 | },
175 | {
176 | "output_type": "execute_result",
177 | "data": {
178 | "text/plain": [
179 | "Text(0, 0.5, 'feature_2')"
180 | ]
181 | },
182 | "metadata": {
183 | "tags": []
184 | },
185 | "execution_count": 7
186 | },
187 | {
188 | "output_type": "display_data",
189 | "data": {
190 | "image/png": "\n",
191 | "text/plain": [
192 | ""
193 | ]
194 | },
195 | "metadata": {
196 | "tags": [],
197 | "needs_background": "light"
198 | }
199 | }
200 | ]
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {
205 | "id": "U_i4TUPZP2y_",
206 | "colab_type": "text"
207 | },
208 | "source": [
209 | "Train and make predictions with KNN model"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "metadata": {
215 | "id": "1PPSZ1xDPl0h",
216 | "colab_type": "code",
217 | "colab": {
218 | "base_uri": "https://localhost:8080/",
219 | "height": 52
220 | },
221 | "outputId": "a529b8d4-23d2-4246-cfdd-85ffbf56ea46"
222 | },
223 | "source": [
224 | "knn=KNN(contamination=outlier_fraction)\n",
225 | "knn.fit(X_train)\n",
226 | "# prediction labels and outlier scores of the training data\n",
227 | "y_train_pred = knn.labels_ \n",
228 | "y_train_scores = knn.decision_scores_ \n",
229 | "# prediction on the test data\n",
230 | "y_test_pred = knn.predict(X_test) \n",
231 | "y_test_scores = knn.decision_function(X_test)\n",
232 | "# errors in test set\n",
233 | "n_errors = (y_test_pred != y_test).sum()\n",
234 | "print('No of Errors in test set: {}'.format(n_errors))\n",
235 | "# accuracy in test set\n",
236 | "print('Accuracy in test set: {}'.format((n_test-n_errors)/n_test))"
237 | ],
238 | "execution_count": null,
239 | "outputs": [
240 | {
241 | "output_type": "stream",
242 | "text": [
243 | "No of Errors in test set: 0\n",
244 | "Accuracy in test set: 1.0\n"
245 | ],
246 | "name": "stdout"
247 | }
248 | ]
249 | },
250 | {
251 | "cell_type": "markdown",
252 | "metadata": {
253 | "id": "Z18WqKf5QMV7",
254 | "colab_type": "text"
255 | },
256 | "source": [
257 | "KNN algorithm was able to correctly classify almost all the outliers"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "metadata": {
263 | "id": "1zwrqVDRPtNO",
264 | "colab_type": "code",
265 | "colab": {
266 | "base_uri": "https://localhost:8080/",
267 | "height": 650
268 | },
269 | "outputId": "9eb5626f-7c8f-461e-8d88-f4cf1e9047e1"
270 | },
271 | "source": [
272 | "from pyod.utils import example\n",
273 | "example.visualize(knn, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False)"
274 | ],
275 | "execution_count": null,
276 | "outputs": [
277 | {
278 | "output_type": "display_data",
279 | "data": {
280 | "image/png": "\n",
281 | "text/plain": [
282 | ""
283 | ]
284 | },
285 | "metadata": {
286 | "tags": []
287 | }
288 | }
289 | ]
290 | }
291 | ]
292 | }
--------------------------------------------------------------------------------