├── mapper.py
├── reducer.py
├── README.md
├── pima.py
├── bigmart2.py
├── eda-of-iris-dataset.ipynb
└── basic-ml-best-of-10-classifiers.ipynb
/mapper.py:
--------------------------------------------------------------------------------
1 | .
2 | #!/usr/bin/env python
3 | import sys
4 |
5 | #--- get all lines from stdin ---
6 | for line in sys.stdin:
7 | #--- remove leading and trailing whitespace---
8 | line = line.strip()
9 |
10 | #--- split the line into words ---
11 | words = line.split()
12 |
13 | #--- output tuples [word, 1] in tab-delimited format---
14 | for word in words:
15 | print '%s\t%s' % (word, "1")
--------------------------------------------------------------------------------
/reducer.py:
--------------------------------------------------------------------------------
1 | .
2 | #!/usr/bin/env python
3 | import sys
4 |
5 | # maps words to their counts
6 | word2count = {}
7 |
8 | # input comes from STDIN
9 | for line in sys.stdin:
10 | # remove leading and trailing whitespace
11 | line = line.strip()
12 |
13 | # parse the input we got from mapper.py
14 | word, count = line.split('\t', 1)
15 | # convert count (currently a string) to int
16 | try:
17 | count = int(count)
18 | except ValueError:
19 | continue
20 |
21 | try:
22 | word2count[word] = word2count[word]+count
23 | except:
24 | word2count[word] = count
25 |
26 | # write the tuples to stdout
27 | # Note: they are unsorted
28 | for word in word2count.keys():
29 | print '%s\t%s'% ( word, word2count[word] )
30 |
31 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 410246-Laboratory-Practice-I-410243-Data-Analytics-
2 | 410241:: High Performance Computing
3 | 410242:: Artificial Intelligence and Robotics
4 | 410243:: Data Analytics
5 |
6 | For Hadoop Word Count Assignment:
7 | Make sure the file is executable:
8 |
9 | chmod +x reducer.py
10 |
11 | Testing
12 | Make sur your two programs work. Here's a simple series of test you can run:
13 | cat mapper.py | mapper.py
14 | This will make mapper.py output all the words that make up its code.
15 | cat mapper.py | mapper.py | sort | reducer.py
16 | This will generate the (unsorted) frequencies of all the unique words (punctuated or not) in mapper.py.
17 | Running on the Hadoop Cluster
18 | Let's run the Python code on the Ulysses.txt file.
19 | We'll assume that the Python code is stored in ~hadoop/352/dft/python
20 | We'll assume that the streaming java library is in ~hadoop/contrib/streaming/streaming-0.19.2-streaming.jar
21 | We'll also assume that ulysses.txt is in dft and that we want the output in dft-output:
22 | cd
23 | cd 352/dft/python
24 | hadoop dfs -rmr dft1-output
25 | hadoop jar /home/hadoop/hadoop/contrib/streaming/hadoop-0.19.2-streaming.jar -file ./mapper.py \
26 | -mapper ./mapper.py -file ./reducer.py -reducer ./reducer.py -input dft -output dft-output
27 | Changing the number of Reducers
28 | To change the number of reducers, simply add this switch -jobconf mapred.reduce.tasks=16 to the command line:
29 | cd
30 | cd 352/dft/python
31 | hadoop dfs -rmr dft1-output
32 | hadoop jar /home/hadoop/hadoop/contrib/streaming/hadoop-0.19.2-streaming.jar \
33 | -jobconf mapred.reduce.tasks=16 \
34 | -file ./mapper.py \
35 | -mapper ./mapper.py \
36 | -file ./reducer.py \
37 | -reducer ./reducer.py \
38 | -input dft -output dft-output
39 |
--------------------------------------------------------------------------------
/pima.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sat Nov 2 15:02:42 2019
4 |
5 | @author: Acer
6 | """
7 |
8 | import pandas as pd
9 | import numpy as np
10 |
11 | import matplotlib.pyplot as plt
12 | import seaborn as sns
13 |
14 | from sklearn.metrics import precision_score, recall_score, f1_score
15 | from sklearn.metrics import confusion_matrix
16 | from sklearn.model_selection import train_test_split
17 |
18 | # Load Data
19 | #------------------------------------------------------------------------------
20 | df = pd.read_csv( 'diabetes.csv')
21 | #------------------------------------------------------------------------------
22 | #dataset info
23 | print(df.info())
24 | print(df.describe())
25 | print(df.isnull().count())
26 | #------------------------------------------------------------------------------
27 | #x and y for train_test_split
28 | X = df.iloc[:, :-1] #x = df.drop(['Outcome'],axis=1)
29 | y = df.iloc[:, -1] #y = df['Outcome']
30 |
31 | #------------------------------------------------------------------------------
32 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
33 | #------------------------------------------------------------------------------
34 | from sklearn.naive_bayes import GaussianNB
35 | #Model
36 | gb = GaussianNB()
37 |
38 | #fiting the model
39 | gb.fit(X_train, y_train)
40 |
41 | #prediction
42 | y_pred = gb.predict(X_test)
43 |
44 | #Accuracy, Precision, Recall, F1
45 | print("Accuracy ", gb.score(X_test, y_test)*100)
46 | print('Precision score: ', format(precision_score(y_test, y_pred)))
47 | print('Recall score: ', format(recall_score(y_test, y_pred)))
48 | print('F1 score: ', format(f1_score(y_test, y_pred)))
49 |
50 | #Plot the confusion matrix
51 | sns.set(font_scale=1.5)
52 | cm = confusion_matrix(y_pred, y_test)
53 | sns.heatmap(cm, annot=True, fmt='g')
54 | plt.show()
55 | #------------------------------------------------------------------------------
56 | from sklearn.linear_model import LogisticRegression
57 | #Model
58 | LR = LogisticRegression()
59 |
60 | #fiting the model
61 | LR.fit(X_train, y_train)
62 |
63 | #prediction
64 | y_pred = LR.predict(X_test)
65 |
66 | #Accuracy
67 | print("Accuracy ", LR.score(X_test, y_test)*100)
68 | print('Precision score: ', format(precision_score(y_test, y_pred)))
69 | print('Recall score: ', format(recall_score(y_test, y_pred)))
70 | print('F1 score: ', format(f1_score(y_test, y_pred)))
71 | #Plot the confusion matrix
72 | sns.set(font_scale=1.5)
73 | cm = confusion_matrix(y_pred, y_test)
74 | sns.heatmap(cm, annot=True, fmt='g')
75 | plt.show()
--------------------------------------------------------------------------------
/bigmart2.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Nov 4 00:37:00 2019
4 |
5 | @author: Acer
6 | """
7 |
8 | # importing required libraries.
9 | import pandas as pd
10 | import numpy as np
11 | import matplotlib.pyplot as plt
12 | import seaborn as sns
13 |
14 | train_data = pd.read_csv('bigmarttrain.csv')
15 | test_data = pd.read_csv('bigmarttest.csv')
16 |
17 | print(train_data.head())
18 |
19 | test_data.apply(lambda x: sum(x.isnull()))
20 |
21 | test_data['Item_Fat_Content'].unique()
22 |
23 | # combining Item_Fat_Content misspelled
24 | train_data['Item_Fat_Content'].replace(['low fat','LF','reg'],['Low Fat','Low Fat','Regular'],inplace = True)
25 | test_data['Item_Fat_Content'].replace(['low fat','LF','reg'],['Low Fat','Low Fat','Regular'],inplace = True)
26 |
27 | # creating new column num_years
28 | train_data['num_years'] = train_data['Outlet_Establishment_Year'].apply(lambda x: 2013 - x)
29 | test_data['num_years'] = test_data['Outlet_Establishment_Year'].apply(lambda x: 2013 - x)
30 |
31 | train_data['Item_Type'].unique()
32 |
33 | sns.factorplot('Outlet_Location_Type',data = train_data,hue ='Outlet_Size' ,kind='count')
34 |
35 | sns.factorplot('Outlet_Location_Type',data = test_data,hue ='Outlet_Size' ,kind='count')
36 |
37 | sns.factorplot('Outlet_Type','num_years',data = test_data,hue='Outlet_Size' )
38 |
39 | sns.factorplot('Outlet_Type','num_years',data = train_data,hue='Outlet_Size' )
40 |
41 | full_data = [train_data, test_data]
42 |
43 | # filling null values
44 | for data in full_data:
45 | data['Item_Weight'].fillna(data['Item_Weight'].mean(),inplace = True)
46 | data['Outlet_Size'].fillna('Medium',inplace = True)
47 |
48 | col = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
49 |
50 | # handling catagorical variables
51 | train_datar = pd.get_dummies(train_data, columns = col, drop_first = True)
52 | test_datar = pd.get_dummies(test_data, columns = col,drop_first = True)
53 |
54 | feat_cols = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'num_years',
55 | 'Item_Fat_Content_Regular', 'Item_Type_Breads', 'Item_Type_Breakfast',
56 | 'Item_Type_Canned', 'Item_Type_Dairy', 'Item_Type_Frozen Foods',
57 | 'Item_Type_Fruits and Vegetables', 'Item_Type_Hard Drinks',
58 | 'Item_Type_Health and Hygiene', 'Item_Type_Household', 'Item_Type_Meat',
59 | 'Item_Type_Others', 'Item_Type_Seafood', 'Item_Type_Snack Foods',
60 | 'Item_Type_Soft Drinks', 'Item_Type_Starchy Foods',
61 | 'Outlet_Size_Medium', 'Outlet_Size_Small',
62 | 'Outlet_Location_Type_Tier 2', 'Outlet_Location_Type_Tier 3',
63 | 'Outlet_Type_Supermarket Type1', 'Outlet_Type_Supermarket Type2',
64 | 'Outlet_Type_Supermarket Type3']
65 |
66 | X = train_datar[feat_cols]
67 | y = train_datar['Item_Outlet_Sales']
68 |
69 | # splitting data as X_train and X_test
70 | from sklearn.model_selection import train_test_split
71 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state = 0)
72 |
73 | from sklearn.linear_model import LinearRegression
74 | lr = LinearRegression()
75 | lr.fit(X_train,y_train)
76 | y_pred = lr.predict(X_test)
77 |
78 | # calculating RMSE
79 | from sklearn.metrics import mean_squared_error
80 | from math import sqrt
81 | rmse = sqrt(mean_squared_error(y_test, y_pred))
82 |
83 | print('RMSE',rmse)
84 |
85 | # predicting on actual test data
86 | X_t = test_datar[feat_cols]
87 | y_result = lr.predict(X_t)
88 |
89 | #creating results.csv file
90 | result = pd.DataFrame()
91 | result['Item_Identifier'] = test_datar['Item_Identifier']
92 | result['Outlet_Identifier'] = test_datar['Outlet_Identifier']
93 |
94 | result["Item_Outlet_Sales"] = y_result
95 | result = result.sort_index()
--------------------------------------------------------------------------------
/eda-of-iris-dataset.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"metadata":{"trusted":true,"_uuid":"9b924fa56f9a3dda9323e8420f202329661968f3"},"cell_type":"code","source":"#EDA Iris dataset\n\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\niris=pd.read_csv(\"../input/IRIS.csv\")\n\n\n#size of iris\niris.shape","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"b1478d62f9d67d6f1da59de8584c715ac6dbd618"},"cell_type":"code","source":"#coloumns name\nprint(iris.columns) ","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"dc31699b5b048edbcd8609121d3ad34792d89279"},"cell_type":"code","source":"iris.head()","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"606bef1774aa849cfac381ccb051fbc8fcc56666"},"cell_type":"code","source":"#how many datapoints for each classes are present\n\niris[\"species\"].value_counts()\n\n#this is a balanced dataset as each class has equal no of values or almost similar values is also balanced dataset(not imbalanced dataset)","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"43dc829a5bccf33230959a1c46463c9f6b72d3fb"},"cell_type":"code","source":"#2-d scatter plot\n\niris.plot(kind='scatter',x='sepal_length',y='sepal_width')\nplt.show()","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"389615615af9aea295c1ba1160db48c9debe1d06"},"cell_type":"code","source":"import seaborn as sns\n\nsns.set_style(\"whitegrid\")\nsns.FacetGrid(iris,hue=\"species\",size=4)\\\n .map(plt.scatter,\"sepal_length\",\"sepal_width\")\\\n .add_legend()\nplt.show()","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"8bb0b04804223b85051d36802fa3851fe5518724"},"cell_type":"code","source":"#plot on bases of petal\n\n\nsns.set_style(\"whitegrid\")\nsns.FacetGrid(iris,hue=\"species\",size=4)\\\n .map(plt.scatter,\"petal_length\",\"petal_width\")\\\n .add_legend()\nplt.show()","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"c6a6ffd6ead7709996537d5ff8688582697b45e1"},"cell_type":"code","source":"#Pair plot\n\nplt.close();\nsns.set_style(\"whitegrid\")\nsns.pairplot(iris,hue='species',size=3);\nplt.show()","execution_count":null,"outputs":[]},{"metadata":{"_uuid":"6f217423ef51fa102c631c97af62bf912d2bcc81"},"cell_type":"markdown","source":"Petal length and petal width are most important features.\nSetosa are easy differnetiable or linearly seperable but there is little overlap between Versicolor and verginica.\nWe can create model using simple if-else condition.\n\nLimitations of pair plot: can be used for only less features.. like max 6 feautures. as total plots will be like nc2 where n is no of features.\n"},{"metadata":{"trusted":false,"_uuid":"49543f21f61b029af80b1589d5d66afa48f3458c"},"cell_type":"code","source":"#Histogram(1-D sctter plot kind of)\n\nsns.FacetGrid(iris,hue=\"species\",size=5)\\\n .map(sns.distplot, \"petal_length\")\\\n .add_legend();\nplt.show();","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"2d6d37f154344bb8489976687ccc9664be995ca3"},"cell_type":"code","source":"#pdf and cdf\n\niris_setosa=iris[iris['species']=='Iris-setosa']\nprint(iris_setosa.head())\ncounts, bin_edges= np.histogram(iris_setosa['petal_length'],bins=10, density= True)\npdf=counts/(sum(counts))\nprint(pdf)\n\nprint(bin_edges)\n\ncdf=np.cumsum(pdf)\nprint(cdf)\n\nplt.plot(bin_edges[1:],pdf)\nplt.plot(bin_edges[1:],cdf)\nplt.show();\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"bf0db346e43b0e978ad088644c44d9d7019fec01"},"cell_type":"code","source":"#pdf and cdf\n\niris_setosa=iris[iris['species']=='Iris-setosa']\nprint(iris_setosa.head())\ncounts, bin_edges= np.histogram(iris_setosa['petal_length'],bins=10, density= True)\npdf=counts/(sum(counts))\nprint(pdf)\n\nprint(bin_edges)\n\ncdf=np.cumsum(pdf)\nprint(cdf)\n\nplt.plot(bin_edges[1:],pdf)\nplt.plot(bin_edges[1:],cdf)\n\n\n\niris_versicolor=iris[iris['species']=='Iris-versicolor']\n\ncounts_ve, bin_edges_ve= np.histogram(iris_versicolor['petal_length'],bins=10, density= True)\npdf_ve=counts_ve/(sum(counts_ve))\nprint(pdf_ve)\n\nprint(bin_edges_ve)\n\ncdf_ve=np.cumsum(pdf_ve)\nprint(cdf_ve)\n\nplt.plot(bin_edges_ve[1:],pdf_ve)\nplt.plot(bin_edges_ve[1:],cdf_ve)\n\n\niris_virginica=iris[iris['species']=='Iris-virginica']\n\ncounts_vi, bin_edges_vi= np.histogram(iris_virginica['petal_length'],bins=10, density= True)\npdf_vi=counts_vi/(sum(counts_vi))\nprint(pdf_vi)\n\nprint(bin_edges_vi)\n\ncdf_vi=np.cumsum(pdf_vi)\nprint(cdf_vi)\n\nplt.plot(bin_edges_vi[1:],pdf_vi)\nplt.plot(bin_edges_vi[1:],cdf_vi)\n\n\nplt.show();\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"a029880b8e6a2ff0710871057071ed45917d1d35"},"cell_type":"code","source":"iris_virginica.describe()","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"03f5fc89dc97e4bdb65cb454e4a679138b5a338f"},"cell_type":"code","source":"sns.boxplot(x='species', y='petal_length',data=iris)\nplt.show()","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"199838d6bd83031323775fde0be8cce68c65c086"},"cell_type":"code","source":"sns.violinplot(x='species', y='petal_length',data=iris)\nplt.show()","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"e31fa7386938110d1f9012893b96bd05f8b07d7d"},"cell_type":"code","source":"#2D Density plot, contors-plot\nsns.jointplot(x=\"petal_length\", y=\"petal_width\", data=iris_setosa, kind=\"kde\");\nplt.show();","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"05da958f26565c024d03854c3b727aa06c92d1e1"},"cell_type":"code","source":"","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.6.6"}},"nbformat":4,"nbformat_minor":1}
--------------------------------------------------------------------------------
/basic-ml-best-of-10-classifiers.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "_cell_guid": "3ebb8e8b-4526-24f5-208f-ee2cb9f1596b"
7 | },
8 | "source": [
9 | "Hugues Fontenelle\n",
10 | "7 October 2016\n",
11 | "\n",
12 | "# Pima Indians Diabetes Database\n",
13 | "## Predict the onset of diabetes based on diagnostic measures\n",
14 | "\n",
15 | "Hi folks. I'm new to this, so let me try out what I've learned so far. Your comments are welcome!"
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {
21 | "_cell_guid": "47666c75-d504-bf1d-dfef-12bce1cbe8eb"
22 | },
23 | "source": [
24 | "First, let's load the data, and split it in four. It is the fold used the authors of the original paper."
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 1,
30 | "metadata": {
31 | "_cell_guid": "9dbddda0-7005-5721-48d4-5a794ffc3dec"
32 | },
33 | "outputs": [],
34 | "source": [
35 | "import numpy as np\n",
36 | "\n",
37 | "f = open(\"../input/diabetes.csv\")\n",
38 | "f.readline() # skip the header\n",
39 | "data = np.loadtxt(f, delimiter = ',')\n",
40 | "X = data[:, :-1]\n",
41 | "y = data[:, -1]\n",
42 | "from sklearn.model_selection import train_test_split\n",
43 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {
49 | "_cell_guid": "aadbd4f8-b043-0899-0c2b-17be3c6a7a94"
50 | },
51 | "source": [
52 | "Let's try out a bunch of classifiers, all with default parameters."
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 2,
58 | "metadata": {
59 | "_cell_guid": "24637d51-c27f-9876-7c9d-d2259ef91a15"
60 | },
61 | "outputs": [],
62 | "source": [
63 | "from sklearn.neural_network import MLPClassifier\n",
64 | "from sklearn.neighbors import KNeighborsClassifier\n",
65 | "from sklearn.svm import SVC\n",
66 | "from sklearn.gaussian_process import GaussianProcessClassifier\n",
67 | "from sklearn.gaussian_process.kernels import RBF\n",
68 | "from sklearn.tree import DecisionTreeClassifier\n",
69 | "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n",
70 | "from sklearn.naive_bayes import GaussianNB\n",
71 | "from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis\n",
72 | "\n",
73 | "names = [\"Nearest Neighbors\", \"Linear SVM\", \"RBF SVM\", \"Gaussian Process\",\n",
74 | " \"Decision Tree\", \"Random Forest\", \"Neural Net\", \"AdaBoost\",\n",
75 | " \"Naive Bayes\", \"QDA\"\n",
76 | " ]\n",
77 | "\n",
78 | "classifiers = [\n",
79 | " KNeighborsClassifier(),\n",
80 | " SVC(kernel=\"linear\"),\n",
81 | " SVC(kernel=\"rbf\"),\n",
82 | " GaussianProcessClassifier(),\n",
83 | " DecisionTreeClassifier(),\n",
84 | " RandomForestClassifier(),\n",
85 | " MLPClassifier(),\n",
86 | " AdaBoostClassifier(),\n",
87 | " GaussianNB(),\n",
88 | " QuadraticDiscriminantAnalysis()\n",
89 | "]"
90 | ]
91 | },
92 | {
93 | "cell_type": "markdown",
94 | "metadata": {
95 | "_cell_guid": "f8bc9411-67fa-c204-db43-bc45a5704d4c"
96 | },
97 | "source": [
98 | "Now run all the classifiers, using 5-fold cross validation."
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 3,
104 | "metadata": {
105 | "_cell_guid": "8890a677-3e71-9ea2-ee9b-ac94446151d4"
106 | },
107 | "outputs": [],
108 | "source": [
109 | "from sklearn.model_selection import cross_val_score\n",
110 | "\n",
111 | "# iterate over classifiers\n",
112 | "results = {}\n",
113 | "for name, clf in zip(names, classifiers):\n",
114 | " scores = cross_val_score(clf, X_train, y_train, cv=5)\n",
115 | " results[name] = scores"
116 | ]
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "metadata": {
121 | "_cell_guid": "af9d636c-9704-7425-01dd-d6db580d4c4e"
122 | },
123 | "source": [
124 | "Here are the results:"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 4,
130 | "metadata": {
131 | "_cell_guid": "458d31e6-b1f7-8c9f-ecc2-4729f8a3d617"
132 | },
133 | "outputs": [
134 | {
135 | "name": "stdout",
136 | "output_type": "stream",
137 | "text": [
138 | " RBF SVM | Accuracy: 64.24% (+/- 0.44%)\n",
139 | " Linear SVM | Accuracy: 76.04% (+/- 5.58%)\n",
140 | " Neural Net | Accuracy: 60.06% (+/- 23.16%)\n",
141 | " Decision Tree | Accuracy: 66.85% (+/- 4.62%)\n",
142 | " Gaussian Process | Accuracy: 68.58% (+/- 6.14%)\n",
143 | " Nearest Neighbors | Accuracy: 71.18% (+/- 7.56%)\n",
144 | " QDA | Accuracy: 73.97% (+/- 8.84%)\n",
145 | " AdaBoost | Accuracy: 72.57% (+/- 8.32%)\n",
146 | " Naive Bayes | Accuracy: 73.62% (+/- 5.78%)\n",
147 | " Random Forest | Accuracy: 73.44% (+/- 3.69%)\n"
148 | ]
149 | }
150 | ],
151 | "source": [
152 | "for name, scores in results.items():\n",
153 | " print(\"%20s | Accuracy: %0.2f%% (+/- %0.2f%%)\" % (name, 100*scores.mean(), 100*scores.std() * 2))"
154 | ]
155 | },
156 | {
157 | "cell_type": "markdown",
158 | "metadata": {
159 | "_cell_guid": "90315be6-17cf-0c9d-5b3f-cb2b826181a1"
160 | },
161 | "source": [
162 | "Seems like a Linear SVM performs best.\n",
163 | "Let's try some parameter optimization."
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 5,
169 | "metadata": {
170 | "_cell_guid": "114b0360-f423-9d8e-cc99-e1ad84759169"
171 | },
172 | "outputs": [
173 | {
174 | "name": "stderr",
175 | "output_type": "stream",
176 | "text": [
177 | "/opt/conda/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
178 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n",
179 | "/opt/conda/lib/python3.5/site-packages/sklearn/grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.\n",
180 | " DeprecationWarning)\n"
181 | ]
182 | },
183 | {
184 | "name": "stdout",
185 | "output_type": "stream",
186 | "text": [
187 | "GridSearchCV(cv=None, error_score='raise',\n",
188 | " estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
189 | " decision_function_shape=None, degree=3, gamma='auto', kernel='linear',\n",
190 | " max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
191 | " tol=0.001, verbose=False),\n",
192 | " fit_params={}, iid=True, n_jobs=1,\n",
193 | " param_grid=[{'C': [0.01, 0.1, 1, 10], 'kernel': ['linear']}],\n",
194 | " pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)\n"
195 | ]
196 | }
197 | ],
198 | "source": [
199 | "from sklearn.grid_search import GridSearchCV\n",
200 | "\n",
201 | "clf = SVC(kernel=\"linear\")\n",
202 | "\n",
203 | "# prepare a range of values to test\n",
204 | "param_grid = [\n",
205 | " {'C': [.01, .1, 1, 10], 'kernel': ['linear']},\n",
206 | " ]\n",
207 | "\n",
208 | "grid = GridSearchCV(estimator=clf, param_grid=param_grid)\n",
209 | "grid.fit(X_train, y_train)\n",
210 | "print(grid)"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": 6,
216 | "metadata": {
217 | "_cell_guid": "da7a2382-e12d-1c61-ef87-684907f3dbaf"
218 | },
219 | "outputs": [
220 | {
221 | "name": "stdout",
222 | "output_type": "stream",
223 | "text": [
224 | "Best score: 76.39%\n",
225 | "Best estimator for parameter C: 0.100000\n"
226 | ]
227 | }
228 | ],
229 | "source": [
230 | "# summarize the results of the grid search\n",
231 | "print(\"Best score: %0.2f%%\" % (100*grid.best_score_))\n",
232 | "print(\"Best estimator for parameter C: %f\" % (grid.best_estimator_.C))"
233 | ]
234 | },
235 | {
236 | "cell_type": "markdown",
237 | "metadata": {
238 | "_cell_guid": "b49c3baf-9dba-417f-9d37-5f6ea30652df"
239 | },
240 | "source": [
241 | "Finaly, train the Linear SVM (with param `C=0.1`) on the whole train set, and evaluate on the test set"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": 7,
247 | "metadata": {
248 | "_cell_guid": "fa723561-4feb-c20c-971e-55b5fcc1cebc"
249 | },
250 | "outputs": [],
251 | "source": [
252 | "clf = SVC(kernel=\"linear\", C=0.1)\n",
253 | "clf.fit(X_train, y_train)\n",
254 | "y_eval = clf.predict(X_test)"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": 8,
260 | "metadata": {
261 | "_cell_guid": "1898e1cd-08b7-c044-86d3-77155ab7a818"
262 | },
263 | "outputs": [
264 | {
265 | "name": "stdout",
266 | "output_type": "stream",
267 | "text": [
268 | "Accuracy: 80.21%\n"
269 | ]
270 | }
271 | ],
272 | "source": [
273 | "acc = sum(y_eval == y_test) / float(len(y_test))\n",
274 | "print(\"Accuracy: %.2f%%\" % (100*acc))"
275 | ]
276 | },
277 | {
278 | "cell_type": "markdown",
279 | "metadata": {
280 | "_cell_guid": "1c626fa2-0c33-69cc-54e8-b6868c907ad9"
281 | },
282 | "source": [
283 | "We did it :-)"
284 | ]
285 | },
286 | {
287 | "cell_type": "markdown",
288 | "metadata": {
289 | "_cell_guid": "569c56fb-822a-1d19-38bd-92270a8a7989"
290 | },
291 | "source": [
292 | "**edit**\n",
293 | "\n",
294 | "I was _probably_ a bit lucky for this particular fold (`random_state=0`). Why would the accuracy on the test be higher than on the optimized trained set? Let's re-run a 5-fold cv on the whole data:"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": 9,
300 | "metadata": {
301 | "_cell_guid": "d80d926e-fa4d-4f34-0836-0594168a6601"
302 | },
303 | "outputs": [],
304 | "source": [
305 | "clf = SVC(kernel=\"linear\", C=0.1)\n",
306 | "scores_final = cross_val_score(clf, X, y, cv=5)"
307 | ]
308 | },
309 | {
310 | "cell_type": "code",
311 | "execution_count": 10,
312 | "metadata": {
313 | "_cell_guid": "cdbcd59f-97ee-0fce-a25c-d63e11e2bf90"
314 | },
315 | "outputs": [
316 | {
317 | "name": "stdout",
318 | "output_type": "stream",
319 | "text": [
320 | "Final model | Accuracy: 76.83% (+/- 4.31%)\n"
321 | ]
322 | }
323 | ],
324 | "source": [
325 | "scores_final.mean(), scores_final.std()\n",
326 | "print(\"Final model | Accuracy: %0.2f%% (+/- %0.2f%%)\" % (100*scores_final.mean(), 100*scores_final.std() * 2))"
327 | ]
328 | },
329 | {
330 | "cell_type": "markdown",
331 | "metadata": {
332 | "_cell_guid": "9bf33d09-cc9b-b5c3-49e5-9da5f49b4c71"
333 | },
334 | "source": [
335 | "..which is more realistic!\n",
336 | "\n",
337 | "I am wondering, at which stage do I then use this test set?"
338 | ]
339 | }
340 | ],
341 | "metadata": {
342 | "_change_revision": 12,
343 | "_is_fork": false,
344 | "kernelspec": {
345 | "display_name": "Python 3",
346 | "language": "python",
347 | "name": "python3"
348 | },
349 | "language_info": {
350 | "codemirror_mode": {
351 | "name": "ipython",
352 | "version": 3
353 | },
354 | "file_extension": ".py",
355 | "mimetype": "text/x-python",
356 | "name": "python",
357 | "nbconvert_exporter": "python",
358 | "pygments_lexer": "ipython3",
359 | "version": "3.5.2"
360 | }
361 | },
362 | "nbformat": 4,
363 | "nbformat_minor": 0
364 | }
365 |
--------------------------------------------------------------------------------