├── mapper.py ├── reducer.py ├── README.md ├── pima.py ├── bigmart2.py ├── eda-of-iris-dataset.ipynb └── basic-ml-best-of-10-classifiers.ipynb /mapper.py: -------------------------------------------------------------------------------- 1 | . 2 | #!/usr/bin/env python 3 | import sys 4 | 5 | #--- get all lines from stdin --- 6 | for line in sys.stdin: 7 | #--- remove leading and trailing whitespace--- 8 | line = line.strip() 9 | 10 | #--- split the line into words --- 11 | words = line.split() 12 | 13 | #--- output tuples [word, 1] in tab-delimited format--- 14 | for word in words: 15 | print '%s\t%s' % (word, "1") -------------------------------------------------------------------------------- /reducer.py: -------------------------------------------------------------------------------- 1 | . 2 | #!/usr/bin/env python 3 | import sys 4 | 5 | # maps words to their counts 6 | word2count = {} 7 | 8 | # input comes from STDIN 9 | for line in sys.stdin: 10 | # remove leading and trailing whitespace 11 | line = line.strip() 12 | 13 | # parse the input we got from mapper.py 14 | word, count = line.split('\t', 1) 15 | # convert count (currently a string) to int 16 | try: 17 | count = int(count) 18 | except ValueError: 19 | continue 20 | 21 | try: 22 | word2count[word] = word2count[word]+count 23 | except: 24 | word2count[word] = count 25 | 26 | # write the tuples to stdout 27 | # Note: they are unsorted 28 | for word in word2count.keys(): 29 | print '%s\t%s'% ( word, word2count[word] ) 30 | 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 410246-Laboratory-Practice-I-410243-Data-Analytics- 2 | 410241:: High Performance Computing
3 | 410242:: Artificial Intelligence and Robotics
4 | 410243:: Data Analytics
5 | 6 | For Hadoop Word Count Assignment:
7 | Make sure the file is executable:
8 | 9 | chmod +x reducer.py
10 | 11 | Testing
12 | Make sur your two programs work. Here's a simple series of test you can run:
13 | cat mapper.py | mapper.py
14 | This will make mapper.py output all the words that make up its code.
15 | cat mapper.py | mapper.py | sort | reducer.py
16 | This will generate the (unsorted) frequencies of all the unique words (punctuated or not) in mapper.py.
17 | Running on the Hadoop Cluster
18 | Let's run the Python code on the Ulysses.txt file.
19 | We'll assume that the Python code is stored in ~hadoop/352/dft/python
20 | We'll assume that the streaming java library is in ~hadoop/contrib/streaming/streaming-0.19.2-streaming.jar
21 | We'll also assume that ulysses.txt is in dft and that we want the output in dft-output:
22 | cd
23 | cd 352/dft/python
24 | hadoop dfs -rmr dft1-output
25 | hadoop jar /home/hadoop/hadoop/contrib/streaming/hadoop-0.19.2-streaming.jar -file ./mapper.py \
26 | -mapper ./mapper.py -file ./reducer.py -reducer ./reducer.py -input dft -output dft-output
27 | Changing the number of Reducers
28 | To change the number of reducers, simply add this switch -jobconf mapred.reduce.tasks=16 to the command line:
29 | cd
30 | cd 352/dft/python
31 | hadoop dfs -rmr dft1-output
32 | hadoop jar /home/hadoop/hadoop/contrib/streaming/hadoop-0.19.2-streaming.jar \
33 | -jobconf mapred.reduce.tasks=16 \
34 | -file ./mapper.py \
35 | -mapper ./mapper.py \
36 | -file ./reducer.py \
37 | -reducer ./reducer.py \
38 | -input dft -output dft-output
39 | -------------------------------------------------------------------------------- /pima.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Nov 2 15:02:42 2019 4 | 5 | @author: Acer 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | 11 | import matplotlib.pyplot as plt 12 | import seaborn as sns 13 | 14 | from sklearn.metrics import precision_score, recall_score, f1_score 15 | from sklearn.metrics import confusion_matrix 16 | from sklearn.model_selection import train_test_split 17 | 18 | # Load Data 19 | #------------------------------------------------------------------------------ 20 | df = pd.read_csv( 'diabetes.csv') 21 | #------------------------------------------------------------------------------ 22 | #dataset info 23 | print(df.info()) 24 | print(df.describe()) 25 | print(df.isnull().count()) 26 | #------------------------------------------------------------------------------ 27 | #x and y for train_test_split 28 | X = df.iloc[:, :-1] #x = df.drop(['Outcome'],axis=1) 29 | y = df.iloc[:, -1] #y = df['Outcome'] 30 | 31 | #------------------------------------------------------------------------------ 32 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) 33 | #------------------------------------------------------------------------------ 34 | from sklearn.naive_bayes import GaussianNB 35 | #Model 36 | gb = GaussianNB() 37 | 38 | #fiting the model 39 | gb.fit(X_train, y_train) 40 | 41 | #prediction 42 | y_pred = gb.predict(X_test) 43 | 44 | #Accuracy, Precision, Recall, F1 45 | print("Accuracy ", gb.score(X_test, y_test)*100) 46 | print('Precision score: ', format(precision_score(y_test, y_pred))) 47 | print('Recall score: ', format(recall_score(y_test, y_pred))) 48 | print('F1 score: ', format(f1_score(y_test, y_pred))) 49 | 50 | #Plot the confusion matrix 51 | sns.set(font_scale=1.5) 52 | cm = confusion_matrix(y_pred, y_test) 53 | sns.heatmap(cm, annot=True, fmt='g') 54 | plt.show() 55 | #------------------------------------------------------------------------------ 56 | from sklearn.linear_model import LogisticRegression 57 | #Model 58 | LR = LogisticRegression() 59 | 60 | #fiting the model 61 | LR.fit(X_train, y_train) 62 | 63 | #prediction 64 | y_pred = LR.predict(X_test) 65 | 66 | #Accuracy 67 | print("Accuracy ", LR.score(X_test, y_test)*100) 68 | print('Precision score: ', format(precision_score(y_test, y_pred))) 69 | print('Recall score: ', format(recall_score(y_test, y_pred))) 70 | print('F1 score: ', format(f1_score(y_test, y_pred))) 71 | #Plot the confusion matrix 72 | sns.set(font_scale=1.5) 73 | cm = confusion_matrix(y_pred, y_test) 74 | sns.heatmap(cm, annot=True, fmt='g') 75 | plt.show() -------------------------------------------------------------------------------- /bigmart2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Nov 4 00:37:00 2019 4 | 5 | @author: Acer 6 | """ 7 | 8 | # importing required libraries. 9 | import pandas as pd 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | import seaborn as sns 13 | 14 | train_data = pd.read_csv('bigmarttrain.csv') 15 | test_data = pd.read_csv('bigmarttest.csv') 16 | 17 | print(train_data.head()) 18 | 19 | test_data.apply(lambda x: sum(x.isnull())) 20 | 21 | test_data['Item_Fat_Content'].unique() 22 | 23 | # combining Item_Fat_Content misspelled 24 | train_data['Item_Fat_Content'].replace(['low fat','LF','reg'],['Low Fat','Low Fat','Regular'],inplace = True) 25 | test_data['Item_Fat_Content'].replace(['low fat','LF','reg'],['Low Fat','Low Fat','Regular'],inplace = True) 26 | 27 | # creating new column num_years 28 | train_data['num_years'] = train_data['Outlet_Establishment_Year'].apply(lambda x: 2013 - x) 29 | test_data['num_years'] = test_data['Outlet_Establishment_Year'].apply(lambda x: 2013 - x) 30 | 31 | train_data['Item_Type'].unique() 32 | 33 | sns.factorplot('Outlet_Location_Type',data = train_data,hue ='Outlet_Size' ,kind='count') 34 | 35 | sns.factorplot('Outlet_Location_Type',data = test_data,hue ='Outlet_Size' ,kind='count') 36 | 37 | sns.factorplot('Outlet_Type','num_years',data = test_data,hue='Outlet_Size' ) 38 | 39 | sns.factorplot('Outlet_Type','num_years',data = train_data,hue='Outlet_Size' ) 40 | 41 | full_data = [train_data, test_data] 42 | 43 | # filling null values 44 | for data in full_data: 45 | data['Item_Weight'].fillna(data['Item_Weight'].mean(),inplace = True) 46 | data['Outlet_Size'].fillna('Medium',inplace = True) 47 | 48 | col = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'] 49 | 50 | # handling catagorical variables 51 | train_datar = pd.get_dummies(train_data, columns = col, drop_first = True) 52 | test_datar = pd.get_dummies(test_data, columns = col,drop_first = True) 53 | 54 | feat_cols = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'num_years', 55 | 'Item_Fat_Content_Regular', 'Item_Type_Breads', 'Item_Type_Breakfast', 56 | 'Item_Type_Canned', 'Item_Type_Dairy', 'Item_Type_Frozen Foods', 57 | 'Item_Type_Fruits and Vegetables', 'Item_Type_Hard Drinks', 58 | 'Item_Type_Health and Hygiene', 'Item_Type_Household', 'Item_Type_Meat', 59 | 'Item_Type_Others', 'Item_Type_Seafood', 'Item_Type_Snack Foods', 60 | 'Item_Type_Soft Drinks', 'Item_Type_Starchy Foods', 61 | 'Outlet_Size_Medium', 'Outlet_Size_Small', 62 | 'Outlet_Location_Type_Tier 2', 'Outlet_Location_Type_Tier 3', 63 | 'Outlet_Type_Supermarket Type1', 'Outlet_Type_Supermarket Type2', 64 | 'Outlet_Type_Supermarket Type3'] 65 | 66 | X = train_datar[feat_cols] 67 | y = train_datar['Item_Outlet_Sales'] 68 | 69 | # splitting data as X_train and X_test 70 | from sklearn.model_selection import train_test_split 71 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state = 0) 72 | 73 | from sklearn.linear_model import LinearRegression 74 | lr = LinearRegression() 75 | lr.fit(X_train,y_train) 76 | y_pred = lr.predict(X_test) 77 | 78 | # calculating RMSE 79 | from sklearn.metrics import mean_squared_error 80 | from math import sqrt 81 | rmse = sqrt(mean_squared_error(y_test, y_pred)) 82 | 83 | print('RMSE',rmse) 84 | 85 | # predicting on actual test data 86 | X_t = test_datar[feat_cols] 87 | y_result = lr.predict(X_t) 88 | 89 | #creating results.csv file 90 | result = pd.DataFrame() 91 | result['Item_Identifier'] = test_datar['Item_Identifier'] 92 | result['Outlet_Identifier'] = test_datar['Outlet_Identifier'] 93 | 94 | result["Item_Outlet_Sales"] = y_result 95 | result = result.sort_index() -------------------------------------------------------------------------------- /eda-of-iris-dataset.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"metadata":{"trusted":true,"_uuid":"9b924fa56f9a3dda9323e8420f202329661968f3"},"cell_type":"code","source":"#EDA Iris dataset\n\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\niris=pd.read_csv(\"../input/IRIS.csv\")\n\n\n#size of iris\niris.shape","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"b1478d62f9d67d6f1da59de8584c715ac6dbd618"},"cell_type":"code","source":"#coloumns name\nprint(iris.columns) ","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"dc31699b5b048edbcd8609121d3ad34792d89279"},"cell_type":"code","source":"iris.head()","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"606bef1774aa849cfac381ccb051fbc8fcc56666"},"cell_type":"code","source":"#how many datapoints for each classes are present\n\niris[\"species\"].value_counts()\n\n#this is a balanced dataset as each class has equal no of values or almost similar values is also balanced dataset(not imbalanced dataset)","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"43dc829a5bccf33230959a1c46463c9f6b72d3fb"},"cell_type":"code","source":"#2-d scatter plot\n\niris.plot(kind='scatter',x='sepal_length',y='sepal_width')\nplt.show()","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"389615615af9aea295c1ba1160db48c9debe1d06"},"cell_type":"code","source":"import seaborn as sns\n\nsns.set_style(\"whitegrid\")\nsns.FacetGrid(iris,hue=\"species\",size=4)\\\n .map(plt.scatter,\"sepal_length\",\"sepal_width\")\\\n .add_legend()\nplt.show()","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"8bb0b04804223b85051d36802fa3851fe5518724"},"cell_type":"code","source":"#plot on bases of petal\n\n\nsns.set_style(\"whitegrid\")\nsns.FacetGrid(iris,hue=\"species\",size=4)\\\n .map(plt.scatter,\"petal_length\",\"petal_width\")\\\n .add_legend()\nplt.show()","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"c6a6ffd6ead7709996537d5ff8688582697b45e1"},"cell_type":"code","source":"#Pair plot\n\nplt.close();\nsns.set_style(\"whitegrid\")\nsns.pairplot(iris,hue='species',size=3);\nplt.show()","execution_count":null,"outputs":[]},{"metadata":{"_uuid":"6f217423ef51fa102c631c97af62bf912d2bcc81"},"cell_type":"markdown","source":"Petal length and petal width are most important features.\nSetosa are easy differnetiable or linearly seperable but there is little overlap between Versicolor and verginica.\nWe can create model using simple if-else condition.\n\nLimitations of pair plot: can be used for only less features.. like max 6 feautures. as total plots will be like nc2 where n is no of features.\n"},{"metadata":{"trusted":false,"_uuid":"49543f21f61b029af80b1589d5d66afa48f3458c"},"cell_type":"code","source":"#Histogram(1-D sctter plot kind of)\n\nsns.FacetGrid(iris,hue=\"species\",size=5)\\\n .map(sns.distplot, \"petal_length\")\\\n .add_legend();\nplt.show();","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"2d6d37f154344bb8489976687ccc9664be995ca3"},"cell_type":"code","source":"#pdf and cdf\n\niris_setosa=iris[iris['species']=='Iris-setosa']\nprint(iris_setosa.head())\ncounts, bin_edges= np.histogram(iris_setosa['petal_length'],bins=10, density= True)\npdf=counts/(sum(counts))\nprint(pdf)\n\nprint(bin_edges)\n\ncdf=np.cumsum(pdf)\nprint(cdf)\n\nplt.plot(bin_edges[1:],pdf)\nplt.plot(bin_edges[1:],cdf)\nplt.show();\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"bf0db346e43b0e978ad088644c44d9d7019fec01"},"cell_type":"code","source":"#pdf and cdf\n\niris_setosa=iris[iris['species']=='Iris-setosa']\nprint(iris_setosa.head())\ncounts, bin_edges= np.histogram(iris_setosa['petal_length'],bins=10, density= True)\npdf=counts/(sum(counts))\nprint(pdf)\n\nprint(bin_edges)\n\ncdf=np.cumsum(pdf)\nprint(cdf)\n\nplt.plot(bin_edges[1:],pdf)\nplt.plot(bin_edges[1:],cdf)\n\n\n\niris_versicolor=iris[iris['species']=='Iris-versicolor']\n\ncounts_ve, bin_edges_ve= np.histogram(iris_versicolor['petal_length'],bins=10, density= True)\npdf_ve=counts_ve/(sum(counts_ve))\nprint(pdf_ve)\n\nprint(bin_edges_ve)\n\ncdf_ve=np.cumsum(pdf_ve)\nprint(cdf_ve)\n\nplt.plot(bin_edges_ve[1:],pdf_ve)\nplt.plot(bin_edges_ve[1:],cdf_ve)\n\n\niris_virginica=iris[iris['species']=='Iris-virginica']\n\ncounts_vi, bin_edges_vi= np.histogram(iris_virginica['petal_length'],bins=10, density= True)\npdf_vi=counts_vi/(sum(counts_vi))\nprint(pdf_vi)\n\nprint(bin_edges_vi)\n\ncdf_vi=np.cumsum(pdf_vi)\nprint(cdf_vi)\n\nplt.plot(bin_edges_vi[1:],pdf_vi)\nplt.plot(bin_edges_vi[1:],cdf_vi)\n\n\nplt.show();\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"a029880b8e6a2ff0710871057071ed45917d1d35"},"cell_type":"code","source":"iris_virginica.describe()","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"03f5fc89dc97e4bdb65cb454e4a679138b5a338f"},"cell_type":"code","source":"sns.boxplot(x='species', y='petal_length',data=iris)\nplt.show()","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"199838d6bd83031323775fde0be8cce68c65c086"},"cell_type":"code","source":"sns.violinplot(x='species', y='petal_length',data=iris)\nplt.show()","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"e31fa7386938110d1f9012893b96bd05f8b07d7d"},"cell_type":"code","source":"#2D Density plot, contors-plot\nsns.jointplot(x=\"petal_length\", y=\"petal_width\", data=iris_setosa, kind=\"kde\");\nplt.show();","execution_count":null,"outputs":[]},{"metadata":{"trusted":false,"_uuid":"05da958f26565c024d03854c3b727aa06c92d1e1"},"cell_type":"code","source":"","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.6.6"}},"nbformat":4,"nbformat_minor":1} -------------------------------------------------------------------------------- /basic-ml-best-of-10-classifiers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "_cell_guid": "3ebb8e8b-4526-24f5-208f-ee2cb9f1596b" 7 | }, 8 | "source": [ 9 | "Hugues Fontenelle\n", 10 | "7 October 2016\n", 11 | "\n", 12 | "# Pima Indians Diabetes Database\n", 13 | "## Predict the onset of diabetes based on diagnostic measures\n", 14 | "\n", 15 | "Hi folks. I'm new to this, so let me try out what I've learned so far. Your comments are welcome!" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "_cell_guid": "47666c75-d504-bf1d-dfef-12bce1cbe8eb" 22 | }, 23 | "source": [ 24 | "First, let's load the data, and split it in four. It is the fold used the authors of the original paper." 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 1, 30 | "metadata": { 31 | "_cell_guid": "9dbddda0-7005-5721-48d4-5a794ffc3dec" 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "import numpy as np\n", 36 | "\n", 37 | "f = open(\"../input/diabetes.csv\")\n", 38 | "f.readline() # skip the header\n", 39 | "data = np.loadtxt(f, delimiter = ',')\n", 40 | "X = data[:, :-1]\n", 41 | "y = data[:, -1]\n", 42 | "from sklearn.model_selection import train_test_split\n", 43 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": { 49 | "_cell_guid": "aadbd4f8-b043-0899-0c2b-17be3c6a7a94" 50 | }, 51 | "source": [ 52 | "Let's try out a bunch of classifiers, all with default parameters." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 2, 58 | "metadata": { 59 | "_cell_guid": "24637d51-c27f-9876-7c9d-d2259ef91a15" 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "from sklearn.neural_network import MLPClassifier\n", 64 | "from sklearn.neighbors import KNeighborsClassifier\n", 65 | "from sklearn.svm import SVC\n", 66 | "from sklearn.gaussian_process import GaussianProcessClassifier\n", 67 | "from sklearn.gaussian_process.kernels import RBF\n", 68 | "from sklearn.tree import DecisionTreeClassifier\n", 69 | "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n", 70 | "from sklearn.naive_bayes import GaussianNB\n", 71 | "from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis\n", 72 | "\n", 73 | "names = [\"Nearest Neighbors\", \"Linear SVM\", \"RBF SVM\", \"Gaussian Process\",\n", 74 | " \"Decision Tree\", \"Random Forest\", \"Neural Net\", \"AdaBoost\",\n", 75 | " \"Naive Bayes\", \"QDA\"\n", 76 | " ]\n", 77 | "\n", 78 | "classifiers = [\n", 79 | " KNeighborsClassifier(),\n", 80 | " SVC(kernel=\"linear\"),\n", 81 | " SVC(kernel=\"rbf\"),\n", 82 | " GaussianProcessClassifier(),\n", 83 | " DecisionTreeClassifier(),\n", 84 | " RandomForestClassifier(),\n", 85 | " MLPClassifier(),\n", 86 | " AdaBoostClassifier(),\n", 87 | " GaussianNB(),\n", 88 | " QuadraticDiscriminantAnalysis()\n", 89 | "]" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": { 95 | "_cell_guid": "f8bc9411-67fa-c204-db43-bc45a5704d4c" 96 | }, 97 | "source": [ 98 | "Now run all the classifiers, using 5-fold cross validation." 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 3, 104 | "metadata": { 105 | "_cell_guid": "8890a677-3e71-9ea2-ee9b-ac94446151d4" 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "from sklearn.model_selection import cross_val_score\n", 110 | "\n", 111 | "# iterate over classifiers\n", 112 | "results = {}\n", 113 | "for name, clf in zip(names, classifiers):\n", 114 | " scores = cross_val_score(clf, X_train, y_train, cv=5)\n", 115 | " results[name] = scores" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": { 121 | "_cell_guid": "af9d636c-9704-7425-01dd-d6db580d4c4e" 122 | }, 123 | "source": [ 124 | "Here are the results:" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 4, 130 | "metadata": { 131 | "_cell_guid": "458d31e6-b1f7-8c9f-ecc2-4729f8a3d617" 132 | }, 133 | "outputs": [ 134 | { 135 | "name": "stdout", 136 | "output_type": "stream", 137 | "text": [ 138 | " RBF SVM | Accuracy: 64.24% (+/- 0.44%)\n", 139 | " Linear SVM | Accuracy: 76.04% (+/- 5.58%)\n", 140 | " Neural Net | Accuracy: 60.06% (+/- 23.16%)\n", 141 | " Decision Tree | Accuracy: 66.85% (+/- 4.62%)\n", 142 | " Gaussian Process | Accuracy: 68.58% (+/- 6.14%)\n", 143 | " Nearest Neighbors | Accuracy: 71.18% (+/- 7.56%)\n", 144 | " QDA | Accuracy: 73.97% (+/- 8.84%)\n", 145 | " AdaBoost | Accuracy: 72.57% (+/- 8.32%)\n", 146 | " Naive Bayes | Accuracy: 73.62% (+/- 5.78%)\n", 147 | " Random Forest | Accuracy: 73.44% (+/- 3.69%)\n" 148 | ] 149 | } 150 | ], 151 | "source": [ 152 | "for name, scores in results.items():\n", 153 | " print(\"%20s | Accuracy: %0.2f%% (+/- %0.2f%%)\" % (name, 100*scores.mean(), 100*scores.std() * 2))" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": { 159 | "_cell_guid": "90315be6-17cf-0c9d-5b3f-cb2b826181a1" 160 | }, 161 | "source": [ 162 | "Seems like a Linear SVM performs best.\n", 163 | "Let's try some parameter optimization." 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 5, 169 | "metadata": { 170 | "_cell_guid": "114b0360-f423-9d8e-cc99-e1ad84759169" 171 | }, 172 | "outputs": [ 173 | { 174 | "name": "stderr", 175 | "output_type": "stream", 176 | "text": [ 177 | "/opt/conda/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 178 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n", 179 | "/opt/conda/lib/python3.5/site-packages/sklearn/grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.\n", 180 | " DeprecationWarning)\n" 181 | ] 182 | }, 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "GridSearchCV(cv=None, error_score='raise',\n", 188 | " estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n", 189 | " decision_function_shape=None, degree=3, gamma='auto', kernel='linear',\n", 190 | " max_iter=-1, probability=False, random_state=None, shrinking=True,\n", 191 | " tol=0.001, verbose=False),\n", 192 | " fit_params={}, iid=True, n_jobs=1,\n", 193 | " param_grid=[{'C': [0.01, 0.1, 1, 10], 'kernel': ['linear']}],\n", 194 | " pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)\n" 195 | ] 196 | } 197 | ], 198 | "source": [ 199 | "from sklearn.grid_search import GridSearchCV\n", 200 | "\n", 201 | "clf = SVC(kernel=\"linear\")\n", 202 | "\n", 203 | "# prepare a range of values to test\n", 204 | "param_grid = [\n", 205 | " {'C': [.01, .1, 1, 10], 'kernel': ['linear']},\n", 206 | " ]\n", 207 | "\n", 208 | "grid = GridSearchCV(estimator=clf, param_grid=param_grid)\n", 209 | "grid.fit(X_train, y_train)\n", 210 | "print(grid)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 6, 216 | "metadata": { 217 | "_cell_guid": "da7a2382-e12d-1c61-ef87-684907f3dbaf" 218 | }, 219 | "outputs": [ 220 | { 221 | "name": "stdout", 222 | "output_type": "stream", 223 | "text": [ 224 | "Best score: 76.39%\n", 225 | "Best estimator for parameter C: 0.100000\n" 226 | ] 227 | } 228 | ], 229 | "source": [ 230 | "# summarize the results of the grid search\n", 231 | "print(\"Best score: %0.2f%%\" % (100*grid.best_score_))\n", 232 | "print(\"Best estimator for parameter C: %f\" % (grid.best_estimator_.C))" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": { 238 | "_cell_guid": "b49c3baf-9dba-417f-9d37-5f6ea30652df" 239 | }, 240 | "source": [ 241 | "Finaly, train the Linear SVM (with param `C=0.1`) on the whole train set, and evaluate on the test set" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 7, 247 | "metadata": { 248 | "_cell_guid": "fa723561-4feb-c20c-971e-55b5fcc1cebc" 249 | }, 250 | "outputs": [], 251 | "source": [ 252 | "clf = SVC(kernel=\"linear\", C=0.1)\n", 253 | "clf.fit(X_train, y_train)\n", 254 | "y_eval = clf.predict(X_test)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 8, 260 | "metadata": { 261 | "_cell_guid": "1898e1cd-08b7-c044-86d3-77155ab7a818" 262 | }, 263 | "outputs": [ 264 | { 265 | "name": "stdout", 266 | "output_type": "stream", 267 | "text": [ 268 | "Accuracy: 80.21%\n" 269 | ] 270 | } 271 | ], 272 | "source": [ 273 | "acc = sum(y_eval == y_test) / float(len(y_test))\n", 274 | "print(\"Accuracy: %.2f%%\" % (100*acc))" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": { 280 | "_cell_guid": "1c626fa2-0c33-69cc-54e8-b6868c907ad9" 281 | }, 282 | "source": [ 283 | "We did it :-)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": { 289 | "_cell_guid": "569c56fb-822a-1d19-38bd-92270a8a7989" 290 | }, 291 | "source": [ 292 | "**edit**\n", 293 | "\n", 294 | "I was _probably_ a bit lucky for this particular fold (`random_state=0`). Why would the accuracy on the test be higher than on the optimized trained set? Let's re-run a 5-fold cv on the whole data:" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 9, 300 | "metadata": { 301 | "_cell_guid": "d80d926e-fa4d-4f34-0836-0594168a6601" 302 | }, 303 | "outputs": [], 304 | "source": [ 305 | "clf = SVC(kernel=\"linear\", C=0.1)\n", 306 | "scores_final = cross_val_score(clf, X, y, cv=5)" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 10, 312 | "metadata": { 313 | "_cell_guid": "cdbcd59f-97ee-0fce-a25c-d63e11e2bf90" 314 | }, 315 | "outputs": [ 316 | { 317 | "name": "stdout", 318 | "output_type": "stream", 319 | "text": [ 320 | "Final model | Accuracy: 76.83% (+/- 4.31%)\n" 321 | ] 322 | } 323 | ], 324 | "source": [ 325 | "scores_final.mean(), scores_final.std()\n", 326 | "print(\"Final model | Accuracy: %0.2f%% (+/- %0.2f%%)\" % (100*scores_final.mean(), 100*scores_final.std() * 2))" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": { 332 | "_cell_guid": "9bf33d09-cc9b-b5c3-49e5-9da5f49b4c71" 333 | }, 334 | "source": [ 335 | "..which is more realistic!\n", 336 | "\n", 337 | "I am wondering, at which stage do I then use this test set?" 338 | ] 339 | } 340 | ], 341 | "metadata": { 342 | "_change_revision": 12, 343 | "_is_fork": false, 344 | "kernelspec": { 345 | "display_name": "Python 3", 346 | "language": "python", 347 | "name": "python3" 348 | }, 349 | "language_info": { 350 | "codemirror_mode": { 351 | "name": "ipython", 352 | "version": 3 353 | }, 354 | "file_extension": ".py", 355 | "mimetype": "text/x-python", 356 | "name": "python", 357 | "nbconvert_exporter": "python", 358 | "pygments_lexer": "ipython3", 359 | "version": "3.5.2" 360 | } 361 | }, 362 | "nbformat": 4, 363 | "nbformat_minor": 0 364 | } 365 | --------------------------------------------------------------------------------