├── h.js ├── hello.js ├── a.py ├── Problem Statement └── Train data │ ├── .md │ └── AttributeInformation.pdf ├── Spam-Email-detection-system-main ├── _config.yml ├── spam_model.pkl ├── static │ ├── img │ │ ├── logo-w.png │ │ ├── safe.png │ │ ├── spam-2.png │ │ ├── spam.png │ │ ├── insurance-protected.png │ │ └── pngkey.com-scam-alert-png-4321853.png │ ├── audio │ │ ├── safe.mpeg │ │ └── warning.mpeg │ └── css │ │ └── style.css ├── app.py ├── templates │ └── index.html └── Untitled2.ipynb ├── a-hands-on-discussion-on-hyperparameter-optimization-techniques.pdf ├── id 3 algorithum (1).ipynb ├── Feature Selection ├── Embedded method.ipynb ├── Filter method.ipynb ├── Wrapper method .ipynb └── feature-selection-technique-in-machine-learning.ipynb └── regularization-in-machine-learning └── regularization-in-machine-learning.ipynb /h.js: -------------------------------------------------------------------------------- 1 | console.log("print") 2 | -------------------------------------------------------------------------------- /hello.js: -------------------------------------------------------------------------------- 1 | console.log("hello") 2 | -------------------------------------------------------------------------------- /a.py: -------------------------------------------------------------------------------- 1 | print("Hello. Developer") 2 | -------------------------------------------------------------------------------- /Problem Statement/Train data/.md: -------------------------------------------------------------------------------- 1 | .md 2 | -------------------------------------------------------------------------------- /Spam-Email-detection-system-main/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /Spam-Email-detection-system-main/spam_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mouneshgouda/Machine_Learning/HEAD/Spam-Email-detection-system-main/spam_model.pkl -------------------------------------------------------------------------------- /Problem Statement/Train data/AttributeInformation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mouneshgouda/Machine_Learning/HEAD/Problem Statement/Train data/AttributeInformation.pdf -------------------------------------------------------------------------------- /Spam-Email-detection-system-main/static/img/logo-w.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mouneshgouda/Machine_Learning/HEAD/Spam-Email-detection-system-main/static/img/logo-w.png -------------------------------------------------------------------------------- /Spam-Email-detection-system-main/static/img/safe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mouneshgouda/Machine_Learning/HEAD/Spam-Email-detection-system-main/static/img/safe.png -------------------------------------------------------------------------------- /Spam-Email-detection-system-main/static/img/spam-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mouneshgouda/Machine_Learning/HEAD/Spam-Email-detection-system-main/static/img/spam-2.png -------------------------------------------------------------------------------- /Spam-Email-detection-system-main/static/img/spam.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mouneshgouda/Machine_Learning/HEAD/Spam-Email-detection-system-main/static/img/spam.png -------------------------------------------------------------------------------- /Spam-Email-detection-system-main/static/audio/safe.mpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mouneshgouda/Machine_Learning/HEAD/Spam-Email-detection-system-main/static/audio/safe.mpeg -------------------------------------------------------------------------------- /Spam-Email-detection-system-main/static/audio/warning.mpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mouneshgouda/Machine_Learning/HEAD/Spam-Email-detection-system-main/static/audio/warning.mpeg -------------------------------------------------------------------------------- /Spam-Email-detection-system-main/static/img/insurance-protected.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mouneshgouda/Machine_Learning/HEAD/Spam-Email-detection-system-main/static/img/insurance-protected.png -------------------------------------------------------------------------------- /a-hands-on-discussion-on-hyperparameter-optimization-techniques.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mouneshgouda/Machine_Learning/HEAD/a-hands-on-discussion-on-hyperparameter-optimization-techniques.pdf -------------------------------------------------------------------------------- /Spam-Email-detection-system-main/static/img/pngkey.com-scam-alert-png-4321853.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mouneshgouda/Machine_Learning/HEAD/Spam-Email-detection-system-main/static/img/pngkey.com-scam-alert-png-4321853.png -------------------------------------------------------------------------------- /Spam-Email-detection-system-main/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask,render_template,request,jsonify 2 | import pandas as pd 3 | import numpy as np 4 | import joblib 5 | 6 | app = Flask(__name__) 7 | 8 | model = joblib.load('spam_model.pkl') 9 | 10 | @app.route('/',methods=['GET', 'POST']) 11 | def index(): 12 | if request.method == 'POST': 13 | message = request.form.get('message') 14 | output = model.predict([message]) 15 | if output == [0]: 16 | result = "This Message is Not a SPAM Message." 17 | else: 18 | result = "This Message is a SPAM Message." 19 | return render_template('index.html', result=result,message=message) 20 | 21 | else: 22 | return render_template('index.html') 23 | 24 | 25 | if __name__ == '__main__': 26 | app.run(debug=True) -------------------------------------------------------------------------------- /Spam-Email-detection-system-main/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | SPAM Detector Website 12 | 13 | 14 | 15 |
16 |
17 |

Disclaimer :- Plesae Click on the three dot and click "Add to Home screen" for better view and quick access

18 |
19 |
20 | Spam Detector 21 |
22 |
23 |
24 | 25 | 26 |
27 |
28 |
{% if message %} 29 |
30 |
31 |

{{message}}

32 |
33 |
{% if result=='This Message is Not a SPAM Message.' %} 34 | 37 | {% endif %} {% if result=='This Message is Not a SPAM Message.' %} 38 |
39 | 42 |
43 | {% endif %} {% if result=='This Message is a SPAM Message.' %} 44 | 47 | {% endif %} {% if result=='This Message is Not a SPAM Message.' %} 48 |
{{result}}
49 | {% endif %} {% if result=='This Message is a SPAM Message.' %} 50 |
{{result}}
51 | {% endif %} {% if result=='This Message is a SPAM Message.' %} 52 |
53 | 56 |
57 | {% endif %} 58 |
59 | {% else %} 60 |

.Enter A Message To Check The Message is SPAM or NOT-SPAM..

61 | {% endif %} 62 |
63 |
64 |
65 |
66 | 67 | 68 | -------------------------------------------------------------------------------- /Spam-Email-detection-system-main/static/css/style.css: -------------------------------------------------------------------------------- 1 | 2 | * { 3 | border: 0; 4 | box-sizing: border-box; 5 | margin: 0; 6 | } 7 | .container{ 8 | height: content; 9 | width: 100%; 10 | justify-content: center; 11 | align-items: center; 12 | display: flex; 13 | flex-flow: column; 14 | } 15 | .head-nav { 16 | height:30px; 17 | width:100%; 18 | background-color: green; 19 | display:flex; 20 | justify-content: center; 21 | align-items: center; 22 | padding: 8px; 23 | } 24 | .logo-heading { 25 | height: 250px; 26 | width: 250px; 27 | margin-top: -20px; 28 | justify-content: center; 29 | align-items: center; 30 | display: flex; 31 | } 32 | .logo-heading img { 33 | height: 100%; 34 | width: 100%; 35 | } 36 | .container-data { 37 | height: 200px; 38 | width: 100%; 39 | justify-content: center; 40 | display: flex; 41 | align-items: center; 42 | background-color:rgb(236, 214, 214); 43 | flex-flow: column; 44 | margin-top: -40px; 45 | padding-left: 10px; 46 | padding-right: 10px; 47 | } 48 | .text-box { 49 | height: 65px; 50 | width:90%; 51 | border-radius: 5px; 52 | border: 2px solid green; 53 | background-color: rgb(252, 248, 248); 54 | margin-top: 15px; 55 | font-size: 14px; 56 | font-weight: bold; 57 | } 58 | .btn { 59 | height: 35px; 60 | width:100px; 61 | border-radius: 5px; 62 | border: 2px solid black; 63 | background-color: green; 64 | color: white; 65 | margin-top: 15px; 66 | } 67 | 68 | .btn:hover { 69 | color: white; 70 | background-color:red; 71 | cursor: pointer; 72 | } 73 | .show-result { 74 | height: content; 75 | width: 100%; 76 | display: flex; 77 | justify-content: center; 78 | flex-flow: column wrap; 79 | background-color: rgb(252, 243, 243); 80 | margin-bottom: 50px; 81 | margin-top: -30px; 82 | padding: 35px; 83 | } 84 | .output { 85 | height: content; 86 | width: 100%; 87 | display: flex; 88 | justify-content: center; 89 | align-items: center; 90 | font-size: 19px; 91 | font-weight: 300; 92 | font-family:'Lucida Sans', 'Lucida Sans Regular', 'Lucida Grande', 'Lucida Sans Unicode', Geneva, Verdana, sans-serif; 93 | color: red; 94 | margin-top: 5px; 95 | padding: 30px; 96 | } 97 | .output-not { 98 | height: content; 99 | width: 100%; 100 | display: flex; 101 | justify-content: center; 102 | align-items: center; 103 | font-size: 19px; 104 | font-weight: 300; 105 | font-family:'Lucida Sans', 'Lucida Sans Regular', 'Lucida Grande', 'Lucida Sans Unicode', Geneva, Verdana, sans-serif; 106 | color: green; 107 | margin-top: 3px; 108 | padding: 30px; 109 | } 110 | .output-logo { 111 | height: 100px; 112 | width: 100%; 113 | margin-top:20px; 114 | justify-content: center; 115 | display: flex; 116 | align-items: center; 117 | } 118 | .output-logo img { 119 | height: 100%; 120 | width: 100px; 121 | } 122 | 123 | .head{ 124 | height: 40px; 125 | width: 100%; 126 | justify-content: center; 127 | align-items: center; 128 | border-top: 2px solid black; 129 | background-color: yellow; 130 | display: flex; 131 | flex-flow: column; 132 | } 133 | .head h2 { 134 | font-size: 15px; 135 | font-family: 'Gill Sans', 'Gill Sans MT', Calibri, 'Trebuchet MS', sans-serif; 136 | color:black; 137 | } 138 | .head-git{ 139 | height: 40px; 140 | width: 100%; 141 | justify-content: center; 142 | align-items: center; 143 | background-color: rgb(250, 244, 244); 144 | display: flex; 145 | flex-flow: row; 146 | margin-bottom: 5px; 147 | } 148 | .head-git h2 { 149 | font-size: 18px; 150 | font-family: 'Gill Sans', 'Gill Sans MT', Calibri, 'Trebuchet MS', sans-serif; 151 | color:black; 152 | } 153 | .head-git h2 a { 154 | text-decoration: none; 155 | color:blue; 156 | font-size: 23px; 157 | } 158 | .head-git h2 a:hover { 159 | color:rgb(211, 47, 41); 160 | background-color: beige; 161 | } 162 | .alert { 163 | visibility: hidden; 164 | } -------------------------------------------------------------------------------- /id 3 algorithum (1).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "id": "bc63810d", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "Enter Outlook (Sunny/Overcast/Rain): Overcast\n", 14 | "Enter Temperature (Hot/Mild/Cool): Mild\n", 15 | "Enter Humidity (High/Normal): Normal\n", 16 | "Enter Wind (Weak/Strong): Strong\n", 17 | "\n", 18 | "Predicted PlayTennis for the new instance: No\n" 19 | ] 20 | } 21 | ], 22 | "source": [ 23 | "import pandas as pd\n", 24 | "from sklearn.model_selection import train_test_split\n", 25 | "from sklearn.tree import DecisionTreeClassifier\n", 26 | "from sklearn.metrics import accuracy_score, classification_report\n", 27 | "\n", 28 | "# Sample dataset: PlayTennis\n", 29 | "data = {\n", 30 | " 'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain'],\n", 31 | " 'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild'],\n", 32 | " 'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal'],\n", 33 | " 'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak'],\n", 34 | " 'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No']\n", 35 | "}\n", 36 | "\n", 37 | "df = pd.DataFrame(data)\n", 38 | "\n", 39 | "# Extract features and target variable\n", 40 | "X = pd.get_dummies(df.drop('PlayTennis', axis=1)) # Convert categorical variables to numerical\n", 41 | "y = df['PlayTennis']\n", 42 | "\n", 43 | "# Get user input for new instance\n", 44 | "new_outlook = input(\"Enter Outlook (Sunny/Overcast/Rain): \")\n", 45 | "new_temperature = input(\"Enter Temperature (Hot/Mild/Cool): \")\n", 46 | "new_humidity = input(\"Enter Humidity (High/Normal): \")\n", 47 | "new_wind = input(\"Enter Wind (Weak/Strong): \")\n", 48 | "\n", 49 | "# Create a new DataFrame for user input\n", 50 | "new_instance = pd.DataFrame({\n", 51 | " 'Outlook_Sunny': [1 if new_outlook == 'Sunny' else 0],\n", 52 | " 'Outlook_Overcast': [1 if new_outlook == 'Overcast' else 0],\n", 53 | " 'Outlook_Rain': [1 if new_outlook == 'Rain' else 0],\n", 54 | " 'Temperature_Hot': [1 if new_temperature == 'Hot' else 0],\n", 55 | " 'Temperature_Mild': [1 if new_temperature == 'Mild' else 0],\n", 56 | " 'Temperature_Cool': [1 if new_temperature == 'Cool' else 0],\n", 57 | " 'Humidity_High': [1 if new_humidity == 'High' else 0],\n", 58 | " 'Humidity_Normal': [1 if new_humidity == 'Normal' else 0],\n", 59 | " 'Wind_Weak': [1 if new_wind == 'Weak' else 0],\n", 60 | " 'Wind_Strong': [1 if new_wind == 'Strong' else 0],\n", 61 | "})\n", 62 | "\n", 63 | "# Train-test split\n", 64 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", 65 | "\n", 66 | "# Build a Decision Tree classifier using scikit-learn\n", 67 | "dt_classifier = DecisionTreeClassifier(random_state=42)\n", 68 | "dt_classifier.fit(X_train, y_train)\n", 69 | "\n", 70 | "new_instance_pred1= dt_classifier.predict(X_test)\n", 71 | "\n", 72 | "# Make predictions on the new instance\n", 73 | "new_instance_pred = dt_classifier.predict(new_instance)\n", 74 | "print(f\"\\nPredicted PlayTennis for the new instance: {new_instance_pred[0]}\")\n" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "id": "0213d788", 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "https://towardsdatascience.com/decision-trees-for-classification-complete-example-d0bc17fcf1c2" 85 | ] 86 | } 87 | ], 88 | "metadata": { 89 | "kernelspec": { 90 | "display_name": "Python 3 (ipykernel)", 91 | "language": "python", 92 | "name": "python3" 93 | }, 94 | "language_info": { 95 | "codemirror_mode": { 96 | "name": "ipython", 97 | "version": 3 98 | }, 99 | "file_extension": ".py", 100 | "mimetype": "text/x-python", 101 | "name": "python", 102 | "nbconvert_exporter": "python", 103 | "pygments_lexer": "ipython3", 104 | "version": "3.9.7" 105 | } 106 | }, 107 | "nbformat": 4, 108 | "nbformat_minor": 5 109 | } 110 | -------------------------------------------------------------------------------- /Feature Selection/Embedded method.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "921d9e05", 6 | "metadata": {}, 7 | "source": [ 8 | "
Embedded method\n", 9 | "-➖📝
" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "id": "5b983074", 15 | "metadata": {}, 16 | "source": [ 17 | " \n", 18 | "
Importing Nassary Liberarys 📈:
" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 5, 24 | "id": "e7e6b63c", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import numpy as np\n", 29 | "from sklearn.datasets import load_breast_cancer\n", 30 | "from sklearn.model_selection import train_test_split\n", 31 | "from sklearn.linear_model import LogisticRegression\n", 32 | "from sklearn.metrics import accuracy_score" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "id": "a4a9b75c", 38 | "metadata": {}, 39 | "source": [ 40 | " \n", 41 | "
Loading seed dataset 📈:
" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 6, 47 | "id": "19efbfb2", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "# Set seed for reproducibility\n", 52 | "seed = 42\n", 53 | "np.random.seed(seed)\n", 54 | "\n", 55 | "# Load the Breast Cancer dataset\n", 56 | "cancer = load_breast_cancer()\n", 57 | "X = cancer.data\n", 58 | "y = cancer.target" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "id": "e043d5b1", 64 | "metadata": {}, 65 | "source": [ 66 | " \n", 67 | "
Split the dataset into training and testing sets as a 20% as a Test and 80% as a Train :
" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 7, 73 | "id": "461f5655", 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "# Split the dataset into training and testing sets\n", 78 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 8, 84 | "id": "20de70b8", 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stderr", 89 | "output_type": "stream", 90 | "text": [ 91 | "C:\\Users\\LENOVO\\anaconda3\\lib\\site-packages\\sklearn\\svm\\_base.py:985: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", 92 | " warnings.warn(\"Liblinear failed to converge, increase \"\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "# Embedded method with LASSO (Logistic Regression with L1 regularization)\n", 98 | "lasso_model = LogisticRegression(penalty='l1', solver='liblinear', random_state=seed)\n", 99 | "lasso_model.fit(X_train, y_train)\n", 100 | "\n", 101 | "# Extract selected features and their coefficients\n", 102 | "selected_indices_lasso = np.where(lasso_model.coef_[0] != 0)[0]\n", 103 | "selected_features_lasso = cancer.feature_names[selected_indices_lasso]\n", 104 | "coefficients_lasso = lasso_model.coef_[0, selected_indices_lasso]" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "id": "c9d4246f", 110 | "metadata": {}, 111 | "source": [ 112 | " \n", 113 | "
Selecting Features using Embedded Method as L1
" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 9, 119 | "id": "fa025ee2", 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "name": "stdout", 124 | "output_type": "stream", 125 | "text": [ 126 | "\n", 127 | "Embedded Method with LASSO (Logistic Regression with L1 regularization)\n", 128 | "Selected Features (LASSO): ['mean radius' 'mean texture' 'mean perimeter' 'mean area' 'texture error'\n", 129 | " 'area error' 'worst radius' 'worst texture' 'worst perimeter'\n", 130 | " 'worst area' 'worst concavity']\n", 131 | "Coefficients (LASSO): [ 4.25893726 0.13813487 -0.2624774 -0.01633754 1.69950795 -0.09940568\n", 132 | " 0.04768624 -0.42417917 -0.02965423 -0.01518975 -3.63866352]\n", 133 | "Accuracy (LASSO): 0.9561\n" 134 | ] 135 | }, 136 | { 137 | "name": "stderr", 138 | "output_type": "stream", 139 | "text": [ 140 | "C:\\Users\\LENOVO\\anaconda3\\lib\\site-packages\\sklearn\\svm\\_base.py:985: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", 141 | " warnings.warn(\"Liblinear failed to converge, increase \"\n" 142 | ] 143 | } 144 | ], 145 | "source": [ 146 | "\n", 147 | "# Function to train and evaluate a model\n", 148 | "def train_and_evaluate(X_train, X_test, y_train, y_test):\n", 149 | " model = LogisticRegression(penalty='l1', solver='liblinear', random_state=seed)\n", 150 | " model.fit(X_train, y_train)\n", 151 | " y_pred = model.predict(X_test)\n", 152 | " accuracy = accuracy_score(y_test, y_pred)\n", 153 | " return model, accuracy\n", 154 | "\n", 155 | "# Train and evaluate the model with selected features\n", 156 | "lasso_model, accuracy_lasso = train_and_evaluate(X_train[:, selected_indices_lasso], X_test[:, selected_indices_lasso], y_train, y_test)\n", 157 | "\n", 158 | "# Print results\n", 159 | "print(\"\\nEmbedded Method with LASSO (Logistic Regression with L1 regularization)\")\n", 160 | "print(f\"Selected Features (LASSO): {selected_features_lasso}\")\n", 161 | "print(f\"Coefficients (LASSO): {coefficients_lasso}\")\n", 162 | "print(f\"Accuracy (LASSO): {accuracy_lasso:.4f}\")\n" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "id": "66fb004d", 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "id": "216cbea4", 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [] 180 | } 181 | ], 182 | "metadata": { 183 | "kernelspec": { 184 | "display_name": "Python 3 (ipykernel)", 185 | "language": "python", 186 | "name": "python3" 187 | }, 188 | "language_info": { 189 | "codemirror_mode": { 190 | "name": "ipython", 191 | "version": 3 192 | }, 193 | "file_extension": ".py", 194 | "mimetype": "text/x-python", 195 | "name": "python", 196 | "nbconvert_exporter": "python", 197 | "pygments_lexer": "ipython3", 198 | "version": "3.9.7" 199 | } 200 | }, 201 | "nbformat": 4, 202 | "nbformat_minor": 5 203 | } 204 | -------------------------------------------------------------------------------- /Feature Selection/Filter method.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "880546ed", 6 | "metadata": {}, 7 | "source": [ 8 | "

Filter Method
" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "200dfba8", 14 | "metadata": {}, 15 | "source": [ 16 | "
Importing Nassary Liberarys 📈:
" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 13, 22 | "id": "645a17c6", 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "\n", 27 | "import numpy as np\n", 28 | "from sklearn.datasets import load_breast_cancer\n", 29 | "from sklearn.model_selection import train_test_split\n", 30 | "from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold\n", 31 | "from sklearn.ensemble import RandomForestClassifier\n", 32 | "from sklearn.metrics import accuracy_score" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "id": "1445759e", 38 | "metadata": {}, 39 | "source": [ 40 | " \n", 41 | "
Loading seed dataset 📈:
" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 14, 47 | "id": "a5251b89", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "# Set seed for reproducibility\n", 52 | "seed = 42\n", 53 | "np.random.seed(seed)\n", 54 | "\n", 55 | "# Load the Breast Cancer dataset\n", 56 | "cancer = load_breast_cancer()\n", 57 | "X = cancer.data\n", 58 | "y = cancer.target" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "id": "da8dd7d2", 64 | "metadata": {}, 65 | "source": [ 66 | " \n", 67 | "
Split the dataset into training and testing sets as a 20% as a Test and 80% as a Train :
" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 15, 73 | "id": "4817be16", 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "# Split the dataset into training and testing sets\n", 78 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "id": "329de26e", 84 | "metadata": {}, 85 | "source": [ 86 | "
Filter method with ANOVA
" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 16, 92 | "id": "2046d2f3", 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "# Filter method with ANOVA\n", 97 | "k_best_features = 10\n", 98 | "anova_selector = SelectKBest(f_classif, k=k_best_features)\n", 99 | "X_train_anova = anova_selector.fit_transform(X_train, y_train)\n", 100 | "X_test_anova = anova_selector.transform(X_test)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "id": "8632271b", 106 | "metadata": {}, 107 | "source": [ 108 | "
Filter method with Variance Threshold
" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 17, 114 | "id": "a38a24ca", 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "# Filter method with Variance Threshold\n", 119 | "variance_threshold_value = 0.01\n", 120 | "variance_selector = VarianceThreshold(threshold=variance_threshold_value)\n", 121 | "X_train_filtered = variance_selector.fit_transform(X_train_anova)\n", 122 | "X_test_filtered = variance_selector.transform(X_test_anova)\n" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "id": "573f43a2", 128 | "metadata": {}, 129 | "source": [ 130 | "
Function to train and evaluate a model
" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 18, 136 | "id": "6b3e9548", 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "# Function to train and evaluate a model\n", 141 | "def train_and_evaluate(X_train, X_test, y_train, y_test):\n", 142 | " model = RandomForestClassifier(random_state=seed)\n", 143 | " model.fit(X_train, y_train)\n", 144 | " y_pred = model.predict(X_test)\n", 145 | " accuracy = accuracy_score(y_test, y_pred)\n", 146 | " return accuracy\n", 147 | "\n", 148 | "# Train and evaluate the model with ANOVA and Variance Threshold\n", 149 | "accuracy_anova = train_and_evaluate(X_train_anova, X_test_anova, y_train, y_test)\n", 150 | "accuracy_variance = train_and_evaluate(X_train_filtered, X_test_filtered, y_train, y_test)\n", 151 | "\n", 152 | "# Get selected feature indices\n", 153 | "selected_indices_variance = np.where(variance_selector.get_support())[0]" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 19, 159 | "id": "96ac34ec", 160 | "metadata": {}, 161 | "outputs": [ 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | "\n", 167 | "Filter Method with ANOVA\n", 168 | "Number of Features Selected (ANOVA): 10\n", 169 | "Selected Feature Indices (ANOVA): [ 0 2 3 6 7 20 22 23 26 27]\n", 170 | "Selected Feature Names (ANOVA): ['mean radius' 'mean perimeter' 'mean area' 'mean concavity'\n", 171 | " 'mean concave points' 'worst radius' 'worst perimeter' 'worst area'\n", 172 | " 'worst concavity' 'worst concave points']\n", 173 | "Accuracy (ANOVA): 0.9561\n", 174 | "\n", 175 | "Filter Method with Variance Threshold\n", 176 | "Number of Features Selected (Variance Threshold): 7\n", 177 | "Selected Feature Indices (Variance Threshold): [0 1 2 5 6 7 8]\n", 178 | "Selected Feature Names (Variance Threshold): ['mean radius' 'mean texture' 'mean perimeter' 'mean compactness'\n", 179 | " 'mean concavity' 'mean concave points' 'mean symmetry']\n", 180 | "Accuracy (Variance Threshold): 0.9737\n" 181 | ] 182 | } 183 | ], 184 | "source": [ 185 | "# Print results\n", 186 | "print(\"\\nFilter Method with ANOVA\")\n", 187 | "print(f\"Number of Features Selected (ANOVA): {k_best_features}\")\n", 188 | "print(f\"Selected Feature Indices (ANOVA): {np.where(anova_selector.get_support())[0]}\")\n", 189 | "print(f\"Selected Feature Names (ANOVA): {cancer.feature_names[anova_selector.get_support()]}\")\n", 190 | "print(f\"Accuracy (ANOVA): {accuracy_anova:.4f}\")\n", 191 | "\n", 192 | "print(\"\\nFilter Method with Variance Threshold\")\n", 193 | "print(f\"Number of Features Selected (Variance Threshold): {len(selected_indices_variance)}\")\n", 194 | "print(f\"Selected Feature Indices (Variance Threshold): {selected_indices_variance}\")\n", 195 | "print(f\"Selected Feature Names (Variance Threshold): {cancer.feature_names[selected_indices_variance]}\")\n", 196 | "print(f\"Accuracy (Variance Threshold): {accuracy_variance:.4f}\")\n" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "id": "4403d6f9", 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "id": "fad55cab", 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [] 214 | } 215 | ], 216 | "metadata": { 217 | "kernelspec": { 218 | "display_name": "Python 3 (ipykernel)", 219 | "language": "python", 220 | "name": "python3" 221 | }, 222 | "language_info": { 223 | "codemirror_mode": { 224 | "name": "ipython", 225 | "version": 3 226 | }, 227 | "file_extension": ".py", 228 | "mimetype": "text/x-python", 229 | "name": "python", 230 | "nbconvert_exporter": "python", 231 | "pygments_lexer": "ipython3", 232 | "version": "3.9.7" 233 | } 234 | }, 235 | "nbformat": 4, 236 | "nbformat_minor": 5 237 | } 238 | -------------------------------------------------------------------------------- /Feature Selection/Wrapper method .ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "9b15f1ec", 6 | "metadata": {}, 7 | "source": [ 8 | "

1. | Importing Nassary Liberarys 🌟 📚

" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 5, 14 | "id": "1e09463d", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "from sklearn.datasets import load_breast_cancer\n", 20 | "from sklearn.model_selection import train_test_split\n", 21 | "from sklearn.ensemble import RandomForestClassifier\n", 22 | "from sklearn.feature_selection import RFE\n", 23 | "from sklearn.metrics import accuracy_score" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "id": "82180422", 29 | "metadata": {}, 30 | "source": [ 31 | " \n", 32 | "
Loading seed dataset 📈:
" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 6, 38 | "id": "34dc6a7c", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "# Set seed for reproducibility\n", 43 | "seed = 42\n", 44 | "np.random.seed(seed)\n", 45 | "\n", 46 | "# Load the Breast Cancer dataset\n", 47 | "cancer = load_breast_cancer()\n", 48 | "X = cancer.data\n", 49 | "y = cancer.target" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "id": "3f999bc5", 55 | "metadata": {}, 56 | "source": [ 57 | " \n", 58 | "
Split the dataset into training and testing sets as a 20% as a Test and 80% as a Train :
" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 11, 64 | "id": "1ce7b251", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "\n", 69 | "# Split the dataset into training and testing sets\n", 70 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "id": "9d80703c", 76 | "metadata": {}, 77 | "source": [ 78 | " \n", 79 | "
Wrapper method using Recursive Feature Elimination (RFE) :
" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 15, 85 | "id": "43327b06", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "\n", 90 | "\n", 91 | "\n", 92 | "# Wrapper method using Recursive Feature Elimination (RFE) with RandomForestClassifier\n", 93 | "def wrapper_method_rfe(X_train, X_test, y_train, estimator, num_features, method):\n", 94 | " model = estimator\n", 95 | " if method == \"Forward\":\n", 96 | " selector = RFE(model, n_features_to_select=num_features, step=1)\n", 97 | " selector.fit(X_train, y_train)\n", 98 | " elif method == \"Backward\":\n", 99 | " selector = RFE(model, n_features_to_select=num_features, step=1)\n", 100 | " selector.fit(X_train, y_train)\n", 101 | " # Since RFE performs backward elimination by default, we need to reverse the selected indices for backward elimination\n", 102 | " selected_indices = np.flip(np.where(selector.support_)[0])\n", 103 | " X_train_selected = selector.transform(X_train)\n", 104 | " X_test_selected = selector.transform(X_test)\n", 105 | " else:\n", 106 | " raise ValueError(\"Invalid method specified\")\n", 107 | "\n", 108 | " if method != \"Backward\":\n", 109 | " X_train_selected = selector.transform(X_train)\n", 110 | " X_test_selected = selector.transform(X_test)\n", 111 | " selected_indices = np.where(selector.support_)[0]" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "id": "5202b583", 117 | "metadata": {}, 118 | "source": [ 119 | " \n", 120 | "
Wrapper method using Recursive Feature Elimination (RFE) :
" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 19, 126 | "id": "fe96f1d4", 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "\n", 131 | "# Wrapper method using Recursive Feature Elimination (RFE) with RandomForestClassifier\n", 132 | "def wrapper_method_rfe(X_train, X_test, y_train, estimator, num_features, method):\n", 133 | " model = estimator\n", 134 | " if method == \"Forward\":\n", 135 | " selector = RFE(model, n_features_to_select=num_features, step=1)\n", 136 | " selector.fit(X_train, y_train)\n", 137 | " elif method == \"Backward\":\n", 138 | " selector = RFE(model, n_features_to_select=num_features, step=1)\n", 139 | " selector.fit(X_train, y_train)\n", 140 | " # Since RFE performs backward elimination by default, we need to reverse the selected indices for backward elimination\n", 141 | " selected_indices = np.flip(np.where(selector.support_)[0])\n", 142 | " X_train_selected = selector.transform(X_train)\n", 143 | " X_test_selected = selector.transform(X_test)\n", 144 | " else:\n", 145 | " raise ValueError(\"Invalid method specified\")\n", 146 | "\n", 147 | " if method != \"Backward\":\n", 148 | " X_train_selected = selector.transform(X_train)\n", 149 | " X_test_selected = selector.transform(X_test)\n", 150 | " selected_indices = np.where(selector.support_)[0]\n", 151 | "\n", 152 | " # Print results\n", 153 | " print(f\"\\nWrapper Method with RFE ({method})\")\n", 154 | " print(f\"Selected Features (RFE): {selected_indices}\")\n", 155 | " print(f\"Number of Features Selected (RFE): {len(selected_indices)}\")\n", 156 | "\n", 157 | " return X_train_selected, X_test_selected\n", 158 | "\n" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "id": "3ab76798", 164 | "metadata": {}, 165 | "source": [ 166 | " \n", 167 | "
Wrapper method using RFE (Forward Selection) And Backword :
" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 20, 173 | "id": "be46adab", 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "name": "stdout", 178 | "output_type": "stream", 179 | "text": [ 180 | "\n", 181 | "Wrapper Method with RFE (Forward)\n", 182 | "Selected Features (RFE): [ 1 2 6 7 20 21 22 23 26 27]\n", 183 | "Number of Features Selected (RFE): 10\n", 184 | "Accuracy (RFE): 0.9649\n", 185 | "\n", 186 | "Wrapper Method with RFE (Backward)\n", 187 | "Selected Features (RFE): [27 26 23 22 21 20 7 6 2 1]\n", 188 | "Number of Features Selected (RFE): 10\n", 189 | "Accuracy (RFE): 0.9649\n" 190 | ] 191 | } 192 | ], 193 | "source": [ 194 | "# Function to train and evaluate a model\n", 195 | "def train_and_evaluate(X_train, X_test, y_train, y_test):\n", 196 | " model = RandomForestClassifier(random_state=seed)\n", 197 | " model.fit(X_train, y_train)\n", 198 | " y_pred = model.predict(X_test)\n", 199 | " accuracy = accuracy_score(y_test, y_pred)\n", 200 | "\n", 201 | " # Print accuracy\n", 202 | " print(f\"Accuracy (RFE): {accuracy:.4f}\")\n", 203 | "\n", 204 | " return accuracy\n", 205 | "\n", 206 | "# Wrapper method using RFE (Forward Selection)\n", 207 | "num_features_rfe_forward = 10\n", 208 | "X_train_rfe_forward, X_test_rfe_forward = wrapper_method_rfe(\n", 209 | " X_train, X_test, y_train, RandomForestClassifier(random_state=seed),\n", 210 | " num_features_rfe_forward, \"Forward\"\n", 211 | ")\n", 212 | "accuracy_rfe_forward = train_and_evaluate(X_train_rfe_forward, X_test_rfe_forward, y_train, y_test)\n", 213 | "\n", 214 | "# Wrapper method using RFE (Backward Elimination)\n", 215 | "num_features_rfe_backward = 10\n", 216 | "X_train_rfe_backward, X_test_rfe_backward = wrapper_method_rfe(\n", 217 | " X_train, X_test, y_train, RandomForestClassifier(random_state=seed),\n", 218 | " num_features_rfe_backward, \"Backward\"\n", 219 | ")\n", 220 | "accuracy_rfe_backward = train_and_evaluate(X_train_rfe_backward, X_test_rfe_backward, y_train, y_test)\n" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "id": "52de39a0", 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [] 230 | } 231 | ], 232 | "metadata": { 233 | "kernelspec": { 234 | "display_name": "Python 3 (ipykernel)", 235 | "language": "python", 236 | "name": "python3" 237 | }, 238 | "language_info": { 239 | "codemirror_mode": { 240 | "name": "ipython", 241 | "version": 3 242 | }, 243 | "file_extension": ".py", 244 | "mimetype": "text/x-python", 245 | "name": "python", 246 | "nbconvert_exporter": "python", 247 | "pygments_lexer": "ipython3", 248 | "version": "3.9.7" 249 | } 250 | }, 251 | "nbformat": 4, 252 | "nbformat_minor": 5 253 | } 254 | -------------------------------------------------------------------------------- /Feature Selection/feature-selection-technique-in-machine-learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", 7 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a", 8 | "collapsed": true 9 | }, 10 | "source": [ 11 | "

Feature Selection

\n", 12 | "Feature Selection is one of the most import technique for a great predictive model. It help us to know the most important features of the data set." 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": { 18 | "_uuid": "8710c1c227abfd06369844ebc57af6fca32b4632" 19 | }, 20 | "source": [ 21 | "

1. | I will cover the below points : 🌟 📚

\n", 22 | "\n", 23 | "1. What is Feature Selection?\n", 24 | "2. Why it is one the most important techinque to learn for a Data Scientitst?\n", 25 | "3. What are the different type of Feature Selection?" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "_uuid": "c755f87fc7ca150d89268ee9dba94b2720d69657" 32 | }, 33 | "source": [ 34 | "
1.1 | 1.Feature Selection: 🌍:
\n", 35 | "\n", 36 | "The process of selecting subset of relevant features for use in model construction which will help to increase the model prediction and decrease the error rate. \n", 37 | "In other word you can say its a process of identifying and removing as much of irrelevant and redundent information as possible.\n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": { 43 | "_uuid": "8779a7d4886a84ff86204ac95a0f6eba11876b58" 44 | }, 45 | "source": [ 46 | "
1.1 | 2. Importance of Feature Selection:🌍:
\n", 47 | "\n", 48 | "* Improve the accuracy of model.\n", 49 | "* Reduce overfitting.\n", 50 | "* Shoter traning time.\n", 51 | "* Reduce complexity of model.\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": { 57 | "_uuid": "1411d4d61851c8e1f404cc3275aec9fbc46b3ba8", 58 | "collapsed": true 59 | }, 60 | "source": [ 61 | "\n", 62 | "
Type of Feature Selection
\n", 63 | "\n", 64 | "* ***Wrapper Method***\n", 65 | "* ***Filter Method***\n", 66 | "* ***Embedded Method***\n" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": { 72 | "_uuid": "79ba39eefedbbc4a5ee444d9b88f953d261663fd" 73 | }, 74 | "source": [ 75 | "\n", 76 | "

Wrapper Method 🌟 📚

\n", 77 | "\n", 78 | "\n", 79 | "In this method a subset of features are selected and train a model using them. Based on the inference that we draw from the previous model, we decide to add or remove features from subset.\n", 80 | "[For indepth details](https://en.wikipedia.org/wiki/Feature_selection)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": { 86 | "_uuid": "503925da9749631ebb3a0e07be5587a8583060a1" 87 | }, 88 | "source": [ 89 | "**Image from wiki**\n", 90 | "

\"Feature

" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": { 96 | "_uuid": "e3a24d030c50a3cc50559c03f586bf2288c221f4", 97 | "collapsed": true 98 | }, 99 | "source": [ 100 | "\n", 101 | "
Type of Wrapper Method
\n", 102 | "\n", 103 | "\n", 104 | "* Forward Selection\n", 105 | "* Backward Elimination\n", 106 | "* Exhaustive Feature Selection " 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": { 112 | "_uuid": "2fa60b67f8e4805bcd497bf464d925b64b1900b5" 113 | }, 114 | "source": [ 115 | "
Forward Selection
\n", 116 | "\n", 117 | "It is a iterative method in which we keep adding feature which best improves our model till an addition of a new feature does not improve the model performance.

\n", 118 | "
Backward Elimination
\n", 119 | "In this we start with all features and removes the least significant feature at each iteration which improves the model performance. We repeat this until no improvemnt is observed on removal of feature.

\n", 120 | "\n", 121 | "
Exhaustive Feature Selection
\n", 122 | "\n", 123 | "In this the best subset of feature is selected, over all possible feature subsets. For example, if a dataset contains 4 features, the algorithm will evaluate all the feature combinations as follows:\n", 124 | "* All possible combinations of 1 feature\n", 125 | "* All possible combinations of 2 features\n", 126 | "* All possible combinations of 3 features\n", 127 | "* All possible combinations of 4 features\n", 128 | " " 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": { 134 | "_uuid": "eafcc42656451003439a83ee28839b1408d274cb" 135 | }, 136 | "source": [ 137 | "\n", 138 | "
Pros
\n", 139 | "\n", 140 | "\n", 141 | "* Aim to find the best possible feature combintaion.\n", 142 | "* Better result then filter method.\n", 143 | "* Can we used for small dataset having less features." 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": { 149 | "_uuid": "9fad7d1d69b81bb7e9157860890f028ff8483c82" 150 | }, 151 | "source": [ 152 | "\n", 153 | "\n", 154 | "\n", 155 | "
Cons
\n", 156 | "\n", 157 | "* Computationally expensive\n", 158 | "* Often impracticable for large dataset having more features." 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": { 164 | "_uuid": "1c6e339fd12bf741b1d3e00edd2c2ee3c136ea7d" 165 | }, 166 | "source": [ 167 | "

Filter Method 📚

" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": { 173 | "_uuid": "7d104eeabb0bd4ad80a59a9c66d88f5487a88937" 174 | }, 175 | "source": [ 176 | "Filter methods are generally used as a preprocessing step. The selection of features is independent of any machine learning algorithms. Instead, features are selected on the basis of their scores in various statistical tests for their correlation with the outcome variable." 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": { 182 | "_uuid": "0913e455d8c335346b3fe9099d00105e8a65c1b8", 183 | "collapsed": true 184 | }, 185 | "source": [ 186 | "**Image from wiki**\n", 187 | "

\"Filter

" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": { 193 | "_uuid": "211f9335e79a2ae0ca5a30ec96945f39e7481d07" 194 | }, 195 | "source": [ 196 | "\n", 197 | "\n", 198 | "
Basic Methods
\n", 199 | "\n", 200 | "We should consider the below filter methods as a data pre processing steps.\n", 201 | "* Constant features - Constant features are those that show the same value for all the observations of the dataset. Remove constant features from dataset.\n", 202 | "* Quasi-constant features - The column which contain 99% of same data is called Quasi constant column. Remove Quasi constant features from dataset.\n", 203 | "* Duplicated features - Remove duplicated features from dataset." 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": { 209 | "_uuid": "fce63551ebc4a8d6b671c9a0c4f10ac89d4fdff4" 210 | }, 211 | "source": [ 212 | "\n", 213 | "
Correlation
\n", 214 | "\n", 215 | "* Correlation is measure of the linear relationship of 2 or more variables.\n", 216 | "* Through correlation we can predict one variable from other.\n", 217 | " * Good variables are highly correlated with the target but uncorrelated among themselves.\n", 218 | "* If two variables are highly correlated with each other, then we should remove one of them. \n", 219 | " " 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": { 225 | "_uuid": "d007d540b45ed89c084a571db241329c03bd30b3" 226 | }, 227 | "source": [ 228 | "\n", 229 | "
Fisher Score
\n", 230 | "\n", 231 | "* Measures the dependence of 2 variables\n", 232 | "* Suited for categorical variables.\n", 233 | "* Target should be binary.\n", 234 | "* Variable values should be non negative, typically Boolean or counts.\n" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": { 240 | "_uuid": "59c38c2392896e016e2ab499b2ac829475d6cc19" 241 | }, 242 | "source": [ 243 | "\n", 244 | "\n", 245 | "
ANOVA (Analysis Of Variance)
\n", 246 | "\n", 247 | "* Measures the dependency of two variables.\n", 248 | "* Suited for continuous variables.\n", 249 | "* Requires a binary target.\n", 250 | "* Assumes linear relationship between variable and target.\n", 251 | "* Assumes variables are normally distributed.\n", 252 | "* Sensitive to sample size\n" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": { 258 | "_uuid": "987b7c270f4b43373b3d70fa216de2f866e64c03" 259 | }, 260 | "source": [ 261 | "
ROC-AUC / RMSE
\n", 262 | "\n", 263 | "* Measures the dependency of two variables.\n", 264 | "* Suited for all type of variables.\n", 265 | "* Makes no assumption on the distribution of the variables." 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": { 271 | "_uuid": "d817284e30d6bdb48e9508e3ecdf4fb015d087f2" 272 | }, 273 | "source": [ 274 | "\n", 275 | "
Steps to select features
\n", 276 | "\n", 277 | "* Rank features according to a certain criteria (like correlation).\n", 278 | " * Each feature is ranked independently of the feature space.\n", 279 | "* Select highest ranking features. " 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": { 285 | "_uuid": "e4ddc34ce1df6366a605729d04ca48094ade7009" 286 | }, 287 | "source": [ 288 | "\n", 289 | "
Basic Pros
\n", 290 | "\n", 291 | "* Fast computation.\n", 292 | "* Simple yet powerful to quickly remove irrelevant and redundant feature.\n", 293 | "* Better choice for large dataset over wrapper methods." 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": { 299 | "_uuid": "def3fa408a109f6d51bb1663a3bbcf5e66a231fd" 300 | }, 301 | "source": [ 302 | "
Basic Cons
\n", 303 | "\n", 304 | "* It may select redundant variables because they do not consider the relationships between features.\n", 305 | "* The prediction accuracy is lesser than wrapper methods." 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": { 311 | "_uuid": "87d76ab632364c86325f8b7749409bb77bf20432" 312 | }, 313 | "source": [ 314 | "\n", 315 | "

Embedded Method 🌟 📚

\n", 316 | "\n", 317 | "\n", 318 | "Embedded method combine the features of Filter and Wrapper methods. A learning algorithm takes advantage of its own variable selection process and performs feature selection and classification simultaneously." 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": { 324 | "_uuid": "b61d42f474e71dec29b9ff15752f106a21459c7e" 325 | }, 326 | "source": [ 327 | "**Image from wiki**\n", 328 | "

\"Feature

" 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": { 334 | "_uuid": "17807564445dd5b351e939e8309c60501785b693" 335 | }, 336 | "source": [ 337 | "

REGULARISATION 🌟 📚

\n", 338 | "\n", 339 | "\n", 340 | "\n", 341 | "Regularization consists in adding a penalty on the different parameters of the model to reduce the freedom of the model. Hence, the model will be less likely to fit the noise of the training data and will improve the generalization abilities of the model. For linear models there are in general 3 types of regularisation:\n", 342 | "* The L1 regularization (also called Lasso)\n", 343 | "* The L2 regularization (also called Ridge)\n", 344 | "* The L1/L2 regularization (also called Elastic net)" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": { 350 | "_uuid": "10c4a1d2541d1bfcbc3a6c378624be3f4ca9879a" 351 | }, 352 | "source": [ 353 | "**Image from Scikit learn**\n", 354 | "

" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": { 360 | "_uuid": "9a0bf904c39487ae6b3cc26a9053100526224d72", 361 | "collapsed": true 362 | }, 363 | "source": [] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": { 368 | "_uuid": "6675c83145ea22d7780e15a19b074c90884eebd4", 369 | "collapsed": true 370 | }, 371 | "source": [] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": { 377 | "_uuid": "c1e5227338ce91fb45514340eca51cbd4130c6a4", 378 | "collapsed": true 379 | }, 380 | "outputs": [], 381 | "source": [] 382 | } 383 | ], 384 | "metadata": { 385 | "kernelspec": { 386 | "display_name": "Python 3 (ipykernel)", 387 | "language": "python", 388 | "name": "python3" 389 | }, 390 | "language_info": { 391 | "codemirror_mode": { 392 | "name": "ipython", 393 | "version": 3 394 | }, 395 | "file_extension": ".py", 396 | "mimetype": "text/x-python", 397 | "name": "python", 398 | "nbconvert_exporter": "python", 399 | "pygments_lexer": "ipython3", 400 | "version": "3.9.7" 401 | } 402 | }, 403 | "nbformat": 4, 404 | "nbformat_minor": 1 405 | } 406 | -------------------------------------------------------------------------------- /Spam-Email-detection-system-main/Untitled2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "executionInfo": { 8 | "elapsed": 1401, 9 | "status": "ok", 10 | "timestamp": 1640778596418, 11 | "user": { 12 | "displayName": "bibek sah", 13 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64", 14 | "userId": "08792417367200435838" 15 | }, 16 | "user_tz": -345 17 | }, 18 | "id": "-9boQqt09xM6" 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import numpy as np\n", 23 | "import pandas as pd\n", 24 | "import matplotlib.pyplot as plt\n", 25 | "import seaborn as sns" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": { 32 | "executionInfo": { 33 | "elapsed": 28, 34 | "status": "ok", 35 | "timestamp": 1640778210470, 36 | "user": { 37 | "displayName": "bibek sah", 38 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64", 39 | "userId": "08792417367200435838" 40 | }, 41 | "user_tz": -345 42 | }, 43 | "id": "Yzk3k6Y890vh" 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "df=pd.read_csv('spam.csv')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 4, 53 | "metadata": { 54 | "colab": { 55 | "base_uri": "https://localhost:8080/", 56 | "height": 206 57 | }, 58 | "executionInfo": { 59 | "elapsed": 27, 60 | "status": "ok", 61 | "timestamp": 1640778210471, 62 | "user": { 63 | "displayName": "bibek sah", 64 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64", 65 | "userId": "08792417367200435838" 66 | }, 67 | "user_tz": -345 68 | }, 69 | "id": "QfNUzF-z-Qzy", 70 | "outputId": "51f730e2-032e-49dd-e93d-6902b4ba8ebc" 71 | }, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/html": [ 76 | "
\n", 77 | "\n", 90 | "\n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | "
CategoryMessage
0hamGo until jurong point, crazy.. Available only ...
1hamOk lar... Joking wif u oni...
2spamFree entry in 2 a wkly comp to win FA Cup fina...
3hamU dun say so early hor... U c already then say...
4hamNah I don't think he goes to usf, he lives aro...
\n", 126 | "
" 127 | ], 128 | "text/plain": [ 129 | " Category Message\n", 130 | "0 ham Go until jurong point, crazy.. Available only ...\n", 131 | "1 ham Ok lar... Joking wif u oni...\n", 132 | "2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n", 133 | "3 ham U dun say so early hor... U c already then say...\n", 134 | "4 ham Nah I don't think he goes to usf, he lives aro..." 135 | ] 136 | }, 137 | "execution_count": 4, 138 | "metadata": {}, 139 | "output_type": "execute_result" 140 | } 141 | ], 142 | "source": [ 143 | "df.head()" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 5, 149 | "metadata": { 150 | "colab": { 151 | "base_uri": "https://localhost:8080/" 152 | }, 153 | "executionInfo": { 154 | "elapsed": 17, 155 | "status": "ok", 156 | "timestamp": 1640778210472, 157 | "user": { 158 | "displayName": "bibek sah", 159 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64", 160 | "userId": "08792417367200435838" 161 | }, 162 | "user_tz": -345 163 | }, 164 | "id": "06nXuOFv_cWx", 165 | "outputId": "019c195a-c7ef-4b65-8c6e-963f4324c3dc" 166 | }, 167 | "outputs": [ 168 | { 169 | "data": { 170 | "text/plain": [ 171 | "array(['ham', 'spam'], dtype=object)" 172 | ] 173 | }, 174 | "execution_count": 5, 175 | "metadata": {}, 176 | "output_type": "execute_result" 177 | } 178 | ], 179 | "source": [ 180 | "df.Category.unique()" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 6, 186 | "metadata": { 187 | "executionInfo": { 188 | "elapsed": 831, 189 | "status": "ok", 190 | "timestamp": 1640778295983, 191 | "user": { 192 | "displayName": "bibek sah", 193 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64", 194 | "userId": "08792417367200435838" 195 | }, 196 | "user_tz": -345 197 | }, 198 | "id": "v4a1QjCV_jKC" 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "df['spam']=df['Category'].apply(lambda x: 1 if x=='spam' else 0)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 7, 208 | "metadata": { 209 | "colab": { 210 | "base_uri": "https://localhost:8080/", 211 | "height": 206 212 | }, 213 | "executionInfo": { 214 | "elapsed": 722, 215 | "status": "ok", 216 | "timestamp": 1640778655207, 217 | "user": { 218 | "displayName": "bibek sah", 219 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64", 220 | "userId": "08792417367200435838" 221 | }, 222 | "user_tz": -345 223 | }, 224 | "id": "wFhq-4sBAeTK", 225 | "outputId": "76771c6a-0589-45fb-8975-32a7a76ea055" 226 | }, 227 | "outputs": [ 228 | { 229 | "data": { 230 | "text/html": [ 231 | "
\n", 232 | "\n", 245 | "\n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | "
CategoryMessagespam
0hamGo until jurong point, crazy.. Available only ...0
1hamOk lar... Joking wif u oni...0
2spamFree entry in 2 a wkly comp to win FA Cup fina...1
3hamU dun say so early hor... U c already then say...0
4hamNah I don't think he goes to usf, he lives aro...0
\n", 287 | "
" 288 | ], 289 | "text/plain": [ 290 | " Category Message spam\n", 291 | "0 ham Go until jurong point, crazy.. Available only ... 0\n", 292 | "1 ham Ok lar... Joking wif u oni... 0\n", 293 | "2 spam Free entry in 2 a wkly comp to win FA Cup fina... 1\n", 294 | "3 ham U dun say so early hor... U c already then say... 0\n", 295 | "4 ham Nah I don't think he goes to usf, he lives aro... 0" 296 | ] 297 | }, 298 | "execution_count": 7, 299 | "metadata": {}, 300 | "output_type": "execute_result" 301 | } 302 | ], 303 | "source": [ 304 | "df.head(5)" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 8, 310 | "metadata": { 311 | "executionInfo": { 312 | "elapsed": 679, 313 | "status": "ok", 314 | "timestamp": 1640778804504, 315 | "user": { 316 | "displayName": "bibek sah", 317 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64", 318 | "userId": "08792417367200435838" 319 | }, 320 | "user_tz": -345 321 | }, 322 | "id": "K9RIT364B2Bm" 323 | }, 324 | "outputs": [], 325 | "source": [ 326 | "from sklearn.model_selection import train_test_split\n", 327 | "x_train,x_test,y_train,y_test=train_test_split(df.Message,df.spam,test_size=0.2,random_state=42)" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 9, 333 | "metadata": { 334 | "colab": { 335 | "base_uri": "https://localhost:8080/" 336 | }, 337 | "executionInfo": { 338 | "elapsed": 455, 339 | "status": "ok", 340 | "timestamp": 1640778875501, 341 | "user": { 342 | "displayName": "bibek sah", 343 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64", 344 | "userId": "08792417367200435838" 345 | }, 346 | "user_tz": -345 347 | }, 348 | "id": "GhSkE8R8CafK", 349 | "outputId": "690bd42a-2be3-4a5a-e8c1-0706cb4ce2ef" 350 | }, 351 | "outputs": [ 352 | { 353 | "data": { 354 | "text/plain": [ 355 | "4457" 356 | ] 357 | }, 358 | "execution_count": 9, 359 | "metadata": {}, 360 | "output_type": "execute_result" 361 | } 362 | ], 363 | "source": [ 364 | "len(x_train)" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 10, 370 | "metadata": { 371 | "colab": { 372 | "base_uri": "https://localhost:8080/" 373 | }, 374 | "executionInfo": { 375 | "elapsed": 693, 376 | "status": "ok", 377 | "timestamp": 1640779197055, 378 | "user": { 379 | "displayName": "bibek sah", 380 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64", 381 | "userId": "08792417367200435838" 382 | }, 383 | "user_tz": -345 384 | }, 385 | "id": "d8RKpTA4Cr2d", 386 | "outputId": "4c0deb38-26c5-409e-fee1-27de285b2b0e" 387 | }, 388 | "outputs": [ 389 | { 390 | "data": { 391 | "text/plain": [ 392 | "1115" 393 | ] 394 | }, 395 | "execution_count": 10, 396 | "metadata": {}, 397 | "output_type": "execute_result" 398 | } 399 | ], 400 | "source": [ 401 | "len(x_test)" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": 11, 407 | "metadata": { 408 | "colab": { 409 | "base_uri": "https://localhost:8080/" 410 | }, 411 | "executionInfo": { 412 | "elapsed": 723, 413 | "status": "ok", 414 | "timestamp": 1640779312565, 415 | "user": { 416 | "displayName": "bibek sah", 417 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64", 418 | "userId": "08792417367200435838" 419 | }, 420 | "user_tz": -345 421 | }, 422 | "id": "6Lb2BpL5D6Tw", 423 | "outputId": "d9c43a76-f4cd-403e-a5b7-d0c99bc48183" 424 | }, 425 | "outputs": [ 426 | { 427 | "data": { 428 | "text/plain": [ 429 | "array([[0, 0, 0, ..., 0, 0, 0],\n", 430 | " [0, 0, 0, ..., 0, 0, 0],\n", 431 | " [0, 0, 0, ..., 0, 0, 0],\n", 432 | " [0, 0, 0, ..., 0, 0, 0],\n", 433 | " [0, 0, 0, ..., 0, 0, 0]], dtype=int64)" 434 | ] 435 | }, 436 | "execution_count": 11, 437 | "metadata": {}, 438 | "output_type": "execute_result" 439 | } 440 | ], 441 | "source": [ 442 | "from sklearn.feature_extraction.text import CountVectorizer\n", 443 | "v=CountVectorizer()\n", 444 | "cv_messages = v.fit_transform(x_train.values)\n", 445 | "cv_messages.toarray()[0:5]" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 12, 451 | "metadata": { 452 | "executionInfo": { 453 | "elapsed": 7, 454 | "status": "ok", 455 | "timestamp": 1640779609411, 456 | "user": { 457 | "displayName": "bibek sah", 458 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64", 459 | "userId": "08792417367200435838" 460 | }, 461 | "user_tz": -345 462 | }, 463 | "id": "LUFmKWvVEWgO" 464 | }, 465 | "outputs": [], 466 | "source": [ 467 | "from sklearn.naive_bayes import MultinomialNB\n", 468 | "model=MultinomialNB()" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": 13, 474 | "metadata": { 475 | "colab": { 476 | "base_uri": "https://localhost:8080/" 477 | }, 478 | "executionInfo": { 479 | "elapsed": 1578, 480 | "status": "ok", 481 | "timestamp": 1640779640258, 482 | "user": { 483 | "displayName": "bibek sah", 484 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64", 485 | "userId": "08792417367200435838" 486 | }, 487 | "user_tz": -345 488 | }, 489 | "id": "icy7RxTrFfAm", 490 | "outputId": "0aee8aff-a9c0-4169-da8a-533f5a49e193" 491 | }, 492 | "outputs": [ 493 | { 494 | "data": { 495 | "text/plain": [ 496 | "MultinomialNB()" 497 | ] 498 | }, 499 | "execution_count": 13, 500 | "metadata": {}, 501 | "output_type": "execute_result" 502 | } 503 | ], 504 | "source": [ 505 | "model.fit(cv_messages,y_train)" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": 14, 511 | "metadata": { 512 | "colab": { 513 | "base_uri": "https://localhost:8080/" 514 | }, 515 | "executionInfo": { 516 | "elapsed": 772, 517 | "status": "ok", 518 | "timestamp": 1640780294984, 519 | "user": { 520 | "displayName": "bibek sah", 521 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64", 522 | "userId": "08792417367200435838" 523 | }, 524 | "user_tz": -345 525 | }, 526 | "id": "sfdn1y7PFmSX", 527 | "outputId": "6dc6eb0e-db56-48cd-8109-3072c098f6a6" 528 | }, 529 | "outputs": [ 530 | { 531 | "data": { 532 | "text/plain": [ 533 | "array([1, 0], dtype=int64)" 534 | ] 535 | }, 536 | "execution_count": 14, 537 | "metadata": {}, 538 | "output_type": "execute_result" 539 | } 540 | ], 541 | "source": [ 542 | "email = [\n", 543 | " 'Upto 30% discount on parking, exclusive offer just for yoy. Dont miss thi reward!',\n", 544 | " 'Ok lar...joking wif u oni...'\n", 545 | "]\n", 546 | "email_count= v.transform(email)\n", 547 | "model.predict(email_count)" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": 15, 553 | "metadata": { 554 | "colab": { 555 | "base_uri": "https://localhost:8080/" 556 | }, 557 | "executionInfo": { 558 | "elapsed": 731, 559 | "status": "ok", 560 | "timestamp": 1640780362896, 561 | "user": { 562 | "displayName": "bibek sah", 563 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64", 564 | "userId": "08792417367200435838" 565 | }, 566 | "user_tz": -345 567 | }, 568 | "id": "I0i5fFZ8IGVJ", 569 | "outputId": "d4c46fa1-af4c-42c7-93d2-11a425b14a48" 570 | }, 571 | "outputs": [ 572 | { 573 | "data": { 574 | "text/plain": [ 575 | "0.9919282511210762" 576 | ] 577 | }, 578 | "execution_count": 15, 579 | "metadata": {}, 580 | "output_type": "execute_result" 581 | } 582 | ], 583 | "source": [ 584 | "x_test_count=v.transform(x_test)\n", 585 | "model.score(x_test_count,y_test)\n" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": 16, 591 | "metadata": { 592 | "executionInfo": { 593 | "elapsed": 15, 594 | "status": "ok", 595 | "timestamp": 1640780413260, 596 | "user": { 597 | "displayName": "bibek sah", 598 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64", 599 | "userId": "08792417367200435838" 600 | }, 601 | "user_tz": -345 602 | }, 603 | "id": "v-ArF0cZIW7x" 604 | }, 605 | "outputs": [], 606 | "source": [ 607 | "# sklearn pipeline" 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": 17, 613 | "metadata": { 614 | "colab": { 615 | "base_uri": "https://localhost:8080/" 616 | }, 617 | "executionInfo": { 618 | "elapsed": 494, 619 | "status": "ok", 620 | "timestamp": 1640780722337, 621 | "user": { 622 | "displayName": "bibek sah", 623 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64", 624 | "userId": "08792417367200435838" 625 | }, 626 | "user_tz": -345 627 | }, 628 | "id": "Sj-eM9hgIjOB", 629 | "outputId": "160dfbdd-303b-4ecc-c2b9-a4fd8dac9510" 630 | }, 631 | "outputs": [ 632 | { 633 | "data": { 634 | "text/plain": [ 635 | "Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])" 636 | ] 637 | }, 638 | "execution_count": 17, 639 | "metadata": {}, 640 | "output_type": "execute_result" 641 | } 642 | ], 643 | "source": [ 644 | "from sklearn.pipeline import Pipeline\n", 645 | "clf = Pipeline([\n", 646 | " ('vectorizer', CountVectorizer()),\n", 647 | " ('nb', MultinomialNB()) \n", 648 | "]\n", 649 | ")\n", 650 | "clf.fit(x_train,y_train)" 651 | ] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "execution_count": 18, 656 | "metadata": { 657 | "colab": { 658 | "base_uri": "https://localhost:8080/" 659 | }, 660 | "executionInfo": { 661 | "elapsed": 697, 662 | "status": "ok", 663 | "timestamp": 1640780793192, 664 | "user": { 665 | "displayName": "bibek sah", 666 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64", 667 | "userId": "08792417367200435838" 668 | }, 669 | "user_tz": -345 670 | }, 671 | "id": "c4oNWSVmJuzd", 672 | "outputId": "4799d618-2345-4c43-f284-fbaca872b976" 673 | }, 674 | "outputs": [ 675 | { 676 | "data": { 677 | "text/plain": [ 678 | "array([1, 0], dtype=int64)" 679 | ] 680 | }, 681 | "execution_count": 18, 682 | "metadata": {}, 683 | "output_type": "execute_result" 684 | } 685 | ], 686 | "source": [ 687 | "email = [\n", 688 | " 'Upto 30% discount on parking, exclusive offer just for yoy. Dont miss thi reward!',\n", 689 | " 'Ok lar...joking wif u oni...' \n", 690 | "]\n", 691 | "clf.predict(email)" 692 | ] 693 | }, 694 | { 695 | "cell_type": "code", 696 | "execution_count": 19, 697 | "metadata": { 698 | "colab": { 699 | "base_uri": "https://localhost:8080/" 700 | }, 701 | "executionInfo": { 702 | "elapsed": 1829, 703 | "status": "ok", 704 | "timestamp": 1640780963050, 705 | "user": { 706 | "displayName": "bibek sah", 707 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64", 708 | "userId": "08792417367200435838" 709 | }, 710 | "user_tz": -345 711 | }, 712 | "id": "o752BL8PJ_-5", 713 | "outputId": "2361abbd-386a-4078-b145-1e0aab7c3254" 714 | }, 715 | "outputs": [ 716 | { 717 | "data": { 718 | "text/plain": [ 719 | "0.9919282511210762" 720 | ] 721 | }, 722 | "execution_count": 19, 723 | "metadata": {}, 724 | "output_type": "execute_result" 725 | } 726 | ], 727 | "source": [ 728 | "clf.score(x_test,y_test)" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": 20, 734 | "metadata": { 735 | "colab": { 736 | "base_uri": "https://localhost:8080/" 737 | }, 738 | "executionInfo": { 739 | "elapsed": 692, 740 | "status": "ok", 741 | "timestamp": 1640781005830, 742 | "user": { 743 | "displayName": "bibek sah", 744 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64", 745 | "userId": "08792417367200435838" 746 | }, 747 | "user_tz": -345 748 | }, 749 | "id": "Wb5rbwIVKo1S", 750 | "outputId": "4cb35f90-0e81-49c9-8379-0b477420d462" 751 | }, 752 | "outputs": [ 753 | { 754 | "data": { 755 | "text/plain": [ 756 | "['spam_model.pkl']" 757 | ] 758 | }, 759 | "execution_count": 20, 760 | "metadata": {}, 761 | "output_type": "execute_result" 762 | } 763 | ], 764 | "source": [ 765 | "import joblib\n", 766 | "joblib.dump(clf,'spam_model.pkl')" 767 | ] 768 | }, 769 | { 770 | "cell_type": "code", 771 | "execution_count": 21, 772 | "metadata": { 773 | "executionInfo": { 774 | "elapsed": 11, 775 | "status": "ok", 776 | "timestamp": 1640781042860, 777 | "user": { 778 | "displayName": "bibek sah", 779 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64", 780 | "userId": "08792417367200435838" 781 | }, 782 | "user_tz": -345 783 | }, 784 | "id": "pP9Ji7YHKz4h" 785 | }, 786 | "outputs": [], 787 | "source": [ 788 | "# model is completed" 789 | ] 790 | }, 791 | { 792 | "cell_type": "code", 793 | "execution_count": null, 794 | "metadata": { 795 | "id": "DegaoHEFK87R" 796 | }, 797 | "outputs": [], 798 | "source": [] 799 | } 800 | ], 801 | "metadata": { 802 | "colab": { 803 | "authorship_tag": "ABX9TyOu3xR/1JAiPy608KPO62Wq", 804 | "collapsed_sections": [], 805 | "mount_file_id": "1XRYrPikxSuVab8l-DhT_L5o44vnz3fDE", 806 | "name": "Untitled2.ipynb", 807 | "provenance": [] 808 | }, 809 | "kernelspec": { 810 | "display_name": "Python 3 (ipykernel)", 811 | "language": "python", 812 | "name": "python3" 813 | }, 814 | "language_info": { 815 | "codemirror_mode": { 816 | "name": "ipython", 817 | "version": 3 818 | }, 819 | "file_extension": ".py", 820 | "mimetype": "text/x-python", 821 | "name": "python", 822 | "nbconvert_exporter": "python", 823 | "pygments_lexer": "ipython3", 824 | "version": "3.9.7" 825 | } 826 | }, 827 | "nbformat": 4, 828 | "nbformat_minor": 1 829 | } 830 | -------------------------------------------------------------------------------- /regularization-in-machine-learning/regularization-in-machine-learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "85a85782", 6 | "metadata": { 7 | "papermill": { 8 | "duration": 0.042361, 9 | "end_time": "2021-12-22T19:07:37.877879", 10 | "exception": false, 11 | "start_time": "2021-12-22T19:07:37.835518", 12 | "status": "completed" 13 | }, 14 | "tags": [] 15 | }, 16 | "source": [ 17 | "### Regularization in Machine Learning" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "id": "11910490", 23 | "metadata": { 24 | "papermill": { 25 | "duration": 0.035764, 26 | "end_time": "2021-12-22T19:07:37.950117", 27 | "exception": false, 28 | "start_time": "2021-12-22T19:07:37.914353", 29 | "status": "completed" 30 | }, 31 | "tags": [] 32 | }, 33 | "source": [ 34 | "# what is regularization in ML\n", 35 | "\n", 36 | "- a technique to prevent the model from overfitting by adding extra information to it.\n", 37 | "- it maintain all variables or features in the model by reducing the magnitude of the variables. \n", 38 | "- Hence, it maintains accuracy as well as a generalization of the model.\n", 39 | "- In simple words, \"In regularization technique, we reduce the magnitude of the features by keeping the same number of features.\"\n", 40 | "- mainly regularizes or reduces the coefficient of features toward zero" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 1, 46 | "id": "646c8325", 47 | "metadata": { 48 | "execution": { 49 | "iopub.execute_input": "2021-12-22T19:07:38.029150Z", 50 | "iopub.status.busy": "2021-12-22T19:07:38.027353Z", 51 | "iopub.status.idle": "2021-12-22T19:07:38.032283Z", 52 | "shell.execute_reply": "2021-12-22T19:07:38.032799Z" 53 | }, 54 | "papermill": { 55 | "duration": 0.046738, 56 | "end_time": "2021-12-22T19:07:38.033119", 57 | "exception": false, 58 | "start_time": "2021-12-22T19:07:37.986381", 59 | "status": "completed" 60 | }, 61 | "tags": [] 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "# Basics of regularization" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "id": "c3676748", 71 | "metadata": { 72 | "papermill": { 73 | "duration": 0.036146, 74 | "end_time": "2021-12-22T19:07:38.108836", 75 | "exception": false, 76 | "start_time": "2021-12-22T19:07:38.072690", 77 | "status": "completed" 78 | }, 79 | "tags": [] 80 | }, 81 | "source": [ 82 | "- a technique to prevent the model from overfitting by adding extra information to it.\n", 83 | "- maintains accuracy as well as a generalization of the mode\n", 84 | "- reduces the magnitude of the variables, hence maintain all variables or features\n", 85 | "- In simple words, \"In regularization technique, we reduce the magnitude of the features by keeping the same number of features\"\n", 86 | "- by adding a penalty or complexity term to the complex model" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 2, 92 | "id": "275c826f", 93 | "metadata": { 94 | "execution": { 95 | "iopub.execute_input": "2021-12-22T19:07:38.185704Z", 96 | "iopub.status.busy": "2021-12-22T19:07:38.184715Z", 97 | "iopub.status.idle": "2021-12-22T19:07:38.188117Z", 98 | "shell.execute_reply": "2021-12-22T19:07:38.188755Z" 99 | }, 100 | "papermill": { 101 | "duration": 0.043564, 102 | "end_time": "2021-12-22T19:07:38.188919", 103 | "exception": false, 104 | "start_time": "2021-12-22T19:07:38.145355", 105 | "status": "completed" 106 | }, 107 | "tags": [] 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "# How does Regularization Work?" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "id": "b8aea070", 117 | "metadata": { 118 | "papermill": { 119 | "duration": 0.036276, 120 | "end_time": "2021-12-22T19:07:38.261804", 121 | "exception": false, 122 | "start_time": "2021-12-22T19:07:38.225528", 123 | "status": "completed" 124 | }, 125 | "tags": [] 126 | }, 127 | "source": [ 128 | "Let's consider the simple linear regression equation:\n", 129 | "y= β0+β1x1+β2x2+β3x3+⋯+βnxn +b\n", 130 | "\n", 131 | "Y represents the value to be predicted\n", 132 | "X1, X2, …Xn are the features for Y.\n", 133 | "\n", 134 | "β0,β1,…..βn are the weights or magnitude\n", 135 | "b represents the intercept.\n", 136 | "\n", 137 | "The loss function for the linear regression is called as RSS or Residual sum of squares.\n", 138 | "\n", 139 | "Techniques of Regularization:\n", 140 | "• Ridge Regression\n", 141 | "• Lasso Regression" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 3, 147 | "id": "dfac9cb6", 148 | "metadata": { 149 | "execution": { 150 | "iopub.execute_input": "2021-12-22T19:07:38.338525Z", 151 | "iopub.status.busy": "2021-12-22T19:07:38.337603Z", 152 | "iopub.status.idle": "2021-12-22T19:07:38.342423Z", 153 | "shell.execute_reply": "2021-12-22T19:07:38.343040Z" 154 | }, 155 | "papermill": { 156 | "duration": 0.04495, 157 | "end_time": "2021-12-22T19:07:38.343227", 158 | "exception": false, 159 | "start_time": "2021-12-22T19:07:38.298277", 160 | "status": "completed" 161 | }, 162 | "tags": [] 163 | }, 164 | "outputs": [], 165 | "source": [ 166 | "# Ridge regression:" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "id": "8246b5e3", 172 | "metadata": { 173 | "papermill": { 174 | "duration": 0.036138, 175 | "end_time": "2021-12-22T19:07:38.416009", 176 | "exception": false, 177 | "start_time": "2021-12-22T19:07:38.379871", 178 | "status": "completed" 179 | }, 180 | "tags": [] 181 | }, 182 | "source": [ 183 | "- a small amount of bias is added\n", 184 | "- reduces the complexity of the model, \n", 185 | "- also called L2 regularization\n", 186 | "- cost function is altered by adding the penalty term to it\n", 187 | "- amount of bias added to the model is called Ridge Regression penalty.." 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "id": "c40b82bb", 193 | "metadata": { 194 | "papermill": { 195 | "duration": 0.036398, 196 | "end_time": "2021-12-22T19:07:38.488992", 197 | "exception": false, 198 | "start_time": "2021-12-22T19:07:38.452594", 199 | "status": "completed" 200 | }, 201 | "tags": [] 202 | }, 203 | "source": [ 204 | "From the cost function of Ridge Regression we can see that if the values of λ tends to zero, the equation becomes the cost function of the linear regression model..\n", 205 | "\n", 206 | "A general linear or polynomial regression will fail if there is high collinearity between the independent variables, so to solve such problems, Ridge regression can be used." 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 4, 212 | "id": "91722b0d", 213 | "metadata": { 214 | "execution": { 215 | "iopub.execute_input": "2021-12-22T19:07:38.566612Z", 216 | "iopub.status.busy": "2021-12-22T19:07:38.565649Z", 217 | "iopub.status.idle": "2021-12-22T19:07:38.568855Z", 218 | "shell.execute_reply": "2021-12-22T19:07:38.569391Z" 219 | }, 220 | "papermill": { 221 | "duration": 0.043271, 222 | "end_time": "2021-12-22T19:07:38.569551", 223 | "exception": false, 224 | "start_time": "2021-12-22T19:07:38.526280", 225 | "status": "completed" 226 | }, 227 | "tags": [] 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "# Lasso regression" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "id": "e18ae18a", 237 | "metadata": { 238 | "papermill": { 239 | "duration": 0.036278, 240 | "end_time": "2021-12-22T19:07:38.642318", 241 | "exception": false, 242 | "start_time": "2021-12-22T19:07:38.606040", 243 | "status": "completed" 244 | }, 245 | "tags": [] 246 | }, 247 | "source": [ 248 | "Lasso Regression:\n", 249 | "- stands for Least Absolute Shrinkage and Selection Operator\n", 250 | "- also called L1 regularization\n", 251 | "- reduces the complexity of the model\n", 252 | "- similar to the Ridge Regression except that the penalty term contains only the absolute weights instead of a square of weights\n", 253 | "- Since it takes absolute values, hence, it can shrink the slope to zero\n", 254 | "- whereas Ridge Regression can only shrink it near to 0.\n", 255 | "- Some of the features are completely neglected for model evaluation\n", 256 | "- hence Lasso helps in reducing overfitting and also feature selection" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "id": "1e22ea3d", 262 | "metadata": { 263 | "papermill": { 264 | "duration": 0.035978, 265 | "end_time": "2021-12-22T19:07:38.714585", 266 | "exception": false, 267 | "start_time": "2021-12-22T19:07:38.678607", 268 | "status": "completed" 269 | }, 270 | "tags": [] 271 | }, 272 | "source": [ 273 | "Lasso Regression adds “absolute value of magnitude” of coefficient as penalty term to the loss function(L). \n", 274 | "Ridge regression adds “squared magnitude” of coefficient as penalty term to the loss function(L)." 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 5, 280 | "id": "5dfc333a", 281 | "metadata": { 282 | "execution": { 283 | "iopub.execute_input": "2021-12-22T19:07:38.791141Z", 284 | "iopub.status.busy": "2021-12-22T19:07:38.790203Z", 285 | "iopub.status.idle": "2021-12-22T19:07:38.793467Z", 286 | "shell.execute_reply": "2021-12-22T19:07:38.794295Z" 287 | }, 288 | "papermill": { 289 | "duration": 0.043469, 290 | "end_time": "2021-12-22T19:07:38.794464", 291 | "exception": false, 292 | "start_time": "2021-12-22T19:07:38.750995", 293 | "status": "completed" 294 | }, 295 | "tags": [] 296 | }, 297 | "outputs": [], 298 | "source": [ 299 | "### Implementation of Lasso Regression" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 6, 305 | "id": "f1be8d74", 306 | "metadata": { 307 | "execution": { 308 | "iopub.execute_input": "2021-12-22T19:07:38.871418Z", 309 | "iopub.status.busy": "2021-12-22T19:07:38.870473Z", 310 | "iopub.status.idle": "2021-12-22T19:07:38.879330Z", 311 | "shell.execute_reply": "2021-12-22T19:07:38.879908Z" 312 | }, 313 | "papermill": { 314 | "duration": 0.048782, 315 | "end_time": "2021-12-22T19:07:38.880083", 316 | "exception": false, 317 | "start_time": "2021-12-22T19:07:38.831301", 318 | "status": "completed" 319 | }, 320 | "tags": [] 321 | }, 322 | "outputs": [], 323 | "source": [ 324 | "import pandas as pd\n", 325 | "import numpy as np\n", 326 | "import matplotlib.pyplot as plt" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "id": "ab060d3e", 332 | "metadata": { 333 | "papermill": { 334 | "duration": 0.036029, 335 | "end_time": "2021-12-22T19:07:38.952464", 336 | "exception": false, 337 | "start_time": "2021-12-22T19:07:38.916435", 338 | "status": "completed" 339 | }, 340 | "tags": [] 341 | }, 342 | "source": [ 343 | "We are going to use the Boston house prediction dataset, that is an inbuilt dataset in sklearn" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 7, 349 | "id": "e06717e2", 350 | "metadata": { 351 | "execution": { 352 | "iopub.execute_input": "2021-12-22T19:07:39.030301Z", 353 | "iopub.status.busy": "2021-12-22T19:07:39.029298Z", 354 | "iopub.status.idle": "2021-12-22T19:07:40.031602Z", 355 | "shell.execute_reply": "2021-12-22T19:07:40.032128Z" 356 | }, 357 | "papermill": { 358 | "duration": 1.043508, 359 | "end_time": "2021-12-22T19:07:40.032329", 360 | "exception": false, 361 | "start_time": "2021-12-22T19:07:38.988821", 362 | "status": "completed" 363 | }, 364 | "tags": [] 365 | }, 366 | "outputs": [], 367 | "source": [ 368 | "from sklearn.datasets import load_boston\n", 369 | "boston=load_boston()" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 8, 375 | "id": "9e81cdf1", 376 | "metadata": { 377 | "execution": { 378 | "iopub.execute_input": "2021-12-22T19:07:40.116864Z", 379 | "iopub.status.busy": "2021-12-22T19:07:40.116015Z", 380 | "iopub.status.idle": "2021-12-22T19:07:40.119134Z", 381 | "shell.execute_reply": "2021-12-22T19:07:40.119605Z" 382 | }, 383 | "papermill": { 384 | "duration": 0.048494, 385 | "end_time": "2021-12-22T19:07:40.119782", 386 | "exception": false, 387 | "start_time": "2021-12-22T19:07:40.071288", 388 | "status": "completed" 389 | }, 390 | "tags": [] 391 | }, 392 | "outputs": [ 393 | { 394 | "data": { 395 | "text/plain": [ 396 | "['DESCR', 'data', 'feature_names', 'filename', 'target']" 397 | ] 398 | }, 399 | "execution_count": 8, 400 | "metadata": {}, 401 | "output_type": "execute_result" 402 | } 403 | ], 404 | "source": [ 405 | "# Getting attributes of boston\n", 406 | "dir(boston)" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 9, 412 | "id": "25ce3d97", 413 | "metadata": { 414 | "execution": { 415 | "iopub.execute_input": "2021-12-22T19:07:40.198755Z", 416 | "iopub.status.busy": "2021-12-22T19:07:40.198029Z", 417 | "iopub.status.idle": "2021-12-22T19:07:40.202579Z", 418 | "shell.execute_reply": "2021-12-22T19:07:40.203158Z" 419 | }, 420 | "papermill": { 421 | "duration": 0.045441, 422 | "end_time": "2021-12-22T19:07:40.203322", 423 | "exception": false, 424 | "start_time": "2021-12-22T19:07:40.157881", 425 | "status": "completed" 426 | }, 427 | "tags": [] 428 | }, 429 | "outputs": [ 430 | { 431 | "data": { 432 | "text/plain": [ 433 | "\".. _boston_dataset:\\n\\nBoston house prices dataset\\n---------------------------\\n\\n**Data Set Characteristics:** \\n\\n :Number of Instances: 506 \\n\\n :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\\n\\n :Attribute Information (in order):\\n - CRIM per capita crime rate by town\\n - ZN proportion of residential land zoned for lots over 25,000 sq.ft.\\n - INDUS proportion of non-retail business acres per town\\n - CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\\n - NOX nitric oxides concentration (parts per 10 million)\\n - RM average number of rooms per dwelling\\n - AGE proportion of owner-occupied units built prior to 1940\\n - DIS weighted distances to five Boston employment centres\\n - RAD index of accessibility to radial highways\\n - TAX full-value property-tax rate per $10,000\\n - PTRATIO pupil-teacher ratio by town\\n - B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\\n - LSTAT % lower status of the population\\n - MEDV Median value of owner-occupied homes in $1000's\\n\\n :Missing Attribute Values: None\\n\\n :Creator: Harrison, D. and Rubinfeld, D.L.\\n\\nThis is a copy of UCI ML housing dataset.\\nhttps://archive.ics.uci.edu/ml/machine-learning-databases/housing/\\n\\n\\nThis dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\\n\\nThe Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\\nprices and the demand for clean air', J. Environ. Economics & Management,\\nvol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics\\n...', Wiley, 1980. N.B. Various transformations are used in the table on\\npages 244-261 of the latter.\\n\\nThe Boston house-price data has been used in many machine learning papers that address regression\\nproblems. \\n \\n.. topic:: References\\n\\n - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\\n - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\\n\"" 434 | ] 435 | }, 436 | "execution_count": 9, 437 | "metadata": {}, 438 | "output_type": "execute_result" 439 | } 440 | ], 441 | "source": [ 442 | "# printing description\n", 443 | "boston.DESCR" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": 10, 449 | "id": "8ebb5684", 450 | "metadata": { 451 | "execution": { 452 | "iopub.execute_input": "2021-12-22T19:07:40.288364Z", 453 | "iopub.status.busy": "2021-12-22T19:07:40.287693Z", 454 | "iopub.status.idle": "2021-12-22T19:07:40.290236Z", 455 | "shell.execute_reply": "2021-12-22T19:07:40.290736Z" 456 | }, 457 | "papermill": { 458 | "duration": 0.048769, 459 | "end_time": "2021-12-22T19:07:40.290894", 460 | "exception": false, 461 | "start_time": "2021-12-22T19:07:40.242125", 462 | "status": "completed" 463 | }, 464 | "tags": [] 465 | }, 466 | "outputs": [ 467 | { 468 | "data": { 469 | "text/plain": [ 470 | "array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,\n", 471 | " 4.9800e+00],\n", 472 | " [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,\n", 473 | " 9.1400e+00],\n", 474 | " [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,\n", 475 | " 4.0300e+00],\n", 476 | " ...,\n", 477 | " [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,\n", 478 | " 5.6400e+00],\n", 479 | " [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,\n", 480 | " 6.4800e+00],\n", 481 | " [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,\n", 482 | " 7.8800e+00]])" 483 | ] 484 | }, 485 | "execution_count": 10, 486 | "metadata": {}, 487 | "output_type": "execute_result" 488 | } 489 | ], 490 | "source": [ 491 | "# Printing \"data\" attributes of the dataset, its our input \n", 492 | "boston.data" 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": 11, 498 | "id": "2931de15", 499 | "metadata": { 500 | "execution": { 501 | "iopub.execute_input": "2021-12-22T19:07:40.371336Z", 502 | "iopub.status.busy": "2021-12-22T19:07:40.370379Z", 503 | "iopub.status.idle": "2021-12-22T19:07:40.376451Z", 504 | "shell.execute_reply": "2021-12-22T19:07:40.375892Z" 505 | }, 506 | "papermill": { 507 | "duration": 0.047497, 508 | "end_time": "2021-12-22T19:07:40.376583", 509 | "exception": false, 510 | "start_time": "2021-12-22T19:07:40.329086", 511 | "status": "completed" 512 | }, 513 | "tags": [] 514 | }, 515 | "outputs": [ 516 | { 517 | "data": { 518 | "text/plain": [ 519 | "array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',\n", 520 | " 'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='\n", 621 | "\n", 634 | "\n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | "
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTAT
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.94.98
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.99.14
\n", 688 | "
" 689 | ], 690 | "text/plain": [ 691 | " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n", 692 | "0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 \n", 693 | "1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 \n", 694 | "\n", 695 | " PTRATIO B LSTAT \n", 696 | "0 15.3 396.9 4.98 \n", 697 | "1 17.8 396.9 9.14 " 698 | ] 699 | }, 700 | "execution_count": 14, 701 | "metadata": {}, 702 | "output_type": "execute_result" 703 | } 704 | ], 705 | "source": [ 706 | "# Printing first 2 rows of the dataframe 'df'\n", 707 | "df.head(2)" 708 | ] 709 | }, 710 | { 711 | "cell_type": "code", 712 | "execution_count": 15, 713 | "id": "941a7867", 714 | "metadata": { 715 | "execution": { 716 | "iopub.execute_input": "2021-12-22T19:07:40.746594Z", 717 | "iopub.status.busy": "2021-12-22T19:07:40.741140Z", 718 | "iopub.status.idle": "2021-12-22T19:07:40.749244Z", 719 | "shell.execute_reply": "2021-12-22T19:07:40.749813Z" 720 | }, 721 | "papermill": { 722 | "duration": 0.052214, 723 | "end_time": "2021-12-22T19:07:40.749987", 724 | "exception": false, 725 | "start_time": "2021-12-22T19:07:40.697773", 726 | "status": "completed" 727 | }, 728 | "tags": [] 729 | }, 730 | "outputs": [], 731 | "source": [ 732 | "# adding a new column 'target' from boston.target\n", 733 | "df['target']=boston.target" 734 | ] 735 | }, 736 | { 737 | "cell_type": "code", 738 | "execution_count": 16, 739 | "id": "313f6168", 740 | "metadata": { 741 | "execution": { 742 | "iopub.execute_input": "2021-12-22T19:07:40.835505Z", 743 | "iopub.status.busy": "2021-12-22T19:07:40.834548Z", 744 | "iopub.status.idle": "2021-12-22T19:07:40.852884Z", 745 | "shell.execute_reply": "2021-12-22T19:07:40.852218Z" 746 | }, 747 | "papermill": { 748 | "duration": 0.062957, 749 | "end_time": "2021-12-22T19:07:40.853023", 750 | "exception": false, 751 | "start_time": "2021-12-22T19:07:40.790066", 752 | "status": "completed" 753 | }, 754 | "tags": [] 755 | }, 756 | "outputs": [ 757 | { 758 | "data": { 759 | "text/html": [ 760 | "
\n", 761 | "\n", 774 | "\n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | "
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATtarget
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.94.9824.0
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.99.1421.6
\n", 831 | "
" 832 | ], 833 | "text/plain": [ 834 | " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n", 835 | "0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 \n", 836 | "1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 \n", 837 | "\n", 838 | " PTRATIO B LSTAT target \n", 839 | "0 15.3 396.9 4.98 24.0 \n", 840 | "1 17.8 396.9 9.14 21.6 " 841 | ] 842 | }, 843 | "execution_count": 16, 844 | "metadata": {}, 845 | "output_type": "execute_result" 846 | } 847 | ], 848 | "source": [ 849 | "df.head(2)" 850 | ] 851 | }, 852 | { 853 | "cell_type": "code", 854 | "execution_count": 17, 855 | "id": "be488e24", 856 | "metadata": { 857 | "execution": { 858 | "iopub.execute_input": "2021-12-22T19:07:40.957263Z", 859 | "iopub.status.busy": "2021-12-22T19:07:40.956587Z", 860 | "iopub.status.idle": "2021-12-22T19:07:40.959429Z", 861 | "shell.execute_reply": "2021-12-22T19:07:40.960162Z" 862 | }, 863 | "papermill": { 864 | "duration": 0.065797, 865 | "end_time": "2021-12-22T19:07:40.960381", 866 | "exception": false, 867 | "start_time": "2021-12-22T19:07:40.894584", 868 | "status": "completed" 869 | }, 870 | "tags": [] 871 | }, 872 | "outputs": [ 873 | { 874 | "name": "stdout", 875 | "output_type": "stream", 876 | "text": [ 877 | "\n", 878 | "RangeIndex: 506 entries, 0 to 505\n", 879 | "Data columns (total 14 columns):\n", 880 | " # Column Non-Null Count Dtype \n", 881 | "--- ------ -------------- ----- \n", 882 | " 0 CRIM 506 non-null float64\n", 883 | " 1 ZN 506 non-null float64\n", 884 | " 2 INDUS 506 non-null float64\n", 885 | " 3 CHAS 506 non-null float64\n", 886 | " 4 NOX 506 non-null float64\n", 887 | " 5 RM 506 non-null float64\n", 888 | " 6 AGE 506 non-null float64\n", 889 | " 7 DIS 506 non-null float64\n", 890 | " 8 RAD 506 non-null float64\n", 891 | " 9 TAX 506 non-null float64\n", 892 | " 10 PTRATIO 506 non-null float64\n", 893 | " 11 B 506 non-null float64\n", 894 | " 12 LSTAT 506 non-null float64\n", 895 | " 13 target 506 non-null float64\n", 896 | "dtypes: float64(14)\n", 897 | "memory usage: 55.5 KB\n" 898 | ] 899 | } 900 | ], 901 | "source": [ 902 | "# Printing consized summary about the dataset\n", 903 | "df.info()" 904 | ] 905 | }, 906 | { 907 | "cell_type": "markdown", 908 | "id": "550c0ff9", 909 | "metadata": { 910 | "papermill": { 911 | "duration": 0.04135, 912 | "end_time": "2021-12-22T19:07:41.044965", 913 | "exception": false, 914 | "start_time": "2021-12-22T19:07:41.003615", 915 | "status": "completed" 916 | }, 917 | "tags": [] 918 | }, 919 | "source": [ 920 | "- we have 13 independent variable and one dependent (House price) variable" 921 | ] 922 | }, 923 | { 924 | "cell_type": "code", 925 | "execution_count": 18, 926 | "id": "5c923e03", 927 | "metadata": { 928 | "execution": { 929 | "iopub.execute_input": "2021-12-22T19:07:41.135908Z", 930 | "iopub.status.busy": "2021-12-22T19:07:41.135229Z", 931 | "iopub.status.idle": "2021-12-22T19:07:41.137601Z", 932 | "shell.execute_reply": "2021-12-22T19:07:41.138114Z" 933 | }, 934 | "papermill": { 935 | "duration": 0.050966, 936 | "end_time": "2021-12-22T19:07:41.138293", 937 | "exception": false, 938 | "start_time": "2021-12-22T19:07:41.087327", 939 | "status": "completed" 940 | }, 941 | "tags": [] 942 | }, 943 | "outputs": [], 944 | "source": [ 945 | "X=df.iloc[:,:-1].values\n", 946 | "y=df.iloc[:,-1].values" 947 | ] 948 | }, 949 | { 950 | "cell_type": "code", 951 | "execution_count": 19, 952 | "id": "0dd9ce88", 953 | "metadata": { 954 | "execution": { 955 | "iopub.execute_input": "2021-12-22T19:07:41.224776Z", 956 | "iopub.status.busy": "2021-12-22T19:07:41.224113Z", 957 | "iopub.status.idle": "2021-12-22T19:07:41.275596Z", 958 | "shell.execute_reply": "2021-12-22T19:07:41.275042Z" 959 | }, 960 | "papermill": { 961 | "duration": 0.095782, 962 | "end_time": "2021-12-22T19:07:41.275750", 963 | "exception": false, 964 | "start_time": "2021-12-22T19:07:41.179968", 965 | "status": "completed" 966 | }, 967 | "tags": [] 968 | }, 969 | "outputs": [], 970 | "source": [ 971 | "from sklearn.model_selection import train_test_split" 972 | ] 973 | }, 974 | { 975 | "cell_type": "code", 976 | "execution_count": 20, 977 | "id": "1126063d", 978 | "metadata": { 979 | "execution": { 980 | "iopub.execute_input": "2021-12-22T19:07:41.361843Z", 981 | "iopub.status.busy": "2021-12-22T19:07:41.361257Z", 982 | "iopub.status.idle": "2021-12-22T19:07:41.366540Z", 983 | "shell.execute_reply": "2021-12-22T19:07:41.367029Z" 984 | }, 985 | "papermill": { 986 | "duration": 0.05011, 987 | "end_time": "2021-12-22T19:07:41.367189", 988 | "exception": false, 989 | "start_time": "2021-12-22T19:07:41.317079", 990 | "status": "completed" 991 | }, 992 | "tags": [] 993 | }, 994 | "outputs": [], 995 | "source": [ 996 | "X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25)" 997 | ] 998 | }, 999 | { 1000 | "cell_type": "code", 1001 | "execution_count": 21, 1002 | "id": "1a913510", 1003 | "metadata": { 1004 | "execution": { 1005 | "iopub.execute_input": "2021-12-22T19:07:41.455019Z", 1006 | "iopub.status.busy": "2021-12-22T19:07:41.454401Z", 1007 | "iopub.status.idle": "2021-12-22T19:07:41.459162Z", 1008 | "shell.execute_reply": "2021-12-22T19:07:41.459617Z" 1009 | }, 1010 | "papermill": { 1011 | "duration": 0.051016, 1012 | "end_time": "2021-12-22T19:07:41.459791", 1013 | "exception": false, 1014 | "start_time": "2021-12-22T19:07:41.408775", 1015 | "status": "completed" 1016 | }, 1017 | "tags": [] 1018 | }, 1019 | "outputs": [ 1020 | { 1021 | "name": "stdout", 1022 | "output_type": "stream", 1023 | "text": [ 1024 | "(379, 13) (379,)\n" 1025 | ] 1026 | } 1027 | ], 1028 | "source": [ 1029 | "print(X_train.shape,y_train.shape)" 1030 | ] 1031 | }, 1032 | { 1033 | "cell_type": "code", 1034 | "execution_count": 22, 1035 | "id": "821c4385", 1036 | "metadata": { 1037 | "execution": { 1038 | "iopub.execute_input": "2021-12-22T19:07:41.547471Z", 1039 | "iopub.status.busy": "2021-12-22T19:07:41.546860Z", 1040 | "iopub.status.idle": "2021-12-22T19:07:41.552895Z", 1041 | "shell.execute_reply": "2021-12-22T19:07:41.552335Z" 1042 | }, 1043 | "papermill": { 1044 | "duration": 0.05103, 1045 | "end_time": "2021-12-22T19:07:41.553036", 1046 | "exception": false, 1047 | "start_time": "2021-12-22T19:07:41.502006", 1048 | "status": "completed" 1049 | }, 1050 | "tags": [] 1051 | }, 1052 | "outputs": [ 1053 | { 1054 | "name": "stdout", 1055 | "output_type": "stream", 1056 | "text": [ 1057 | "(127, 13) (127,)\n" 1058 | ] 1059 | } 1060 | ], 1061 | "source": [ 1062 | "print(X_test.shape,y_test.shape)" 1063 | ] 1064 | }, 1065 | { 1066 | "cell_type": "code", 1067 | "execution_count": 23, 1068 | "id": "e1df2e09", 1069 | "metadata": { 1070 | "execution": { 1071 | "iopub.execute_input": "2021-12-22T19:07:41.643135Z", 1072 | "iopub.status.busy": "2021-12-22T19:07:41.642472Z", 1073 | "iopub.status.idle": "2021-12-22T19:07:41.720937Z", 1074 | "shell.execute_reply": "2021-12-22T19:07:41.720254Z" 1075 | }, 1076 | "papermill": { 1077 | "duration": 0.125329, 1078 | "end_time": "2021-12-22T19:07:41.721071", 1079 | "exception": false, 1080 | "start_time": "2021-12-22T19:07:41.595742", 1081 | "status": "completed" 1082 | }, 1083 | "tags": [] 1084 | }, 1085 | "outputs": [], 1086 | "source": [ 1087 | "# now we will start training of the model on multiple regression\n", 1088 | "from sklearn.linear_model import LinearRegression\n", 1089 | "lr=LinearRegression()" 1090 | ] 1091 | }, 1092 | { 1093 | "cell_type": "code", 1094 | "execution_count": 24, 1095 | "id": "0cf58a86", 1096 | "metadata": { 1097 | "execution": { 1098 | "iopub.execute_input": "2021-12-22T19:07:41.811926Z", 1099 | "iopub.status.busy": "2021-12-22T19:07:41.811262Z", 1100 | "iopub.status.idle": "2021-12-22T19:07:41.832465Z", 1101 | "shell.execute_reply": "2021-12-22T19:07:41.831844Z" 1102 | }, 1103 | "papermill": { 1104 | "duration": 0.06874, 1105 | "end_time": "2021-12-22T19:07:41.832601", 1106 | "exception": false, 1107 | "start_time": "2021-12-22T19:07:41.763861", 1108 | "status": "completed" 1109 | }, 1110 | "tags": [] 1111 | }, 1112 | "outputs": [ 1113 | { 1114 | "data": { 1115 | "text/plain": [ 1116 | "LinearRegression()" 1117 | ] 1118 | }, 1119 | "execution_count": 24, 1120 | "metadata": {}, 1121 | "output_type": "execute_result" 1122 | } 1123 | ], 1124 | "source": [ 1125 | "lr.fit(X_train, y_train)" 1126 | ] 1127 | }, 1128 | { 1129 | "cell_type": "code", 1130 | "execution_count": 25, 1131 | "id": "9d2d414a", 1132 | "metadata": { 1133 | "execution": { 1134 | "iopub.execute_input": "2021-12-22T19:07:41.924444Z", 1135 | "iopub.status.busy": "2021-12-22T19:07:41.922358Z", 1136 | "iopub.status.idle": "2021-12-22T19:07:41.927036Z", 1137 | "shell.execute_reply": "2021-12-22T19:07:41.926431Z" 1138 | }, 1139 | "papermill": { 1140 | "duration": 0.051007, 1141 | "end_time": "2021-12-22T19:07:41.927167", 1142 | "exception": false, 1143 | "start_time": "2021-12-22T19:07:41.876160", 1144 | "status": "completed" 1145 | }, 1146 | "tags": [] 1147 | }, 1148 | "outputs": [], 1149 | "source": [ 1150 | "lr_pred=lr.predict(X_test)" 1151 | ] 1152 | }, 1153 | { 1154 | "cell_type": "code", 1155 | "execution_count": 26, 1156 | "id": "ea2fddf5", 1157 | "metadata": { 1158 | "execution": { 1159 | "iopub.execute_input": "2021-12-22T19:07:42.021312Z", 1160 | "iopub.status.busy": "2021-12-22T19:07:42.020599Z", 1161 | "iopub.status.idle": "2021-12-22T19:07:42.024191Z", 1162 | "shell.execute_reply": "2021-12-22T19:07:42.024795Z" 1163 | }, 1164 | "papermill": { 1165 | "duration": 0.053355, 1166 | "end_time": "2021-12-22T19:07:42.024956", 1167 | "exception": false, 1168 | "start_time": "2021-12-22T19:07:41.971601", 1169 | "status": "completed" 1170 | }, 1171 | "tags": [] 1172 | }, 1173 | "outputs": [ 1174 | { 1175 | "data": { 1176 | "text/plain": [ 1177 | "19.73771080470582" 1178 | ] 1179 | }, 1180 | "execution_count": 26, 1181 | "metadata": {}, 1182 | "output_type": "execute_result" 1183 | } 1184 | ], 1185 | "source": [ 1186 | "# calculation mean squared error\n", 1187 | "mse=np.mean((lr_pred-y_test)**2)\n", 1188 | "mse" 1189 | ] 1190 | }, 1191 | { 1192 | "cell_type": "code", 1193 | "execution_count": 27, 1194 | "id": "393d46ab", 1195 | "metadata": { 1196 | "execution": { 1197 | "iopub.execute_input": "2021-12-22T19:07:42.116953Z", 1198 | "iopub.status.busy": "2021-12-22T19:07:42.116337Z", 1199 | "iopub.status.idle": "2021-12-22T19:07:42.126336Z", 1200 | "shell.execute_reply": "2021-12-22T19:07:42.126853Z" 1201 | }, 1202 | "papermill": { 1203 | "duration": 0.058001, 1204 | "end_time": "2021-12-22T19:07:42.127035", 1205 | "exception": false, 1206 | "start_time": "2021-12-22T19:07:42.069034", 1207 | "status": "completed" 1208 | }, 1209 | "tags": [] 1210 | }, 1211 | "outputs": [ 1212 | { 1213 | "name": "stdout", 1214 | "output_type": "stream", 1215 | "text": [ 1216 | " Columns Coefficient Values\n", 1217 | "0 CRIM -0.066498\n", 1218 | "1 ZN 0.053051\n", 1219 | "2 INDUS 0.041127\n", 1220 | "3 CHAS 3.502430\n", 1221 | "4 NOX -18.380600\n", 1222 | "5 RM 3.456135\n", 1223 | "6 AGE 0.012149\n", 1224 | "7 DIS -1.543379\n", 1225 | "8 RAD 0.296151\n", 1226 | "9 TAX -0.012449\n", 1227 | "10 PTRATIO -0.890911\n", 1228 | "11 B 0.011632\n", 1229 | "12 LSTAT -0.606322\n", 1230 | "13 target NaN\n" 1231 | ] 1232 | } 1233 | ], 1234 | "source": [ 1235 | "# Putting together the coefficient and their columns\n", 1236 | "\n", 1237 | "lr_coeff=pd.DataFrame()\n", 1238 | "lr_coeff['Columns']=df.columns\n", 1239 | "lr_coeff['Coefficient Values']=pd.Series(lr.coef_)\n", 1240 | "\n", 1241 | "print(lr_coeff)" 1242 | ] 1243 | }, 1244 | { 1245 | "cell_type": "markdown", 1246 | "id": "749d3787", 1247 | "metadata": { 1248 | "papermill": { 1249 | "duration": 0.044005, 1250 | "end_time": "2021-12-22T19:07:42.215203", 1251 | "exception": false, 1252 | "start_time": "2021-12-22T19:07:42.171198", 1253 | "status": "completed" 1254 | }, 1255 | "tags": [] 1256 | }, 1257 | "source": [ 1258 | "- We can see that most of the columns do not significant coefficients and hence they do not contribute much in model performance,\n", 1259 | "- we need to regularize the model" 1260 | ] 1261 | }, 1262 | { 1263 | "cell_type": "code", 1264 | "execution_count": 28, 1265 | "id": "2cc429b5", 1266 | "metadata": { 1267 | "execution": { 1268 | "iopub.execute_input": "2021-12-22T19:07:42.308891Z", 1269 | "iopub.status.busy": "2021-12-22T19:07:42.308216Z", 1270 | "iopub.status.idle": "2021-12-22T19:07:42.310474Z", 1271 | "shell.execute_reply": "2021-12-22T19:07:42.311006Z" 1272 | }, 1273 | "papermill": { 1274 | "duration": 0.051247, 1275 | "end_time": "2021-12-22T19:07:42.311167", 1276 | "exception": false, 1277 | "start_time": "2021-12-22T19:07:42.259920", 1278 | "status": "completed" 1279 | }, 1280 | "tags": [] 1281 | }, 1282 | "outputs": [], 1283 | "source": [ 1284 | "# Regularizing using ridge regression\n", 1285 | "from sklearn.linear_model import Ridge" 1286 | ] 1287 | }, 1288 | { 1289 | "cell_type": "code", 1290 | "execution_count": 29, 1291 | "id": "4ffa3dfc", 1292 | "metadata": { 1293 | "execution": { 1294 | "iopub.execute_input": "2021-12-22T19:07:42.403533Z", 1295 | "iopub.status.busy": "2021-12-22T19:07:42.402930Z", 1296 | "iopub.status.idle": "2021-12-22T19:07:42.405692Z", 1297 | "shell.execute_reply": "2021-12-22T19:07:42.406297Z" 1298 | }, 1299 | "papermill": { 1300 | "duration": 0.050914, 1301 | "end_time": "2021-12-22T19:07:42.406456", 1302 | "exception": false, 1303 | "start_time": "2021-12-22T19:07:42.355542", 1304 | "status": "completed" 1305 | }, 1306 | "tags": [] 1307 | }, 1308 | "outputs": [], 1309 | "source": [ 1310 | "ridge_reg=Ridge(alpha=1)\n", 1311 | "# here alpha parameter indicates Regularization strength; it must be a positive floating number" 1312 | ] 1313 | }, 1314 | { 1315 | "cell_type": "code", 1316 | "execution_count": 30, 1317 | "id": "a4b5a566", 1318 | "metadata": { 1319 | "execution": { 1320 | "iopub.execute_input": "2021-12-22T19:07:42.498921Z", 1321 | "iopub.status.busy": "2021-12-22T19:07:42.498315Z", 1322 | "iopub.status.idle": "2021-12-22T19:07:42.508619Z", 1323 | "shell.execute_reply": "2021-12-22T19:07:42.509115Z" 1324 | }, 1325 | "papermill": { 1326 | "duration": 0.058383, 1327 | "end_time": "2021-12-22T19:07:42.509276", 1328 | "exception": false, 1329 | "start_time": "2021-12-22T19:07:42.450893", 1330 | "status": "completed" 1331 | }, 1332 | "tags": [] 1333 | }, 1334 | "outputs": [ 1335 | { 1336 | "data": { 1337 | "text/plain": [ 1338 | "Ridge(alpha=1)" 1339 | ] 1340 | }, 1341 | "execution_count": 30, 1342 | "metadata": {}, 1343 | "output_type": "execute_result" 1344 | } 1345 | ], 1346 | "source": [ 1347 | "ridge_reg.fit(X_train,y_train)" 1348 | ] 1349 | }, 1350 | { 1351 | "cell_type": "code", 1352 | "execution_count": 31, 1353 | "id": "e42249cc", 1354 | "metadata": { 1355 | "execution": { 1356 | "iopub.execute_input": "2021-12-22T19:07:42.602566Z", 1357 | "iopub.status.busy": "2021-12-22T19:07:42.601969Z", 1358 | "iopub.status.idle": "2021-12-22T19:07:42.605585Z", 1359 | "shell.execute_reply": "2021-12-22T19:07:42.606182Z" 1360 | }, 1361 | "papermill": { 1362 | "duration": 0.052095, 1363 | "end_time": "2021-12-22T19:07:42.606354", 1364 | "exception": false, 1365 | "start_time": "2021-12-22T19:07:42.554259", 1366 | "status": "completed" 1367 | }, 1368 | "tags": [] 1369 | }, 1370 | "outputs": [], 1371 | "source": [ 1372 | "y_pred=ridge_reg.predict(X_test)" 1373 | ] 1374 | }, 1375 | { 1376 | "cell_type": "code", 1377 | "execution_count": 32, 1378 | "id": "07b3d94d", 1379 | "metadata": { 1380 | "execution": { 1381 | "iopub.execute_input": "2021-12-22T19:07:42.701798Z", 1382 | "iopub.status.busy": "2021-12-22T19:07:42.701151Z", 1383 | "iopub.status.idle": "2021-12-22T19:07:42.711342Z", 1384 | "shell.execute_reply": "2021-12-22T19:07:42.711796Z" 1385 | }, 1386 | "papermill": { 1387 | "duration": 0.059316, 1388 | "end_time": "2021-12-22T19:07:42.711960", 1389 | "exception": false, 1390 | "start_time": "2021-12-22T19:07:42.652644", 1391 | "status": "completed" 1392 | }, 1393 | "tags": [] 1394 | }, 1395 | "outputs": [ 1396 | { 1397 | "name": "stdout", 1398 | "output_type": "stream", 1399 | "text": [ 1400 | " columns Coefficient estimates\n", 1401 | "0 CRIM -0.059764\n", 1402 | "1 ZN 0.053677\n", 1403 | "2 INDUS 0.004674\n", 1404 | "3 CHAS 3.309944\n", 1405 | "4 NOX -9.918291\n", 1406 | "5 RM 3.558169\n", 1407 | "6 AGE 0.003945\n", 1408 | "7 DIS -1.419434\n", 1409 | "8 RAD 0.273208\n", 1410 | "9 TAX -0.012888\n", 1411 | "10 PTRATIO -0.790406\n", 1412 | "11 B 0.012675\n", 1413 | "12 LSTAT -0.614542\n", 1414 | "13 target NaN\n" 1415 | ] 1416 | } 1417 | ], 1418 | "source": [ 1419 | "ridge_coeff=pd.DataFrame()\n", 1420 | "ridge_coeff['columns']=df.columns\n", 1421 | "ridge_coeff['Coefficient estimates']=pd.Series(ridge_reg.coef_)\n", 1422 | "print(ridge_coeff)" 1423 | ] 1424 | }, 1425 | { 1426 | "cell_type": "markdown", 1427 | "id": "c0c94de5", 1428 | "metadata": { 1429 | "papermill": { 1430 | "duration": 0.046599, 1431 | "end_time": "2021-12-22T19:07:42.804542", 1432 | "exception": false, 1433 | "start_time": "2021-12-22T19:07:42.757943", 1434 | "status": "completed" 1435 | }, 1436 | "tags": [] 1437 | }, 1438 | "source": [ 1439 | "- As we can observe from the above plots that alpha helps in regularizing the coefficient and make them converge faster. \n", 1440 | "- it shows some of the coefficients become zero. In Ridge Regularization, the coefficients can never be 0, they are just too small to observe in above plots. " 1441 | ] 1442 | }, 1443 | { 1444 | "cell_type": "markdown", 1445 | "id": "2547f3b9", 1446 | "metadata": { 1447 | "papermill": { 1448 | "duration": 0.045682, 1449 | "end_time": "2021-12-22T19:07:42.896858", 1450 | "exception": false, 1451 | "start_time": "2021-12-22T19:07:42.851176", 1452 | "status": "completed" 1453 | }, 1454 | "tags": [] 1455 | }, 1456 | "source": [ 1457 | "### Implementation of lasso regression using sklearn" 1458 | ] 1459 | }, 1460 | { 1461 | "cell_type": "markdown", 1462 | "id": "66b93314", 1463 | "metadata": { 1464 | "papermill": { 1465 | "duration": 0.045735, 1466 | "end_time": "2021-12-22T19:07:42.988699", 1467 | "exception": false, 1468 | "start_time": "2021-12-22T19:07:42.942964", 1469 | "status": "completed" 1470 | }, 1471 | "tags": [] 1472 | }, 1473 | "source": [ 1474 | "- we add Mean Absolute value of coefficients in place of mean square value\n", 1475 | "- Unlike Ridge Regression, Lasso regression can completely eliminate the variable by reducing its coefficient value to 0." 1476 | ] 1477 | }, 1478 | { 1479 | "cell_type": "code", 1480 | "execution_count": 33, 1481 | "id": "3b51c8d7", 1482 | "metadata": { 1483 | "execution": { 1484 | "iopub.execute_input": "2021-12-22T19:07:43.084868Z", 1485 | "iopub.status.busy": "2021-12-22T19:07:43.084217Z", 1486 | "iopub.status.idle": "2021-12-22T19:07:43.088349Z", 1487 | "shell.execute_reply": "2021-12-22T19:07:43.087847Z" 1488 | }, 1489 | "papermill": { 1490 | "duration": 0.054006, 1491 | "end_time": "2021-12-22T19:07:43.088502", 1492 | "exception": false, 1493 | "start_time": "2021-12-22T19:07:43.034496", 1494 | "status": "completed" 1495 | }, 1496 | "tags": [] 1497 | }, 1498 | "outputs": [], 1499 | "source": [ 1500 | "from sklearn.linear_model import Lasso\n", 1501 | "lasso=Lasso(alpha=1)" 1502 | ] 1503 | }, 1504 | { 1505 | "cell_type": "code", 1506 | "execution_count": 34, 1507 | "id": "00805f88", 1508 | "metadata": { 1509 | "execution": { 1510 | "iopub.execute_input": "2021-12-22T19:07:43.183361Z", 1511 | "iopub.status.busy": "2021-12-22T19:07:43.182730Z", 1512 | "iopub.status.idle": "2021-12-22T19:07:43.190670Z", 1513 | "shell.execute_reply": "2021-12-22T19:07:43.191217Z" 1514 | }, 1515 | "papermill": { 1516 | "duration": 0.057067, 1517 | "end_time": "2021-12-22T19:07:43.191394", 1518 | "exception": false, 1519 | "start_time": "2021-12-22T19:07:43.134327", 1520 | "status": "completed" 1521 | }, 1522 | "tags": [] 1523 | }, 1524 | "outputs": [], 1525 | "source": [ 1526 | "lasso.fit(X_train,y_train)\n", 1527 | "y_pred1=lasso.predict(X_test)" 1528 | ] 1529 | }, 1530 | { 1531 | "cell_type": "code", 1532 | "execution_count": 35, 1533 | "id": "816b5ade", 1534 | "metadata": { 1535 | "execution": { 1536 | "iopub.execute_input": "2021-12-22T19:07:43.286534Z", 1537 | "iopub.status.busy": "2021-12-22T19:07:43.285930Z", 1538 | "iopub.status.idle": "2021-12-22T19:07:43.289250Z", 1539 | "shell.execute_reply": "2021-12-22T19:07:43.289839Z" 1540 | }, 1541 | "papermill": { 1542 | "duration": 0.052542, 1543 | "end_time": "2021-12-22T19:07:43.290000", 1544 | "exception": false, 1545 | "start_time": "2021-12-22T19:07:43.237458", 1546 | "status": "completed" 1547 | }, 1548 | "tags": [] 1549 | }, 1550 | "outputs": [], 1551 | "source": [ 1552 | "lasso_mse=np.mean((y_pred1-y_test)**2)" 1553 | ] 1554 | }, 1555 | { 1556 | "cell_type": "code", 1557 | "execution_count": 36, 1558 | "id": "b275aa1f", 1559 | "metadata": { 1560 | "execution": { 1561 | "iopub.execute_input": "2021-12-22T19:07:43.385343Z", 1562 | "iopub.status.busy": "2021-12-22T19:07:43.384744Z", 1563 | "iopub.status.idle": "2021-12-22T19:07:43.389077Z", 1564 | "shell.execute_reply": "2021-12-22T19:07:43.389546Z" 1565 | }, 1566 | "papermill": { 1567 | "duration": 0.053477, 1568 | "end_time": "2021-12-22T19:07:43.389750", 1569 | "exception": false, 1570 | "start_time": "2021-12-22T19:07:43.336273", 1571 | "status": "completed" 1572 | }, 1573 | "tags": [] 1574 | }, 1575 | "outputs": [ 1576 | { 1577 | "name": "stdout", 1578 | "output_type": "stream", 1579 | "text": [ 1580 | "25.283708842642042\n" 1581 | ] 1582 | } 1583 | ], 1584 | "source": [ 1585 | "print(lasso_mse)" 1586 | ] 1587 | }, 1588 | { 1589 | "cell_type": "code", 1590 | "execution_count": 37, 1591 | "id": "ac5b3180", 1592 | "metadata": { 1593 | "execution": { 1594 | "iopub.execute_input": "2021-12-22T19:07:43.487451Z", 1595 | "iopub.status.busy": "2021-12-22T19:07:43.486800Z", 1596 | "iopub.status.idle": "2021-12-22T19:07:43.493024Z", 1597 | "shell.execute_reply": "2021-12-22T19:07:43.493570Z" 1598 | }, 1599 | "papermill": { 1600 | "duration": 0.056385, 1601 | "end_time": "2021-12-22T19:07:43.493760", 1602 | "exception": false, 1603 | "start_time": "2021-12-22T19:07:43.437375", 1604 | "status": "completed" 1605 | }, 1606 | "tags": [] 1607 | }, 1608 | "outputs": [], 1609 | "source": [ 1610 | "lasso_coef=pd.DataFrame()\n", 1611 | "lasso_coef['columns']=df.columns\n", 1612 | "lasso_coef['coeffienct values']=pd.Series(lasso.coef_)" 1613 | ] 1614 | }, 1615 | { 1616 | "cell_type": "code", 1617 | "execution_count": 38, 1618 | "id": "322c1e1a", 1619 | "metadata": { 1620 | "execution": { 1621 | "iopub.execute_input": "2021-12-22T19:07:43.591109Z", 1622 | "iopub.status.busy": "2021-12-22T19:07:43.590431Z", 1623 | "iopub.status.idle": "2021-12-22T19:07:43.599134Z", 1624 | "shell.execute_reply": "2021-12-22T19:07:43.599675Z" 1625 | }, 1626 | "papermill": { 1627 | "duration": 0.058995, 1628 | "end_time": "2021-12-22T19:07:43.599844", 1629 | "exception": false, 1630 | "start_time": "2021-12-22T19:07:43.540849", 1631 | "status": "completed" 1632 | }, 1633 | "tags": [] 1634 | }, 1635 | "outputs": [ 1636 | { 1637 | "data": { 1638 | "text/html": [ 1639 | "
\n", 1640 | "\n", 1653 | "\n", 1654 | " \n", 1655 | " \n", 1656 | " \n", 1657 | " \n", 1658 | " \n", 1659 | " \n", 1660 | " \n", 1661 | " \n", 1662 | " \n", 1663 | " \n", 1664 | " \n", 1665 | " \n", 1666 | " \n", 1667 | " \n", 1668 | " \n", 1669 | " \n", 1670 | " \n", 1671 | " \n", 1672 | " \n", 1673 | " \n", 1674 | " \n", 1675 | " \n", 1676 | " \n", 1677 | " \n", 1678 | " \n", 1679 | " \n", 1680 | " \n", 1681 | " \n", 1682 | " \n", 1683 | " \n", 1684 | " \n", 1685 | " \n", 1686 | " \n", 1687 | " \n", 1688 | " \n", 1689 | " \n", 1690 | " \n", 1691 | " \n", 1692 | " \n", 1693 | " \n", 1694 | " \n", 1695 | " \n", 1696 | " \n", 1697 | " \n", 1698 | " \n", 1699 | " \n", 1700 | " \n", 1701 | " \n", 1702 | " \n", 1703 | " \n", 1704 | " \n", 1705 | " \n", 1706 | " \n", 1707 | " \n", 1708 | " \n", 1709 | " \n", 1710 | " \n", 1711 | " \n", 1712 | " \n", 1713 | " \n", 1714 | " \n", 1715 | " \n", 1716 | " \n", 1717 | " \n", 1718 | " \n", 1719 | " \n", 1720 | " \n", 1721 | " \n", 1722 | " \n", 1723 | " \n", 1724 | " \n", 1725 | " \n", 1726 | " \n", 1727 | " \n", 1728 | " \n", 1729 | " \n", 1730 | " \n", 1731 | " \n", 1732 | " \n", 1733 | "
columnscoeffienct values
0CRIM-0.000000
1ZN0.052337
2INDUS-0.000000
3CHAS0.000000
4NOX-0.000000
5RM0.905588
6AGE0.030446
7DIS-0.743750
8RAD0.219849
9TAX-0.014176
10PTRATIO-0.601588
11B0.011533
12LSTAT-0.831386
13targetNaN
\n", 1734 | "
" 1735 | ], 1736 | "text/plain": [ 1737 | " columns coeffienct values\n", 1738 | "0 CRIM -0.000000\n", 1739 | "1 ZN 0.052337\n", 1740 | "2 INDUS -0.000000\n", 1741 | "3 CHAS 0.000000\n", 1742 | "4 NOX -0.000000\n", 1743 | "5 RM 0.905588\n", 1744 | "6 AGE 0.030446\n", 1745 | "7 DIS -0.743750\n", 1746 | "8 RAD 0.219849\n", 1747 | "9 TAX -0.014176\n", 1748 | "10 PTRATIO -0.601588\n", 1749 | "11 B 0.011533\n", 1750 | "12 LSTAT -0.831386\n", 1751 | "13 target NaN" 1752 | ] 1753 | }, 1754 | "execution_count": 38, 1755 | "metadata": {}, 1756 | "output_type": "execute_result" 1757 | } 1758 | ], 1759 | "source": [ 1760 | "lasso_coef" 1761 | ] 1762 | }, 1763 | { 1764 | "cell_type": "code", 1765 | "execution_count": 39, 1766 | "id": "50658745", 1767 | "metadata": { 1768 | "execution": { 1769 | "iopub.execute_input": "2021-12-22T19:07:43.697914Z", 1770 | "iopub.status.busy": "2021-12-22T19:07:43.697194Z", 1771 | "iopub.status.idle": "2021-12-22T19:07:43.701581Z", 1772 | "shell.execute_reply": "2021-12-22T19:07:43.702185Z" 1773 | }, 1774 | "papermill": { 1775 | "duration": 0.054965, 1776 | "end_time": "2021-12-22T19:07:43.702345", 1777 | "exception": false, 1778 | "start_time": "2021-12-22T19:07:43.647380", 1779 | "status": "completed" 1780 | }, 1781 | "tags": [] 1782 | }, 1783 | "outputs": [ 1784 | { 1785 | "data": { 1786 | "text/plain": [ 1787 | "pandas.core.frame.DataFrame" 1788 | ] 1789 | }, 1790 | "execution_count": 39, 1791 | "metadata": {}, 1792 | "output_type": "execute_result" 1793 | } 1794 | ], 1795 | "source": [ 1796 | "type(lasso_coef)" 1797 | ] 1798 | }, 1799 | { 1800 | "cell_type": "markdown", 1801 | "id": "5108bea9", 1802 | "metadata": { 1803 | "papermill": { 1804 | "duration": 0.047194, 1805 | "end_time": "2021-12-22T19:07:43.797052", 1806 | "exception": false, 1807 | "start_time": "2021-12-22T19:07:43.749858", 1808 | "status": "completed" 1809 | }, 1810 | "tags": [] 1811 | }, 1812 | "source": [ 1813 | "### Python implementation of Elastic Net " 1814 | ] 1815 | }, 1816 | { 1817 | "cell_type": "code", 1818 | "execution_count": 40, 1819 | "id": "f9e0ea19", 1820 | "metadata": { 1821 | "execution": { 1822 | "iopub.execute_input": "2021-12-22T19:07:43.896026Z", 1823 | "iopub.status.busy": "2021-12-22T19:07:43.895359Z", 1824 | "iopub.status.idle": "2021-12-22T19:07:43.898529Z", 1825 | "shell.execute_reply": "2021-12-22T19:07:43.899059Z" 1826 | }, 1827 | "papermill": { 1828 | "duration": 0.05412, 1829 | "end_time": "2021-12-22T19:07:43.899277", 1830 | "exception": false, 1831 | "start_time": "2021-12-22T19:07:43.845157", 1832 | "status": "completed" 1833 | }, 1834 | "tags": [] 1835 | }, 1836 | "outputs": [], 1837 | "source": [ 1838 | "from sklearn.linear_model import ElasticNet\n", 1839 | "elastic=ElasticNet(alpha=1)" 1840 | ] 1841 | }, 1842 | { 1843 | "cell_type": "code", 1844 | "execution_count": 41, 1845 | "id": "514ea47a", 1846 | "metadata": { 1847 | "execution": { 1848 | "iopub.execute_input": "2021-12-22T19:07:43.998460Z", 1849 | "iopub.status.busy": "2021-12-22T19:07:43.997850Z", 1850 | "iopub.status.idle": "2021-12-22T19:07:44.004364Z", 1851 | "shell.execute_reply": "2021-12-22T19:07:44.004936Z" 1852 | }, 1853 | "papermill": { 1854 | "duration": 0.057813, 1855 | "end_time": "2021-12-22T19:07:44.005105", 1856 | "exception": false, 1857 | "start_time": "2021-12-22T19:07:43.947292", 1858 | "status": "completed" 1859 | }, 1860 | "tags": [] 1861 | }, 1862 | "outputs": [ 1863 | { 1864 | "data": { 1865 | "text/plain": [ 1866 | "ElasticNet(alpha=1)" 1867 | ] 1868 | }, 1869 | "execution_count": 41, 1870 | "metadata": {}, 1871 | "output_type": "execute_result" 1872 | } 1873 | ], 1874 | "source": [ 1875 | "elastic.fit(X_train,y_train)" 1876 | ] 1877 | }, 1878 | { 1879 | "cell_type": "code", 1880 | "execution_count": 42, 1881 | "id": "6ca049fb", 1882 | "metadata": { 1883 | "execution": { 1884 | "iopub.execute_input": "2021-12-22T19:07:44.105947Z", 1885 | "iopub.status.busy": "2021-12-22T19:07:44.105288Z", 1886 | "iopub.status.idle": "2021-12-22T19:07:44.108773Z", 1887 | "shell.execute_reply": "2021-12-22T19:07:44.109289Z" 1888 | }, 1889 | "papermill": { 1890 | "duration": 0.055931, 1891 | "end_time": "2021-12-22T19:07:44.109453", 1892 | "exception": false, 1893 | "start_time": "2021-12-22T19:07:44.053522", 1894 | "status": "completed" 1895 | }, 1896 | "tags": [] 1897 | }, 1898 | "outputs": [], 1899 | "source": [ 1900 | "y_pred2=elastic.predict(X_test)" 1901 | ] 1902 | }, 1903 | { 1904 | "cell_type": "code", 1905 | "execution_count": 43, 1906 | "id": "40aeee4e", 1907 | "metadata": { 1908 | "execution": { 1909 | "iopub.execute_input": "2021-12-22T19:07:44.209854Z", 1910 | "iopub.status.busy": "2021-12-22T19:07:44.209204Z", 1911 | "iopub.status.idle": "2021-12-22T19:07:44.214732Z", 1912 | "shell.execute_reply": "2021-12-22T19:07:44.214113Z" 1913 | }, 1914 | "papermill": { 1915 | "duration": 0.056737, 1916 | "end_time": "2021-12-22T19:07:44.214870", 1917 | "exception": false, 1918 | "start_time": "2021-12-22T19:07:44.158133", 1919 | "status": "completed" 1920 | }, 1921 | "tags": [] 1922 | }, 1923 | "outputs": [ 1924 | { 1925 | "name": "stdout", 1926 | "output_type": "stream", 1927 | "text": [ 1928 | "24.422988143894155\n" 1929 | ] 1930 | } 1931 | ], 1932 | "source": [ 1933 | "elastic_mse=np.mean((y_pred2-y_test)**2)\n", 1934 | "# Here for reminding, mean squared error is the mean of sqaure of diffrence in y_predicted and y_test\n", 1935 | "\n", 1936 | "print(elastic_mse)" 1937 | ] 1938 | }, 1939 | { 1940 | "cell_type": "code", 1941 | "execution_count": 44, 1942 | "id": "76811e85", 1943 | "metadata": { 1944 | "execution": { 1945 | "iopub.execute_input": "2021-12-22T19:07:44.316624Z", 1946 | "iopub.status.busy": "2021-12-22T19:07:44.315981Z", 1947 | "iopub.status.idle": "2021-12-22T19:07:44.325120Z", 1948 | "shell.execute_reply": "2021-12-22T19:07:44.325614Z" 1949 | }, 1950 | "papermill": { 1951 | "duration": 0.061949, 1952 | "end_time": "2021-12-22T19:07:44.325803", 1953 | "exception": false, 1954 | "start_time": "2021-12-22T19:07:44.263854", 1955 | "status": "completed" 1956 | }, 1957 | "tags": [] 1958 | }, 1959 | "outputs": [ 1960 | { 1961 | "name": "stdout", 1962 | "output_type": "stream", 1963 | "text": [ 1964 | " columns coeff values\n", 1965 | "0 CRIM -0.022867\n", 1966 | "1 ZN 0.055481\n", 1967 | "2 INDUS -0.000000\n", 1968 | "3 CHAS 0.000000\n", 1969 | "4 NOX -0.000000\n", 1970 | "5 RM 0.926176\n", 1971 | "6 AGE 0.029873\n", 1972 | "7 DIS -0.802898\n", 1973 | "8 RAD 0.261508\n", 1974 | "9 TAX -0.015532\n", 1975 | "10 PTRATIO -0.648044\n", 1976 | "11 B 0.011629\n", 1977 | "12 LSTAT -0.823327\n", 1978 | "13 target NaN\n" 1979 | ] 1980 | } 1981 | ], 1982 | "source": [ 1983 | "# making dataframe of column wise coefficient of elasticnet\n", 1984 | "\n", 1985 | "elastic_coeff=pd.DataFrame()\n", 1986 | "elastic_coeff['columns']=df.columns\n", 1987 | "elastic_coeff['coeff values']=pd.Series(elastic.coef_)\n", 1988 | "\n", 1989 | "print(elastic_coeff)" 1990 | ] 1991 | }, 1992 | { 1993 | "cell_type": "code", 1994 | "execution_count": 45, 1995 | "id": "c45c3e50", 1996 | "metadata": { 1997 | "execution": { 1998 | "iopub.execute_input": "2021-12-22T19:07:44.428261Z", 1999 | "iopub.status.busy": "2021-12-22T19:07:44.427600Z", 2000 | "iopub.status.idle": "2021-12-22T19:07:44.431913Z", 2001 | "shell.execute_reply": "2021-12-22T19:07:44.432475Z" 2002 | }, 2003 | "papermill": { 2004 | "duration": 0.057216, 2005 | "end_time": "2021-12-22T19:07:44.432632", 2006 | "exception": false, 2007 | "start_time": "2021-12-22T19:07:44.375416", 2008 | "status": "completed" 2009 | }, 2010 | "tags": [] 2011 | }, 2012 | "outputs": [ 2013 | { 2014 | "data": { 2015 | "text/plain": [ 2016 | "pandas.core.frame.DataFrame" 2017 | ] 2018 | }, 2019 | "execution_count": 45, 2020 | "metadata": {}, 2021 | "output_type": "execute_result" 2022 | } 2023 | ], 2024 | "source": [ 2025 | "type(elastic_coeff)" 2026 | ] 2027 | }, 2028 | { 2029 | "cell_type": "markdown", 2030 | "id": "365d0b90", 2031 | "metadata": { 2032 | "papermill": { 2033 | "duration": 0.049713, 2034 | "end_time": "2021-12-22T19:07:44.532152", 2035 | "exception": false, 2036 | "start_time": "2021-12-22T19:07:44.482439", 2037 | "status": "completed" 2038 | }, 2039 | "tags": [] 2040 | }, 2041 | "source": [ 2042 | "- Elastic Net is a combination of both of the above regularization. It contains both the L1 and L2 as its penalty term. \n", 2043 | "- It performs better than Ridge and Lasso Regression for most of the test cases" 2044 | ] 2045 | } 2046 | ], 2047 | "metadata": { 2048 | "kernelspec": { 2049 | "display_name": "Python 3 (ipykernel)", 2050 | "language": "python", 2051 | "name": "python3" 2052 | }, 2053 | "language_info": { 2054 | "codemirror_mode": { 2055 | "name": "ipython", 2056 | "version": 3 2057 | }, 2058 | "file_extension": ".py", 2059 | "mimetype": "text/x-python", 2060 | "name": "python", 2061 | "nbconvert_exporter": "python", 2062 | "pygments_lexer": "ipython3", 2063 | "version": "3.9.7" 2064 | }, 2065 | "papermill": { 2066 | "default_parameters": {}, 2067 | "duration": 17.062862, 2068 | "end_time": "2021-12-22T19:07:45.291877", 2069 | "environment_variables": {}, 2070 | "exception": null, 2071 | "input_path": "__notebook__.ipynb", 2072 | "output_path": "__notebook__.ipynb", 2073 | "parameters": {}, 2074 | "start_time": "2021-12-22T19:07:28.229015", 2075 | "version": "2.3.3" 2076 | } 2077 | }, 2078 | "nbformat": 4, 2079 | "nbformat_minor": 5 2080 | } 2081 | --------------------------------------------------------------------------------