├── h.js
├── hello.js
├── a.py
├── Problem Statement
└── Train data
│ ├── .md
│ └── AttributeInformation.pdf
├── Spam-Email-detection-system-main
├── _config.yml
├── spam_model.pkl
├── static
│ ├── img
│ │ ├── logo-w.png
│ │ ├── safe.png
│ │ ├── spam-2.png
│ │ ├── spam.png
│ │ ├── insurance-protected.png
│ │ └── pngkey.com-scam-alert-png-4321853.png
│ ├── audio
│ │ ├── safe.mpeg
│ │ └── warning.mpeg
│ └── css
│ │ └── style.css
├── app.py
├── templates
│ └── index.html
└── Untitled2.ipynb
├── a-hands-on-discussion-on-hyperparameter-optimization-techniques.pdf
├── id 3 algorithum (1).ipynb
├── Feature Selection
├── Embedded method.ipynb
├── Filter method.ipynb
├── Wrapper method .ipynb
└── feature-selection-technique-in-machine-learning.ipynb
└── regularization-in-machine-learning
└── regularization-in-machine-learning.ipynb
/h.js:
--------------------------------------------------------------------------------
1 | console.log("print")
2 |
--------------------------------------------------------------------------------
/hello.js:
--------------------------------------------------------------------------------
1 | console.log("hello")
2 |
--------------------------------------------------------------------------------
/a.py:
--------------------------------------------------------------------------------
1 | print("Hello. Developer")
2 |
--------------------------------------------------------------------------------
/Problem Statement/Train data/.md:
--------------------------------------------------------------------------------
1 | .md
2 |
--------------------------------------------------------------------------------
/Spam-Email-detection-system-main/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman
--------------------------------------------------------------------------------
/Spam-Email-detection-system-main/spam_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mouneshgouda/Machine_Learning/HEAD/Spam-Email-detection-system-main/spam_model.pkl
--------------------------------------------------------------------------------
/Problem Statement/Train data/AttributeInformation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mouneshgouda/Machine_Learning/HEAD/Problem Statement/Train data/AttributeInformation.pdf
--------------------------------------------------------------------------------
/Spam-Email-detection-system-main/static/img/logo-w.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mouneshgouda/Machine_Learning/HEAD/Spam-Email-detection-system-main/static/img/logo-w.png
--------------------------------------------------------------------------------
/Spam-Email-detection-system-main/static/img/safe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mouneshgouda/Machine_Learning/HEAD/Spam-Email-detection-system-main/static/img/safe.png
--------------------------------------------------------------------------------
/Spam-Email-detection-system-main/static/img/spam-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mouneshgouda/Machine_Learning/HEAD/Spam-Email-detection-system-main/static/img/spam-2.png
--------------------------------------------------------------------------------
/Spam-Email-detection-system-main/static/img/spam.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mouneshgouda/Machine_Learning/HEAD/Spam-Email-detection-system-main/static/img/spam.png
--------------------------------------------------------------------------------
/Spam-Email-detection-system-main/static/audio/safe.mpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mouneshgouda/Machine_Learning/HEAD/Spam-Email-detection-system-main/static/audio/safe.mpeg
--------------------------------------------------------------------------------
/Spam-Email-detection-system-main/static/audio/warning.mpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mouneshgouda/Machine_Learning/HEAD/Spam-Email-detection-system-main/static/audio/warning.mpeg
--------------------------------------------------------------------------------
/Spam-Email-detection-system-main/static/img/insurance-protected.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mouneshgouda/Machine_Learning/HEAD/Spam-Email-detection-system-main/static/img/insurance-protected.png
--------------------------------------------------------------------------------
/a-hands-on-discussion-on-hyperparameter-optimization-techniques.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mouneshgouda/Machine_Learning/HEAD/a-hands-on-discussion-on-hyperparameter-optimization-techniques.pdf
--------------------------------------------------------------------------------
/Spam-Email-detection-system-main/static/img/pngkey.com-scam-alert-png-4321853.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mouneshgouda/Machine_Learning/HEAD/Spam-Email-detection-system-main/static/img/pngkey.com-scam-alert-png-4321853.png
--------------------------------------------------------------------------------
/Spam-Email-detection-system-main/app.py:
--------------------------------------------------------------------------------
1 | from flask import Flask,render_template,request,jsonify
2 | import pandas as pd
3 | import numpy as np
4 | import joblib
5 |
6 | app = Flask(__name__)
7 |
8 | model = joblib.load('spam_model.pkl')
9 |
10 | @app.route('/',methods=['GET', 'POST'])
11 | def index():
12 | if request.method == 'POST':
13 | message = request.form.get('message')
14 | output = model.predict([message])
15 | if output == [0]:
16 | result = "This Message is Not a SPAM Message."
17 | else:
18 | result = "This Message is a SPAM Message."
19 | return render_template('index.html', result=result,message=message)
20 |
21 | else:
22 | return render_template('index.html')
23 |
24 |
25 | if __name__ == '__main__':
26 | app.run(debug=True)
--------------------------------------------------------------------------------
/Spam-Email-detection-system-main/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
16 |
17 |
Disclaimer :- Plesae Click on the three dot and click "Add to Home screen" for better view and quick access
18 |
19 |
20 |
}})
21 |
22 |
23 |
27 |
28 |
{% if message %}
29 |
30 |
31 |
{{message}}
32 |
33 |
{% if result=='This Message is Not a SPAM Message.' %}
34 |
35 |
}})
36 |
37 | {% endif %} {% if result=='This Message is Not a SPAM Message.' %}
38 |
43 | {% endif %} {% if result=='This Message is a SPAM Message.' %}
44 |
45 |
}})
46 |
47 | {% endif %} {% if result=='This Message is Not a SPAM Message.' %}
48 |
{{result}}
49 | {% endif %} {% if result=='This Message is a SPAM Message.' %}
50 |
{{result}}
51 | {% endif %} {% if result=='This Message is a SPAM Message.' %}
52 |
57 | {% endif %}
58 |
59 | {% else %}
60 |
.Enter A Message To Check The Message is SPAM or NOT-SPAM..
61 | {% endif %}
62 |
63 |
64 |
65 |
66 |
67 |
68 |
--------------------------------------------------------------------------------
/Spam-Email-detection-system-main/static/css/style.css:
--------------------------------------------------------------------------------
1 |
2 | * {
3 | border: 0;
4 | box-sizing: border-box;
5 | margin: 0;
6 | }
7 | .container{
8 | height: content;
9 | width: 100%;
10 | justify-content: center;
11 | align-items: center;
12 | display: flex;
13 | flex-flow: column;
14 | }
15 | .head-nav {
16 | height:30px;
17 | width:100%;
18 | background-color: green;
19 | display:flex;
20 | justify-content: center;
21 | align-items: center;
22 | padding: 8px;
23 | }
24 | .logo-heading {
25 | height: 250px;
26 | width: 250px;
27 | margin-top: -20px;
28 | justify-content: center;
29 | align-items: center;
30 | display: flex;
31 | }
32 | .logo-heading img {
33 | height: 100%;
34 | width: 100%;
35 | }
36 | .container-data {
37 | height: 200px;
38 | width: 100%;
39 | justify-content: center;
40 | display: flex;
41 | align-items: center;
42 | background-color:rgb(236, 214, 214);
43 | flex-flow: column;
44 | margin-top: -40px;
45 | padding-left: 10px;
46 | padding-right: 10px;
47 | }
48 | .text-box {
49 | height: 65px;
50 | width:90%;
51 | border-radius: 5px;
52 | border: 2px solid green;
53 | background-color: rgb(252, 248, 248);
54 | margin-top: 15px;
55 | font-size: 14px;
56 | font-weight: bold;
57 | }
58 | .btn {
59 | height: 35px;
60 | width:100px;
61 | border-radius: 5px;
62 | border: 2px solid black;
63 | background-color: green;
64 | color: white;
65 | margin-top: 15px;
66 | }
67 |
68 | .btn:hover {
69 | color: white;
70 | background-color:red;
71 | cursor: pointer;
72 | }
73 | .show-result {
74 | height: content;
75 | width: 100%;
76 | display: flex;
77 | justify-content: center;
78 | flex-flow: column wrap;
79 | background-color: rgb(252, 243, 243);
80 | margin-bottom: 50px;
81 | margin-top: -30px;
82 | padding: 35px;
83 | }
84 | .output {
85 | height: content;
86 | width: 100%;
87 | display: flex;
88 | justify-content: center;
89 | align-items: center;
90 | font-size: 19px;
91 | font-weight: 300;
92 | font-family:'Lucida Sans', 'Lucida Sans Regular', 'Lucida Grande', 'Lucida Sans Unicode', Geneva, Verdana, sans-serif;
93 | color: red;
94 | margin-top: 5px;
95 | padding: 30px;
96 | }
97 | .output-not {
98 | height: content;
99 | width: 100%;
100 | display: flex;
101 | justify-content: center;
102 | align-items: center;
103 | font-size: 19px;
104 | font-weight: 300;
105 | font-family:'Lucida Sans', 'Lucida Sans Regular', 'Lucida Grande', 'Lucida Sans Unicode', Geneva, Verdana, sans-serif;
106 | color: green;
107 | margin-top: 3px;
108 | padding: 30px;
109 | }
110 | .output-logo {
111 | height: 100px;
112 | width: 100%;
113 | margin-top:20px;
114 | justify-content: center;
115 | display: flex;
116 | align-items: center;
117 | }
118 | .output-logo img {
119 | height: 100%;
120 | width: 100px;
121 | }
122 |
123 | .head{
124 | height: 40px;
125 | width: 100%;
126 | justify-content: center;
127 | align-items: center;
128 | border-top: 2px solid black;
129 | background-color: yellow;
130 | display: flex;
131 | flex-flow: column;
132 | }
133 | .head h2 {
134 | font-size: 15px;
135 | font-family: 'Gill Sans', 'Gill Sans MT', Calibri, 'Trebuchet MS', sans-serif;
136 | color:black;
137 | }
138 | .head-git{
139 | height: 40px;
140 | width: 100%;
141 | justify-content: center;
142 | align-items: center;
143 | background-color: rgb(250, 244, 244);
144 | display: flex;
145 | flex-flow: row;
146 | margin-bottom: 5px;
147 | }
148 | .head-git h2 {
149 | font-size: 18px;
150 | font-family: 'Gill Sans', 'Gill Sans MT', Calibri, 'Trebuchet MS', sans-serif;
151 | color:black;
152 | }
153 | .head-git h2 a {
154 | text-decoration: none;
155 | color:blue;
156 | font-size: 23px;
157 | }
158 | .head-git h2 a:hover {
159 | color:rgb(211, 47, 41);
160 | background-color: beige;
161 | }
162 | .alert {
163 | visibility: hidden;
164 | }
--------------------------------------------------------------------------------
/id 3 algorithum (1).ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "id": "bc63810d",
7 | "metadata": {},
8 | "outputs": [
9 | {
10 | "name": "stdout",
11 | "output_type": "stream",
12 | "text": [
13 | "Enter Outlook (Sunny/Overcast/Rain): Overcast\n",
14 | "Enter Temperature (Hot/Mild/Cool): Mild\n",
15 | "Enter Humidity (High/Normal): Normal\n",
16 | "Enter Wind (Weak/Strong): Strong\n",
17 | "\n",
18 | "Predicted PlayTennis for the new instance: No\n"
19 | ]
20 | }
21 | ],
22 | "source": [
23 | "import pandas as pd\n",
24 | "from sklearn.model_selection import train_test_split\n",
25 | "from sklearn.tree import DecisionTreeClassifier\n",
26 | "from sklearn.metrics import accuracy_score, classification_report\n",
27 | "\n",
28 | "# Sample dataset: PlayTennis\n",
29 | "data = {\n",
30 | " 'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain'],\n",
31 | " 'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild'],\n",
32 | " 'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal'],\n",
33 | " 'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak'],\n",
34 | " 'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No']\n",
35 | "}\n",
36 | "\n",
37 | "df = pd.DataFrame(data)\n",
38 | "\n",
39 | "# Extract features and target variable\n",
40 | "X = pd.get_dummies(df.drop('PlayTennis', axis=1)) # Convert categorical variables to numerical\n",
41 | "y = df['PlayTennis']\n",
42 | "\n",
43 | "# Get user input for new instance\n",
44 | "new_outlook = input(\"Enter Outlook (Sunny/Overcast/Rain): \")\n",
45 | "new_temperature = input(\"Enter Temperature (Hot/Mild/Cool): \")\n",
46 | "new_humidity = input(\"Enter Humidity (High/Normal): \")\n",
47 | "new_wind = input(\"Enter Wind (Weak/Strong): \")\n",
48 | "\n",
49 | "# Create a new DataFrame for user input\n",
50 | "new_instance = pd.DataFrame({\n",
51 | " 'Outlook_Sunny': [1 if new_outlook == 'Sunny' else 0],\n",
52 | " 'Outlook_Overcast': [1 if new_outlook == 'Overcast' else 0],\n",
53 | " 'Outlook_Rain': [1 if new_outlook == 'Rain' else 0],\n",
54 | " 'Temperature_Hot': [1 if new_temperature == 'Hot' else 0],\n",
55 | " 'Temperature_Mild': [1 if new_temperature == 'Mild' else 0],\n",
56 | " 'Temperature_Cool': [1 if new_temperature == 'Cool' else 0],\n",
57 | " 'Humidity_High': [1 if new_humidity == 'High' else 0],\n",
58 | " 'Humidity_Normal': [1 if new_humidity == 'Normal' else 0],\n",
59 | " 'Wind_Weak': [1 if new_wind == 'Weak' else 0],\n",
60 | " 'Wind_Strong': [1 if new_wind == 'Strong' else 0],\n",
61 | "})\n",
62 | "\n",
63 | "# Train-test split\n",
64 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
65 | "\n",
66 | "# Build a Decision Tree classifier using scikit-learn\n",
67 | "dt_classifier = DecisionTreeClassifier(random_state=42)\n",
68 | "dt_classifier.fit(X_train, y_train)\n",
69 | "\n",
70 | "new_instance_pred1= dt_classifier.predict(X_test)\n",
71 | "\n",
72 | "# Make predictions on the new instance\n",
73 | "new_instance_pred = dt_classifier.predict(new_instance)\n",
74 | "print(f\"\\nPredicted PlayTennis for the new instance: {new_instance_pred[0]}\")\n"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "id": "0213d788",
81 | "metadata": {},
82 | "outputs": [],
83 | "source": [
84 | "https://towardsdatascience.com/decision-trees-for-classification-complete-example-d0bc17fcf1c2"
85 | ]
86 | }
87 | ],
88 | "metadata": {
89 | "kernelspec": {
90 | "display_name": "Python 3 (ipykernel)",
91 | "language": "python",
92 | "name": "python3"
93 | },
94 | "language_info": {
95 | "codemirror_mode": {
96 | "name": "ipython",
97 | "version": 3
98 | },
99 | "file_extension": ".py",
100 | "mimetype": "text/x-python",
101 | "name": "python",
102 | "nbconvert_exporter": "python",
103 | "pygments_lexer": "ipython3",
104 | "version": "3.9.7"
105 | }
106 | },
107 | "nbformat": 4,
108 | "nbformat_minor": 5
109 | }
110 |
--------------------------------------------------------------------------------
/Feature Selection/Embedded method.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "921d9e05",
6 | "metadata": {},
7 | "source": [
8 | "
Embedded method\n",
9 | "-➖📝
"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "id": "5b983074",
15 | "metadata": {},
16 | "source": [
17 | " \n",
18 | "
Importing Nassary Liberarys 📈:
"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 5,
24 | "id": "e7e6b63c",
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "import numpy as np\n",
29 | "from sklearn.datasets import load_breast_cancer\n",
30 | "from sklearn.model_selection import train_test_split\n",
31 | "from sklearn.linear_model import LogisticRegression\n",
32 | "from sklearn.metrics import accuracy_score"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "id": "a4a9b75c",
38 | "metadata": {},
39 | "source": [
40 | " \n",
41 | "
Loading seed dataset 📈:
"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 6,
47 | "id": "19efbfb2",
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "# Set seed for reproducibility\n",
52 | "seed = 42\n",
53 | "np.random.seed(seed)\n",
54 | "\n",
55 | "# Load the Breast Cancer dataset\n",
56 | "cancer = load_breast_cancer()\n",
57 | "X = cancer.data\n",
58 | "y = cancer.target"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "id": "e043d5b1",
64 | "metadata": {},
65 | "source": [
66 | " \n",
67 | "
Split the dataset into training and testing sets as a 20% as a Test and 80% as a Train :
"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 7,
73 | "id": "461f5655",
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "# Split the dataset into training and testing sets\n",
78 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 8,
84 | "id": "20de70b8",
85 | "metadata": {},
86 | "outputs": [
87 | {
88 | "name": "stderr",
89 | "output_type": "stream",
90 | "text": [
91 | "C:\\Users\\LENOVO\\anaconda3\\lib\\site-packages\\sklearn\\svm\\_base.py:985: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
92 | " warnings.warn(\"Liblinear failed to converge, increase \"\n"
93 | ]
94 | }
95 | ],
96 | "source": [
97 | "# Embedded method with LASSO (Logistic Regression with L1 regularization)\n",
98 | "lasso_model = LogisticRegression(penalty='l1', solver='liblinear', random_state=seed)\n",
99 | "lasso_model.fit(X_train, y_train)\n",
100 | "\n",
101 | "# Extract selected features and their coefficients\n",
102 | "selected_indices_lasso = np.where(lasso_model.coef_[0] != 0)[0]\n",
103 | "selected_features_lasso = cancer.feature_names[selected_indices_lasso]\n",
104 | "coefficients_lasso = lasso_model.coef_[0, selected_indices_lasso]"
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "id": "c9d4246f",
110 | "metadata": {},
111 | "source": [
112 | " \n",
113 | "
Selecting Features using Embedded Method as L1
"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 9,
119 | "id": "fa025ee2",
120 | "metadata": {},
121 | "outputs": [
122 | {
123 | "name": "stdout",
124 | "output_type": "stream",
125 | "text": [
126 | "\n",
127 | "Embedded Method with LASSO (Logistic Regression with L1 regularization)\n",
128 | "Selected Features (LASSO): ['mean radius' 'mean texture' 'mean perimeter' 'mean area' 'texture error'\n",
129 | " 'area error' 'worst radius' 'worst texture' 'worst perimeter'\n",
130 | " 'worst area' 'worst concavity']\n",
131 | "Coefficients (LASSO): [ 4.25893726 0.13813487 -0.2624774 -0.01633754 1.69950795 -0.09940568\n",
132 | " 0.04768624 -0.42417917 -0.02965423 -0.01518975 -3.63866352]\n",
133 | "Accuracy (LASSO): 0.9561\n"
134 | ]
135 | },
136 | {
137 | "name": "stderr",
138 | "output_type": "stream",
139 | "text": [
140 | "C:\\Users\\LENOVO\\anaconda3\\lib\\site-packages\\sklearn\\svm\\_base.py:985: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
141 | " warnings.warn(\"Liblinear failed to converge, increase \"\n"
142 | ]
143 | }
144 | ],
145 | "source": [
146 | "\n",
147 | "# Function to train and evaluate a model\n",
148 | "def train_and_evaluate(X_train, X_test, y_train, y_test):\n",
149 | " model = LogisticRegression(penalty='l1', solver='liblinear', random_state=seed)\n",
150 | " model.fit(X_train, y_train)\n",
151 | " y_pred = model.predict(X_test)\n",
152 | " accuracy = accuracy_score(y_test, y_pred)\n",
153 | " return model, accuracy\n",
154 | "\n",
155 | "# Train and evaluate the model with selected features\n",
156 | "lasso_model, accuracy_lasso = train_and_evaluate(X_train[:, selected_indices_lasso], X_test[:, selected_indices_lasso], y_train, y_test)\n",
157 | "\n",
158 | "# Print results\n",
159 | "print(\"\\nEmbedded Method with LASSO (Logistic Regression with L1 regularization)\")\n",
160 | "print(f\"Selected Features (LASSO): {selected_features_lasso}\")\n",
161 | "print(f\"Coefficients (LASSO): {coefficients_lasso}\")\n",
162 | "print(f\"Accuracy (LASSO): {accuracy_lasso:.4f}\")\n"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "id": "66fb004d",
169 | "metadata": {},
170 | "outputs": [],
171 | "source": []
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": null,
176 | "id": "216cbea4",
177 | "metadata": {},
178 | "outputs": [],
179 | "source": []
180 | }
181 | ],
182 | "metadata": {
183 | "kernelspec": {
184 | "display_name": "Python 3 (ipykernel)",
185 | "language": "python",
186 | "name": "python3"
187 | },
188 | "language_info": {
189 | "codemirror_mode": {
190 | "name": "ipython",
191 | "version": 3
192 | },
193 | "file_extension": ".py",
194 | "mimetype": "text/x-python",
195 | "name": "python",
196 | "nbconvert_exporter": "python",
197 | "pygments_lexer": "ipython3",
198 | "version": "3.9.7"
199 | }
200 | },
201 | "nbformat": 4,
202 | "nbformat_minor": 5
203 | }
204 |
--------------------------------------------------------------------------------
/Feature Selection/Filter method.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "880546ed",
6 | "metadata": {},
7 | "source": [
8 | "
Filter Method
"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "200dfba8",
14 | "metadata": {},
15 | "source": [
16 | "Importing Nassary Liberarys 📈:
"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 13,
22 | "id": "645a17c6",
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "\n",
27 | "import numpy as np\n",
28 | "from sklearn.datasets import load_breast_cancer\n",
29 | "from sklearn.model_selection import train_test_split\n",
30 | "from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold\n",
31 | "from sklearn.ensemble import RandomForestClassifier\n",
32 | "from sklearn.metrics import accuracy_score"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "id": "1445759e",
38 | "metadata": {},
39 | "source": [
40 | " \n",
41 | "Loading seed dataset 📈:
"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 14,
47 | "id": "a5251b89",
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "# Set seed for reproducibility\n",
52 | "seed = 42\n",
53 | "np.random.seed(seed)\n",
54 | "\n",
55 | "# Load the Breast Cancer dataset\n",
56 | "cancer = load_breast_cancer()\n",
57 | "X = cancer.data\n",
58 | "y = cancer.target"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "id": "da8dd7d2",
64 | "metadata": {},
65 | "source": [
66 | " \n",
67 | "Split the dataset into training and testing sets as a 20% as a Test and 80% as a Train :
"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 15,
73 | "id": "4817be16",
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "# Split the dataset into training and testing sets\n",
78 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "id": "329de26e",
84 | "metadata": {},
85 | "source": [
86 | "Filter method with ANOVA
"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 16,
92 | "id": "2046d2f3",
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "# Filter method with ANOVA\n",
97 | "k_best_features = 10\n",
98 | "anova_selector = SelectKBest(f_classif, k=k_best_features)\n",
99 | "X_train_anova = anova_selector.fit_transform(X_train, y_train)\n",
100 | "X_test_anova = anova_selector.transform(X_test)"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "id": "8632271b",
106 | "metadata": {},
107 | "source": [
108 | "Filter method with Variance Threshold
"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 17,
114 | "id": "a38a24ca",
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "# Filter method with Variance Threshold\n",
119 | "variance_threshold_value = 0.01\n",
120 | "variance_selector = VarianceThreshold(threshold=variance_threshold_value)\n",
121 | "X_train_filtered = variance_selector.fit_transform(X_train_anova)\n",
122 | "X_test_filtered = variance_selector.transform(X_test_anova)\n"
123 | ]
124 | },
125 | {
126 | "cell_type": "markdown",
127 | "id": "573f43a2",
128 | "metadata": {},
129 | "source": [
130 | "Function to train and evaluate a model
"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 18,
136 | "id": "6b3e9548",
137 | "metadata": {},
138 | "outputs": [],
139 | "source": [
140 | "# Function to train and evaluate a model\n",
141 | "def train_and_evaluate(X_train, X_test, y_train, y_test):\n",
142 | " model = RandomForestClassifier(random_state=seed)\n",
143 | " model.fit(X_train, y_train)\n",
144 | " y_pred = model.predict(X_test)\n",
145 | " accuracy = accuracy_score(y_test, y_pred)\n",
146 | " return accuracy\n",
147 | "\n",
148 | "# Train and evaluate the model with ANOVA and Variance Threshold\n",
149 | "accuracy_anova = train_and_evaluate(X_train_anova, X_test_anova, y_train, y_test)\n",
150 | "accuracy_variance = train_and_evaluate(X_train_filtered, X_test_filtered, y_train, y_test)\n",
151 | "\n",
152 | "# Get selected feature indices\n",
153 | "selected_indices_variance = np.where(variance_selector.get_support())[0]"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": 19,
159 | "id": "96ac34ec",
160 | "metadata": {},
161 | "outputs": [
162 | {
163 | "name": "stdout",
164 | "output_type": "stream",
165 | "text": [
166 | "\n",
167 | "Filter Method with ANOVA\n",
168 | "Number of Features Selected (ANOVA): 10\n",
169 | "Selected Feature Indices (ANOVA): [ 0 2 3 6 7 20 22 23 26 27]\n",
170 | "Selected Feature Names (ANOVA): ['mean radius' 'mean perimeter' 'mean area' 'mean concavity'\n",
171 | " 'mean concave points' 'worst radius' 'worst perimeter' 'worst area'\n",
172 | " 'worst concavity' 'worst concave points']\n",
173 | "Accuracy (ANOVA): 0.9561\n",
174 | "\n",
175 | "Filter Method with Variance Threshold\n",
176 | "Number of Features Selected (Variance Threshold): 7\n",
177 | "Selected Feature Indices (Variance Threshold): [0 1 2 5 6 7 8]\n",
178 | "Selected Feature Names (Variance Threshold): ['mean radius' 'mean texture' 'mean perimeter' 'mean compactness'\n",
179 | " 'mean concavity' 'mean concave points' 'mean symmetry']\n",
180 | "Accuracy (Variance Threshold): 0.9737\n"
181 | ]
182 | }
183 | ],
184 | "source": [
185 | "# Print results\n",
186 | "print(\"\\nFilter Method with ANOVA\")\n",
187 | "print(f\"Number of Features Selected (ANOVA): {k_best_features}\")\n",
188 | "print(f\"Selected Feature Indices (ANOVA): {np.where(anova_selector.get_support())[0]}\")\n",
189 | "print(f\"Selected Feature Names (ANOVA): {cancer.feature_names[anova_selector.get_support()]}\")\n",
190 | "print(f\"Accuracy (ANOVA): {accuracy_anova:.4f}\")\n",
191 | "\n",
192 | "print(\"\\nFilter Method with Variance Threshold\")\n",
193 | "print(f\"Number of Features Selected (Variance Threshold): {len(selected_indices_variance)}\")\n",
194 | "print(f\"Selected Feature Indices (Variance Threshold): {selected_indices_variance}\")\n",
195 | "print(f\"Selected Feature Names (Variance Threshold): {cancer.feature_names[selected_indices_variance]}\")\n",
196 | "print(f\"Accuracy (Variance Threshold): {accuracy_variance:.4f}\")\n"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": null,
202 | "id": "4403d6f9",
203 | "metadata": {},
204 | "outputs": [],
205 | "source": []
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "id": "fad55cab",
211 | "metadata": {},
212 | "outputs": [],
213 | "source": []
214 | }
215 | ],
216 | "metadata": {
217 | "kernelspec": {
218 | "display_name": "Python 3 (ipykernel)",
219 | "language": "python",
220 | "name": "python3"
221 | },
222 | "language_info": {
223 | "codemirror_mode": {
224 | "name": "ipython",
225 | "version": 3
226 | },
227 | "file_extension": ".py",
228 | "mimetype": "text/x-python",
229 | "name": "python",
230 | "nbconvert_exporter": "python",
231 | "pygments_lexer": "ipython3",
232 | "version": "3.9.7"
233 | }
234 | },
235 | "nbformat": 4,
236 | "nbformat_minor": 5
237 | }
238 |
--------------------------------------------------------------------------------
/Feature Selection/Wrapper method .ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "9b15f1ec",
6 | "metadata": {},
7 | "source": [
8 | "1. | Importing Nassary Liberarys 🌟 📚
"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 5,
14 | "id": "1e09463d",
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "from sklearn.datasets import load_breast_cancer\n",
20 | "from sklearn.model_selection import train_test_split\n",
21 | "from sklearn.ensemble import RandomForestClassifier\n",
22 | "from sklearn.feature_selection import RFE\n",
23 | "from sklearn.metrics import accuracy_score"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "id": "82180422",
29 | "metadata": {},
30 | "source": [
31 | " \n",
32 | "Loading seed dataset 📈:
"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 6,
38 | "id": "34dc6a7c",
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "# Set seed for reproducibility\n",
43 | "seed = 42\n",
44 | "np.random.seed(seed)\n",
45 | "\n",
46 | "# Load the Breast Cancer dataset\n",
47 | "cancer = load_breast_cancer()\n",
48 | "X = cancer.data\n",
49 | "y = cancer.target"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "id": "3f999bc5",
55 | "metadata": {},
56 | "source": [
57 | " \n",
58 | "Split the dataset into training and testing sets as a 20% as a Test and 80% as a Train :
"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 11,
64 | "id": "1ce7b251",
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "\n",
69 | "# Split the dataset into training and testing sets\n",
70 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)"
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "id": "9d80703c",
76 | "metadata": {},
77 | "source": [
78 | " \n",
79 | "Wrapper method using Recursive Feature Elimination (RFE) :
"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 15,
85 | "id": "43327b06",
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "\n",
90 | "\n",
91 | "\n",
92 | "# Wrapper method using Recursive Feature Elimination (RFE) with RandomForestClassifier\n",
93 | "def wrapper_method_rfe(X_train, X_test, y_train, estimator, num_features, method):\n",
94 | " model = estimator\n",
95 | " if method == \"Forward\":\n",
96 | " selector = RFE(model, n_features_to_select=num_features, step=1)\n",
97 | " selector.fit(X_train, y_train)\n",
98 | " elif method == \"Backward\":\n",
99 | " selector = RFE(model, n_features_to_select=num_features, step=1)\n",
100 | " selector.fit(X_train, y_train)\n",
101 | " # Since RFE performs backward elimination by default, we need to reverse the selected indices for backward elimination\n",
102 | " selected_indices = np.flip(np.where(selector.support_)[0])\n",
103 | " X_train_selected = selector.transform(X_train)\n",
104 | " X_test_selected = selector.transform(X_test)\n",
105 | " else:\n",
106 | " raise ValueError(\"Invalid method specified\")\n",
107 | "\n",
108 | " if method != \"Backward\":\n",
109 | " X_train_selected = selector.transform(X_train)\n",
110 | " X_test_selected = selector.transform(X_test)\n",
111 | " selected_indices = np.where(selector.support_)[0]"
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "id": "5202b583",
117 | "metadata": {},
118 | "source": [
119 | " \n",
120 | "Wrapper method using Recursive Feature Elimination (RFE) :
"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": 19,
126 | "id": "fe96f1d4",
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "\n",
131 | "# Wrapper method using Recursive Feature Elimination (RFE) with RandomForestClassifier\n",
132 | "def wrapper_method_rfe(X_train, X_test, y_train, estimator, num_features, method):\n",
133 | " model = estimator\n",
134 | " if method == \"Forward\":\n",
135 | " selector = RFE(model, n_features_to_select=num_features, step=1)\n",
136 | " selector.fit(X_train, y_train)\n",
137 | " elif method == \"Backward\":\n",
138 | " selector = RFE(model, n_features_to_select=num_features, step=1)\n",
139 | " selector.fit(X_train, y_train)\n",
140 | " # Since RFE performs backward elimination by default, we need to reverse the selected indices for backward elimination\n",
141 | " selected_indices = np.flip(np.where(selector.support_)[0])\n",
142 | " X_train_selected = selector.transform(X_train)\n",
143 | " X_test_selected = selector.transform(X_test)\n",
144 | " else:\n",
145 | " raise ValueError(\"Invalid method specified\")\n",
146 | "\n",
147 | " if method != \"Backward\":\n",
148 | " X_train_selected = selector.transform(X_train)\n",
149 | " X_test_selected = selector.transform(X_test)\n",
150 | " selected_indices = np.where(selector.support_)[0]\n",
151 | "\n",
152 | " # Print results\n",
153 | " print(f\"\\nWrapper Method with RFE ({method})\")\n",
154 | " print(f\"Selected Features (RFE): {selected_indices}\")\n",
155 | " print(f\"Number of Features Selected (RFE): {len(selected_indices)}\")\n",
156 | "\n",
157 | " return X_train_selected, X_test_selected\n",
158 | "\n"
159 | ]
160 | },
161 | {
162 | "cell_type": "markdown",
163 | "id": "3ab76798",
164 | "metadata": {},
165 | "source": [
166 | " \n",
167 | "Wrapper method using RFE (Forward Selection) And Backword :
"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": 20,
173 | "id": "be46adab",
174 | "metadata": {},
175 | "outputs": [
176 | {
177 | "name": "stdout",
178 | "output_type": "stream",
179 | "text": [
180 | "\n",
181 | "Wrapper Method with RFE (Forward)\n",
182 | "Selected Features (RFE): [ 1 2 6 7 20 21 22 23 26 27]\n",
183 | "Number of Features Selected (RFE): 10\n",
184 | "Accuracy (RFE): 0.9649\n",
185 | "\n",
186 | "Wrapper Method with RFE (Backward)\n",
187 | "Selected Features (RFE): [27 26 23 22 21 20 7 6 2 1]\n",
188 | "Number of Features Selected (RFE): 10\n",
189 | "Accuracy (RFE): 0.9649\n"
190 | ]
191 | }
192 | ],
193 | "source": [
194 | "# Function to train and evaluate a model\n",
195 | "def train_and_evaluate(X_train, X_test, y_train, y_test):\n",
196 | " model = RandomForestClassifier(random_state=seed)\n",
197 | " model.fit(X_train, y_train)\n",
198 | " y_pred = model.predict(X_test)\n",
199 | " accuracy = accuracy_score(y_test, y_pred)\n",
200 | "\n",
201 | " # Print accuracy\n",
202 | " print(f\"Accuracy (RFE): {accuracy:.4f}\")\n",
203 | "\n",
204 | " return accuracy\n",
205 | "\n",
206 | "# Wrapper method using RFE (Forward Selection)\n",
207 | "num_features_rfe_forward = 10\n",
208 | "X_train_rfe_forward, X_test_rfe_forward = wrapper_method_rfe(\n",
209 | " X_train, X_test, y_train, RandomForestClassifier(random_state=seed),\n",
210 | " num_features_rfe_forward, \"Forward\"\n",
211 | ")\n",
212 | "accuracy_rfe_forward = train_and_evaluate(X_train_rfe_forward, X_test_rfe_forward, y_train, y_test)\n",
213 | "\n",
214 | "# Wrapper method using RFE (Backward Elimination)\n",
215 | "num_features_rfe_backward = 10\n",
216 | "X_train_rfe_backward, X_test_rfe_backward = wrapper_method_rfe(\n",
217 | " X_train, X_test, y_train, RandomForestClassifier(random_state=seed),\n",
218 | " num_features_rfe_backward, \"Backward\"\n",
219 | ")\n",
220 | "accuracy_rfe_backward = train_and_evaluate(X_train_rfe_backward, X_test_rfe_backward, y_train, y_test)\n"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": null,
226 | "id": "52de39a0",
227 | "metadata": {},
228 | "outputs": [],
229 | "source": []
230 | }
231 | ],
232 | "metadata": {
233 | "kernelspec": {
234 | "display_name": "Python 3 (ipykernel)",
235 | "language": "python",
236 | "name": "python3"
237 | },
238 | "language_info": {
239 | "codemirror_mode": {
240 | "name": "ipython",
241 | "version": 3
242 | },
243 | "file_extension": ".py",
244 | "mimetype": "text/x-python",
245 | "name": "python",
246 | "nbconvert_exporter": "python",
247 | "pygments_lexer": "ipython3",
248 | "version": "3.9.7"
249 | }
250 | },
251 | "nbformat": 4,
252 | "nbformat_minor": 5
253 | }
254 |
--------------------------------------------------------------------------------
/Feature Selection/feature-selection-technique-in-machine-learning.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
7 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a",
8 | "collapsed": true
9 | },
10 | "source": [
11 | " Feature Selection
\n",
12 | "Feature Selection is one of the most import technique for a great predictive model. It help us to know the most important features of the data set."
13 | ]
14 | },
15 | {
16 | "cell_type": "markdown",
17 | "metadata": {
18 | "_uuid": "8710c1c227abfd06369844ebc57af6fca32b4632"
19 | },
20 | "source": [
21 | "1. | I will cover the below points : 🌟 📚
\n",
22 | "\n",
23 | "1. What is Feature Selection?\n",
24 | "2. Why it is one the most important techinque to learn for a Data Scientitst?\n",
25 | "3. What are the different type of Feature Selection?"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {
31 | "_uuid": "c755f87fc7ca150d89268ee9dba94b2720d69657"
32 | },
33 | "source": [
34 | "1.1 | 1.Feature Selection: 🌍:
\n",
35 | "\n",
36 | "The process of selecting subset of relevant features for use in model construction which will help to increase the model prediction and decrease the error rate. \n",
37 | "In other word you can say its a process of identifying and removing as much of irrelevant and redundent information as possible.\n"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {
43 | "_uuid": "8779a7d4886a84ff86204ac95a0f6eba11876b58"
44 | },
45 | "source": [
46 | "1.1 | 2. Importance of Feature Selection:🌍:
\n",
47 | "\n",
48 | "* Improve the accuracy of model.\n",
49 | "* Reduce overfitting.\n",
50 | "* Shoter traning time.\n",
51 | "* Reduce complexity of model.\n"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {
57 | "_uuid": "1411d4d61851c8e1f404cc3275aec9fbc46b3ba8",
58 | "collapsed": true
59 | },
60 | "source": [
61 | "\n",
62 | " Type of Feature Selection
\n",
63 | "\n",
64 | "* ***Wrapper Method***\n",
65 | "* ***Filter Method***\n",
66 | "* ***Embedded Method***\n"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "metadata": {
72 | "_uuid": "79ba39eefedbbc4a5ee444d9b88f953d261663fd"
73 | },
74 | "source": [
75 | "\n",
76 | " Wrapper Method 🌟 📚
\n",
77 | "\n",
78 | "\n",
79 | "In this method a subset of features are selected and train a model using them. Based on the inference that we draw from the previous model, we decide to add or remove features from subset.\n",
80 | "[For indepth details](https://en.wikipedia.org/wiki/Feature_selection)"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {
86 | "_uuid": "503925da9749631ebb3a0e07be5587a8583060a1"
87 | },
88 | "source": [
89 | "**Image from wiki**\n",
90 | "
"
91 | ]
92 | },
93 | {
94 | "cell_type": "markdown",
95 | "metadata": {
96 | "_uuid": "e3a24d030c50a3cc50559c03f586bf2288c221f4",
97 | "collapsed": true
98 | },
99 | "source": [
100 | "\n",
101 | "
Type of Wrapper Method
\n",
102 | "\n",
103 | "\n",
104 | "* Forward Selection\n",
105 | "* Backward Elimination\n",
106 | "* Exhaustive Feature Selection "
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {
112 | "_uuid": "2fa60b67f8e4805bcd497bf464d925b64b1900b5"
113 | },
114 | "source": [
115 | "
Forward Selection
\n",
116 | "\n",
117 | "It is a iterative method in which we keep adding feature which best improves our model till an addition of a new feature does not improve the model performance.
\n",
118 | "
Backward Elimination
\n",
119 | "In this we start with all features and removes the least significant feature at each iteration which improves the model performance. We repeat this until no improvemnt is observed on removal of feature.
\n",
120 | "\n",
121 | "
Exhaustive Feature Selection
\n",
122 | "\n",
123 | "In this the best subset of feature is selected, over all possible feature subsets. For example, if a dataset contains 4 features, the algorithm will evaluate all the feature combinations as follows:\n",
124 | "* All possible combinations of 1 feature\n",
125 | "* All possible combinations of 2 features\n",
126 | "* All possible combinations of 3 features\n",
127 | "* All possible combinations of 4 features\n",
128 | " "
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {
134 | "_uuid": "eafcc42656451003439a83ee28839b1408d274cb"
135 | },
136 | "source": [
137 | "\n",
138 | "
Pros
\n",
139 | "\n",
140 | "\n",
141 | "* Aim to find the best possible feature combintaion.\n",
142 | "* Better result then filter method.\n",
143 | "* Can we used for small dataset having less features."
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "metadata": {
149 | "_uuid": "9fad7d1d69b81bb7e9157860890f028ff8483c82"
150 | },
151 | "source": [
152 | "\n",
153 | "\n",
154 | "\n",
155 | "
Cons
\n",
156 | "\n",
157 | "* Computationally expensive\n",
158 | "* Often impracticable for large dataset having more features."
159 | ]
160 | },
161 | {
162 | "cell_type": "markdown",
163 | "metadata": {
164 | "_uuid": "1c6e339fd12bf741b1d3e00edd2c2ee3c136ea7d"
165 | },
166 | "source": [
167 | "
Filter Method 📚
"
168 | ]
169 | },
170 | {
171 | "cell_type": "markdown",
172 | "metadata": {
173 | "_uuid": "7d104eeabb0bd4ad80a59a9c66d88f5487a88937"
174 | },
175 | "source": [
176 | "Filter methods are generally used as a preprocessing step. The selection of features is independent of any machine learning algorithms. Instead, features are selected on the basis of their scores in various statistical tests for their correlation with the outcome variable."
177 | ]
178 | },
179 | {
180 | "cell_type": "markdown",
181 | "metadata": {
182 | "_uuid": "0913e455d8c335346b3fe9099d00105e8a65c1b8",
183 | "collapsed": true
184 | },
185 | "source": [
186 | "**Image from wiki**\n",
187 | "

"
188 | ]
189 | },
190 | {
191 | "cell_type": "markdown",
192 | "metadata": {
193 | "_uuid": "211f9335e79a2ae0ca5a30ec96945f39e7481d07"
194 | },
195 | "source": [
196 | "\n",
197 | "\n",
198 | "
Basic Methods
\n",
199 | "\n",
200 | "We should consider the below filter methods as a data pre processing steps.\n",
201 | "* Constant features - Constant features are those that show the same value for all the observations of the dataset. Remove constant features from dataset.\n",
202 | "* Quasi-constant features - The column which contain 99% of same data is called Quasi constant column. Remove Quasi constant features from dataset.\n",
203 | "* Duplicated features - Remove duplicated features from dataset."
204 | ]
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "metadata": {
209 | "_uuid": "fce63551ebc4a8d6b671c9a0c4f10ac89d4fdff4"
210 | },
211 | "source": [
212 | "\n",
213 | "
Correlation
\n",
214 | "\n",
215 | "* Correlation is measure of the linear relationship of 2 or more variables.\n",
216 | "* Through correlation we can predict one variable from other.\n",
217 | " * Good variables are highly correlated with the target but uncorrelated among themselves.\n",
218 | "* If two variables are highly correlated with each other, then we should remove one of them. \n",
219 | " "
220 | ]
221 | },
222 | {
223 | "cell_type": "markdown",
224 | "metadata": {
225 | "_uuid": "d007d540b45ed89c084a571db241329c03bd30b3"
226 | },
227 | "source": [
228 | "\n",
229 | "
Fisher Score
\n",
230 | "\n",
231 | "* Measures the dependence of 2 variables\n",
232 | "* Suited for categorical variables.\n",
233 | "* Target should be binary.\n",
234 | "* Variable values should be non negative, typically Boolean or counts.\n"
235 | ]
236 | },
237 | {
238 | "cell_type": "markdown",
239 | "metadata": {
240 | "_uuid": "59c38c2392896e016e2ab499b2ac829475d6cc19"
241 | },
242 | "source": [
243 | "\n",
244 | "\n",
245 | "
ANOVA (Analysis Of Variance)
\n",
246 | "\n",
247 | "* Measures the dependency of two variables.\n",
248 | "* Suited for continuous variables.\n",
249 | "* Requires a binary target.\n",
250 | "* Assumes linear relationship between variable and target.\n",
251 | "* Assumes variables are normally distributed.\n",
252 | "* Sensitive to sample size\n"
253 | ]
254 | },
255 | {
256 | "cell_type": "markdown",
257 | "metadata": {
258 | "_uuid": "987b7c270f4b43373b3d70fa216de2f866e64c03"
259 | },
260 | "source": [
261 | "
ROC-AUC / RMSE
\n",
262 | "\n",
263 | "* Measures the dependency of two variables.\n",
264 | "* Suited for all type of variables.\n",
265 | "* Makes no assumption on the distribution of the variables."
266 | ]
267 | },
268 | {
269 | "cell_type": "markdown",
270 | "metadata": {
271 | "_uuid": "d817284e30d6bdb48e9508e3ecdf4fb015d087f2"
272 | },
273 | "source": [
274 | "\n",
275 | "
Steps to select features
\n",
276 | "\n",
277 | "* Rank features according to a certain criteria (like correlation).\n",
278 | " * Each feature is ranked independently of the feature space.\n",
279 | "* Select highest ranking features. "
280 | ]
281 | },
282 | {
283 | "cell_type": "markdown",
284 | "metadata": {
285 | "_uuid": "e4ddc34ce1df6366a605729d04ca48094ade7009"
286 | },
287 | "source": [
288 | "\n",
289 | "
Basic Pros
\n",
290 | "\n",
291 | "* Fast computation.\n",
292 | "* Simple yet powerful to quickly remove irrelevant and redundant feature.\n",
293 | "* Better choice for large dataset over wrapper methods."
294 | ]
295 | },
296 | {
297 | "cell_type": "markdown",
298 | "metadata": {
299 | "_uuid": "def3fa408a109f6d51bb1663a3bbcf5e66a231fd"
300 | },
301 | "source": [
302 | "
Basic Cons
\n",
303 | "\n",
304 | "* It may select redundant variables because they do not consider the relationships between features.\n",
305 | "* The prediction accuracy is lesser than wrapper methods."
306 | ]
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "metadata": {
311 | "_uuid": "87d76ab632364c86325f8b7749409bb77bf20432"
312 | },
313 | "source": [
314 | "\n",
315 | "
Embedded Method 🌟 📚
\n",
316 | "\n",
317 | "\n",
318 | "Embedded method combine the features of Filter and Wrapper methods. A learning algorithm takes advantage of its own variable selection process and performs feature selection and classification simultaneously."
319 | ]
320 | },
321 | {
322 | "cell_type": "markdown",
323 | "metadata": {
324 | "_uuid": "b61d42f474e71dec29b9ff15752f106a21459c7e"
325 | },
326 | "source": [
327 | "**Image from wiki**\n",
328 | "

"
329 | ]
330 | },
331 | {
332 | "cell_type": "markdown",
333 | "metadata": {
334 | "_uuid": "17807564445dd5b351e939e8309c60501785b693"
335 | },
336 | "source": [
337 | "
REGULARISATION 🌟 📚
\n",
338 | "\n",
339 | "\n",
340 | "\n",
341 | "Regularization consists in adding a penalty on the different parameters of the model to reduce the freedom of the model. Hence, the model will be less likely to fit the noise of the training data and will improve the generalization abilities of the model. For linear models there are in general 3 types of regularisation:\n",
342 | "* The L1 regularization (also called Lasso)\n",
343 | "* The L2 regularization (also called Ridge)\n",
344 | "* The L1/L2 regularization (also called Elastic net)"
345 | ]
346 | },
347 | {
348 | "cell_type": "markdown",
349 | "metadata": {
350 | "_uuid": "10c4a1d2541d1bfcbc3a6c378624be3f4ca9879a"
351 | },
352 | "source": [
353 | "**Image from Scikit learn**\n",
354 | "

"
355 | ]
356 | },
357 | {
358 | "cell_type": "markdown",
359 | "metadata": {
360 | "_uuid": "9a0bf904c39487ae6b3cc26a9053100526224d72",
361 | "collapsed": true
362 | },
363 | "source": []
364 | },
365 | {
366 | "cell_type": "markdown",
367 | "metadata": {
368 | "_uuid": "6675c83145ea22d7780e15a19b074c90884eebd4",
369 | "collapsed": true
370 | },
371 | "source": []
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": null,
376 | "metadata": {
377 | "_uuid": "c1e5227338ce91fb45514340eca51cbd4130c6a4",
378 | "collapsed": true
379 | },
380 | "outputs": [],
381 | "source": []
382 | }
383 | ],
384 | "metadata": {
385 | "kernelspec": {
386 | "display_name": "Python 3 (ipykernel)",
387 | "language": "python",
388 | "name": "python3"
389 | },
390 | "language_info": {
391 | "codemirror_mode": {
392 | "name": "ipython",
393 | "version": 3
394 | },
395 | "file_extension": ".py",
396 | "mimetype": "text/x-python",
397 | "name": "python",
398 | "nbconvert_exporter": "python",
399 | "pygments_lexer": "ipython3",
400 | "version": "3.9.7"
401 | }
402 | },
403 | "nbformat": 4,
404 | "nbformat_minor": 1
405 | }
406 |
--------------------------------------------------------------------------------
/Spam-Email-detection-system-main/Untitled2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {
7 | "executionInfo": {
8 | "elapsed": 1401,
9 | "status": "ok",
10 | "timestamp": 1640778596418,
11 | "user": {
12 | "displayName": "bibek sah",
13 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64",
14 | "userId": "08792417367200435838"
15 | },
16 | "user_tz": -345
17 | },
18 | "id": "-9boQqt09xM6"
19 | },
20 | "outputs": [],
21 | "source": [
22 | "import numpy as np\n",
23 | "import pandas as pd\n",
24 | "import matplotlib.pyplot as plt\n",
25 | "import seaborn as sns"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 3,
31 | "metadata": {
32 | "executionInfo": {
33 | "elapsed": 28,
34 | "status": "ok",
35 | "timestamp": 1640778210470,
36 | "user": {
37 | "displayName": "bibek sah",
38 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64",
39 | "userId": "08792417367200435838"
40 | },
41 | "user_tz": -345
42 | },
43 | "id": "Yzk3k6Y890vh"
44 | },
45 | "outputs": [],
46 | "source": [
47 | "df=pd.read_csv('spam.csv')"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 4,
53 | "metadata": {
54 | "colab": {
55 | "base_uri": "https://localhost:8080/",
56 | "height": 206
57 | },
58 | "executionInfo": {
59 | "elapsed": 27,
60 | "status": "ok",
61 | "timestamp": 1640778210471,
62 | "user": {
63 | "displayName": "bibek sah",
64 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64",
65 | "userId": "08792417367200435838"
66 | },
67 | "user_tz": -345
68 | },
69 | "id": "QfNUzF-z-Qzy",
70 | "outputId": "51f730e2-032e-49dd-e93d-6902b4ba8ebc"
71 | },
72 | "outputs": [
73 | {
74 | "data": {
75 | "text/html": [
76 | "
\n",
77 | "\n",
90 | "
\n",
91 | " \n",
92 | " \n",
93 | " | \n",
94 | " Category | \n",
95 | " Message | \n",
96 | "
\n",
97 | " \n",
98 | " \n",
99 | " \n",
100 | " | 0 | \n",
101 | " ham | \n",
102 | " Go until jurong point, crazy.. Available only ... | \n",
103 | "
\n",
104 | " \n",
105 | " | 1 | \n",
106 | " ham | \n",
107 | " Ok lar... Joking wif u oni... | \n",
108 | "
\n",
109 | " \n",
110 | " | 2 | \n",
111 | " spam | \n",
112 | " Free entry in 2 a wkly comp to win FA Cup fina... | \n",
113 | "
\n",
114 | " \n",
115 | " | 3 | \n",
116 | " ham | \n",
117 | " U dun say so early hor... U c already then say... | \n",
118 | "
\n",
119 | " \n",
120 | " | 4 | \n",
121 | " ham | \n",
122 | " Nah I don't think he goes to usf, he lives aro... | \n",
123 | "
\n",
124 | " \n",
125 | "
\n",
126 | "
"
127 | ],
128 | "text/plain": [
129 | " Category Message\n",
130 | "0 ham Go until jurong point, crazy.. Available only ...\n",
131 | "1 ham Ok lar... Joking wif u oni...\n",
132 | "2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n",
133 | "3 ham U dun say so early hor... U c already then say...\n",
134 | "4 ham Nah I don't think he goes to usf, he lives aro..."
135 | ]
136 | },
137 | "execution_count": 4,
138 | "metadata": {},
139 | "output_type": "execute_result"
140 | }
141 | ],
142 | "source": [
143 | "df.head()"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 5,
149 | "metadata": {
150 | "colab": {
151 | "base_uri": "https://localhost:8080/"
152 | },
153 | "executionInfo": {
154 | "elapsed": 17,
155 | "status": "ok",
156 | "timestamp": 1640778210472,
157 | "user": {
158 | "displayName": "bibek sah",
159 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64",
160 | "userId": "08792417367200435838"
161 | },
162 | "user_tz": -345
163 | },
164 | "id": "06nXuOFv_cWx",
165 | "outputId": "019c195a-c7ef-4b65-8c6e-963f4324c3dc"
166 | },
167 | "outputs": [
168 | {
169 | "data": {
170 | "text/plain": [
171 | "array(['ham', 'spam'], dtype=object)"
172 | ]
173 | },
174 | "execution_count": 5,
175 | "metadata": {},
176 | "output_type": "execute_result"
177 | }
178 | ],
179 | "source": [
180 | "df.Category.unique()"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": 6,
186 | "metadata": {
187 | "executionInfo": {
188 | "elapsed": 831,
189 | "status": "ok",
190 | "timestamp": 1640778295983,
191 | "user": {
192 | "displayName": "bibek sah",
193 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64",
194 | "userId": "08792417367200435838"
195 | },
196 | "user_tz": -345
197 | },
198 | "id": "v4a1QjCV_jKC"
199 | },
200 | "outputs": [],
201 | "source": [
202 | "df['spam']=df['Category'].apply(lambda x: 1 if x=='spam' else 0)"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": 7,
208 | "metadata": {
209 | "colab": {
210 | "base_uri": "https://localhost:8080/",
211 | "height": 206
212 | },
213 | "executionInfo": {
214 | "elapsed": 722,
215 | "status": "ok",
216 | "timestamp": 1640778655207,
217 | "user": {
218 | "displayName": "bibek sah",
219 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64",
220 | "userId": "08792417367200435838"
221 | },
222 | "user_tz": -345
223 | },
224 | "id": "wFhq-4sBAeTK",
225 | "outputId": "76771c6a-0589-45fb-8975-32a7a76ea055"
226 | },
227 | "outputs": [
228 | {
229 | "data": {
230 | "text/html": [
231 | "
\n",
232 | "\n",
245 | "
\n",
246 | " \n",
247 | " \n",
248 | " | \n",
249 | " Category | \n",
250 | " Message | \n",
251 | " spam | \n",
252 | "
\n",
253 | " \n",
254 | " \n",
255 | " \n",
256 | " | 0 | \n",
257 | " ham | \n",
258 | " Go until jurong point, crazy.. Available only ... | \n",
259 | " 0 | \n",
260 | "
\n",
261 | " \n",
262 | " | 1 | \n",
263 | " ham | \n",
264 | " Ok lar... Joking wif u oni... | \n",
265 | " 0 | \n",
266 | "
\n",
267 | " \n",
268 | " | 2 | \n",
269 | " spam | \n",
270 | " Free entry in 2 a wkly comp to win FA Cup fina... | \n",
271 | " 1 | \n",
272 | "
\n",
273 | " \n",
274 | " | 3 | \n",
275 | " ham | \n",
276 | " U dun say so early hor... U c already then say... | \n",
277 | " 0 | \n",
278 | "
\n",
279 | " \n",
280 | " | 4 | \n",
281 | " ham | \n",
282 | " Nah I don't think he goes to usf, he lives aro... | \n",
283 | " 0 | \n",
284 | "
\n",
285 | " \n",
286 | "
\n",
287 | "
"
288 | ],
289 | "text/plain": [
290 | " Category Message spam\n",
291 | "0 ham Go until jurong point, crazy.. Available only ... 0\n",
292 | "1 ham Ok lar... Joking wif u oni... 0\n",
293 | "2 spam Free entry in 2 a wkly comp to win FA Cup fina... 1\n",
294 | "3 ham U dun say so early hor... U c already then say... 0\n",
295 | "4 ham Nah I don't think he goes to usf, he lives aro... 0"
296 | ]
297 | },
298 | "execution_count": 7,
299 | "metadata": {},
300 | "output_type": "execute_result"
301 | }
302 | ],
303 | "source": [
304 | "df.head(5)"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": 8,
310 | "metadata": {
311 | "executionInfo": {
312 | "elapsed": 679,
313 | "status": "ok",
314 | "timestamp": 1640778804504,
315 | "user": {
316 | "displayName": "bibek sah",
317 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64",
318 | "userId": "08792417367200435838"
319 | },
320 | "user_tz": -345
321 | },
322 | "id": "K9RIT364B2Bm"
323 | },
324 | "outputs": [],
325 | "source": [
326 | "from sklearn.model_selection import train_test_split\n",
327 | "x_train,x_test,y_train,y_test=train_test_split(df.Message,df.spam,test_size=0.2,random_state=42)"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": 9,
333 | "metadata": {
334 | "colab": {
335 | "base_uri": "https://localhost:8080/"
336 | },
337 | "executionInfo": {
338 | "elapsed": 455,
339 | "status": "ok",
340 | "timestamp": 1640778875501,
341 | "user": {
342 | "displayName": "bibek sah",
343 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64",
344 | "userId": "08792417367200435838"
345 | },
346 | "user_tz": -345
347 | },
348 | "id": "GhSkE8R8CafK",
349 | "outputId": "690bd42a-2be3-4a5a-e8c1-0706cb4ce2ef"
350 | },
351 | "outputs": [
352 | {
353 | "data": {
354 | "text/plain": [
355 | "4457"
356 | ]
357 | },
358 | "execution_count": 9,
359 | "metadata": {},
360 | "output_type": "execute_result"
361 | }
362 | ],
363 | "source": [
364 | "len(x_train)"
365 | ]
366 | },
367 | {
368 | "cell_type": "code",
369 | "execution_count": 10,
370 | "metadata": {
371 | "colab": {
372 | "base_uri": "https://localhost:8080/"
373 | },
374 | "executionInfo": {
375 | "elapsed": 693,
376 | "status": "ok",
377 | "timestamp": 1640779197055,
378 | "user": {
379 | "displayName": "bibek sah",
380 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64",
381 | "userId": "08792417367200435838"
382 | },
383 | "user_tz": -345
384 | },
385 | "id": "d8RKpTA4Cr2d",
386 | "outputId": "4c0deb38-26c5-409e-fee1-27de285b2b0e"
387 | },
388 | "outputs": [
389 | {
390 | "data": {
391 | "text/plain": [
392 | "1115"
393 | ]
394 | },
395 | "execution_count": 10,
396 | "metadata": {},
397 | "output_type": "execute_result"
398 | }
399 | ],
400 | "source": [
401 | "len(x_test)"
402 | ]
403 | },
404 | {
405 | "cell_type": "code",
406 | "execution_count": 11,
407 | "metadata": {
408 | "colab": {
409 | "base_uri": "https://localhost:8080/"
410 | },
411 | "executionInfo": {
412 | "elapsed": 723,
413 | "status": "ok",
414 | "timestamp": 1640779312565,
415 | "user": {
416 | "displayName": "bibek sah",
417 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64",
418 | "userId": "08792417367200435838"
419 | },
420 | "user_tz": -345
421 | },
422 | "id": "6Lb2BpL5D6Tw",
423 | "outputId": "d9c43a76-f4cd-403e-a5b7-d0c99bc48183"
424 | },
425 | "outputs": [
426 | {
427 | "data": {
428 | "text/plain": [
429 | "array([[0, 0, 0, ..., 0, 0, 0],\n",
430 | " [0, 0, 0, ..., 0, 0, 0],\n",
431 | " [0, 0, 0, ..., 0, 0, 0],\n",
432 | " [0, 0, 0, ..., 0, 0, 0],\n",
433 | " [0, 0, 0, ..., 0, 0, 0]], dtype=int64)"
434 | ]
435 | },
436 | "execution_count": 11,
437 | "metadata": {},
438 | "output_type": "execute_result"
439 | }
440 | ],
441 | "source": [
442 | "from sklearn.feature_extraction.text import CountVectorizer\n",
443 | "v=CountVectorizer()\n",
444 | "cv_messages = v.fit_transform(x_train.values)\n",
445 | "cv_messages.toarray()[0:5]"
446 | ]
447 | },
448 | {
449 | "cell_type": "code",
450 | "execution_count": 12,
451 | "metadata": {
452 | "executionInfo": {
453 | "elapsed": 7,
454 | "status": "ok",
455 | "timestamp": 1640779609411,
456 | "user": {
457 | "displayName": "bibek sah",
458 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64",
459 | "userId": "08792417367200435838"
460 | },
461 | "user_tz": -345
462 | },
463 | "id": "LUFmKWvVEWgO"
464 | },
465 | "outputs": [],
466 | "source": [
467 | "from sklearn.naive_bayes import MultinomialNB\n",
468 | "model=MultinomialNB()"
469 | ]
470 | },
471 | {
472 | "cell_type": "code",
473 | "execution_count": 13,
474 | "metadata": {
475 | "colab": {
476 | "base_uri": "https://localhost:8080/"
477 | },
478 | "executionInfo": {
479 | "elapsed": 1578,
480 | "status": "ok",
481 | "timestamp": 1640779640258,
482 | "user": {
483 | "displayName": "bibek sah",
484 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64",
485 | "userId": "08792417367200435838"
486 | },
487 | "user_tz": -345
488 | },
489 | "id": "icy7RxTrFfAm",
490 | "outputId": "0aee8aff-a9c0-4169-da8a-533f5a49e193"
491 | },
492 | "outputs": [
493 | {
494 | "data": {
495 | "text/plain": [
496 | "MultinomialNB()"
497 | ]
498 | },
499 | "execution_count": 13,
500 | "metadata": {},
501 | "output_type": "execute_result"
502 | }
503 | ],
504 | "source": [
505 | "model.fit(cv_messages,y_train)"
506 | ]
507 | },
508 | {
509 | "cell_type": "code",
510 | "execution_count": 14,
511 | "metadata": {
512 | "colab": {
513 | "base_uri": "https://localhost:8080/"
514 | },
515 | "executionInfo": {
516 | "elapsed": 772,
517 | "status": "ok",
518 | "timestamp": 1640780294984,
519 | "user": {
520 | "displayName": "bibek sah",
521 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64",
522 | "userId": "08792417367200435838"
523 | },
524 | "user_tz": -345
525 | },
526 | "id": "sfdn1y7PFmSX",
527 | "outputId": "6dc6eb0e-db56-48cd-8109-3072c098f6a6"
528 | },
529 | "outputs": [
530 | {
531 | "data": {
532 | "text/plain": [
533 | "array([1, 0], dtype=int64)"
534 | ]
535 | },
536 | "execution_count": 14,
537 | "metadata": {},
538 | "output_type": "execute_result"
539 | }
540 | ],
541 | "source": [
542 | "email = [\n",
543 | " 'Upto 30% discount on parking, exclusive offer just for yoy. Dont miss thi reward!',\n",
544 | " 'Ok lar...joking wif u oni...'\n",
545 | "]\n",
546 | "email_count= v.transform(email)\n",
547 | "model.predict(email_count)"
548 | ]
549 | },
550 | {
551 | "cell_type": "code",
552 | "execution_count": 15,
553 | "metadata": {
554 | "colab": {
555 | "base_uri": "https://localhost:8080/"
556 | },
557 | "executionInfo": {
558 | "elapsed": 731,
559 | "status": "ok",
560 | "timestamp": 1640780362896,
561 | "user": {
562 | "displayName": "bibek sah",
563 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64",
564 | "userId": "08792417367200435838"
565 | },
566 | "user_tz": -345
567 | },
568 | "id": "I0i5fFZ8IGVJ",
569 | "outputId": "d4c46fa1-af4c-42c7-93d2-11a425b14a48"
570 | },
571 | "outputs": [
572 | {
573 | "data": {
574 | "text/plain": [
575 | "0.9919282511210762"
576 | ]
577 | },
578 | "execution_count": 15,
579 | "metadata": {},
580 | "output_type": "execute_result"
581 | }
582 | ],
583 | "source": [
584 | "x_test_count=v.transform(x_test)\n",
585 | "model.score(x_test_count,y_test)\n"
586 | ]
587 | },
588 | {
589 | "cell_type": "code",
590 | "execution_count": 16,
591 | "metadata": {
592 | "executionInfo": {
593 | "elapsed": 15,
594 | "status": "ok",
595 | "timestamp": 1640780413260,
596 | "user": {
597 | "displayName": "bibek sah",
598 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64",
599 | "userId": "08792417367200435838"
600 | },
601 | "user_tz": -345
602 | },
603 | "id": "v-ArF0cZIW7x"
604 | },
605 | "outputs": [],
606 | "source": [
607 | "# sklearn pipeline"
608 | ]
609 | },
610 | {
611 | "cell_type": "code",
612 | "execution_count": 17,
613 | "metadata": {
614 | "colab": {
615 | "base_uri": "https://localhost:8080/"
616 | },
617 | "executionInfo": {
618 | "elapsed": 494,
619 | "status": "ok",
620 | "timestamp": 1640780722337,
621 | "user": {
622 | "displayName": "bibek sah",
623 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64",
624 | "userId": "08792417367200435838"
625 | },
626 | "user_tz": -345
627 | },
628 | "id": "Sj-eM9hgIjOB",
629 | "outputId": "160dfbdd-303b-4ecc-c2b9-a4fd8dac9510"
630 | },
631 | "outputs": [
632 | {
633 | "data": {
634 | "text/plain": [
635 | "Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])"
636 | ]
637 | },
638 | "execution_count": 17,
639 | "metadata": {},
640 | "output_type": "execute_result"
641 | }
642 | ],
643 | "source": [
644 | "from sklearn.pipeline import Pipeline\n",
645 | "clf = Pipeline([\n",
646 | " ('vectorizer', CountVectorizer()),\n",
647 | " ('nb', MultinomialNB()) \n",
648 | "]\n",
649 | ")\n",
650 | "clf.fit(x_train,y_train)"
651 | ]
652 | },
653 | {
654 | "cell_type": "code",
655 | "execution_count": 18,
656 | "metadata": {
657 | "colab": {
658 | "base_uri": "https://localhost:8080/"
659 | },
660 | "executionInfo": {
661 | "elapsed": 697,
662 | "status": "ok",
663 | "timestamp": 1640780793192,
664 | "user": {
665 | "displayName": "bibek sah",
666 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64",
667 | "userId": "08792417367200435838"
668 | },
669 | "user_tz": -345
670 | },
671 | "id": "c4oNWSVmJuzd",
672 | "outputId": "4799d618-2345-4c43-f284-fbaca872b976"
673 | },
674 | "outputs": [
675 | {
676 | "data": {
677 | "text/plain": [
678 | "array([1, 0], dtype=int64)"
679 | ]
680 | },
681 | "execution_count": 18,
682 | "metadata": {},
683 | "output_type": "execute_result"
684 | }
685 | ],
686 | "source": [
687 | "email = [\n",
688 | " 'Upto 30% discount on parking, exclusive offer just for yoy. Dont miss thi reward!',\n",
689 | " 'Ok lar...joking wif u oni...' \n",
690 | "]\n",
691 | "clf.predict(email)"
692 | ]
693 | },
694 | {
695 | "cell_type": "code",
696 | "execution_count": 19,
697 | "metadata": {
698 | "colab": {
699 | "base_uri": "https://localhost:8080/"
700 | },
701 | "executionInfo": {
702 | "elapsed": 1829,
703 | "status": "ok",
704 | "timestamp": 1640780963050,
705 | "user": {
706 | "displayName": "bibek sah",
707 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64",
708 | "userId": "08792417367200435838"
709 | },
710 | "user_tz": -345
711 | },
712 | "id": "o752BL8PJ_-5",
713 | "outputId": "2361abbd-386a-4078-b145-1e0aab7c3254"
714 | },
715 | "outputs": [
716 | {
717 | "data": {
718 | "text/plain": [
719 | "0.9919282511210762"
720 | ]
721 | },
722 | "execution_count": 19,
723 | "metadata": {},
724 | "output_type": "execute_result"
725 | }
726 | ],
727 | "source": [
728 | "clf.score(x_test,y_test)"
729 | ]
730 | },
731 | {
732 | "cell_type": "code",
733 | "execution_count": 20,
734 | "metadata": {
735 | "colab": {
736 | "base_uri": "https://localhost:8080/"
737 | },
738 | "executionInfo": {
739 | "elapsed": 692,
740 | "status": "ok",
741 | "timestamp": 1640781005830,
742 | "user": {
743 | "displayName": "bibek sah",
744 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64",
745 | "userId": "08792417367200435838"
746 | },
747 | "user_tz": -345
748 | },
749 | "id": "Wb5rbwIVKo1S",
750 | "outputId": "4cb35f90-0e81-49c9-8379-0b477420d462"
751 | },
752 | "outputs": [
753 | {
754 | "data": {
755 | "text/plain": [
756 | "['spam_model.pkl']"
757 | ]
758 | },
759 | "execution_count": 20,
760 | "metadata": {},
761 | "output_type": "execute_result"
762 | }
763 | ],
764 | "source": [
765 | "import joblib\n",
766 | "joblib.dump(clf,'spam_model.pkl')"
767 | ]
768 | },
769 | {
770 | "cell_type": "code",
771 | "execution_count": 21,
772 | "metadata": {
773 | "executionInfo": {
774 | "elapsed": 11,
775 | "status": "ok",
776 | "timestamp": 1640781042860,
777 | "user": {
778 | "displayName": "bibek sah",
779 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjtY0YqqHlXep2Tt94P9dLVJx_J-lV4Nk1cBRPh7A=s64",
780 | "userId": "08792417367200435838"
781 | },
782 | "user_tz": -345
783 | },
784 | "id": "pP9Ji7YHKz4h"
785 | },
786 | "outputs": [],
787 | "source": [
788 | "# model is completed"
789 | ]
790 | },
791 | {
792 | "cell_type": "code",
793 | "execution_count": null,
794 | "metadata": {
795 | "id": "DegaoHEFK87R"
796 | },
797 | "outputs": [],
798 | "source": []
799 | }
800 | ],
801 | "metadata": {
802 | "colab": {
803 | "authorship_tag": "ABX9TyOu3xR/1JAiPy608KPO62Wq",
804 | "collapsed_sections": [],
805 | "mount_file_id": "1XRYrPikxSuVab8l-DhT_L5o44vnz3fDE",
806 | "name": "Untitled2.ipynb",
807 | "provenance": []
808 | },
809 | "kernelspec": {
810 | "display_name": "Python 3 (ipykernel)",
811 | "language": "python",
812 | "name": "python3"
813 | },
814 | "language_info": {
815 | "codemirror_mode": {
816 | "name": "ipython",
817 | "version": 3
818 | },
819 | "file_extension": ".py",
820 | "mimetype": "text/x-python",
821 | "name": "python",
822 | "nbconvert_exporter": "python",
823 | "pygments_lexer": "ipython3",
824 | "version": "3.9.7"
825 | }
826 | },
827 | "nbformat": 4,
828 | "nbformat_minor": 1
829 | }
830 |
--------------------------------------------------------------------------------
/regularization-in-machine-learning/regularization-in-machine-learning.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "85a85782",
6 | "metadata": {
7 | "papermill": {
8 | "duration": 0.042361,
9 | "end_time": "2021-12-22T19:07:37.877879",
10 | "exception": false,
11 | "start_time": "2021-12-22T19:07:37.835518",
12 | "status": "completed"
13 | },
14 | "tags": []
15 | },
16 | "source": [
17 | "### Regularization in Machine Learning"
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "id": "11910490",
23 | "metadata": {
24 | "papermill": {
25 | "duration": 0.035764,
26 | "end_time": "2021-12-22T19:07:37.950117",
27 | "exception": false,
28 | "start_time": "2021-12-22T19:07:37.914353",
29 | "status": "completed"
30 | },
31 | "tags": []
32 | },
33 | "source": [
34 | "# what is regularization in ML\n",
35 | "\n",
36 | "- a technique to prevent the model from overfitting by adding extra information to it.\n",
37 | "- it maintain all variables or features in the model by reducing the magnitude of the variables. \n",
38 | "- Hence, it maintains accuracy as well as a generalization of the model.\n",
39 | "- In simple words, \"In regularization technique, we reduce the magnitude of the features by keeping the same number of features.\"\n",
40 | "- mainly regularizes or reduces the coefficient of features toward zero"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 1,
46 | "id": "646c8325",
47 | "metadata": {
48 | "execution": {
49 | "iopub.execute_input": "2021-12-22T19:07:38.029150Z",
50 | "iopub.status.busy": "2021-12-22T19:07:38.027353Z",
51 | "iopub.status.idle": "2021-12-22T19:07:38.032283Z",
52 | "shell.execute_reply": "2021-12-22T19:07:38.032799Z"
53 | },
54 | "papermill": {
55 | "duration": 0.046738,
56 | "end_time": "2021-12-22T19:07:38.033119",
57 | "exception": false,
58 | "start_time": "2021-12-22T19:07:37.986381",
59 | "status": "completed"
60 | },
61 | "tags": []
62 | },
63 | "outputs": [],
64 | "source": [
65 | "# Basics of regularization"
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "id": "c3676748",
71 | "metadata": {
72 | "papermill": {
73 | "duration": 0.036146,
74 | "end_time": "2021-12-22T19:07:38.108836",
75 | "exception": false,
76 | "start_time": "2021-12-22T19:07:38.072690",
77 | "status": "completed"
78 | },
79 | "tags": []
80 | },
81 | "source": [
82 | "- a technique to prevent the model from overfitting by adding extra information to it.\n",
83 | "- maintains accuracy as well as a generalization of the mode\n",
84 | "- reduces the magnitude of the variables, hence maintain all variables or features\n",
85 | "- In simple words, \"In regularization technique, we reduce the magnitude of the features by keeping the same number of features\"\n",
86 | "- by adding a penalty or complexity term to the complex model"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 2,
92 | "id": "275c826f",
93 | "metadata": {
94 | "execution": {
95 | "iopub.execute_input": "2021-12-22T19:07:38.185704Z",
96 | "iopub.status.busy": "2021-12-22T19:07:38.184715Z",
97 | "iopub.status.idle": "2021-12-22T19:07:38.188117Z",
98 | "shell.execute_reply": "2021-12-22T19:07:38.188755Z"
99 | },
100 | "papermill": {
101 | "duration": 0.043564,
102 | "end_time": "2021-12-22T19:07:38.188919",
103 | "exception": false,
104 | "start_time": "2021-12-22T19:07:38.145355",
105 | "status": "completed"
106 | },
107 | "tags": []
108 | },
109 | "outputs": [],
110 | "source": [
111 | "# How does Regularization Work?"
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "id": "b8aea070",
117 | "metadata": {
118 | "papermill": {
119 | "duration": 0.036276,
120 | "end_time": "2021-12-22T19:07:38.261804",
121 | "exception": false,
122 | "start_time": "2021-12-22T19:07:38.225528",
123 | "status": "completed"
124 | },
125 | "tags": []
126 | },
127 | "source": [
128 | "Let's consider the simple linear regression equation:\n",
129 | "y= β0+β1x1+β2x2+β3x3+⋯+βnxn +b\n",
130 | "\n",
131 | "Y represents the value to be predicted\n",
132 | "X1, X2, …Xn are the features for Y.\n",
133 | "\n",
134 | "β0,β1,…..βn are the weights or magnitude\n",
135 | "b represents the intercept.\n",
136 | "\n",
137 | "The loss function for the linear regression is called as RSS or Residual sum of squares.\n",
138 | "\n",
139 | "Techniques of Regularization:\n",
140 | "• Ridge Regression\n",
141 | "• Lasso Regression"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 3,
147 | "id": "dfac9cb6",
148 | "metadata": {
149 | "execution": {
150 | "iopub.execute_input": "2021-12-22T19:07:38.338525Z",
151 | "iopub.status.busy": "2021-12-22T19:07:38.337603Z",
152 | "iopub.status.idle": "2021-12-22T19:07:38.342423Z",
153 | "shell.execute_reply": "2021-12-22T19:07:38.343040Z"
154 | },
155 | "papermill": {
156 | "duration": 0.04495,
157 | "end_time": "2021-12-22T19:07:38.343227",
158 | "exception": false,
159 | "start_time": "2021-12-22T19:07:38.298277",
160 | "status": "completed"
161 | },
162 | "tags": []
163 | },
164 | "outputs": [],
165 | "source": [
166 | "# Ridge regression:"
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "id": "8246b5e3",
172 | "metadata": {
173 | "papermill": {
174 | "duration": 0.036138,
175 | "end_time": "2021-12-22T19:07:38.416009",
176 | "exception": false,
177 | "start_time": "2021-12-22T19:07:38.379871",
178 | "status": "completed"
179 | },
180 | "tags": []
181 | },
182 | "source": [
183 | "- a small amount of bias is added\n",
184 | "- reduces the complexity of the model, \n",
185 | "- also called L2 regularization\n",
186 | "- cost function is altered by adding the penalty term to it\n",
187 | "- amount of bias added to the model is called Ridge Regression penalty.."
188 | ]
189 | },
190 | {
191 | "cell_type": "markdown",
192 | "id": "c40b82bb",
193 | "metadata": {
194 | "papermill": {
195 | "duration": 0.036398,
196 | "end_time": "2021-12-22T19:07:38.488992",
197 | "exception": false,
198 | "start_time": "2021-12-22T19:07:38.452594",
199 | "status": "completed"
200 | },
201 | "tags": []
202 | },
203 | "source": [
204 | "From the cost function of Ridge Regression we can see that if the values of λ tends to zero, the equation becomes the cost function of the linear regression model..\n",
205 | "\n",
206 | "A general linear or polynomial regression will fail if there is high collinearity between the independent variables, so to solve such problems, Ridge regression can be used."
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 4,
212 | "id": "91722b0d",
213 | "metadata": {
214 | "execution": {
215 | "iopub.execute_input": "2021-12-22T19:07:38.566612Z",
216 | "iopub.status.busy": "2021-12-22T19:07:38.565649Z",
217 | "iopub.status.idle": "2021-12-22T19:07:38.568855Z",
218 | "shell.execute_reply": "2021-12-22T19:07:38.569391Z"
219 | },
220 | "papermill": {
221 | "duration": 0.043271,
222 | "end_time": "2021-12-22T19:07:38.569551",
223 | "exception": false,
224 | "start_time": "2021-12-22T19:07:38.526280",
225 | "status": "completed"
226 | },
227 | "tags": []
228 | },
229 | "outputs": [],
230 | "source": [
231 | "# Lasso regression"
232 | ]
233 | },
234 | {
235 | "cell_type": "markdown",
236 | "id": "e18ae18a",
237 | "metadata": {
238 | "papermill": {
239 | "duration": 0.036278,
240 | "end_time": "2021-12-22T19:07:38.642318",
241 | "exception": false,
242 | "start_time": "2021-12-22T19:07:38.606040",
243 | "status": "completed"
244 | },
245 | "tags": []
246 | },
247 | "source": [
248 | "Lasso Regression:\n",
249 | "- stands for Least Absolute Shrinkage and Selection Operator\n",
250 | "- also called L1 regularization\n",
251 | "- reduces the complexity of the model\n",
252 | "- similar to the Ridge Regression except that the penalty term contains only the absolute weights instead of a square of weights\n",
253 | "- Since it takes absolute values, hence, it can shrink the slope to zero\n",
254 | "- whereas Ridge Regression can only shrink it near to 0.\n",
255 | "- Some of the features are completely neglected for model evaluation\n",
256 | "- hence Lasso helps in reducing overfitting and also feature selection"
257 | ]
258 | },
259 | {
260 | "cell_type": "markdown",
261 | "id": "1e22ea3d",
262 | "metadata": {
263 | "papermill": {
264 | "duration": 0.035978,
265 | "end_time": "2021-12-22T19:07:38.714585",
266 | "exception": false,
267 | "start_time": "2021-12-22T19:07:38.678607",
268 | "status": "completed"
269 | },
270 | "tags": []
271 | },
272 | "source": [
273 | "Lasso Regression adds “absolute value of magnitude” of coefficient as penalty term to the loss function(L). \n",
274 | "Ridge regression adds “squared magnitude” of coefficient as penalty term to the loss function(L)."
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": 5,
280 | "id": "5dfc333a",
281 | "metadata": {
282 | "execution": {
283 | "iopub.execute_input": "2021-12-22T19:07:38.791141Z",
284 | "iopub.status.busy": "2021-12-22T19:07:38.790203Z",
285 | "iopub.status.idle": "2021-12-22T19:07:38.793467Z",
286 | "shell.execute_reply": "2021-12-22T19:07:38.794295Z"
287 | },
288 | "papermill": {
289 | "duration": 0.043469,
290 | "end_time": "2021-12-22T19:07:38.794464",
291 | "exception": false,
292 | "start_time": "2021-12-22T19:07:38.750995",
293 | "status": "completed"
294 | },
295 | "tags": []
296 | },
297 | "outputs": [],
298 | "source": [
299 | "### Implementation of Lasso Regression"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": 6,
305 | "id": "f1be8d74",
306 | "metadata": {
307 | "execution": {
308 | "iopub.execute_input": "2021-12-22T19:07:38.871418Z",
309 | "iopub.status.busy": "2021-12-22T19:07:38.870473Z",
310 | "iopub.status.idle": "2021-12-22T19:07:38.879330Z",
311 | "shell.execute_reply": "2021-12-22T19:07:38.879908Z"
312 | },
313 | "papermill": {
314 | "duration": 0.048782,
315 | "end_time": "2021-12-22T19:07:38.880083",
316 | "exception": false,
317 | "start_time": "2021-12-22T19:07:38.831301",
318 | "status": "completed"
319 | },
320 | "tags": []
321 | },
322 | "outputs": [],
323 | "source": [
324 | "import pandas as pd\n",
325 | "import numpy as np\n",
326 | "import matplotlib.pyplot as plt"
327 | ]
328 | },
329 | {
330 | "cell_type": "markdown",
331 | "id": "ab060d3e",
332 | "metadata": {
333 | "papermill": {
334 | "duration": 0.036029,
335 | "end_time": "2021-12-22T19:07:38.952464",
336 | "exception": false,
337 | "start_time": "2021-12-22T19:07:38.916435",
338 | "status": "completed"
339 | },
340 | "tags": []
341 | },
342 | "source": [
343 | "We are going to use the Boston house prediction dataset, that is an inbuilt dataset in sklearn"
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": 7,
349 | "id": "e06717e2",
350 | "metadata": {
351 | "execution": {
352 | "iopub.execute_input": "2021-12-22T19:07:39.030301Z",
353 | "iopub.status.busy": "2021-12-22T19:07:39.029298Z",
354 | "iopub.status.idle": "2021-12-22T19:07:40.031602Z",
355 | "shell.execute_reply": "2021-12-22T19:07:40.032128Z"
356 | },
357 | "papermill": {
358 | "duration": 1.043508,
359 | "end_time": "2021-12-22T19:07:40.032329",
360 | "exception": false,
361 | "start_time": "2021-12-22T19:07:38.988821",
362 | "status": "completed"
363 | },
364 | "tags": []
365 | },
366 | "outputs": [],
367 | "source": [
368 | "from sklearn.datasets import load_boston\n",
369 | "boston=load_boston()"
370 | ]
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": 8,
375 | "id": "9e81cdf1",
376 | "metadata": {
377 | "execution": {
378 | "iopub.execute_input": "2021-12-22T19:07:40.116864Z",
379 | "iopub.status.busy": "2021-12-22T19:07:40.116015Z",
380 | "iopub.status.idle": "2021-12-22T19:07:40.119134Z",
381 | "shell.execute_reply": "2021-12-22T19:07:40.119605Z"
382 | },
383 | "papermill": {
384 | "duration": 0.048494,
385 | "end_time": "2021-12-22T19:07:40.119782",
386 | "exception": false,
387 | "start_time": "2021-12-22T19:07:40.071288",
388 | "status": "completed"
389 | },
390 | "tags": []
391 | },
392 | "outputs": [
393 | {
394 | "data": {
395 | "text/plain": [
396 | "['DESCR', 'data', 'feature_names', 'filename', 'target']"
397 | ]
398 | },
399 | "execution_count": 8,
400 | "metadata": {},
401 | "output_type": "execute_result"
402 | }
403 | ],
404 | "source": [
405 | "# Getting attributes of boston\n",
406 | "dir(boston)"
407 | ]
408 | },
409 | {
410 | "cell_type": "code",
411 | "execution_count": 9,
412 | "id": "25ce3d97",
413 | "metadata": {
414 | "execution": {
415 | "iopub.execute_input": "2021-12-22T19:07:40.198755Z",
416 | "iopub.status.busy": "2021-12-22T19:07:40.198029Z",
417 | "iopub.status.idle": "2021-12-22T19:07:40.202579Z",
418 | "shell.execute_reply": "2021-12-22T19:07:40.203158Z"
419 | },
420 | "papermill": {
421 | "duration": 0.045441,
422 | "end_time": "2021-12-22T19:07:40.203322",
423 | "exception": false,
424 | "start_time": "2021-12-22T19:07:40.157881",
425 | "status": "completed"
426 | },
427 | "tags": []
428 | },
429 | "outputs": [
430 | {
431 | "data": {
432 | "text/plain": [
433 | "\".. _boston_dataset:\\n\\nBoston house prices dataset\\n---------------------------\\n\\n**Data Set Characteristics:** \\n\\n :Number of Instances: 506 \\n\\n :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\\n\\n :Attribute Information (in order):\\n - CRIM per capita crime rate by town\\n - ZN proportion of residential land zoned for lots over 25,000 sq.ft.\\n - INDUS proportion of non-retail business acres per town\\n - CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\\n - NOX nitric oxides concentration (parts per 10 million)\\n - RM average number of rooms per dwelling\\n - AGE proportion of owner-occupied units built prior to 1940\\n - DIS weighted distances to five Boston employment centres\\n - RAD index of accessibility to radial highways\\n - TAX full-value property-tax rate per $10,000\\n - PTRATIO pupil-teacher ratio by town\\n - B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\\n - LSTAT % lower status of the population\\n - MEDV Median value of owner-occupied homes in $1000's\\n\\n :Missing Attribute Values: None\\n\\n :Creator: Harrison, D. and Rubinfeld, D.L.\\n\\nThis is a copy of UCI ML housing dataset.\\nhttps://archive.ics.uci.edu/ml/machine-learning-databases/housing/\\n\\n\\nThis dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\\n\\nThe Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\\nprices and the demand for clean air', J. Environ. Economics & Management,\\nvol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics\\n...', Wiley, 1980. N.B. Various transformations are used in the table on\\npages 244-261 of the latter.\\n\\nThe Boston house-price data has been used in many machine learning papers that address regression\\nproblems. \\n \\n.. topic:: References\\n\\n - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\\n - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\\n\""
434 | ]
435 | },
436 | "execution_count": 9,
437 | "metadata": {},
438 | "output_type": "execute_result"
439 | }
440 | ],
441 | "source": [
442 | "# printing description\n",
443 | "boston.DESCR"
444 | ]
445 | },
446 | {
447 | "cell_type": "code",
448 | "execution_count": 10,
449 | "id": "8ebb5684",
450 | "metadata": {
451 | "execution": {
452 | "iopub.execute_input": "2021-12-22T19:07:40.288364Z",
453 | "iopub.status.busy": "2021-12-22T19:07:40.287693Z",
454 | "iopub.status.idle": "2021-12-22T19:07:40.290236Z",
455 | "shell.execute_reply": "2021-12-22T19:07:40.290736Z"
456 | },
457 | "papermill": {
458 | "duration": 0.048769,
459 | "end_time": "2021-12-22T19:07:40.290894",
460 | "exception": false,
461 | "start_time": "2021-12-22T19:07:40.242125",
462 | "status": "completed"
463 | },
464 | "tags": []
465 | },
466 | "outputs": [
467 | {
468 | "data": {
469 | "text/plain": [
470 | "array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,\n",
471 | " 4.9800e+00],\n",
472 | " [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,\n",
473 | " 9.1400e+00],\n",
474 | " [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,\n",
475 | " 4.0300e+00],\n",
476 | " ...,\n",
477 | " [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,\n",
478 | " 5.6400e+00],\n",
479 | " [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,\n",
480 | " 6.4800e+00],\n",
481 | " [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,\n",
482 | " 7.8800e+00]])"
483 | ]
484 | },
485 | "execution_count": 10,
486 | "metadata": {},
487 | "output_type": "execute_result"
488 | }
489 | ],
490 | "source": [
491 | "# Printing \"data\" attributes of the dataset, its our input \n",
492 | "boston.data"
493 | ]
494 | },
495 | {
496 | "cell_type": "code",
497 | "execution_count": 11,
498 | "id": "2931de15",
499 | "metadata": {
500 | "execution": {
501 | "iopub.execute_input": "2021-12-22T19:07:40.371336Z",
502 | "iopub.status.busy": "2021-12-22T19:07:40.370379Z",
503 | "iopub.status.idle": "2021-12-22T19:07:40.376451Z",
504 | "shell.execute_reply": "2021-12-22T19:07:40.375892Z"
505 | },
506 | "papermill": {
507 | "duration": 0.047497,
508 | "end_time": "2021-12-22T19:07:40.376583",
509 | "exception": false,
510 | "start_time": "2021-12-22T19:07:40.329086",
511 | "status": "completed"
512 | },
513 | "tags": []
514 | },
515 | "outputs": [
516 | {
517 | "data": {
518 | "text/plain": [
519 | "array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',\n",
520 | " 'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='
\n",
621 | "\n",
634 | "\n",
635 | " \n",
636 | " \n",
637 | " | \n",
638 | " CRIM | \n",
639 | " ZN | \n",
640 | " INDUS | \n",
641 | " CHAS | \n",
642 | " NOX | \n",
643 | " RM | \n",
644 | " AGE | \n",
645 | " DIS | \n",
646 | " RAD | \n",
647 | " TAX | \n",
648 | " PTRATIO | \n",
649 | " B | \n",
650 | " LSTAT | \n",
651 | "
\n",
652 | " \n",
653 | " \n",
654 | " \n",
655 | " | 0 | \n",
656 | " 0.00632 | \n",
657 | " 18.0 | \n",
658 | " 2.31 | \n",
659 | " 0.0 | \n",
660 | " 0.538 | \n",
661 | " 6.575 | \n",
662 | " 65.2 | \n",
663 | " 4.0900 | \n",
664 | " 1.0 | \n",
665 | " 296.0 | \n",
666 | " 15.3 | \n",
667 | " 396.9 | \n",
668 | " 4.98 | \n",
669 | "
\n",
670 | " \n",
671 | " | 1 | \n",
672 | " 0.02731 | \n",
673 | " 0.0 | \n",
674 | " 7.07 | \n",
675 | " 0.0 | \n",
676 | " 0.469 | \n",
677 | " 6.421 | \n",
678 | " 78.9 | \n",
679 | " 4.9671 | \n",
680 | " 2.0 | \n",
681 | " 242.0 | \n",
682 | " 17.8 | \n",
683 | " 396.9 | \n",
684 | " 9.14 | \n",
685 | "
\n",
686 | " \n",
687 | "
\n",
688 | " "
689 | ],
690 | "text/plain": [
691 | " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n",
692 | "0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 \n",
693 | "1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 \n",
694 | "\n",
695 | " PTRATIO B LSTAT \n",
696 | "0 15.3 396.9 4.98 \n",
697 | "1 17.8 396.9 9.14 "
698 | ]
699 | },
700 | "execution_count": 14,
701 | "metadata": {},
702 | "output_type": "execute_result"
703 | }
704 | ],
705 | "source": [
706 | "# Printing first 2 rows of the dataframe 'df'\n",
707 | "df.head(2)"
708 | ]
709 | },
710 | {
711 | "cell_type": "code",
712 | "execution_count": 15,
713 | "id": "941a7867",
714 | "metadata": {
715 | "execution": {
716 | "iopub.execute_input": "2021-12-22T19:07:40.746594Z",
717 | "iopub.status.busy": "2021-12-22T19:07:40.741140Z",
718 | "iopub.status.idle": "2021-12-22T19:07:40.749244Z",
719 | "shell.execute_reply": "2021-12-22T19:07:40.749813Z"
720 | },
721 | "papermill": {
722 | "duration": 0.052214,
723 | "end_time": "2021-12-22T19:07:40.749987",
724 | "exception": false,
725 | "start_time": "2021-12-22T19:07:40.697773",
726 | "status": "completed"
727 | },
728 | "tags": []
729 | },
730 | "outputs": [],
731 | "source": [
732 | "# adding a new column 'target' from boston.target\n",
733 | "df['target']=boston.target"
734 | ]
735 | },
736 | {
737 | "cell_type": "code",
738 | "execution_count": 16,
739 | "id": "313f6168",
740 | "metadata": {
741 | "execution": {
742 | "iopub.execute_input": "2021-12-22T19:07:40.835505Z",
743 | "iopub.status.busy": "2021-12-22T19:07:40.834548Z",
744 | "iopub.status.idle": "2021-12-22T19:07:40.852884Z",
745 | "shell.execute_reply": "2021-12-22T19:07:40.852218Z"
746 | },
747 | "papermill": {
748 | "duration": 0.062957,
749 | "end_time": "2021-12-22T19:07:40.853023",
750 | "exception": false,
751 | "start_time": "2021-12-22T19:07:40.790066",
752 | "status": "completed"
753 | },
754 | "tags": []
755 | },
756 | "outputs": [
757 | {
758 | "data": {
759 | "text/html": [
760 | "