├── Project_Paper
└── README.md


/Project_Paper:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sristi1123/Diabetes_Prediction/HEAD/Project_Paper


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Diabetes_Prediction
  2 | #🚀 Diabetes Prediction using Machine Learning (KNN Model) This project uses K-Nearest Neighbors (KNN) to predict diabetes based on health parameters. It includes data #preprocessing, visualization, model training, hyperparameter tuning, and performance evaluation.
  3 | 
  4 | 
  5 | #import libraries
  6 | import numpy as np
  7 | import pandas as pd
  8 | import matplotlib.pyplot as plt
  9 | import seaborn as sns
 10 | sns.set()
 11 | 
 12 | from mlxtend.plotting import plot_decision_regions
 13 | import missingno as msno
 14 | from pandas.plotting import scatter_matrix
 15 | from sklearn.preprocessing import StandardScaler
 16 | from sklearn.model_selection import train_test_split
 17 | from sklearn.neighbors import KNeighborsClassifier
 18 | from sklearn.metrics import confusion_matrix
 19 | from sklearn import metrics
 20 | from sklearn.metrics import classification_report
 21 | from sklearn.metrics import roc_curve
 22 | from sklearn.metrics import roc_auc_score
 23 | from sklearn.model_selection import GridSearchCV
 24 | import warnings
 25 | warnings.filterwarnings('ignore')
 26 | %matplotlib inline
 27 | # Load dataset
 28 | df=pd.read_csv("/content/diabetes.csv")
 29 | df.head()
 30 | df.columns
 31 | df.info() # Print dataset information
 32 | df.describe()
 33 | df.describe().T  # Summary statistics
 34 | df.isnull()
 35 | df.isnull().sum()   # Check for missing values
 36 | # Create a deep copy of the dataset for processing
 37 | df_copy=df.copy(deep=True)
 38 | # Replace zero values with NaN in specific columns to handle missing data
 39 | df_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']]=df_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.nan)
 40 | df_copy.isnull().sum()
 41 | # Plot histograms before data cleaning
 42 | p=df.hist(figsize=(20,20))
 43 | # Fill missing values with mean or median
 44 | df_copy['Glucose'].fillna(df_copy['Glucose'].mean(),inplace=True)
 45 | df_copy['BloodPressure'].fillna(df_copy['BloodPressure'].mean(),inplace=True)
 46 | df_copy['SkinThickness'].fillna(df_copy['SkinThickness'].median(),inplace=True)
 47 | df_copy['Insulin'].fillna(df_copy['Insulin'].median(),inplace=True)
 48 | df_copy['BMI'].fillna(df_copy['BMI'].median(),inplace=True)
 49 | # plot histograms after data cleaning
 50 | p=df_copy.hist(figsize=(20,20))
 51 | # Visualize missing values using MissingNo library
 52 | p=msno.bar(df_copy)
 53 | import matplotlib.pyplot as plt
 54 | 
 55 | # Define colors for the bars
 56 | color_wheel = {0: "#0392cf", 1: "#7bc043"}  # Blue for Non-Diabetic, Green for Diabetic
 57 | 
 58 | # Count occurrences of each Outcome (0 and 1)
 59 | outcome_counts = df.Outcome.value_counts()
 60 | 
 61 | # Rename the index (0 → Non-Diabetic, 1 → Diabetic)
 62 | outcome_counts.index = ["Non-Diabetic", "Diabetic"]
 63 | 
 64 | # Plot bar chart with updated labels
 65 | p = outcome_counts.plot(kind="bar", color=[color_wheel[0], color_wheel[1]])
 66 | 
 67 | # Set axis labels
 68 | p.set_xlabel("Outcome")
 69 | p.set_ylabel("Frequency")
 70 | 
 71 | # Keep labels straight
 72 | plt.xticks(rotation=0)
 73 | 
 74 | plt.show()
 75 |  # scatter matrix to show relationships between numerical features.
 76 | p=scatter_matrix(df,figsize=(20,20))
 77 | # Seaborn for better visualization
 78 | p=sns.pairplot(df_copy,hue='Outcome')
 79 | # Correlation heatmap befire data cleaning
 80 | plt.figure (figsize=(12,10))
 81 | p=sns.heatmap(df.corr(),annot=True,cmap="YlGnBu")
 82 | # Correlation heatmap after data cleaning
 83 | plt.figure(figsize=(12,10))
 84 | p=sns.heatmap(df_copy.corr(),annot=True,cmap="YlGnBu")
 85 | # print first 5 rows
 86 | df_copy.head()
 87 | # Standardize feature values for better KNN performance
 88 | sc_x=StandardScaler()
 89 | x=pd.DataFrame(sc_x.fit_transform(df_copy.drop(["Outcome"],axis=1)),columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
 90 |        'BMI', 'DiabetesPedigreeFunction', 'Age'])
 91 | x.head()
 92 | y=df_copy.Outcome
 93 | y
 94 | # Split data into training and testing sets
 95 | x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=1/3,random_state=42,stratify=y)
 96 | # Find the best K value using training and testing scores
 97 | test_score=[]
 98 | train_score=[]
 99 | for i in range(1,15):
100 |   knn=KNeighborsClassifier(n_neighbors=i)
101 |   knn.fit(x_train,y_train)
102 |   train_score.append(knn.score(x_train,y_train))
103 |   test_score.append(knn.score(x_test,y_test))
104 | train_score
105 | train_score
106 | # Print best K values
107 | max_train_score=max(train_score)
108 | train_score_ind=[i for i,v in enumerate(train_score) if v==max_train_score]
109 | print("Max train score {} % and k = {}".format(max_train_score*100,list(map(lambda x:x+1,train_score_ind))))
110 | max_test_score=max(test_score)
111 | test_score_ind=[i for i,v in enumerate(test_score) if v==max_test_score]
112 | print("Max test score {} % and k = {}".format(max_test_score*100,list(map(lambda x:x+1,test_score_ind))))
113 | # Plot accuracy vs number of neighbors (K)
114 | plt.figure(figsize=(12,5))
115 | plt.plot(range(1,15),test_score,color="blue",label="Testing Accuracy")
116 | plt.plot(range(1,15),train_score,color="red",label="Training Accuracy")
117 | plt.legend()
118 | # Train KNN model with optimal K=11
119 | knn=KNeighborsClassifier(n_neighbors=11)
120 | knn.fit(x_train,y_train)
121 | knn.score(x_test,y_test)
122 | # Plot decision regions
123 | value = 20000
124 | width = 20000
125 | 
126 | plot_decision_regions(x.values, y.values, clf=knn, legend=2,
127 |                       filler_feature_values={2: value, 3: value, 4: value, 5: value, 6: value, 7: value},
128 |                       filler_feature_ranges={2: width, 3: width, 4: width, 5: width, 6: width, 7: width},
129 |                       X_highlight=x_test.values)
130 | 
131 | plt.title('KNN with Diabetes Dataset')
132 | plt.show()
133 | 
134 | import matplotlib.pyplot as plt
135 | from mlxtend.plotting import plot_decision_regions
136 | 
137 | # Select only two features (e.g., Glucose and BMI)
138 | x = x[['Glucose', 'BMI']]  # This selects only 'Glucose' and 'BMI' columns
139 | y = y  # Keep the target variable (Outcome)
140 | 
141 | # Fit the KNN model (already done in your previous steps)
142 | knn.fit(x, y)
143 | 
144 | # Plot the decision regions for these two features
145 | plot_decision_regions(X=x.values, y=y.values, clf=knn, legend=2)
146 | 
147 | # Title for the plot
148 | plt.title('KNN with Diabetes Dataset')
149 | plt.show()
150 | # Predictions on test set
151 | y_pred=knn.predict(x_test)
152 | # Generate confusion matrix and heatmap
153 | cnf_matrix=metrics.confusion_matrix(y_test,y_pred)
154 | p=sns.heatmap(pd.DataFrame(cnf_matrix),annot=True,cmap="YlGnBu")
155 | plt.title("Confusion matrix",y=1.1)
156 | plt.ylabel("Actual label")
157 | plt.xlabel("Predicted label")
158 | 
159 | # Classification report
160 | print(classification_report(y_test,y_pred))
161 | 
162 | import joblib
163 | joblib.dump(knn, "knn_model.pkl")
164 | joblib.dump(sc_x, "scaler.pkl")
165 | 
166 | # =============================================
167 | # INTERACTIVE PREDICTION WIDGET
168 | # =============================================
169 | import ipywidgets as widgets
170 | from IPython.display import display, clear_output
171 | 
172 | # Define sliders for user input
173 | pregnancies = widgets.IntSlider(min=0, max=20, value=1, description="Pregnancies")
174 | glucose = widgets.IntSlider(min=50, max=200, value=100, description="Glucose")
175 | bp = widgets.IntSlider(min=40, max=150, value=70, description="Blood Pressure")
176 | skin = widgets.IntSlider(min=0, max=100, value=20, description="Skin Thickness")
177 | insulin = widgets.IntSlider(min=0, max=900, value=30, description="Insulin")
178 | bmi = widgets.FloatSlider(min=10, max=60, value=25.0, description="BMI")
179 | dpf = widgets.FloatSlider(min=0.0, max=3.0, value=0.5, description="DPF")
180 | age = widgets.IntSlider(min=10, max=100, value=30, description="Age")
181 | 
182 | # Output area
183 | output = widgets.Output()
184 | 
185 | # Function to predict diabetes
186 | def predict_diabetes(b):
187 |     with output:
188 |         clear_output(wait=True)
189 | 
190 |         user_data = np.array([[pregnancies.value, glucose.value, bp.value, skin.value,
191 |                              insulin.value, bmi.value, dpf.value, age.value]])
192 |         user_data_scaled = sc_x.transform(user_data)
193 |         prediction = knn.predict(user_data_scaled)[0]
194 |         probability = knn.predict_proba(user_data_scaled)[0][1]
195 | 
196 |         # Set emoji-based feedback
197 |         if prediction == 1:
198 |             emoji = "⚠️"
199 |             message = f"{emoji} High Diabetes Risk! ({probability:.2%})"
200 |             color = "red"
201 |         else:
202 |             emoji = "🎉"
203 |             message = f"{emoji} Low Diabetes Risk! ({probability:.2%})"
204 |             color = "green"
205 | 
206 |         # Show result
207 |         print(message)
208 | 
209 |         # Enhanced Visual Feedback (Gradient Progress Bar)
210 |         fig, ax = plt.subplots(figsize=(6, 1.2))
211 |         ax.barh([""], [probability], color=sns.color_palette("coolwarm", as_cmap=True)(probability), height=0.5)
212 |         ax.set_xlim(0, 1)
213 |         ax.set_xticks([])
214 |         ax.set_yticks([])
215 |         ax.spines["top"].set_visible(False)
216 |         ax.spines["right"].set_visible(False)
217 |         ax.spines["bottom"].set_visible(False)
218 |         ax.spines["left"].set_visible(False)
219 |         ax.text(probability/2, 0, f"{probability:.1%}", va="center", ha="center",
220 |                 fontsize=12, color="white", weight="bold")
221 | 
222 |         plt.title("Diabetes Risk Probability", fontsize=14, weight="bold", color=color)
223 |         plt.show()
224 | 
225 | # Create a button
226 | button = widgets.Button(description="🚀 Predict Now")
227 | button.on_click(predict_diabetes)
228 | 
229 | # Display UI
230 | print("\n\n=== Diabetes Risk Prediction Tool ===")
231 | display(pregnancies, glucose, bp, skin, insulin, bmi, dpf, age, button, output)
232 | 


--------------------------------------------------------------------------------