├── Project_Paper └── README.md /Project_Paper: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sristi1123/Diabetes_Prediction/HEAD/Project_Paper -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Diabetes_Prediction 2 | #🚀 Diabetes Prediction using Machine Learning (KNN Model) This project uses K-Nearest Neighbors (KNN) to predict diabetes based on health parameters. It includes data #preprocessing, visualization, model training, hyperparameter tuning, and performance evaluation. 3 | 4 | 5 | #import libraries 6 | import numpy as np 7 | import pandas as pd 8 | import matplotlib.pyplot as plt 9 | import seaborn as sns 10 | sns.set() 11 | 12 | from mlxtend.plotting import plot_decision_regions 13 | import missingno as msno 14 | from pandas.plotting import scatter_matrix 15 | from sklearn.preprocessing import StandardScaler 16 | from sklearn.model_selection import train_test_split 17 | from sklearn.neighbors import KNeighborsClassifier 18 | from sklearn.metrics import confusion_matrix 19 | from sklearn import metrics 20 | from sklearn.metrics import classification_report 21 | from sklearn.metrics import roc_curve 22 | from sklearn.metrics import roc_auc_score 23 | from sklearn.model_selection import GridSearchCV 24 | import warnings 25 | warnings.filterwarnings('ignore') 26 | %matplotlib inline 27 | # Load dataset 28 | df=pd.read_csv("/content/diabetes.csv") 29 | df.head() 30 | df.columns 31 | df.info() # Print dataset information 32 | df.describe() 33 | df.describe().T # Summary statistics 34 | df.isnull() 35 | df.isnull().sum() # Check for missing values 36 | # Create a deep copy of the dataset for processing 37 | df_copy=df.copy(deep=True) 38 | # Replace zero values with NaN in specific columns to handle missing data 39 | df_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']]=df_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.nan) 40 | df_copy.isnull().sum() 41 | # Plot histograms before data cleaning 42 | p=df.hist(figsize=(20,20)) 43 | # Fill missing values with mean or median 44 | df_copy['Glucose'].fillna(df_copy['Glucose'].mean(),inplace=True) 45 | df_copy['BloodPressure'].fillna(df_copy['BloodPressure'].mean(),inplace=True) 46 | df_copy['SkinThickness'].fillna(df_copy['SkinThickness'].median(),inplace=True) 47 | df_copy['Insulin'].fillna(df_copy['Insulin'].median(),inplace=True) 48 | df_copy['BMI'].fillna(df_copy['BMI'].median(),inplace=True) 49 | # plot histograms after data cleaning 50 | p=df_copy.hist(figsize=(20,20)) 51 | # Visualize missing values using MissingNo library 52 | p=msno.bar(df_copy) 53 | import matplotlib.pyplot as plt 54 | 55 | # Define colors for the bars 56 | color_wheel = {0: "#0392cf", 1: "#7bc043"} # Blue for Non-Diabetic, Green for Diabetic 57 | 58 | # Count occurrences of each Outcome (0 and 1) 59 | outcome_counts = df.Outcome.value_counts() 60 | 61 | # Rename the index (0 → Non-Diabetic, 1 → Diabetic) 62 | outcome_counts.index = ["Non-Diabetic", "Diabetic"] 63 | 64 | # Plot bar chart with updated labels 65 | p = outcome_counts.plot(kind="bar", color=[color_wheel[0], color_wheel[1]]) 66 | 67 | # Set axis labels 68 | p.set_xlabel("Outcome") 69 | p.set_ylabel("Frequency") 70 | 71 | # Keep labels straight 72 | plt.xticks(rotation=0) 73 | 74 | plt.show() 75 | # scatter matrix to show relationships between numerical features. 76 | p=scatter_matrix(df,figsize=(20,20)) 77 | # Seaborn for better visualization 78 | p=sns.pairplot(df_copy,hue='Outcome') 79 | # Correlation heatmap befire data cleaning 80 | plt.figure (figsize=(12,10)) 81 | p=sns.heatmap(df.corr(),annot=True,cmap="YlGnBu") 82 | # Correlation heatmap after data cleaning 83 | plt.figure(figsize=(12,10)) 84 | p=sns.heatmap(df_copy.corr(),annot=True,cmap="YlGnBu") 85 | # print first 5 rows 86 | df_copy.head() 87 | # Standardize feature values for better KNN performance 88 | sc_x=StandardScaler() 89 | x=pd.DataFrame(sc_x.fit_transform(df_copy.drop(["Outcome"],axis=1)),columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 90 | 'BMI', 'DiabetesPedigreeFunction', 'Age']) 91 | x.head() 92 | y=df_copy.Outcome 93 | y 94 | # Split data into training and testing sets 95 | x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=1/3,random_state=42,stratify=y) 96 | # Find the best K value using training and testing scores 97 | test_score=[] 98 | train_score=[] 99 | for i in range(1,15): 100 | knn=KNeighborsClassifier(n_neighbors=i) 101 | knn.fit(x_train,y_train) 102 | train_score.append(knn.score(x_train,y_train)) 103 | test_score.append(knn.score(x_test,y_test)) 104 | train_score 105 | train_score 106 | # Print best K values 107 | max_train_score=max(train_score) 108 | train_score_ind=[i for i,v in enumerate(train_score) if v==max_train_score] 109 | print("Max train score {} % and k = {}".format(max_train_score*100,list(map(lambda x:x+1,train_score_ind)))) 110 | max_test_score=max(test_score) 111 | test_score_ind=[i for i,v in enumerate(test_score) if v==max_test_score] 112 | print("Max test score {} % and k = {}".format(max_test_score*100,list(map(lambda x:x+1,test_score_ind)))) 113 | # Plot accuracy vs number of neighbors (K) 114 | plt.figure(figsize=(12,5)) 115 | plt.plot(range(1,15),test_score,color="blue",label="Testing Accuracy") 116 | plt.plot(range(1,15),train_score,color="red",label="Training Accuracy") 117 | plt.legend() 118 | # Train KNN model with optimal K=11 119 | knn=KNeighborsClassifier(n_neighbors=11) 120 | knn.fit(x_train,y_train) 121 | knn.score(x_test,y_test) 122 | # Plot decision regions 123 | value = 20000 124 | width = 20000 125 | 126 | plot_decision_regions(x.values, y.values, clf=knn, legend=2, 127 | filler_feature_values={2: value, 3: value, 4: value, 5: value, 6: value, 7: value}, 128 | filler_feature_ranges={2: width, 3: width, 4: width, 5: width, 6: width, 7: width}, 129 | X_highlight=x_test.values) 130 | 131 | plt.title('KNN with Diabetes Dataset') 132 | plt.show() 133 | 134 | import matplotlib.pyplot as plt 135 | from mlxtend.plotting import plot_decision_regions 136 | 137 | # Select only two features (e.g., Glucose and BMI) 138 | x = x[['Glucose', 'BMI']] # This selects only 'Glucose' and 'BMI' columns 139 | y = y # Keep the target variable (Outcome) 140 | 141 | # Fit the KNN model (already done in your previous steps) 142 | knn.fit(x, y) 143 | 144 | # Plot the decision regions for these two features 145 | plot_decision_regions(X=x.values, y=y.values, clf=knn, legend=2) 146 | 147 | # Title for the plot 148 | plt.title('KNN with Diabetes Dataset') 149 | plt.show() 150 | # Predictions on test set 151 | y_pred=knn.predict(x_test) 152 | # Generate confusion matrix and heatmap 153 | cnf_matrix=metrics.confusion_matrix(y_test,y_pred) 154 | p=sns.heatmap(pd.DataFrame(cnf_matrix),annot=True,cmap="YlGnBu") 155 | plt.title("Confusion matrix",y=1.1) 156 | plt.ylabel("Actual label") 157 | plt.xlabel("Predicted label") 158 | 159 | # Classification report 160 | print(classification_report(y_test,y_pred)) 161 | 162 | import joblib 163 | joblib.dump(knn, "knn_model.pkl") 164 | joblib.dump(sc_x, "scaler.pkl") 165 | 166 | # ============================================= 167 | # INTERACTIVE PREDICTION WIDGET 168 | # ============================================= 169 | import ipywidgets as widgets 170 | from IPython.display import display, clear_output 171 | 172 | # Define sliders for user input 173 | pregnancies = widgets.IntSlider(min=0, max=20, value=1, description="Pregnancies") 174 | glucose = widgets.IntSlider(min=50, max=200, value=100, description="Glucose") 175 | bp = widgets.IntSlider(min=40, max=150, value=70, description="Blood Pressure") 176 | skin = widgets.IntSlider(min=0, max=100, value=20, description="Skin Thickness") 177 | insulin = widgets.IntSlider(min=0, max=900, value=30, description="Insulin") 178 | bmi = widgets.FloatSlider(min=10, max=60, value=25.0, description="BMI") 179 | dpf = widgets.FloatSlider(min=0.0, max=3.0, value=0.5, description="DPF") 180 | age = widgets.IntSlider(min=10, max=100, value=30, description="Age") 181 | 182 | # Output area 183 | output = widgets.Output() 184 | 185 | # Function to predict diabetes 186 | def predict_diabetes(b): 187 | with output: 188 | clear_output(wait=True) 189 | 190 | user_data = np.array([[pregnancies.value, glucose.value, bp.value, skin.value, 191 | insulin.value, bmi.value, dpf.value, age.value]]) 192 | user_data_scaled = sc_x.transform(user_data) 193 | prediction = knn.predict(user_data_scaled)[0] 194 | probability = knn.predict_proba(user_data_scaled)[0][1] 195 | 196 | # Set emoji-based feedback 197 | if prediction == 1: 198 | emoji = "⚠️" 199 | message = f"{emoji} High Diabetes Risk! ({probability:.2%})" 200 | color = "red" 201 | else: 202 | emoji = "🎉" 203 | message = f"{emoji} Low Diabetes Risk! ({probability:.2%})" 204 | color = "green" 205 | 206 | # Show result 207 | print(message) 208 | 209 | # Enhanced Visual Feedback (Gradient Progress Bar) 210 | fig, ax = plt.subplots(figsize=(6, 1.2)) 211 | ax.barh([""], [probability], color=sns.color_palette("coolwarm", as_cmap=True)(probability), height=0.5) 212 | ax.set_xlim(0, 1) 213 | ax.set_xticks([]) 214 | ax.set_yticks([]) 215 | ax.spines["top"].set_visible(False) 216 | ax.spines["right"].set_visible(False) 217 | ax.spines["bottom"].set_visible(False) 218 | ax.spines["left"].set_visible(False) 219 | ax.text(probability/2, 0, f"{probability:.1%}", va="center", ha="center", 220 | fontsize=12, color="white", weight="bold") 221 | 222 | plt.title("Diabetes Risk Probability", fontsize=14, weight="bold", color=color) 223 | plt.show() 224 | 225 | # Create a button 226 | button = widgets.Button(description="🚀 Predict Now") 227 | button.on_click(predict_diabetes) 228 | 229 | # Display UI 230 | print("\n\n=== Diabetes Risk Prediction Tool ===") 231 | display(pregnancies, glucose, bp, skin, insulin, bmi, dpf, age, button, output) 232 | --------------------------------------------------------------------------------