├── .gitignore ├── README.md ├── app ├── app.py ├── models │ └── pipeline_xgboost.pkl ├── static │ └── styles.css └── templates │ └── index.html ├── data └── processed │ ├── X_test.csv │ ├── X_test_engineered.csv │ └── X_train.csv ├── models ├── logistic_regression.pkl ├── random_forest.pkl └── xgboost.pkl ├── notebooks └── exploratory_data_analysis.ipynb ├── raw └── diabetes_data.csv ├── requirements.txt ├── scripts ├── __pycache__ │ └── utility_functions.cpython-311.pyc ├── comprehensive_model_report.py ├── data_preprocessing.py ├── feature_engineering.py ├── model_evaluation.py ├── model_training.py ├── plots │ ├── Logistic Regression_confusion_matrix.png │ ├── Logistic Regression_roc_curve.png │ ├── Random Forest_confusion_matrix.png │ ├── Random Forest_roc_curve.png │ ├── XGBoost_confusion_matrix.png │ └── XGBoost_roc_curve.png ├── reports │ ├── Logistic Regression_report.csv │ ├── Random Forest_report.csv │ └── XGBoost_report.csv └── utility_functions.py └── tests ├── __pycache__ ├── test_data_preprocessing.cpython-311-pytest-8.2.2.pyc ├── test_feature_engineering.cpython-311-pytest-8.2.2.pyc ├── test_model_evaluation.cpython-311-pytest-8.2.2.pyc └── test_model_training.cpython-311-pytest-8.2.2.pyc ├── models ├── logistic_regression.pkl ├── random_forest.pkl └── xgboost.pkl ├── test_data_preprocessing.py ├── test_feature_engineering.py └── test_model_training.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Diabetes Health Prediction and Analysis 🎉 2 | 3 | ![Diabetes Health Prediction](https://miro.medium.com/v2/resize:fit:828/format:webp/1*KkQbSEI9sT44_yxR9vscJA.gif) 4 | 5 | --- 6 | 7 | Welcome to the **Diabetes Health Prediction and Analysis** project! This repository contains a comprehensive pipeline for predicting diabetes diagnosis using various machine learning and deep learning models, along with an in-depth exploratory data analysis and feature engineering steps. 8 | 9 | ## 🚀 Project Overview 10 | 11 | This project aims to provide a thorough analysis of diabetes-related health data, develop predictive models, and evaluate their performance. The key components of the project include: 12 | 13 | - 📊 Data Preprocessing 14 | - 🔍 Exploratory Data Analysis (EDA) 15 | - 🛠️ Feature Engineering 16 | - 🧠 Model Training 17 | - 📈 Model Evaluation 18 | - 📑 Comprehensive Reports 19 | 20 | ## 📂 Project Structure 21 | 22 | Here's an overview of the project directory structure: 23 | 24 | 25 | ```plaintext 26 | Diabetes_Health_Prediction_and_Analysis/ 27 | ├── data/ 28 | │ ├── raw/ 29 | │ │ └── diabetes_data.csv 30 | │ ├── processed/ 31 | │ │ ├── X_train.csv 32 | │ │ ├── X_train_engineered.csv 33 | │ │ ├── X_test.csv 34 | │ │ ├── X_test_engineered.csv 35 | │ │ ├── y_train.csv 36 | │ │ └── y_test.csv 37 | ├── app/ 38 | │ ├── app.py 39 | │ ├── templates/ 40 | │ │ └── index.html 41 | │ └── static/ 42 | │ └── styles.css 43 | ├── models/ 44 | │ ├── logistic_regression.pkl 45 | │ ├── random_forest.pkl 46 | │ └── xgboost.pkl 47 | ├── notebooks/ 48 | │ └── exploratory_data_analysis.ipynb 49 | ├── scripts/ 50 | │ ├── plots/ 51 | │ ├── reports/ 52 | │ ├── data_preprocessing.py 53 | │ ├── feature_engineering.py 54 | │ ├── model_training.py 55 | │ ├── model_evaluation.py 56 | │ └── model_performance_report.py 57 | ├── tests/ 58 | │ ├── models/ 59 | │ ├── test_data_preprocessing.py 60 | │ ├── test_feature_engineering.py 61 | │ ├── test_model_training.py 62 | ├── requirements.txt 63 | └── README.md 64 | ``` 65 | 66 | ## 🔧 Setup and Installation 67 | 68 | To get started with this project, follow the steps below: 69 | 70 | 1. **Clone the repository:** 71 | 72 | ```sh 73 | git clone https://github.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis.git 74 | cd Diabetes_Health_Prediction_and_Analysis 75 | ``` 76 | 77 | 2. **Create and activate a virtual environment:** 78 | 79 | ```sh 80 | python -m venv venv 81 | source venv/bin/activate # On Windows use `venv\Scripts\activate` 82 | ``` 83 | 84 | 3. **Install the required packages:** 85 | 86 | ```sh 87 | pip install -r requirements.txt 88 | ``` 89 | 90 | 4. **Run the data preprocessing script:** 91 | 92 | ```sh 93 | python scripts/data_preprocessing.py 94 | ``` 95 | 96 | 5. **Run the feature engineering script:** 97 | 98 | ```sh 99 | python scripts/feature_engineering.py 100 | ``` 101 | 102 | 6. **Train the models:** 103 | 104 | ```sh 105 | python scripts/model_training.py 106 | ``` 107 | 108 | 7. **Evaluate the models:** 109 | 110 | ```sh 111 | python scripts/model_evaluation.py 112 | ``` 113 | 114 | 8. **Generate comprehensive model performance reports:** 115 | 116 | ```sh 117 | python script/comprehensive_model_report.py 118 | ``` 119 | 120 | ## 🚀 Usage 121 | 122 | - **Exploratory Data Analysis**: Check the `notebooks/exploratory_data_analysis.ipynb` notebook for detailed data analysis and visualizations. 123 | - **Scripts**: All scripts for data preprocessing, feature engineering, model training, and evaluation are located in the `scripts/` directory. 124 | - **Tests**: To ensure code quality and correctness, tests are included in the `tests/` directory. Run them with `pytest`. 125 | 126 | ## 📊 Models 127 | 128 | The following models are trained and evaluated in this project: 129 | 130 | --- 131 | 132 | ### Logistic Regression 133 | 134 | #### ROC Curve: 135 | ![Logistic Regression ROC Curve](/scripts/plots/Logistic%20Regression_roc_curve.png) 136 | 137 | *The ROC curve illustrates the true positive rate (sensitivity) versus the false positive rate (1-specificity) for different threshold settings. A higher area under the curve (AUC) indicates better model performance.* 138 | 139 | #### Confusion Matrix: 140 | ![Logistic Regression Confusion Matrix](/scripts/plots/Logistic%20Regression_confusion_matrix.png) 141 | 142 | *The confusion matrix provides a summary of the prediction results on the classification problem. It shows the number of true positive (TP), true negative (TN), false positive (FP), and false negative (FN) predictions.* 143 | 144 | --- 145 | 146 | ### Random Forest 147 | 148 | #### ROC Curve: 149 | ![Random Forest ROC Curve](/scripts/plots/Random%20Forest_roc_curve.png) 150 | 151 | *The ROC curve illustrates the true positive rate (sensitivity) versus the false positive rate (1-specificity) for different threshold settings. A higher area under the curve (AUC) indicates better model performance.* 152 | 153 | #### Confusion Matrix: 154 | ![Random Forest Confusion Matrix](/scripts/plots/Random%20Forest_confusion_matrix.png) 155 | 156 | *The confusion matrix provides a summary of the prediction results on 157 | 158 | 159 | 160 | ## 🎯 Performance Metrics 161 | 162 | The performance of the models is evaluated using the following metrics: 163 | 164 | - **Accuracy** 165 | - **Precision** 166 | - **Recall** 167 | - **F1 Score** 168 | - **ROC AUC Score** 169 | - **Confusion Matrix** 170 | 171 | ### Logistic Regression 172 | 173 | - **Accuracy (Doğruluk):** %78.99 174 | - **Precision (Kesinlik):** %73.19 175 | - **Recall (Duyarlılık):** %70.63 176 | - **F1 Score:** %71.89 177 | - **ROC AUC:** %83.86 178 | 179 | **Confusion Matrix:** 180 | ```plaintext 181 | [[196 37] 182 | [ 42 101]] 183 | ``` 184 | Model dosyası: 185 | ```sh 186 | models/logistic_regression.pkl 187 | ``` 188 | 189 | ### Random Forest 190 | 191 | - **Accuracy (Doğruluk):** %91.22 192 | - **Precision (Kesinlik):** %94.35 193 | - **Recall (Duyarlılık):** %81.82 194 | - **F1 Score:** %87.64 195 | - **ROC AUC:** %97.69 196 | 197 | **Confusion Matrix:** 198 | ```plaintext 199 | [[226 7] 200 | [ 26 117]] 201 | ``` 202 | Model dosyası: 203 | ```sh 204 | models/random_forest.pkl 205 | ``` 206 | ##### Explanations: 207 | 208 | 1. [x] **_Accuracy:_** The ratio of correctly predicted instances to the total instances. 209 | 2. [x] **_Precision:**_ The ratio of true positive predictions to the total predicted positives. It measures the accuracy of positive predictions. 210 | 3. [x] **_Recall:_** The ratio of true positive predictions to the actual positives. It measures the model's ability to identify positive instances. 211 | 4. [x] **_F1 Score:_** The harmonic mean of precision and recall. It provides a balance between precision and recall. 212 | 5. [x] **_ROC AUC:_** The area under the ROC curve. It summarizes the model's ability to distinguish between classes. 213 | 214 | **Confusion Matrix:** 215 | 216 | * True Positive (TP): 117 - The number of actual positive cases correctly identified by the model. 217 | * True Negative (TN): 226 - The number of actual negative cases correctly identified by the model. 218 | * False Positive (FP): 7 - The number of actual negative cases incorrectly identified as positive by the model. 219 | * False Negative (FN): 26 - The number of actual positive cases incorrectly identified as negative by the model. 220 | 221 | ##### Explanations: 222 | 1. [x] **_Accuracy:_** The ratio of correctly predicted instances to the total instances. 223 | 2. [x] **_Precision:_** The ratio of true positive predictions to the total predicted positives. It measures the accuracy of positive predictions. 224 | 3. [x] **_Recall:_** The ratio of true positive predictions to the actual positives. It measures the model's ability to identify positive instances. 225 | 4. [x] **_F1 Score:_** The harmonic mean of precision and recall. It provides a balance between precision and recall. 226 | 5. [x] **_ROC AUC:_** The area under the ROC curve. It summarizes the model's ability to distinguish between classes. 227 | 228 | **Confusion Matrix:** 229 | 230 | * True Positive (TP): 117 - The number of actual positive cases correctly identified by the model. 231 | * True Negative (TN): 226 - The number of actual negative cases correctly identified by the model. 232 | * False Positive (FP): 7 - The number of actual negative cases incorrectly identified as positive by the model. 233 | * False Negative (FN): 26 - The number of actual positive cases incorrectly identified as negative by the model. 234 | 235 | ### XGBoost 236 | 237 | - **Accuracy (Doğruluk):** %91.76 238 | - **Precision (Kesinlik):** %93.08 239 | - **Recall (Duyarlılık):** %84.62 240 | - **F1 Score:** %88.64 241 | - **ROC AUC:** %98.41 242 | 243 | **Confusion Matrix:** 244 | ```plaintext 245 | [[224 9] 246 | [ 22 121]] 247 | ``` 248 | Model dosyası: 249 | ```sh 250 | models/xgboost.pkl 251 | ``` 252 | ##### Explanations: 253 | 254 | 1. [x] **_Accuracy_:** The ratio of correctly predicted instances to the total instances. 255 | 2. [x] **_Precision:_** The ratio of true positive predictions to the total predicted positives. It measures the accuracy of positive predictions. 256 | 3. [x] **_Recall:_** The ratio of true positive predictions to the actual positives. It measures the model's ability to identify positive instances. 257 | 4. [x] _**F1 Score:**_ The harmonic mean of precision and recall. It provides a balance between precision and recall. 258 | 5. [x] **_ROC AUC:_** The area under the ROC curve. It summarizes the model's ability to distinguish between classes. 259 | 260 | **Confusion Matrix:** 261 | 262 | * True Positive (TP): 121 - The number of actual positive cases correctly identified by the model. 263 | * True Negative (TN): 224 - The number of actual negative cases correctly identified by the model. 264 | * False Positive (FP): 9 - The number of actual negative cases incorrectly identified as positive by the model. 265 | * False Negative (FN): 22 - The number of actual positive cases incorrectly identified as negative by the model. 266 | 267 | ## 📈 Results 268 | 269 | Model performance reports and evaluation metrics are saved and displayed in the `comprehensive_model_report.py` script output. 270 | 271 | ## 💡 Future Work 272 | 273 | - Implement more advanced deep learning models (e.g., Neural Networks, LSTM). 274 | - Perform hyperparameter tuning to optimize model performance. 275 | - Explore feature selection techniques to improve model accuracy. 276 | - Integrate additional health datasets for broader analysis. 277 | 278 | ## 🤝 Contributing 279 | 280 | Contributions are welcome! Please feel free to submit a Pull Request. 281 | 282 | Whether it's improving the documentation, adding new features, or fixing bugs, your contributions are highly appreciated. Let's make this project better together! 🚀 283 | 284 | ### How to Contribute: 285 | 286 | 1. **Fork the Repository**: Click on the 'Fork' button at the top right corner of this page to create a copy of this repository in your GitHub account. 287 | 288 | 2. **Clone the Forked Repository**: 289 | ```bash 290 | git clone https://github.com/your-username/Diabetes_Health_Prediction_and_Analysis.git 291 | ``` 292 | 293 | 3. **Create a New Branch**: 294 | ```bash 295 | git checkout -b feature/your-feature-name 296 | ``` 297 | 298 | 4. **Make Your Changes**: Implement your feature, bug fix, or improvement. 299 | 300 | 5. **Commit Your Changes**: 301 | ```bash 302 | git commit -m "Add your commit message here" 303 | ``` 304 | 305 | 6. **Push to Your Forked Repository**: 306 | ```bash 307 | git push origin feature/your-feature-name 308 | ``` 309 | 310 | 7. **Open a Pull Request**: Go to the original repository on GitHub and click on the 'New Pull Request' button. Compare changes from your forked repository and submit the pull request. 311 | 312 | --- 313 | 314 | Thank you for your contributions! Together, we can build a more robust and efficient Diabetes Health Prediction and Analysis tool. 🌟 315 | 316 | ## 📄 License 317 | 318 | This project is licensed under the MIT License. 319 | 320 | ## 📬 Contact 321 | 322 | If you have any questions or suggestions, feel free to open an issue or contact me directly. I am always open to feedback and would love to hear from you! 323 | 324 | --- 325 | 326 | ### How to Reach Me: 327 | 328 | - **Email:** [piinartp@gmail.com](mailto:piinartp@gmail.com) 329 | - **GitHub Issues:** [Open an Issue](https://github.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/issues) 330 | - **LinkedIn:** [Your LinkedIn Profile](https://www.linkedin.com/in/piinartp/) 331 | 332 | --- 333 | 334 | Thank you for your interest in the Diabetes Health Prediction and Analysis project! Your feedback and suggestions are invaluable in making this project better and more useful for everyone. 🌟 335 | 336 | ![Contact Us](https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhgcxnmPWgrukdZFkZONlQ4vUIKWJakRLZqvQUfzkDUbS2nAbQyIxR23-OwOis99pE6UQSxXmxwwuugHQWmwRFfZdw4QKGnk9S_n4yFrfPFTSbKIL6sKUKTwFUyG-8no5Y_9dCLI0LUJIo/s1600/welovehearingfromu.png!) 337 | 338 | --- 339 | 340 | 341 | --- 342 | 343 | ⭐️ Don't forget to give this project a star if you found it useful! ⭐️ 344 | -------------------------------------------------------------------------------- /app/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, render_template 2 | import joblib 3 | import pandas as pd 4 | 5 | # Load the trained model 6 | model = joblib.load("./models/pipeline_xgboost.pkl") 7 | 8 | # Load the feature names 9 | X_train = pd.read_csv("../data/processed/X_train_engineered.csv") 10 | feature_names = X_train.columns.tolist() 11 | 12 | # Remove 'PatientID' from feature names if it exists 13 | if 'PatientID' in feature_names: 14 | feature_names.remove('PatientID') 15 | 16 | # Categorize features 17 | demographic_features = [feature for feature in feature_names if 18 | 'age' in feature or 'gender' in feature or 'ethnicity' in feature] 19 | medical_history_features = [feature for feature in feature_names if 'history' in feature or 'diabetes' in feature] 20 | lifestyle_features = [feature for feature in feature_names if 21 | 'bmi' in feature or 'smoking' in feature or 'activity' in feature] 22 | 23 | app = Flask(__name__) 24 | 25 | 26 | @app.route('/') 27 | def home(): 28 | return render_template('index.html', demographic_features=demographic_features, 29 | medical_history_features=medical_history_features, 30 | lifestyle_features=lifestyle_features) 31 | 32 | 33 | @app.route('/predict', methods=['POST']) 34 | def predict(): 35 | try: 36 | # Extract form data 37 | features = [float(request.form[feature]) for feature in feature_names] 38 | input_data = pd.DataFrame([features], columns=feature_names) 39 | 40 | # Predict using the model 41 | prediction = model.predict(input_data)[0] 42 | prediction_prob = model.predict_proba(input_data)[0][1] 43 | 44 | return render_template('index.html', prediction=prediction, probability=prediction_prob, 45 | demographic_features=demographic_features, 46 | medical_history_features=medical_history_features, 47 | lifestyle_features=lifestyle_features) 48 | except KeyError as e: 49 | return f"Missing form data for feature: {e.args[0]}", 400 50 | except ValueError as e: 51 | return str(e), 400 52 | 53 | 54 | if __name__ == "__main__": 55 | app.run(debug=True) 56 | -------------------------------------------------------------------------------- /app/models/pipeline_xgboost.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/app/models/pipeline_xgboost.pkl -------------------------------------------------------------------------------- /app/static/styles.css: -------------------------------------------------------------------------------- 1 | body { 2 | background-color: #f4f7f6; 3 | font-family: 'Roboto', sans-serif; 4 | } 5 | 6 | .container { 7 | max-width: 800px; 8 | margin: auto; 9 | } 10 | 11 | .card { 12 | border: none; 13 | border-radius: 15px; 14 | overflow: hidden; 15 | } 16 | 17 | .card-header { 18 | background: linear-gradient(45deg, #007bff, #0056b3); 19 | } 20 | 21 | .card-title { 22 | margin: 0; 23 | } 24 | 25 | .btn-primary { 26 | background-color: #007bff; 27 | border-color: #007bff; 28 | transition: background-color 0.3s, border-color 0.3s; 29 | } 30 | 31 | .btn-primary:hover { 32 | background-color: #0056b3; 33 | border-color: #0056b3; 34 | } 35 | 36 | .form-section { 37 | margin-bottom: 20px; 38 | padding: 20px; 39 | border: 1px solid #e0e0e0; 40 | border-radius: 10px; 41 | background-color: #ffffff; 42 | box-shadow: 0 4px 8px rgba(0, 0, 0, 0.05); 43 | } 44 | 45 | .section-title { 46 | margin-bottom: 15px; 47 | font-size: 1.3em; 48 | font-weight: bold; 49 | color: #333333; 50 | } 51 | 52 | .result { 53 | background-color: #e9f7ef; 54 | border: 1px solid #d4edda; 55 | border-radius: 10px; 56 | padding: 20px; 57 | } 58 | 59 | .result h2, .result h3 { 60 | margin: 0; 61 | color: #155724; 62 | } 63 | -------------------------------------------------------------------------------- /app/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Diabetes Prediction 10 | 11 | 12 | 13 |
14 |
15 |
16 |

🩺 Diabetes Prediction

17 |
18 |
19 |
20 |
21 |

Demographic Details

22 |
23 | {% for feature in demographic_features %} 24 |
25 |
26 | 27 | 28 |
29 |
30 | {% endfor %} 31 |
32 |
33 | 34 |
35 |

Medical History

36 |
37 | {% for feature in medical_history_features %} 38 |
39 |
40 | 41 | 42 |
43 |
44 | {% endfor %} 45 |
46 |
47 | 48 |
49 |

Lifestyle Factors

50 |
51 | {% for feature in lifestyle_features %} 52 |
53 |
54 | 55 | 56 |
57 |
58 | {% endfor %} 59 |
60 |
61 | 62 | 63 |
64 | 65 | {% if prediction is not none %} 66 |
67 |
68 |

Prediction: {{ prediction }}

69 |

Probability: {{ probability }}

70 |
71 |
72 | {% endif %} 73 |
74 |
75 |
76 | 77 | 78 | 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /models/logistic_regression.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/models/logistic_regression.pkl -------------------------------------------------------------------------------- /models/random_forest.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/models/random_forest.pkl -------------------------------------------------------------------------------- /models/xgboost.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/models/xgboost.pkl -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pytest~=8.2.2 2 | pandas~=2.2.2 3 | joblib~=1.4.0 4 | scikit-learn~=1.4.2 5 | xgboost~=2.0.3 6 | numpy~=1.26.4 7 | seaborn~=0.13.2 8 | matplotlib~=3.8.4 9 | -------------------------------------------------------------------------------- /scripts/__pycache__/utility_functions.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/scripts/__pycache__/utility_functions.cpython-311.pyc -------------------------------------------------------------------------------- /scripts/comprehensive_model_report.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, classification_report, accuracy_score, \ 4 | precision_score, recall_score, f1_score 5 | import seaborn as sns 6 | import joblib 7 | import os 8 | from utility_functions import load_model, load_train_test_data, plot_roc_curve, plot_confusion_matrix 9 | 10 | 11 | # Modeli ve veri setlerini yükleme 12 | def load_model_and_data(model_path, X_test_path, y_test_path): 13 | model = load_model(model_path) 14 | X_test = pd.read_csv(X_test_path) 15 | y_test = pd.read_csv(y_test_path).values.ravel().astype('int') 16 | return model, X_test, y_test 17 | 18 | 19 | # ROC eğrisini çizme ve kaydetme 20 | def plot_and_save_roc_curve(model, X_test, y_test, model_name): 21 | y_pred_prob = model.predict_proba(X_test)[:, 1] 22 | fpr, tpr, _ = roc_curve(y_test, y_pred_prob) 23 | roc_auc = roc_auc_score(y_test, y_pred_prob) 24 | 25 | plt.figure() 26 | plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})') 27 | plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') 28 | plt.xlim([0.0, 1.0]) 29 | plt.ylim([0.0, 1.05]) 30 | plt.xlabel('False Positive Rate') 31 | plt.ylabel('True Positive Rate') 32 | plt.title(f'Receiver Operating Characteristic - {model_name}') 33 | plt.legend(loc="lower right") 34 | os.makedirs('plots', exist_ok=True) 35 | plt.savefig(f'plots/{model_name}_roc_curve.png') 36 | plt.show() 37 | return roc_auc 38 | 39 | 40 | # Confusion Matrix çizme ve kaydetme 41 | def plot_and_save_confusion_matrix(y_test, y_pred, model_name): 42 | cm = confusion_matrix(y_test, y_pred) 43 | plt.figure() 44 | sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Diabetes', 'Diabetes'], 45 | yticklabels=['No Diabetes', 'Diabetes']) 46 | plt.xlabel('Predicted') 47 | plt.ylabel('True') 48 | plt.title(f'Confusion Matrix - {model_name}') 49 | os.makedirs('plots', exist_ok=True) 50 | plt.savefig(f'plots/{model_name}_confusion_matrix.png') 51 | plt.show() 52 | 53 | 54 | # Kapsamlı model değerlendirme raporu oluşturma 55 | def comprehensive_model_report(model_path, X_test_path, y_test_path, model_name): 56 | model, X_test, y_test = load_model_and_data(model_path, X_test_path, y_test_path) 57 | y_pred = model.predict(X_test) 58 | 59 | # ROC Curve ve AUC 60 | roc_auc = plot_and_save_roc_curve(model, X_test, y_test, model_name) 61 | 62 | # Confusion Matrix 63 | plot_and_save_confusion_matrix(y_test, y_pred, model_name) 64 | 65 | # Classification Report 66 | class_report = classification_report(y_test, y_pred, output_dict=True) 67 | print(f"Classification Report for {model_name}:\n") 68 | print(classification_report(y_test, y_pred)) 69 | 70 | # Diğer Metrikler 71 | accuracy = accuracy_score(y_test, y_pred) 72 | precision = precision_score(y_test, y_pred) 73 | recall = recall_score(y_test, y_pred) 74 | f1 = f1_score(y_test, y_pred) 75 | 76 | metrics = { 77 | 'accuracy': accuracy, 78 | 'precision': precision, 79 | 'recall': recall, 80 | 'f1_score': f1, 81 | 'roc_auc': roc_auc 82 | } 83 | 84 | # Sonuçları kaydetme 85 | report_path = f'reports/{model_name}_report.csv' 86 | os.makedirs('reports', exist_ok=True) 87 | report_df = pd.DataFrame(class_report).transpose() 88 | report_df['accuracy'] = accuracy 89 | report_df['roc_auc'] = roc_auc 90 | report_df.to_csv(report_path) 91 | 92 | print(f"Comprehensive report saved as {report_path}") 93 | print(f"Metrics: {metrics}") 94 | 95 | 96 | # Ana işlem fonksiyonu 97 | def main(): 98 | model_paths = ["models/logistic_regression.pkl", "models/random_forest.pkl", "models/xgboost.pkl"] 99 | model_names = ["Logistic Regression", "Random Forest", "XGBoost"] 100 | X_test_path = "../data/processed/X_test_engineered.csv" 101 | y_test_path = "../data/processed/y_test.csv" 102 | 103 | for model_path, model_name in zip(model_paths, model_names): 104 | comprehensive_model_report(model_path, X_test_path, y_test_path, model_name) 105 | 106 | 107 | if __name__ == "__main__": 108 | main() 109 | -------------------------------------------------------------------------------- /scripts/data_preprocessing.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import seaborn as sns 3 | import matplotlib.pyplot as plt 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.preprocessing import StandardScaler, OneHotEncoder 6 | from sklearn.impute import SimpleImputer 7 | import os 8 | 9 | 10 | # Veri setini yükleme 11 | def load_data(file_path): 12 | df = pd.read_csv(file_path) 13 | return df 14 | 15 | 16 | # Veri Keşfi (Exploratory Data Analysis) 17 | def perform_eda(df, dataset_name=""): 18 | print(f"{dataset_name} Veri Setinin İlk 5 Satırı:\n", df.head()) 19 | print(f"\n{dataset_name} Veri Seti Hakkında Bilgiler:\n", df.info()) 20 | print(f"\n{dataset_name} Veri Setindeki Eksik Değerler:\n", df.isnull().sum()) 21 | print(f"\n{dataset_name} Temel İstatistikler:\n", df.describe()) 22 | 23 | # Kategorik değişkenlerin dağılımı 24 | categorical_columns = df.select_dtypes(include=['object', 'category']).columns 25 | for column in categorical_columns: 26 | plt.figure(figsize=(10, 5)) 27 | sns.countplot(data=df, x=column) 28 | plt.title(f'{dataset_name} {column} Dağılımı') 29 | plt.show() 30 | 31 | # Sayısal değişkenlerin dağılımı 32 | numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns 33 | df[numeric_columns].hist(bins=15, figsize=(20, 15)) 34 | plt.suptitle(f'{dataset_name} Sayısal Değişkenlerin Dağılımı') 35 | plt.show() 36 | 37 | # Korelasyon matrisi (DoctorInCharge sütunu çıkarıldı) 38 | if 'DoctorInCharge' in df.columns: 39 | df_corr = df.drop(columns=['DoctorInCharge']) 40 | else: 41 | df_corr = df.copy() 42 | plt.figure(figsize=(15, 10)) 43 | sns.heatmap(df_corr.corr(), annot=True, cmap='coolwarm') 44 | plt.title(f'{dataset_name} Korelasyon Matrisi') 45 | plt.show() 46 | 47 | 48 | # Eksik verileri işleme 49 | def handle_missing_values(df): 50 | imputer = SimpleImputer(strategy='mean') 51 | numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns 52 | df[numeric_columns] = imputer.fit_transform(df[numeric_columns]) 53 | return df 54 | 55 | 56 | # Kategorik değişkenleri dönüştürme 57 | def encode_categorical_values(df): 58 | categorical_columns = df.select_dtypes(include=['object']).columns 59 | encoder = OneHotEncoder(sparse_output=False, drop='first') 60 | encoded_columns = pd.DataFrame(encoder.fit_transform(df[categorical_columns])) 61 | encoded_columns.columns = encoder.get_feature_names_out(categorical_columns) 62 | df = df.drop(categorical_columns, axis=1) 63 | df = pd.concat([df, encoded_columns], axis=1) 64 | return df 65 | 66 | 67 | # Verileri normalleştirme 68 | def normalize_data(df): 69 | scaler = StandardScaler() 70 | numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns 71 | df[numeric_columns] = scaler.fit_transform(df[numeric_columns]) 72 | return df 73 | 74 | 75 | # Verileri eğitim ve test setlerine ayırma 76 | def split_data(df, target_column, test_size=0.2, random_state=42): 77 | X = df.drop(target_column, axis=1) 78 | y = df[target_column] 79 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state) 80 | return X_train, X_test, y_train, y_test 81 | 82 | 83 | # Eğitim ve test setlerini kaydetme 84 | def save_datasets(X_train, X_test, y_train, y_test, output_dir): 85 | os.makedirs(output_dir, exist_ok=True) 86 | X_train.to_csv(os.path.join(output_dir, 'X_train.csv'), index=False) 87 | X_test.to_csv(os.path.join(output_dir, 'X_test.csv'), index=False) 88 | y_train.to_csv(os.path.join(output_dir, 'y_train.csv'), index=False) 89 | y_test.to_csv(os.path.join(output_dir, 'y_test.csv'), index=False) 90 | print(f"Datasets saved to {output_dir}") 91 | 92 | 93 | # Ana işlem fonksiyonu 94 | def main(file_path, target_column, output_dir): 95 | df = load_data(file_path) 96 | perform_eda(df, dataset_name="Orijinal") 97 | df = handle_missing_values(df) 98 | df = encode_categorical_values(df) 99 | df = normalize_data(df) 100 | X_train, X_test, y_train, y_test = split_data(df, target_column) 101 | save_datasets(X_train, X_test, y_train, y_test, output_dir) 102 | 103 | # Eğitim ve test setleri için sütun analizleri 104 | perform_eda(pd.concat([X_train, y_train], axis=1), dataset_name="Eğitim Seti") 105 | perform_eda(pd.concat([X_test, y_test], axis=1), dataset_name="Test Seti") 106 | 107 | return X_train, X_test, y_train, y_test 108 | 109 | 110 | if __name__ == "__main__": 111 | file_path = "../data/raw/diabetes_data.csv" # Veri seti yolu 112 | target_column = "Diagnosis" # Hedef değişken 113 | output_dir = "../data/processed" # Kaydedilecek dizin 114 | X_train, X_test, y_train, y_test = main(file_path, target_column, output_dir) 115 | print("Veri ön işleme tamamlandı ve veriler eğitim/test setlerine ayrıldı.") 116 | -------------------------------------------------------------------------------- /scripts/feature_engineering.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.preprocessing import PolynomialFeatures 3 | 4 | 5 | # Yeni özellikler oluşturma 6 | def create_new_features(df): 7 | # Örneğin, yaş ve BMI'nin çarpımını yeni bir özellik olarak ekleyebiliriz 8 | df['Age_BMI'] = df['Age'] * df['BMI'] 9 | return df 10 | 11 | 12 | # Polynomial Features oluşturma 13 | def add_polynomial_features(df, degree=2): 14 | poly = PolynomialFeatures(degree, include_bias=False) 15 | numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns 16 | poly_features = poly.fit_transform(df[numeric_columns]) 17 | poly_feature_names = poly.get_feature_names_out(numeric_columns) 18 | poly_df = pd.DataFrame(poly_features, columns=poly_feature_names) 19 | 20 | df = df.reset_index(drop=True) 21 | poly_df = poly_df.reset_index(drop=True) 22 | 23 | df = pd.concat([df, poly_df], axis=1) 24 | return df 25 | 26 | 27 | # Ana işlem fonksiyonu 28 | def main(train_file_path, test_file_path, train_output_path, test_output_path, degree=2): 29 | # Eğitim veri seti için feature engineering 30 | df_train = pd.read_csv(train_file_path) 31 | df_train = create_new_features(df_train) 32 | df_train = add_polynomial_features(df_train, degree) 33 | df_train.to_csv(train_output_path, index=False) 34 | print(f"Eğitim veri seti için feature engineering tamamlandı ve {train_output_path} dosyasına kaydedildi.") 35 | 36 | # Test veri seti için feature engineering 37 | df_test = pd.read_csv(test_file_path) 38 | df_test = create_new_features(df_test) 39 | df_test = add_polynomial_features(df_test, degree) 40 | df_test.to_csv(test_output_path, index=False) 41 | print(f"Test veri seti için feature engineering tamamlandı ve {test_output_path} dosyasına kaydedildi.") 42 | 43 | 44 | if __name__ == "__main__": 45 | train_file_path = "../data/processed/X_train.csv" # Eğitim veri seti yolu 46 | test_file_path = "../data/processed/X_test.csv" # Test veri seti yolu 47 | train_output_path = "../data/processed/X_train_engineered.csv" # Kaydedilecek yeni eğitim veri seti yolu 48 | test_output_path = "../data/processed/X_test_engineered.csv" # Kaydedilecek yeni test veri seti yolu 49 | degree = 2 # Polynomial degree 50 | main(train_file_path, test_file_path, train_output_path, test_output_path, degree) 51 | -------------------------------------------------------------------------------- /scripts/model_evaluation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import seaborn as sns 3 | import matplotlib.pyplot as plt 4 | from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, classification_report 5 | import joblib 6 | 7 | 8 | # Modeli ve veri setlerini yükleme 9 | def load_model_and_data(model_path, X_test_path, y_test_path): 10 | model = joblib.load(model_path) 11 | X_test = pd.read_csv(X_test_path) 12 | y_test = pd.read_csv(y_test_path).values.ravel().astype('int') 13 | return model, X_test, y_test 14 | 15 | 16 | # ROC eğrisini çizme 17 | def plot_roc_curve(model, X_test, y_test, model_name): 18 | y_pred_prob = model.predict_proba(X_test)[:, 1] 19 | fpr, tpr, _ = roc_curve(y_test, y_pred_prob) 20 | roc_auc = roc_auc_score(y_test, y_pred_prob) 21 | 22 | plt.figure() 23 | plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})') 24 | plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') 25 | plt.xlim([0.0, 1.0]) 26 | plt.ylim([0.0, 1.05]) 27 | plt.xlabel('False Positive Rate') 28 | plt.ylabel('True Positive Rate') 29 | plt.title(f'Receiver Operating Characteristic - {model_name}') 30 | plt.legend(loc="lower right") 31 | plt.savefig(f'plots/{model_name}_roc_curve.png') 32 | plt.show() 33 | 34 | 35 | # Confusion Matrix çizme 36 | def plot_confusion_matrix(y_test, y_pred, model_name): 37 | cm = confusion_matrix(y_test, y_pred) 38 | plt.figure() 39 | sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Diabetes', 'Diabetes'], 40 | yticklabels=['No Diabetes', 'Diabetes']) 41 | plt.xlabel('Predicted') 42 | plt.ylabel('True') 43 | plt.title(f'Confusion Matrix - {model_name}') 44 | plt.savefig(f'plots/{model_name}_confusion_matrix.png') 45 | plt.show() 46 | 47 | 48 | # Modeli değerlendirme 49 | def evaluate_model(model_path, X_test_path, y_test_path, model_name): 50 | model, X_test, y_test = load_model_and_data(model_path, X_test_path, y_test_path) 51 | y_pred = model.predict(X_test) 52 | 53 | # ROC Curve 54 | plot_roc_curve(model, X_test, y_test, model_name) 55 | 56 | # Confusion Matrix 57 | plot_confusion_matrix(y_test, y_pred, model_name) 58 | 59 | # Classification Report 60 | print(f"Classification Report for {model_name}:\n") 61 | print(classification_report(y_test, y_pred)) 62 | 63 | 64 | # Ana işlem fonksiyonu 65 | def main(): 66 | model_paths = ["models/logistic_regression.pkl", "models/random_forest.pkl", "models/xgboost.pkl"] 67 | model_names = ["Logistic Regression", "Random Forest", "XGBoost"] 68 | X_test_path = "../data/processed/X_test_engineered.csv" 69 | y_test_path = "../data/processed/y_test.csv" 70 | 71 | for model_path, model_name in zip(model_paths, model_names): 72 | evaluate_model(model_path, X_test_path, y_test_path, model_name) 73 | 74 | 75 | if __name__ == "__main__": 76 | main() 77 | -------------------------------------------------------------------------------- /scripts/model_training.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.model_selection import train_test_split, GridSearchCV 3 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn.ensemble import RandomForestClassifier 6 | from xgboost import XGBClassifier 7 | import joblib 8 | import os 9 | 10 | 11 | # Veri setlerini yükleme 12 | def load_data(X_train_path, X_test_path, y_train_path, y_test_path): 13 | X_train = pd.read_csv(X_train_path) 14 | X_test = pd.read_csv(X_test_path) 15 | y_train = pd.read_csv(y_train_path).values.ravel().astype('int') # Hedef değişkeni integer tipine dönüştür 16 | y_test = pd.read_csv(y_test_path).values.ravel().astype('int') # Hedef değişkeni integer tipine dönüştür 17 | 18 | return X_train, X_test, y_train, y_test 19 | 20 | 21 | # Model eğitimi ve değerlendirmesi 22 | def train_and_evaluate_model(X_train, X_test, y_train, y_test, model, model_name): 23 | model.fit(X_train, y_train) 24 | y_pred = model.predict(X_test) 25 | y_pred_prob = model.predict_proba(X_test)[:, 1] 26 | 27 | accuracy = accuracy_score(y_test, y_pred) 28 | precision = precision_score(y_test, y_pred) 29 | recall = recall_score(y_test, y_pred) 30 | f1 = f1_score(y_test, y_pred) 31 | roc_auc = roc_auc_score(y_test, y_pred_prob) 32 | cm = confusion_matrix(y_test, y_pred) 33 | 34 | print(f"Model: {model_name}") 35 | print(f"Accuracy: {accuracy:.4f}") 36 | print(f"Precision: {precision:.4f}") 37 | print(f"Recall: {recall:.4f}") 38 | print(f"F1 Score: {f1:.4f}") 39 | print(f"ROC AUC: {roc_auc:.4f}") 40 | print("Confusion Matrix:") 41 | print(cm) 42 | print("\n") 43 | 44 | # Modeli kaydetme 45 | os.makedirs('../models', exist_ok=True) 46 | joblib.dump(model, f"../models/{model_name}.pkl") 47 | print(f"Model saved as models/{model_name}.pkl") 48 | 49 | 50 | # Ana işlem fonksiyonu 51 | def main(X_train_path, X_test_path, y_train_path, y_test_path): 52 | X_train, X_test, y_train, y_test = load_data(X_train_path, X_test_path, y_train_path, y_test_path) 53 | 54 | # Lojistik Regresyon 55 | lr = LogisticRegression(max_iter=1000) 56 | train_and_evaluate_model(X_train, X_test, y_train, y_test, lr, "logistic_regression") 57 | 58 | # Random Forest 59 | rf = RandomForestClassifier(n_estimators=100) 60 | train_and_evaluate_model(X_train, X_test, y_train, y_test, rf, "random_forest") 61 | 62 | # XGBoost 63 | xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss') 64 | train_and_evaluate_model(X_train, X_test, y_train, y_test, xgb, "xgboost") 65 | 66 | 67 | if __name__ == "__main__": 68 | X_train_path = "../data/processed/X_train_engineered.csv" # Eğitim veri seti yolu 69 | X_test_path = "../data/processed/X_test_engineered.csv" # Test veri seti yolu 70 | y_train_path = "../data/processed/y_train.csv" # Eğitim hedef değişkeni veri seti yolu 71 | y_test_path = "../data/processed/y_test.csv" # Test hedef değişkeni veri seti yolu 72 | main(X_train_path, X_test_path, y_train_path, y_test_path) 73 | -------------------------------------------------------------------------------- /scripts/plots/Logistic Regression_confusion_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/scripts/plots/Logistic Regression_confusion_matrix.png -------------------------------------------------------------------------------- /scripts/plots/Logistic Regression_roc_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/scripts/plots/Logistic Regression_roc_curve.png -------------------------------------------------------------------------------- /scripts/plots/Random Forest_confusion_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/scripts/plots/Random Forest_confusion_matrix.png -------------------------------------------------------------------------------- /scripts/plots/Random Forest_roc_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/scripts/plots/Random Forest_roc_curve.png -------------------------------------------------------------------------------- /scripts/plots/XGBoost_confusion_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/scripts/plots/XGBoost_confusion_matrix.png -------------------------------------------------------------------------------- /scripts/plots/XGBoost_roc_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/scripts/plots/XGBoost_roc_curve.png -------------------------------------------------------------------------------- /scripts/reports/Logistic Regression_report.csv: -------------------------------------------------------------------------------- 1 | ,precision,recall,f1-score,support,accuracy,roc_auc 2 | 0,0.8235294117647058,0.8412017167381974,0.832271762208068,233.0,0.7898936170212766,0.8386206068609503 3 | 1,0.7318840579710145,0.7062937062937062,0.7188612099644128,143.0,0.7898936170212766,0.8386206068609503 4 | accuracy,0.7898936170212766,0.7898936170212766,0.7898936170212766,0.7898936170212766,0.7898936170212766,0.8386206068609503 5 | macro avg,0.7777067348678601,0.7737477115159519,0.7755664860862403,376.0,0.7898936170212766,0.8386206068609503 6 | weighted avg,0.7886749288059349,0.7898936170212766,0.7891395574983799,376.0,0.7898936170212766,0.8386206068609503 7 | -------------------------------------------------------------------------------- /scripts/reports/Random Forest_report.csv: -------------------------------------------------------------------------------- 1 | ,precision,recall,f1-score,support,accuracy,roc_auc 2 | 0,0.8968253968253969,0.9699570815450643,0.931958762886598,233.0,0.9122340425531915,0.9768600498214233 3 | 1,0.9435483870967742,0.8181818181818182,0.8764044943820225,143.0,0.9122340425531915,0.9768600498214233 4 | accuracy,0.9122340425531915,0.9122340425531915,0.9122340425531915,0.9122340425531915,0.9122340425531915,0.9768600498214233 5 | macro avg,0.9201868919610856,0.8940694498634413,0.9041816286343103,376.0,0.9122340425531915,0.9768600498214233 6 | weighted avg,0.91459504472116,0.9122340425531915,0.9108304107691664,376.0,0.9122340425531915,0.9768600498214233 7 | -------------------------------------------------------------------------------- /scripts/reports/XGBoost_report.csv: -------------------------------------------------------------------------------- 1 | ,precision,recall,f1-score,support,accuracy,roc_auc 2 | 0,0.9105691056910569,0.9613733905579399,0.9352818371607515,233.0,0.9175531914893617,0.9840631471532759 3 | 1,0.9307692307692308,0.8461538461538461,0.8864468864468864,143.0,0.9175531914893617,0.9840631471532759 4 | accuracy,0.9175531914893617,0.9175531914893617,0.9175531914893617,0.9175531914893617,0.9175531914893617,0.9840631471532759 5 | macro avg,0.9206691682301438,0.903763618355893,0.910864361803819,376.0,0.9175531914893617,0.9840631471532759 6 | weighted avg,0.9182516000691922,0.9175531914893617,0.9167089702669144,376.0,0.9175531914893617,0.9840631471532759 7 | -------------------------------------------------------------------------------- /scripts/utility_functions.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import seaborn as sns 3 | import matplotlib.pyplot as plt 4 | from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, classification_report 5 | import joblib 6 | import os 7 | 8 | 9 | # Veri setini yükleme 10 | def load_data(file_path): 11 | df = pd.read_csv(file_path) 12 | return df 13 | 14 | 15 | # Veri setlerini yükleme 16 | def load_train_test_data(X_train_path, X_test_path, y_train_path, y_test_path): 17 | X_train = pd.read_csv(X_train_path) 18 | X_test = pd.read_csv(X_test_path) 19 | y_train = pd.read_csv(y_train_path).values.ravel().astype('int') 20 | y_test = pd.read_csv(y_test_path).values.ravel().astype('int') 21 | return X_train, X_test, y_train, y_test 22 | 23 | 24 | # Modeli kaydetme 25 | def save_model(model, model_name): 26 | os.makedirs('../models', exist_ok=True) 27 | joblib.dump(model, f"../models/{model_name}.pkl") 28 | print(f"Model saved as models/{model_name}.pkl") 29 | 30 | 31 | # Modeli yükleme 32 | def load_model(model_path): 33 | model = joblib.load(model_path) 34 | return model 35 | 36 | 37 | # ROC eğrisini çizme 38 | def plot_roc_curve(model, X_test, y_test, model_name): 39 | y_pred_prob = model.predict_proba(X_test)[:, 1] 40 | fpr, tpr, _ = roc_curve(y_test, y_pred_prob) 41 | roc_auc = roc_auc_score(y_test, y_pred_prob) 42 | 43 | plt.figure() 44 | plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})') 45 | plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') 46 | plt.xlim([0.0, 1.0]) 47 | plt.ylim([0.0, 1.05]) 48 | plt.xlabel('False Positive Rate') 49 | plt.ylabel('True Positive Rate') 50 | plt.title(f'Receiver Operating Characteristic - {model_name}') 51 | plt.legend(loc="lower right") 52 | os.makedirs('plots', exist_ok=True) 53 | plt.savefig(f'plots/{model_name}_roc_curve.png') 54 | plt.show() 55 | 56 | 57 | # Confusion Matrix çizme 58 | def plot_confusion_matrix(y_test, y_pred, model_name): 59 | cm = confusion_matrix(y_test, y_pred) 60 | plt.figure() 61 | sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Diabetes', 'Diabetes'], 62 | yticklabels=['No Diabetes', 'Diabetes']) 63 | plt.xlabel('Predicted') 64 | plt.ylabel('True') 65 | plt.title(f'Confusion Matrix - {model_name}') 66 | os.makedirs('plots', exist_ok=True) 67 | plt.savefig(f'plots/{model_name}_confusion_matrix.png') 68 | plt.show() 69 | 70 | 71 | # Modeli değerlendirme 72 | def evaluate_model(model, X_test, y_test, model_name): 73 | y_pred = model.predict(X_test) 74 | 75 | # ROC Curve 76 | plot_roc_curve(model, X_test, y_test, model_name) 77 | 78 | # Confusion Matrix 79 | plot_confusion_matrix(y_test, y_pred, model_name) 80 | 81 | # Classification Report 82 | print(f"Classification Report for {model_name}:\n") 83 | print(classification_report(y_test, y_pred)) 84 | 85 | 86 | # Eğitim ve test setlerini kaydetme 87 | def save_datasets(X_train, X_test, y_train, y_test, output_dir): 88 | os.makedirs(output_dir, exist_ok=True) 89 | X_train.to_csv(os.path.join(output_dir, 'X_train.csv'), index=False) 90 | X_test.to_csv(os.path.join(output_dir, 'X_test.csv'), index=False) 91 | y_train.to_csv(os.path.join(output_dir, 'y_train.csv'), index=False) 92 | y_test.to_csv(os.path.join(output_dir, 'y_test.csv'), index=False) 93 | print(f"Datasets saved to {output_dir}") 94 | -------------------------------------------------------------------------------- /tests/__pycache__/test_data_preprocessing.cpython-311-pytest-8.2.2.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/tests/__pycache__/test_data_preprocessing.cpython-311-pytest-8.2.2.pyc -------------------------------------------------------------------------------- /tests/__pycache__/test_feature_engineering.cpython-311-pytest-8.2.2.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/tests/__pycache__/test_feature_engineering.cpython-311-pytest-8.2.2.pyc -------------------------------------------------------------------------------- /tests/__pycache__/test_model_evaluation.cpython-311-pytest-8.2.2.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/tests/__pycache__/test_model_evaluation.cpython-311-pytest-8.2.2.pyc -------------------------------------------------------------------------------- /tests/__pycache__/test_model_training.cpython-311-pytest-8.2.2.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/tests/__pycache__/test_model_training.cpython-311-pytest-8.2.2.pyc -------------------------------------------------------------------------------- /tests/models/logistic_regression.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/tests/models/logistic_regression.pkl -------------------------------------------------------------------------------- /tests/models/random_forest.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/tests/models/random_forest.pkl -------------------------------------------------------------------------------- /tests/models/xgboost.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/tests/models/xgboost.pkl -------------------------------------------------------------------------------- /tests/test_data_preprocessing.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pandas as pd 3 | from sklearn.impute import SimpleImputer 4 | from sklearn.preprocessing import OneHotEncoder, StandardScaler 5 | from sklearn.model_selection import train_test_split 6 | import os 7 | 8 | 9 | # Veri setini yükleme 10 | def load_data(file_path): 11 | df = pd.read_csv(file_path) 12 | return df 13 | 14 | 15 | # Veri Keşfi (Exploratory Data Analysis) 16 | def perform_eda(df): 17 | print("Veri Setinin İlk 5 Satırı:\n", df.head()) 18 | print("\nVeri Seti Hakkında Bilgiler:\n", df.info()) 19 | print("\nVeri Setindeki Eksik Değerler:\n", df.isnull().sum()) 20 | print("\nTemel İstatistikler:\n", df.describe()) 21 | 22 | 23 | # Eksik verileri işleme 24 | def handle_missing_values(df): 25 | imputer = SimpleImputer(strategy='mean') 26 | numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns 27 | df[numeric_columns] = imputer.fit_transform(df[numeric_columns]) 28 | return df 29 | 30 | 31 | # Kategorik değişkenleri dönüştürme 32 | def encode_categorical_values(df): 33 | # Kategorik sütunları belirle 34 | categorical_columns = df.select_dtypes(include=['object', 'category']).columns 35 | # Encoder oluştur 36 | encoder = OneHotEncoder(sparse_output=False, drop='first') 37 | # Kategorik sütunları dönüştür 38 | encoded_columns = pd.DataFrame(encoder.fit_transform(df[categorical_columns]), 39 | columns=encoder.get_feature_names_out(categorical_columns)) 40 | # Orijinal kategorik sütunları düşür 41 | df = df.drop(categorical_columns, axis=1) 42 | # Yeni sütunları ekle 43 | df = pd.concat([df, encoded_columns], axis=1) 44 | return df 45 | 46 | 47 | # Verileri normalleştirme 48 | def normalize_data(df): 49 | scaler = StandardScaler() 50 | numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns 51 | df[numeric_columns] = scaler.fit_transform(df[numeric_columns]) 52 | return df 53 | 54 | 55 | # Verileri eğitim ve test setlerine ayırma 56 | def split_data(df, target_column, test_size=0.2, random_state=42): 57 | X = df.drop(target_column, axis=1) 58 | y = df[target_column] 59 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state) 60 | return X_train, X_test, y_train, y_test 61 | 62 | 63 | # Eğitim ve test setlerini kaydetme 64 | def save_datasets(X_train, X_test, y_train, y_test, output_dir): 65 | os.makedirs(output_dir, exist_ok=True) 66 | X_train.to_csv(os.path.join(output_dir, 'X_train.csv'), index=False) 67 | X_test.to_csv(os.path.join(output_dir, 'X_test.csv'), index=False) 68 | y_train.to_csv(os.path.join(output_dir, 'y_train.csv'), index=False) 69 | y_test.to_csv(os.path.join(output_dir, 'y_test.csv'), index=False) 70 | print(f"Datasets saved to {output_dir}") 71 | 72 | 73 | @pytest.fixture 74 | def sample_data(): 75 | data = { 76 | 'Age': [25, 35, 45, 55], 77 | 'Gender': ['0', '1', '0', '1'], # 'Gender' sütunu string tipinde olmalı 78 | 'BMI': [22.5, 24.5, 28.0, 30.0], 79 | 'Diagnosis': [0, 1, 0, 1] 80 | } 81 | return pd.DataFrame(data) 82 | 83 | 84 | def test_load_data(sample_data): 85 | df = sample_data 86 | assert not df.empty 87 | 88 | 89 | def test_handle_missing_values(sample_data): 90 | df = sample_data.copy() 91 | df.loc[0, 'BMI'] = None 92 | df = handle_missing_values(df) 93 | assert df['BMI'].isnull().sum() == 0 94 | 95 | 96 | def test_encode_categorical_values(sample_data): 97 | df = sample_data 98 | df = encode_categorical_values(df) 99 | assert 'Gender_1' in df.columns 100 | 101 | 102 | def test_normalize_data(sample_data): 103 | df = sample_data 104 | df = normalize_data(df) 105 | assert df['Age'].mean() < 1 106 | 107 | 108 | def test_split_data(sample_data): 109 | df = sample_data 110 | X_train, X_test, y_train, y_test = split_data(df, 'Diagnosis') 111 | assert len(X_train) + len(X_test) == len(df) 112 | 113 | 114 | def test_save_datasets(tmpdir, sample_data): 115 | df = sample_data 116 | X_train, X_test, y_train, y_test = split_data(df, 'Diagnosis') 117 | output_dir = tmpdir.mkdir("data") 118 | save_datasets(X_train, X_test, y_train, y_test, output_dir) 119 | assert (output_dir / 'X_train.csv').check() 120 | -------------------------------------------------------------------------------- /tests/test_feature_engineering.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pandas as pd 3 | import numpy as np 4 | 5 | 6 | # Yeni özellikler oluşturma 7 | def create_new_features(df): 8 | df['Age_BMI'] = df['Age'] * df['BMI'] 9 | return df 10 | 11 | 12 | # Polinom özellikler ekleme 13 | def add_polynomial_features(df, degree=2): 14 | for column in df.select_dtypes(include=['float64', 'int64']).columns: 15 | for power in range(2, degree + 1): 16 | df[f'{column}^{power}'] = np.power(df[column], power) 17 | return df 18 | 19 | 20 | @pytest.fixture 21 | def sample_data(): 22 | data = { 23 | 'Age': [25, 35, 45, 55], 24 | 'BMI': [22.5, 24.5, 28.0, 30.0] 25 | } 26 | return pd.DataFrame(data) 27 | 28 | 29 | def test_create_new_features(sample_data): 30 | df = sample_data 31 | df = create_new_features(df) 32 | assert 'Age_BMI' in df.columns 33 | assert (df['Age_BMI'] == df['Age'] * df['BMI']).all() 34 | 35 | 36 | def test_add_polynomial_features(sample_data): 37 | df = sample_data 38 | df = add_polynomial_features(df, degree=2) 39 | assert 'Age^2' in df.columns 40 | assert (df['Age^2'] == df['Age'] ** 2).all() 41 | assert 'BMI^2' in df.columns 42 | assert (df['BMI^2'] == df['BMI'] ** 2).all() 43 | -------------------------------------------------------------------------------- /tests/test_model_training.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pandas as pd 3 | from sklearn.linear_model import LogisticRegression 4 | from sklearn.ensemble import RandomForestClassifier 5 | from xgboost import XGBClassifier 6 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix 7 | import joblib 8 | import os 9 | 10 | 11 | # Veri setlerini yükleme 12 | def load_data(train_path, test_path): 13 | X_train = pd.read_csv(train_path) 14 | X_test = pd.read_csv(test_path) 15 | y_train = pd.read_csv(train_path.replace('X_', 'y_')).values.ravel().astype('int') 16 | y_test = pd.read_csv(test_path.replace('X_', 'y_')).values.ravel().astype('int') 17 | return X_train, X_test, y_train, y_test 18 | 19 | 20 | # Model eğitimi ve değerlendirmesi 21 | def train_and_evaluate_model(X_train, X_test, y_train, y_test, model, model_name): 22 | model.fit(X_train, y_train) 23 | y_pred = model.predict(X_test) 24 | y_pred_prob = model.predict_proba(X_test)[:, 1] 25 | 26 | accuracy = accuracy_score(y_test, y_pred) 27 | precision = precision_score(y_test, y_pred) 28 | recall = recall_score(y_test, y_pred) 29 | f1 = f1_score(y_test, y_pred) 30 | roc_auc = roc_auc_score(y_test, y_pred_prob) 31 | cm = confusion_matrix(y_test, y_pred) 32 | 33 | print(f"Model: {model_name}") 34 | print(f"Accuracy: {accuracy:.4f}") 35 | print(f"Precision: {precision:.4f}") 36 | print(f"Recall: {recall:.4f}") 37 | print(f"F1 Score: {f1:.4f}") 38 | print(f"ROC AUC: {roc_auc:.4f}") 39 | print("Confusion Matrix:") 40 | print(cm) 41 | print("\n") 42 | 43 | # Modeli kaydetme 44 | os.makedirs('models', exist_ok=True) 45 | joblib.dump(model, f"models/{model_name}.pkl") 46 | print(f"Model saved as models/{model_name}.pkl") 47 | 48 | 49 | @pytest.fixture 50 | def sample_data(): 51 | X_train = pd.DataFrame({ 52 | 'Age': [25, 35, 45, 55], 53 | 'BMI': [22.5, 24.5, 28.0, 30.0] 54 | }) 55 | y_train = pd.Series([0, 1, 0, 1]) 56 | return X_train, y_train 57 | 58 | 59 | def test_train_and_evaluate_model(sample_data): 60 | X_train, y_train = sample_data 61 | X_test, y_test = X_train.copy(), y_train.copy() 62 | 63 | # Logistic Regression 64 | lr = LogisticRegression() 65 | train_and_evaluate_model(X_train, X_test, y_train, y_test, lr, "logistic_regression") 66 | 67 | # Random Forest 68 | rf = RandomForestClassifier(n_estimators=10) 69 | train_and_evaluate_model(X_train, X_test, y_train, y_test, rf, "random_forest") 70 | 71 | # XGBoost 72 | xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss') 73 | train_and_evaluate_model(X_train, X_test, y_train, y_test, xgb, "xgboost") 74 | 75 | # Modellerin kaydedildiğini kontrol etme 76 | assert os.path.exists('models/logistic_regression.pkl') 77 | assert os.path.exists('models/random_forest.pkl') 78 | assert os.path.exists('models/xgboost.pkl') 79 | 80 | 81 | def test_load_data(tmpdir): 82 | # Geçici veri dosyaları oluşturma 83 | train_file = tmpdir.join("X_train.csv") 84 | test_file = tmpdir.join("X_test.csv") 85 | train_file.write("Age,BMI\n25,22.5\n35,24.5\n45,28.0\n55,30.0") 86 | test_file.write("Age,BMI\n25,22.5\n35,24.5\n45,28.0\n55,30.0") 87 | 88 | y_train_file = tmpdir.join("y_train.csv") 89 | y_test_file = tmpdir.join("y_test.csv") 90 | y_train_file.write("Diagnosis\n0\n1\n0\n1") 91 | y_test_file.write("Diagnosis\n0\n1\n0\n1") 92 | 93 | # Veri setlerini yükleme 94 | X_train, X_test, y_train, y_test = load_data(str(train_file), str(test_file)) 95 | 96 | # Veri setlerinin doğru yüklendiğini kontrol etme 97 | assert X_train.shape == (4, 2) 98 | assert X_test.shape == (4, 2) 99 | assert y_train.shape == (4,) 100 | assert y_test.shape == (4,) 101 | --------------------------------------------------------------------------------