├── .gitignore
├── README.md
├── app
    ├── app.py
    ├── models
    │   └── pipeline_xgboost.pkl
    ├── static
    │   └── styles.css
    └── templates
    │   └── index.html
├── data
    └── processed
    │   ├── X_test.csv
    │   ├── X_test_engineered.csv
    │   └── X_train.csv
├── models
    ├── logistic_regression.pkl
    ├── random_forest.pkl
    └── xgboost.pkl
├── notebooks
    └── exploratory_data_analysis.ipynb
├── raw
    └── diabetes_data.csv
├── requirements.txt
├── scripts
    ├── __pycache__
    │   └── utility_functions.cpython-311.pyc
    ├── comprehensive_model_report.py
    ├── data_preprocessing.py
    ├── feature_engineering.py
    ├── model_evaluation.py
    ├── model_training.py
    ├── plots
    │   ├── Logistic Regression_confusion_matrix.png
    │   ├── Logistic Regression_roc_curve.png
    │   ├── Random Forest_confusion_matrix.png
    │   ├── Random Forest_roc_curve.png
    │   ├── XGBoost_confusion_matrix.png
    │   └── XGBoost_roc_curve.png
    ├── reports
    │   ├── Logistic Regression_report.csv
    │   ├── Random Forest_report.csv
    │   └── XGBoost_report.csv
    └── utility_functions.py
└── tests
    ├── __pycache__
        ├── test_data_preprocessing.cpython-311-pytest-8.2.2.pyc
        ├── test_feature_engineering.cpython-311-pytest-8.2.2.pyc
        ├── test_model_evaluation.cpython-311-pytest-8.2.2.pyc
        └── test_model_training.cpython-311-pytest-8.2.2.pyc
    ├── models
        ├── logistic_regression.pkl
        ├── random_forest.pkl
        └── xgboost.pkl
    ├── test_data_preprocessing.py
    ├── test_feature_engineering.py
    └── test_model_training.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Diabetes Health Prediction and Analysis 🎉
  2 | 
  3 | ![Diabetes Health Prediction](https://miro.medium.com/v2/resize:fit:828/format:webp/1*KkQbSEI9sT44_yxR9vscJA.gif)
  4 | 
  5 | ---
  6 | 
  7 | Welcome to the **Diabetes Health Prediction and Analysis** project! This repository contains a comprehensive pipeline for predicting diabetes diagnosis using various machine learning and deep learning models, along with an in-depth exploratory data analysis and feature engineering steps.
  8 | 
  9 | ## 🚀 Project Overview
 10 | 
 11 | This project aims to provide a thorough analysis of diabetes-related health data, develop predictive models, and evaluate their performance. The key components of the project include:
 12 | 
 13 | - 📊 Data Preprocessing
 14 | - 🔍 Exploratory Data Analysis (EDA)
 15 | - 🛠️ Feature Engineering
 16 | - 🧠 Model Training
 17 | - 📈 Model Evaluation
 18 | - 📑 Comprehensive Reports
 19 | 
 20 | ## 📂 Project Structure
 21 | 
 22 | Here's an overview of the project directory structure:
 23 | 
 24 | 
 25 | ```plaintext
 26 | Diabetes_Health_Prediction_and_Analysis/
 27 | ├── data/
 28 | │   ├── raw/
 29 | │   │   └── diabetes_data.csv
 30 | │   ├── processed/
 31 | │   │   ├── X_train.csv
 32 | │   │   ├── X_train_engineered.csv
 33 | │   │   ├── X_test.csv
 34 | │   │   ├── X_test_engineered.csv
 35 | │   │   ├── y_train.csv
 36 | │   │   └── y_test.csv
 37 | ├── app/
 38 | │   ├── app.py
 39 | │   ├── templates/
 40 | │   │   └── index.html
 41 | │   └── static/
 42 | │       └── styles.css
 43 | ├── models/
 44 | │   ├── logistic_regression.pkl
 45 | │   ├── random_forest.pkl
 46 | │   └── xgboost.pkl
 47 | ├── notebooks/
 48 | │   └── exploratory_data_analysis.ipynb
 49 | ├── scripts/
 50 | │   ├── plots/
 51 | │   ├── reports/
 52 | │   ├── data_preprocessing.py
 53 | │   ├── feature_engineering.py
 54 | │   ├── model_training.py
 55 | │   ├── model_evaluation.py
 56 | │   └── model_performance_report.py
 57 | ├── tests/
 58 | │   ├── models/
 59 | │   ├── test_data_preprocessing.py
 60 | │   ├── test_feature_engineering.py
 61 | │   ├── test_model_training.py
 62 | ├── requirements.txt 
 63 | └── README.md
 64 | ```
 65 | 
 66 | ## 🔧 Setup and Installation
 67 | 
 68 | To get started with this project, follow the steps below:
 69 | 
 70 | 1. **Clone the repository:**
 71 | 
 72 |     ```sh
 73 |     git clone https://github.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis.git
 74 |     cd Diabetes_Health_Prediction_and_Analysis
 75 |     ```
 76 | 
 77 | 2. **Create and activate a virtual environment:**
 78 | 
 79 |     ```sh
 80 |     python -m venv venv
 81 |     source venv/bin/activate  # On Windows use `venv\Scripts\activate`
 82 |     ```
 83 | 
 84 | 3. **Install the required packages:**
 85 | 
 86 |     ```sh
 87 |     pip install -r requirements.txt
 88 |     ```
 89 | 
 90 | 4. **Run the data preprocessing script:**
 91 | 
 92 |     ```sh
 93 |     python scripts/data_preprocessing.py
 94 |     ```
 95 | 
 96 | 5. **Run the feature engineering script:**
 97 | 
 98 |     ```sh
 99 |     python scripts/feature_engineering.py
100 |     ```
101 | 
102 | 6. **Train the models:**
103 | 
104 |     ```sh
105 |     python scripts/model_training.py
106 |     ```
107 | 
108 | 7. **Evaluate the models:**
109 | 
110 |     ```sh
111 |     python scripts/model_evaluation.py
112 |     ```
113 | 
114 | 8. **Generate comprehensive model performance reports:**
115 | 
116 |     ```sh
117 |     python script/comprehensive_model_report.py
118 |     ```
119 | 
120 | ## 🚀 Usage
121 | 
122 | - **Exploratory Data Analysis**: Check the `notebooks/exploratory_data_analysis.ipynb` notebook for detailed data analysis and visualizations.
123 | - **Scripts**: All scripts for data preprocessing, feature engineering, model training, and evaluation are located in the `scripts/` directory.
124 | - **Tests**: To ensure code quality and correctness, tests are included in the `tests/` directory. Run them with `pytest`.
125 | 
126 | ## 📊 Models
127 | 
128 | The following models are trained and evaluated in this project:
129 | 
130 | ---
131 | 
132 | ### Logistic Regression
133 | 
134 | #### ROC Curve:
135 | ![Logistic Regression ROC Curve](/scripts/plots/Logistic%20Regression_roc_curve.png)
136 | 
137 | *The ROC curve illustrates the true positive rate (sensitivity) versus the false positive rate (1-specificity) for different threshold settings. A higher area under the curve (AUC) indicates better model performance.*
138 | 
139 | #### Confusion Matrix:
140 | ![Logistic Regression Confusion Matrix](/scripts/plots/Logistic%20Regression_confusion_matrix.png)
141 | 
142 | *The confusion matrix provides a summary of the prediction results on the classification problem. It shows the number of true positive (TP), true negative (TN), false positive (FP), and false negative (FN) predictions.*
143 | 
144 | ---
145 | 
146 | ### Random Forest
147 | 
148 | #### ROC Curve:
149 | ![Random Forest ROC Curve](/scripts/plots/Random%20Forest_roc_curve.png)
150 | 
151 | *The ROC curve illustrates the true positive rate (sensitivity) versus the false positive rate (1-specificity) for different threshold settings. A higher area under the curve (AUC) indicates better model performance.*
152 | 
153 | #### Confusion Matrix:
154 | ![Random Forest Confusion Matrix](/scripts/plots/Random%20Forest_confusion_matrix.png)
155 | 
156 | *The confusion matrix provides a summary of the prediction results on
157 | 
158 | 
159 | 
160 | ## 🎯 Performance Metrics
161 | 
162 | The performance of the models is evaluated using the following metrics:
163 | 
164 | - **Accuracy**
165 | - **Precision**
166 | - **Recall**
167 | - **F1 Score**
168 | - **ROC AUC Score**
169 | - **Confusion Matrix**
170 | 
171 | ### Logistic Regression
172 | 
173 | - **Accuracy (Doğruluk):** %78.99
174 | - **Precision (Kesinlik):** %73.19
175 | - **Recall (Duyarlılık):** %70.63
176 | - **F1 Score:** %71.89
177 | - **ROC AUC:** %83.86
178 | 
179 | **Confusion Matrix:**
180 | ```plaintext
181 | [[196  37]
182 |  [ 42 101]]
183 | ```
184 | Model dosyası: 
185 | ```sh
186 | models/logistic_regression.pkl
187 | ```
188 | 
189 | ### Random Forest
190 | 
191 | - **Accuracy (Doğruluk):** %91.22
192 | - **Precision (Kesinlik):** %94.35
193 | - **Recall (Duyarlılık):** %81.82
194 | - **F1 Score:** %87.64
195 | - **ROC AUC:** %97.69
196 | 
197 | **Confusion Matrix:**
198 | ```plaintext
199 | [[226   7]
200 |  [ 26 117]]
201 | ```
202 | Model dosyası: 
203 | ```sh
204 | models/random_forest.pkl
205 | ```
206 | ##### Explanations:
207 | 
208 | 1. [x] **_Accuracy:_** The ratio of correctly predicted instances to the total instances.
209 | 2. [x] **_Precision:**_ The ratio of true positive predictions to the total predicted positives. It measures the accuracy of positive predictions.
210 | 3. [x] **_Recall:_** The ratio of true positive predictions to the actual positives. It measures the model's ability to identify positive instances.
211 | 4.  [x] **_F1 Score:_** The harmonic mean of precision and recall. It provides a balance between precision and recall.
212 | 5.  [x] **_ROC AUC:_** The area under the ROC curve. It summarizes the model's ability to distinguish between classes.
213 | 
214 | **Confusion Matrix:**
215 | 
216 | * True Positive (TP): 117 - The number of actual positive cases correctly identified by the model.
217 | * True Negative (TN): 226 - The number of actual negative cases correctly identified by the model.
218 | * False Positive (FP): 7 - The number of actual negative cases incorrectly identified as positive by the model.
219 | * False Negative (FN): 26 - The number of actual positive cases incorrectly identified as negative by the model.
220 | 
221 | ##### Explanations:
222 | 1. [x] **_Accuracy:_** The ratio of correctly predicted instances to the total instances.
223 | 2. [x] **_Precision:_** The ratio of true positive predictions to the total predicted positives. It measures the accuracy of positive predictions.
224 | 3. [x] **_Recall:_** The ratio of true positive predictions to the actual positives. It measures the model's ability to identify positive instances.
225 | 4. [x] **_F1 Score:_** The harmonic mean of precision and recall. It provides a balance between precision and recall.
226 | 5. [x] **_ROC AUC:_** The area under the ROC curve. It summarizes the model's ability to distinguish between classes.
227 | 
228 | **Confusion Matrix:**
229 | 
230 | * True Positive (TP): 117 - The number of actual positive cases correctly identified by the model.
231 | * True Negative (TN): 226 - The number of actual negative cases correctly identified by the model.
232 | * False Positive (FP): 7 - The number of actual negative cases incorrectly identified as positive by the model.
233 | * False Negative (FN): 26 - The number of actual positive cases incorrectly identified as negative by the model.
234 | 
235 | ### XGBoost
236 | 
237 | - **Accuracy (Doğruluk):** %91.76
238 | - **Precision (Kesinlik):** %93.08
239 | - **Recall (Duyarlılık):** %84.62
240 | - **F1 Score:** %88.64
241 | - **ROC AUC:** %98.41
242 | 
243 | **Confusion Matrix:**
244 | ```plaintext
245 | [[224   9]
246 |  [ 22 121]]
247 | ```
248 | Model dosyası: 
249 | ```sh
250 | models/xgboost.pkl
251 | ```
252 | ##### Explanations:
253 | 
254 | 1. [x] **_Accuracy_:** The ratio of correctly predicted instances to the total instances.
255 | 2. [x] **_Precision:_** The ratio of true positive predictions to the total predicted positives. It measures the accuracy of positive predictions.
256 | 3. [x] **_Recall:_** The ratio of true positive predictions to the actual positives. It measures the model's ability to identify positive instances.
257 | 4. [x] _**F1 Score:**_ The harmonic mean of precision and recall. It provides a balance between precision and recall.
258 | 5. [x] **_ROC AUC:_** The area under the ROC curve. It summarizes the model's ability to distinguish between classes.
259 | 
260 | **Confusion Matrix:**
261 | 
262 | * True Positive (TP): 121 - The number of actual positive cases correctly identified by the model.
263 | * True Negative (TN): 224 - The number of actual negative cases correctly identified by the model.
264 | * False Positive (FP): 9 - The number of actual negative cases incorrectly identified as positive by the model.
265 | * False Negative (FN): 22 - The number of actual positive cases incorrectly identified as negative by the model.
266 | 
267 | ## 📈 Results
268 | 
269 | Model performance reports and evaluation metrics are saved and displayed in the `comprehensive_model_report.py` script output.
270 | 
271 | ## 💡 Future Work
272 | 
273 | - Implement more advanced deep learning models (e.g., Neural Networks, LSTM).
274 | - Perform hyperparameter tuning to optimize model performance.
275 | - Explore feature selection techniques to improve model accuracy.
276 | - Integrate additional health datasets for broader analysis.
277 | 
278 | ## 🤝 Contributing
279 | 
280 | Contributions are welcome! Please feel free to submit a Pull Request. 
281 | 
282 | Whether it's improving the documentation, adding new features, or fixing bugs, your contributions are highly appreciated. Let's make this project better together! 🚀
283 | 
284 | ### How to Contribute:
285 | 
286 | 1. **Fork the Repository**: Click on the 'Fork' button at the top right corner of this page to create a copy of this repository in your GitHub account.
287 | 
288 | 2. **Clone the Forked Repository**: 
289 |     ```bash
290 |     git clone https://github.com/your-username/Diabetes_Health_Prediction_and_Analysis.git
291 |     ```
292 | 
293 | 3. **Create a New Branch**: 
294 |     ```bash
295 |     git checkout -b feature/your-feature-name
296 |     ```
297 | 
298 | 4. **Make Your Changes**: Implement your feature, bug fix, or improvement.
299 | 
300 | 5. **Commit Your Changes**: 
301 |     ```bash
302 |     git commit -m "Add your commit message here"
303 |     ```
304 | 
305 | 6. **Push to Your Forked Repository**: 
306 |     ```bash
307 |     git push origin feature/your-feature-name
308 |     ```
309 | 
310 | 7. **Open a Pull Request**: Go to the original repository on GitHub and click on the 'New Pull Request' button. Compare changes from your forked repository and submit the pull request.
311 | 
312 | ---
313 | 
314 | Thank you for your contributions! Together, we can build a more robust and efficient Diabetes Health Prediction and Analysis tool. 🌟
315 | 
316 | ## 📄 License
317 | 
318 | This project is licensed under the MIT License.
319 | 
320 | ## 📬 Contact
321 | 
322 | If you have any questions or suggestions, feel free to open an issue or contact me directly. I am always open to feedback and would love to hear from you!
323 | 
324 | ---
325 | 
326 | ### How to Reach Me:
327 | 
328 | - **Email:** [piinartp@gmail.com](mailto:piinartp@gmail.com)
329 | - **GitHub Issues:** [Open an Issue](https://github.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/issues)
330 | - **LinkedIn:** [Your LinkedIn Profile](https://www.linkedin.com/in/piinartp/)
331 | 
332 | ---
333 | 
334 | Thank you for your interest in the Diabetes Health Prediction and Analysis project! Your feedback and suggestions are invaluable in making this project better and more useful for everyone. 🌟
335 | 
336 | ![Contact Us](https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhgcxnmPWgrukdZFkZONlQ4vUIKWJakRLZqvQUfzkDUbS2nAbQyIxR23-OwOis99pE6UQSxXmxwwuugHQWmwRFfZdw4QKGnk9S_n4yFrfPFTSbKIL6sKUKTwFUyG-8no5Y_9dCLI0LUJIo/s1600/welovehearingfromu.png!)
337 | 
338 | ---
339 | 
340 | 
341 | ---
342 | 
343 | ⭐️ Don't forget to give this project a star if you found it useful! ⭐️
344 | 


--------------------------------------------------------------------------------
/app/app.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, request, render_template
 2 | import joblib
 3 | import pandas as pd
 4 | 
 5 | # Load the trained model
 6 | model = joblib.load("./models/pipeline_xgboost.pkl")
 7 | 
 8 | # Load the feature names
 9 | X_train = pd.read_csv("../data/processed/X_train_engineered.csv")
10 | feature_names = X_train.columns.tolist()
11 | 
12 | # Remove 'PatientID' from feature names if it exists
13 | if 'PatientID' in feature_names:
14 |     feature_names.remove('PatientID')
15 | 
16 | # Categorize features
17 | demographic_features = [feature for feature in feature_names if
18 |                         'age' in feature or 'gender' in feature or 'ethnicity' in feature]
19 | medical_history_features = [feature for feature in feature_names if 'history' in feature or 'diabetes' in feature]
20 | lifestyle_features = [feature for feature in feature_names if
21 |                       'bmi' in feature or 'smoking' in feature or 'activity' in feature]
22 | 
23 | app = Flask(__name__)
24 | 
25 | 
26 | @app.route('/')
27 | def home():
28 |     return render_template('index.html', demographic_features=demographic_features,
29 |                            medical_history_features=medical_history_features,
30 |                            lifestyle_features=lifestyle_features)
31 | 
32 | 
33 | @app.route('/predict', methods=['POST'])
34 | def predict():
35 |     try:
36 |         # Extract form data
37 |         features = [float(request.form[feature]) for feature in feature_names]
38 |         input_data = pd.DataFrame([features], columns=feature_names)
39 | 
40 |         # Predict using the model
41 |         prediction = model.predict(input_data)[0]
42 |         prediction_prob = model.predict_proba(input_data)[0][1]
43 | 
44 |         return render_template('index.html', prediction=prediction, probability=prediction_prob,
45 |                                demographic_features=demographic_features,
46 |                                medical_history_features=medical_history_features,
47 |                                lifestyle_features=lifestyle_features)
48 |     except KeyError as e:
49 |         return f"Missing form data for feature: {e.args[0]}", 400
50 |     except ValueError as e:
51 |         return str(e), 400
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     app.run(debug=True)
56 | 


--------------------------------------------------------------------------------
/app/models/pipeline_xgboost.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/app/models/pipeline_xgboost.pkl


--------------------------------------------------------------------------------
/app/static/styles.css:
--------------------------------------------------------------------------------
 1 | body {
 2 |     background-color: #f4f7f6;
 3 |     font-family: 'Roboto', sans-serif;
 4 | }
 5 | 
 6 | .container {
 7 |     max-width: 800px;
 8 |     margin: auto;
 9 | }
10 | 
11 | .card {
12 |     border: none;
13 |     border-radius: 15px;
14 |     overflow: hidden;
15 | }
16 | 
17 | .card-header {
18 |     background: linear-gradient(45deg, #007bff, #0056b3);
19 | }
20 | 
21 | .card-title {
22 |     margin: 0;
23 | }
24 | 
25 | .btn-primary {
26 |     background-color: #007bff;
27 |     border-color: #007bff;
28 |     transition: background-color 0.3s, border-color 0.3s;
29 | }
30 | 
31 | .btn-primary:hover {
32 |     background-color: #0056b3;
33 |     border-color: #0056b3;
34 | }
35 | 
36 | .form-section {
37 |     margin-bottom: 20px;
38 |     padding: 20px;
39 |     border: 1px solid #e0e0e0;
40 |     border-radius: 10px;
41 |     background-color: #ffffff;
42 |     box-shadow: 0 4px 8px rgba(0, 0, 0, 0.05);
43 | }
44 | 
45 | .section-title {
46 |     margin-bottom: 15px;
47 |     font-size: 1.3em;
48 |     font-weight: bold;
49 |     color: #333333;
50 | }
51 | 
52 | .result {
53 |     background-color: #e9f7ef;
54 |     border: 1px solid #d4edda;
55 |     border-radius: 10px;
56 |     padding: 20px;
57 | }
58 | 
59 | .result h2, .result h3 {
60 |     margin: 0;
61 |     color: #155724;
62 | }
63 | 


--------------------------------------------------------------------------------
/app/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | 
 4 | <head>
 5 |     <meta charset="UTF-8">
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
 7 |     <link href="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css" rel="stylesheet">
 8 |     <link rel="stylesheet" href="/static/styles.css">
 9 |     <title>Diabetes Prediction</title>
10 | </head>
11 | 
12 | <body>
13 |     <div class="container mt-5">
14 |         <div class="card shadow-lg">
15 |             <div class="card-header bg-primary text-white text-center">
16 |                 <h1 class="card-title">🩺 Diabetes Prediction</h1>
17 |             </div>
18 |             <div class="card-body">
19 |                 <form action="/predict" method="POST" class="prediction-form">
20 |                     <div class="form-section">
21 |                         <h4 class="section-title bg-light p-2 rounded">Demographic Details</h4>
22 |                         <div class="row">
23 |                             {% for feature in demographic_features %}
24 |                             <div class="col-md-6">
25 |                                 <div class="form-group">
26 |                                     <label for="{{ feature }}">{{ feature.replace('_', ' ').title() }}:</label>
27 |                                     <input type="text" id="{{ feature }}" name="{{ feature }}" class="form-control" required>
28 |                                 </div>
29 |                             </div>
30 |                             {% endfor %}
31 |                         </div>
32 |                     </div>
33 | 
34 |                     <div class="form-section">
35 |                         <h4 class="section-title bg-light p-2 rounded">Medical History</h4>
36 |                         <div class="row">
37 |                             {% for feature in medical_history_features %}
38 |                             <div class="col-md-6">
39 |                                 <div class="form-group">
40 |                                     <label for="{{ feature }}">{{ feature.replace('_', ' ').title() }}:</label>
41 |                                     <input type="text" id="{{ feature }}" name="{{ feature }}" class="form-control" required>
42 |                                 </div>
43 |                             </div>
44 |                             {% endfor %}
45 |                         </div>
46 |                     </div>
47 | 
48 |                     <div class="form-section">
49 |                         <h4 class="section-title bg-light p-2 rounded">Lifestyle Factors</h4>
50 |                         <div class="row">
51 |                             {% for feature in lifestyle_features %}
52 |                             <div class="col-md-6">
53 |                                 <div class="form-group">
54 |                                     <label for="{{ feature }}">{{ feature.replace('_', ' ').title() }}:</label>
55 |                                     <input type="text" id="{{ feature }}" name="{{ feature }}" class="form-control" required>
56 |                                 </div>
57 |                             </div>
58 |                             {% endfor %}
59 |                         </div>
60 |                     </div>
61 | 
62 |                     <button type="submit" class="btn btn-primary btn-block">Predict</button>
63 |                 </form>
64 | 
65 |                 {% if prediction is not none %}
66 |                 <div class="result mt-4">
67 |                     <div class="alert alert-success text-center">
68 |                         <h2>Prediction: {{ prediction }}</h2>
69 |                         <h3>Probability: {{ probability }}</h3>
70 |                     </div>
71 |                 </div>
72 |                 {% endif %}
73 |             </div>
74 |         </div>
75 |     </div>
76 |     <script src="https://code.jquery.com/jquery-3.5.1.slim.min.js"></script>
77 |     <script src="https://cdn.jsdelivr.net/npm/@popperjs/core@2.5.4/dist/umd/popper.min.js"></script>
78 |     <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/js/bootstrap.min.js"></script>
79 | </body>
80 | 
81 | </html>
82 | 


--------------------------------------------------------------------------------
/models/logistic_regression.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/models/logistic_regression.pkl


--------------------------------------------------------------------------------
/models/random_forest.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/models/random_forest.pkl


--------------------------------------------------------------------------------
/models/xgboost.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/models/xgboost.pkl


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pytest~=8.2.2
2 | pandas~=2.2.2
3 | joblib~=1.4.0
4 | scikit-learn~=1.4.2
5 | xgboost~=2.0.3
6 | numpy~=1.26.4
7 | seaborn~=0.13.2
8 | matplotlib~=3.8.4
9 | 


--------------------------------------------------------------------------------
/scripts/__pycache__/utility_functions.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/scripts/__pycache__/utility_functions.cpython-311.pyc


--------------------------------------------------------------------------------
/scripts/comprehensive_model_report.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import matplotlib.pyplot as plt
  3 | from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, classification_report, accuracy_score, \
  4 |     precision_score, recall_score, f1_score
  5 | import seaborn as sns
  6 | import joblib
  7 | import os
  8 | from utility_functions import load_model, load_train_test_data, plot_roc_curve, plot_confusion_matrix
  9 | 
 10 | 
 11 | # Modeli ve veri setlerini yükleme
 12 | def load_model_and_data(model_path, X_test_path, y_test_path):
 13 |     model = load_model(model_path)
 14 |     X_test = pd.read_csv(X_test_path)
 15 |     y_test = pd.read_csv(y_test_path).values.ravel().astype('int')
 16 |     return model, X_test, y_test
 17 | 
 18 | 
 19 | # ROC eğrisini çizme ve kaydetme
 20 | def plot_and_save_roc_curve(model, X_test, y_test, model_name):
 21 |     y_pred_prob = model.predict_proba(X_test)[:, 1]
 22 |     fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
 23 |     roc_auc = roc_auc_score(y_test, y_pred_prob)
 24 | 
 25 |     plt.figure()
 26 |     plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
 27 |     plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
 28 |     plt.xlim([0.0, 1.0])
 29 |     plt.ylim([0.0, 1.05])
 30 |     plt.xlabel('False Positive Rate')
 31 |     plt.ylabel('True Positive Rate')
 32 |     plt.title(f'Receiver Operating Characteristic - {model_name}')
 33 |     plt.legend(loc="lower right")
 34 |     os.makedirs('plots', exist_ok=True)
 35 |     plt.savefig(f'plots/{model_name}_roc_curve.png')
 36 |     plt.show()
 37 |     return roc_auc
 38 | 
 39 | 
 40 | # Confusion Matrix çizme ve kaydetme
 41 | def plot_and_save_confusion_matrix(y_test, y_pred, model_name):
 42 |     cm = confusion_matrix(y_test, y_pred)
 43 |     plt.figure()
 44 |     sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Diabetes', 'Diabetes'],
 45 |                 yticklabels=['No Diabetes', 'Diabetes'])
 46 |     plt.xlabel('Predicted')
 47 |     plt.ylabel('True')
 48 |     plt.title(f'Confusion Matrix - {model_name}')
 49 |     os.makedirs('plots', exist_ok=True)
 50 |     plt.savefig(f'plots/{model_name}_confusion_matrix.png')
 51 |     plt.show()
 52 | 
 53 | 
 54 | # Kapsamlı model değerlendirme raporu oluşturma
 55 | def comprehensive_model_report(model_path, X_test_path, y_test_path, model_name):
 56 |     model, X_test, y_test = load_model_and_data(model_path, X_test_path, y_test_path)
 57 |     y_pred = model.predict(X_test)
 58 | 
 59 |     # ROC Curve ve AUC
 60 |     roc_auc = plot_and_save_roc_curve(model, X_test, y_test, model_name)
 61 | 
 62 |     # Confusion Matrix
 63 |     plot_and_save_confusion_matrix(y_test, y_pred, model_name)
 64 | 
 65 |     # Classification Report
 66 |     class_report = classification_report(y_test, y_pred, output_dict=True)
 67 |     print(f"Classification Report for {model_name}:\n")
 68 |     print(classification_report(y_test, y_pred))
 69 | 
 70 |     # Diğer Metrikler
 71 |     accuracy = accuracy_score(y_test, y_pred)
 72 |     precision = precision_score(y_test, y_pred)
 73 |     recall = recall_score(y_test, y_pred)
 74 |     f1 = f1_score(y_test, y_pred)
 75 | 
 76 |     metrics = {
 77 |         'accuracy': accuracy,
 78 |         'precision': precision,
 79 |         'recall': recall,
 80 |         'f1_score': f1,
 81 |         'roc_auc': roc_auc
 82 |     }
 83 | 
 84 |     # Sonuçları kaydetme
 85 |     report_path = f'reports/{model_name}_report.csv'
 86 |     os.makedirs('reports', exist_ok=True)
 87 |     report_df = pd.DataFrame(class_report).transpose()
 88 |     report_df['accuracy'] = accuracy
 89 |     report_df['roc_auc'] = roc_auc
 90 |     report_df.to_csv(report_path)
 91 | 
 92 |     print(f"Comprehensive report saved as {report_path}")
 93 |     print(f"Metrics: {metrics}")
 94 | 
 95 | 
 96 | # Ana işlem fonksiyonu
 97 | def main():
 98 |     model_paths = ["models/logistic_regression.pkl", "models/random_forest.pkl", "models/xgboost.pkl"]
 99 |     model_names = ["Logistic Regression", "Random Forest", "XGBoost"]
100 |     X_test_path = "../data/processed/X_test_engineered.csv"
101 |     y_test_path = "../data/processed/y_test.csv"
102 | 
103 |     for model_path, model_name in zip(model_paths, model_names):
104 |         comprehensive_model_report(model_path, X_test_path, y_test_path, model_name)
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     main()
109 | 


--------------------------------------------------------------------------------
/scripts/data_preprocessing.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import seaborn as sns
  3 | import matplotlib.pyplot as plt
  4 | from sklearn.model_selection import train_test_split
  5 | from sklearn.preprocessing import StandardScaler, OneHotEncoder
  6 | from sklearn.impute import SimpleImputer
  7 | import os
  8 | 
  9 | 
 10 | # Veri setini yükleme
 11 | def load_data(file_path):
 12 |     df = pd.read_csv(file_path)
 13 |     return df
 14 | 
 15 | 
 16 | # Veri Keşfi (Exploratory Data Analysis)
 17 | def perform_eda(df, dataset_name=""):
 18 |     print(f"{dataset_name} Veri Setinin İlk 5 Satırı:\n", df.head())
 19 |     print(f"\n{dataset_name} Veri Seti Hakkında Bilgiler:\n", df.info())
 20 |     print(f"\n{dataset_name} Veri Setindeki Eksik Değerler:\n", df.isnull().sum())
 21 |     print(f"\n{dataset_name} Temel İstatistikler:\n", df.describe())
 22 | 
 23 |     # Kategorik değişkenlerin dağılımı
 24 |     categorical_columns = df.select_dtypes(include=['object', 'category']).columns
 25 |     for column in categorical_columns:
 26 |         plt.figure(figsize=(10, 5))
 27 |         sns.countplot(data=df, x=column)
 28 |         plt.title(f'{dataset_name} {column} Dağılımı')
 29 |         plt.show()
 30 | 
 31 |     # Sayısal değişkenlerin dağılımı
 32 |     numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
 33 |     df[numeric_columns].hist(bins=15, figsize=(20, 15))
 34 |     plt.suptitle(f'{dataset_name} Sayısal Değişkenlerin Dağılımı')
 35 |     plt.show()
 36 | 
 37 |     # Korelasyon matrisi (DoctorInCharge sütunu çıkarıldı)
 38 |     if 'DoctorInCharge' in df.columns:
 39 |         df_corr = df.drop(columns=['DoctorInCharge'])
 40 |     else:
 41 |         df_corr = df.copy()
 42 |     plt.figure(figsize=(15, 10))
 43 |     sns.heatmap(df_corr.corr(), annot=True, cmap='coolwarm')
 44 |     plt.title(f'{dataset_name} Korelasyon Matrisi')
 45 |     plt.show()
 46 | 
 47 | 
 48 | # Eksik verileri işleme
 49 | def handle_missing_values(df):
 50 |     imputer = SimpleImputer(strategy='mean')
 51 |     numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
 52 |     df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
 53 |     return df
 54 | 
 55 | 
 56 | # Kategorik değişkenleri dönüştürme
 57 | def encode_categorical_values(df):
 58 |     categorical_columns = df.select_dtypes(include=['object']).columns
 59 |     encoder = OneHotEncoder(sparse_output=False, drop='first')
 60 |     encoded_columns = pd.DataFrame(encoder.fit_transform(df[categorical_columns]))
 61 |     encoded_columns.columns = encoder.get_feature_names_out(categorical_columns)
 62 |     df = df.drop(categorical_columns, axis=1)
 63 |     df = pd.concat([df, encoded_columns], axis=1)
 64 |     return df
 65 | 
 66 | 
 67 | # Verileri normalleştirme
 68 | def normalize_data(df):
 69 |     scaler = StandardScaler()
 70 |     numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
 71 |     df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
 72 |     return df
 73 | 
 74 | 
 75 | # Verileri eğitim ve test setlerine ayırma
 76 | def split_data(df, target_column, test_size=0.2, random_state=42):
 77 |     X = df.drop(target_column, axis=1)
 78 |     y = df[target_column]
 79 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
 80 |     return X_train, X_test, y_train, y_test
 81 | 
 82 | 
 83 | # Eğitim ve test setlerini kaydetme
 84 | def save_datasets(X_train, X_test, y_train, y_test, output_dir):
 85 |     os.makedirs(output_dir, exist_ok=True)
 86 |     X_train.to_csv(os.path.join(output_dir, 'X_train.csv'), index=False)
 87 |     X_test.to_csv(os.path.join(output_dir, 'X_test.csv'), index=False)
 88 |     y_train.to_csv(os.path.join(output_dir, 'y_train.csv'), index=False)
 89 |     y_test.to_csv(os.path.join(output_dir, 'y_test.csv'), index=False)
 90 |     print(f"Datasets saved to {output_dir}")
 91 | 
 92 | 
 93 | # Ana işlem fonksiyonu
 94 | def main(file_path, target_column, output_dir):
 95 |     df = load_data(file_path)
 96 |     perform_eda(df, dataset_name="Orijinal")
 97 |     df = handle_missing_values(df)
 98 |     df = encode_categorical_values(df)
 99 |     df = normalize_data(df)
100 |     X_train, X_test, y_train, y_test = split_data(df, target_column)
101 |     save_datasets(X_train, X_test, y_train, y_test, output_dir)
102 | 
103 |     # Eğitim ve test setleri için sütun analizleri
104 |     perform_eda(pd.concat([X_train, y_train], axis=1), dataset_name="Eğitim Seti")
105 |     perform_eda(pd.concat([X_test, y_test], axis=1), dataset_name="Test Seti")
106 | 
107 |     return X_train, X_test, y_train, y_test
108 | 
109 | 
110 | if __name__ == "__main__":
111 |     file_path = "../data/raw/diabetes_data.csv"  # Veri seti yolu
112 |     target_column = "Diagnosis"  # Hedef değişken
113 |     output_dir = "../data/processed"  # Kaydedilecek dizin
114 |     X_train, X_test, y_train, y_test = main(file_path, target_column, output_dir)
115 |     print("Veri ön işleme tamamlandı ve veriler eğitim/test setlerine ayrıldı.")
116 | 


--------------------------------------------------------------------------------
/scripts/feature_engineering.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.preprocessing import PolynomialFeatures
 3 | 
 4 | 
 5 | # Yeni özellikler oluşturma
 6 | def create_new_features(df):
 7 |     # Örneğin, yaş ve BMI'nin çarpımını yeni bir özellik olarak ekleyebiliriz
 8 |     df['Age_BMI'] = df['Age'] * df['BMI']
 9 |     return df
10 | 
11 | 
12 | # Polynomial Features oluşturma
13 | def add_polynomial_features(df, degree=2):
14 |     poly = PolynomialFeatures(degree, include_bias=False)
15 |     numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
16 |     poly_features = poly.fit_transform(df[numeric_columns])
17 |     poly_feature_names = poly.get_feature_names_out(numeric_columns)
18 |     poly_df = pd.DataFrame(poly_features, columns=poly_feature_names)
19 | 
20 |     df = df.reset_index(drop=True)
21 |     poly_df = poly_df.reset_index(drop=True)
22 | 
23 |     df = pd.concat([df, poly_df], axis=1)
24 |     return df
25 | 
26 | 
27 | # Ana işlem fonksiyonu
28 | def main(train_file_path, test_file_path, train_output_path, test_output_path, degree=2):
29 |     # Eğitim veri seti için feature engineering
30 |     df_train = pd.read_csv(train_file_path)
31 |     df_train = create_new_features(df_train)
32 |     df_train = add_polynomial_features(df_train, degree)
33 |     df_train.to_csv(train_output_path, index=False)
34 |     print(f"Eğitim veri seti için feature engineering tamamlandı ve {train_output_path} dosyasına kaydedildi.")
35 | 
36 |     # Test veri seti için feature engineering
37 |     df_test = pd.read_csv(test_file_path)
38 |     df_test = create_new_features(df_test)
39 |     df_test = add_polynomial_features(df_test, degree)
40 |     df_test.to_csv(test_output_path, index=False)
41 |     print(f"Test veri seti için feature engineering tamamlandı ve {test_output_path} dosyasına kaydedildi.")
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     train_file_path = "../data/processed/X_train.csv"  # Eğitim veri seti yolu
46 |     test_file_path = "../data/processed/X_test.csv"  # Test veri seti yolu
47 |     train_output_path = "../data/processed/X_train_engineered.csv"  # Kaydedilecek yeni eğitim veri seti yolu
48 |     test_output_path = "../data/processed/X_test_engineered.csv"  # Kaydedilecek yeni test veri seti yolu
49 |     degree = 2  # Polynomial degree
50 |     main(train_file_path, test_file_path, train_output_path, test_output_path, degree)
51 | 


--------------------------------------------------------------------------------
/scripts/model_evaluation.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import seaborn as sns
 3 | import matplotlib.pyplot as plt
 4 | from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, classification_report
 5 | import joblib
 6 | 
 7 | 
 8 | # Modeli ve veri setlerini yükleme
 9 | def load_model_and_data(model_path, X_test_path, y_test_path):
10 |     model = joblib.load(model_path)
11 |     X_test = pd.read_csv(X_test_path)
12 |     y_test = pd.read_csv(y_test_path).values.ravel().astype('int')
13 |     return model, X_test, y_test
14 | 
15 | 
16 | # ROC eğrisini çizme
17 | def plot_roc_curve(model, X_test, y_test, model_name):
18 |     y_pred_prob = model.predict_proba(X_test)[:, 1]
19 |     fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
20 |     roc_auc = roc_auc_score(y_test, y_pred_prob)
21 | 
22 |     plt.figure()
23 |     plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
24 |     plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
25 |     plt.xlim([0.0, 1.0])
26 |     plt.ylim([0.0, 1.05])
27 |     plt.xlabel('False Positive Rate')
28 |     plt.ylabel('True Positive Rate')
29 |     plt.title(f'Receiver Operating Characteristic - {model_name}')
30 |     plt.legend(loc="lower right")
31 |     plt.savefig(f'plots/{model_name}_roc_curve.png')
32 |     plt.show()
33 | 
34 | 
35 | # Confusion Matrix çizme
36 | def plot_confusion_matrix(y_test, y_pred, model_name):
37 |     cm = confusion_matrix(y_test, y_pred)
38 |     plt.figure()
39 |     sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Diabetes', 'Diabetes'],
40 |                 yticklabels=['No Diabetes', 'Diabetes'])
41 |     plt.xlabel('Predicted')
42 |     plt.ylabel('True')
43 |     plt.title(f'Confusion Matrix - {model_name}')
44 |     plt.savefig(f'plots/{model_name}_confusion_matrix.png')
45 |     plt.show()
46 | 
47 | 
48 | # Modeli değerlendirme
49 | def evaluate_model(model_path, X_test_path, y_test_path, model_name):
50 |     model, X_test, y_test = load_model_and_data(model_path, X_test_path, y_test_path)
51 |     y_pred = model.predict(X_test)
52 | 
53 |     # ROC Curve
54 |     plot_roc_curve(model, X_test, y_test, model_name)
55 | 
56 |     # Confusion Matrix
57 |     plot_confusion_matrix(y_test, y_pred, model_name)
58 | 
59 |     # Classification Report
60 |     print(f"Classification Report for {model_name}:\n")
61 |     print(classification_report(y_test, y_pred))
62 | 
63 | 
64 | # Ana işlem fonksiyonu
65 | def main():
66 |     model_paths = ["models/logistic_regression.pkl", "models/random_forest.pkl", "models/xgboost.pkl"]
67 |     model_names = ["Logistic Regression", "Random Forest", "XGBoost"]
68 |     X_test_path = "../data/processed/X_test_engineered.csv"
69 |     y_test_path = "../data/processed/y_test.csv"
70 | 
71 |     for model_path, model_name in zip(model_paths, model_names):
72 |         evaluate_model(model_path, X_test_path, y_test_path, model_name)
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     main()
77 | 


--------------------------------------------------------------------------------
/scripts/model_training.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.model_selection import train_test_split, GridSearchCV
 3 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
 4 | from sklearn.linear_model import LogisticRegression
 5 | from sklearn.ensemble import RandomForestClassifier
 6 | from xgboost import XGBClassifier
 7 | import joblib
 8 | import os
 9 | 
10 | 
11 | # Veri setlerini yükleme
12 | def load_data(X_train_path, X_test_path, y_train_path, y_test_path):
13 |     X_train = pd.read_csv(X_train_path)
14 |     X_test = pd.read_csv(X_test_path)
15 |     y_train = pd.read_csv(y_train_path).values.ravel().astype('int')  # Hedef değişkeni integer tipine dönüştür
16 |     y_test = pd.read_csv(y_test_path).values.ravel().astype('int')  # Hedef değişkeni integer tipine dönüştür
17 | 
18 |     return X_train, X_test, y_train, y_test
19 | 
20 | 
21 | # Model eğitimi ve değerlendirmesi
22 | def train_and_evaluate_model(X_train, X_test, y_train, y_test, model, model_name):
23 |     model.fit(X_train, y_train)
24 |     y_pred = model.predict(X_test)
25 |     y_pred_prob = model.predict_proba(X_test)[:, 1]
26 | 
27 |     accuracy = accuracy_score(y_test, y_pred)
28 |     precision = precision_score(y_test, y_pred)
29 |     recall = recall_score(y_test, y_pred)
30 |     f1 = f1_score(y_test, y_pred)
31 |     roc_auc = roc_auc_score(y_test, y_pred_prob)
32 |     cm = confusion_matrix(y_test, y_pred)
33 | 
34 |     print(f"Model: {model_name}")
35 |     print(f"Accuracy: {accuracy:.4f}")
36 |     print(f"Precision: {precision:.4f}")
37 |     print(f"Recall: {recall:.4f}")
38 |     print(f"F1 Score: {f1:.4f}")
39 |     print(f"ROC AUC: {roc_auc:.4f}")
40 |     print("Confusion Matrix:")
41 |     print(cm)
42 |     print("\n")
43 | 
44 |     # Modeli kaydetme
45 |     os.makedirs('../models', exist_ok=True)
46 |     joblib.dump(model, f"../models/{model_name}.pkl")
47 |     print(f"Model saved as models/{model_name}.pkl")
48 | 
49 | 
50 | # Ana işlem fonksiyonu
51 | def main(X_train_path, X_test_path, y_train_path, y_test_path):
52 |     X_train, X_test, y_train, y_test = load_data(X_train_path, X_test_path, y_train_path, y_test_path)
53 | 
54 |     # Lojistik Regresyon
55 |     lr = LogisticRegression(max_iter=1000)
56 |     train_and_evaluate_model(X_train, X_test, y_train, y_test, lr, "logistic_regression")
57 | 
58 |     # Random Forest
59 |     rf = RandomForestClassifier(n_estimators=100)
60 |     train_and_evaluate_model(X_train, X_test, y_train, y_test, rf, "random_forest")
61 | 
62 |     # XGBoost
63 |     xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
64 |     train_and_evaluate_model(X_train, X_test, y_train, y_test, xgb, "xgboost")
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     X_train_path = "../data/processed/X_train_engineered.csv"  # Eğitim veri seti yolu
69 |     X_test_path = "../data/processed/X_test_engineered.csv"  # Test veri seti yolu
70 |     y_train_path = "../data/processed/y_train.csv"  # Eğitim hedef değişkeni veri seti yolu
71 |     y_test_path = "../data/processed/y_test.csv"  # Test hedef değişkeni veri seti yolu
72 |     main(X_train_path, X_test_path, y_train_path, y_test_path)
73 | 


--------------------------------------------------------------------------------
/scripts/plots/Logistic Regression_confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/scripts/plots/Logistic Regression_confusion_matrix.png


--------------------------------------------------------------------------------
/scripts/plots/Logistic Regression_roc_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/scripts/plots/Logistic Regression_roc_curve.png


--------------------------------------------------------------------------------
/scripts/plots/Random Forest_confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/scripts/plots/Random Forest_confusion_matrix.png


--------------------------------------------------------------------------------
/scripts/plots/Random Forest_roc_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/scripts/plots/Random Forest_roc_curve.png


--------------------------------------------------------------------------------
/scripts/plots/XGBoost_confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/scripts/plots/XGBoost_confusion_matrix.png


--------------------------------------------------------------------------------
/scripts/plots/XGBoost_roc_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/scripts/plots/XGBoost_roc_curve.png


--------------------------------------------------------------------------------
/scripts/reports/Logistic Regression_report.csv:
--------------------------------------------------------------------------------
1 | ,precision,recall,f1-score,support,accuracy,roc_auc
2 | 0,0.8235294117647058,0.8412017167381974,0.832271762208068,233.0,0.7898936170212766,0.8386206068609503
3 | 1,0.7318840579710145,0.7062937062937062,0.7188612099644128,143.0,0.7898936170212766,0.8386206068609503
4 | accuracy,0.7898936170212766,0.7898936170212766,0.7898936170212766,0.7898936170212766,0.7898936170212766,0.8386206068609503
5 | macro avg,0.7777067348678601,0.7737477115159519,0.7755664860862403,376.0,0.7898936170212766,0.8386206068609503
6 | weighted avg,0.7886749288059349,0.7898936170212766,0.7891395574983799,376.0,0.7898936170212766,0.8386206068609503
7 | 


--------------------------------------------------------------------------------
/scripts/reports/Random Forest_report.csv:
--------------------------------------------------------------------------------
1 | ,precision,recall,f1-score,support,accuracy,roc_auc
2 | 0,0.8968253968253969,0.9699570815450643,0.931958762886598,233.0,0.9122340425531915,0.9768600498214233
3 | 1,0.9435483870967742,0.8181818181818182,0.8764044943820225,143.0,0.9122340425531915,0.9768600498214233
4 | accuracy,0.9122340425531915,0.9122340425531915,0.9122340425531915,0.9122340425531915,0.9122340425531915,0.9768600498214233
5 | macro avg,0.9201868919610856,0.8940694498634413,0.9041816286343103,376.0,0.9122340425531915,0.9768600498214233
6 | weighted avg,0.91459504472116,0.9122340425531915,0.9108304107691664,376.0,0.9122340425531915,0.9768600498214233
7 | 


--------------------------------------------------------------------------------
/scripts/reports/XGBoost_report.csv:
--------------------------------------------------------------------------------
1 | ,precision,recall,f1-score,support,accuracy,roc_auc
2 | 0,0.9105691056910569,0.9613733905579399,0.9352818371607515,233.0,0.9175531914893617,0.9840631471532759
3 | 1,0.9307692307692308,0.8461538461538461,0.8864468864468864,143.0,0.9175531914893617,0.9840631471532759
4 | accuracy,0.9175531914893617,0.9175531914893617,0.9175531914893617,0.9175531914893617,0.9175531914893617,0.9840631471532759
5 | macro avg,0.9206691682301438,0.903763618355893,0.910864361803819,376.0,0.9175531914893617,0.9840631471532759
6 | weighted avg,0.9182516000691922,0.9175531914893617,0.9167089702669144,376.0,0.9175531914893617,0.9840631471532759
7 | 


--------------------------------------------------------------------------------
/scripts/utility_functions.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import seaborn as sns
 3 | import matplotlib.pyplot as plt
 4 | from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, classification_report
 5 | import joblib
 6 | import os
 7 | 
 8 | 
 9 | # Veri setini yükleme
10 | def load_data(file_path):
11 |     df = pd.read_csv(file_path)
12 |     return df
13 | 
14 | 
15 | # Veri setlerini yükleme
16 | def load_train_test_data(X_train_path, X_test_path, y_train_path, y_test_path):
17 |     X_train = pd.read_csv(X_train_path)
18 |     X_test = pd.read_csv(X_test_path)
19 |     y_train = pd.read_csv(y_train_path).values.ravel().astype('int')
20 |     y_test = pd.read_csv(y_test_path).values.ravel().astype('int')
21 |     return X_train, X_test, y_train, y_test
22 | 
23 | 
24 | # Modeli kaydetme
25 | def save_model(model, model_name):
26 |     os.makedirs('../models', exist_ok=True)
27 |     joblib.dump(model, f"../models/{model_name}.pkl")
28 |     print(f"Model saved as models/{model_name}.pkl")
29 | 
30 | 
31 | # Modeli yükleme
32 | def load_model(model_path):
33 |     model = joblib.load(model_path)
34 |     return model
35 | 
36 | 
37 | # ROC eğrisini çizme
38 | def plot_roc_curve(model, X_test, y_test, model_name):
39 |     y_pred_prob = model.predict_proba(X_test)[:, 1]
40 |     fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
41 |     roc_auc = roc_auc_score(y_test, y_pred_prob)
42 | 
43 |     plt.figure()
44 |     plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
45 |     plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
46 |     plt.xlim([0.0, 1.0])
47 |     plt.ylim([0.0, 1.05])
48 |     plt.xlabel('False Positive Rate')
49 |     plt.ylabel('True Positive Rate')
50 |     plt.title(f'Receiver Operating Characteristic - {model_name}')
51 |     plt.legend(loc="lower right")
52 |     os.makedirs('plots', exist_ok=True)
53 |     plt.savefig(f'plots/{model_name}_roc_curve.png')
54 |     plt.show()
55 | 
56 | 
57 | # Confusion Matrix çizme
58 | def plot_confusion_matrix(y_test, y_pred, model_name):
59 |     cm = confusion_matrix(y_test, y_pred)
60 |     plt.figure()
61 |     sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Diabetes', 'Diabetes'],
62 |                 yticklabels=['No Diabetes', 'Diabetes'])
63 |     plt.xlabel('Predicted')
64 |     plt.ylabel('True')
65 |     plt.title(f'Confusion Matrix - {model_name}')
66 |     os.makedirs('plots', exist_ok=True)
67 |     plt.savefig(f'plots/{model_name}_confusion_matrix.png')
68 |     plt.show()
69 | 
70 | 
71 | # Modeli değerlendirme
72 | def evaluate_model(model, X_test, y_test, model_name):
73 |     y_pred = model.predict(X_test)
74 | 
75 |     # ROC Curve
76 |     plot_roc_curve(model, X_test, y_test, model_name)
77 | 
78 |     # Confusion Matrix
79 |     plot_confusion_matrix(y_test, y_pred, model_name)
80 | 
81 |     # Classification Report
82 |     print(f"Classification Report for {model_name}:\n")
83 |     print(classification_report(y_test, y_pred))
84 | 
85 | 
86 | # Eğitim ve test setlerini kaydetme
87 | def save_datasets(X_train, X_test, y_train, y_test, output_dir):
88 |     os.makedirs(output_dir, exist_ok=True)
89 |     X_train.to_csv(os.path.join(output_dir, 'X_train.csv'), index=False)
90 |     X_test.to_csv(os.path.join(output_dir, 'X_test.csv'), index=False)
91 |     y_train.to_csv(os.path.join(output_dir, 'y_train.csv'), index=False)
92 |     y_test.to_csv(os.path.join(output_dir, 'y_test.csv'), index=False)
93 |     print(f"Datasets saved to {output_dir}")
94 | 


--------------------------------------------------------------------------------
/tests/__pycache__/test_data_preprocessing.cpython-311-pytest-8.2.2.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/tests/__pycache__/test_data_preprocessing.cpython-311-pytest-8.2.2.pyc


--------------------------------------------------------------------------------
/tests/__pycache__/test_feature_engineering.cpython-311-pytest-8.2.2.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/tests/__pycache__/test_feature_engineering.cpython-311-pytest-8.2.2.pyc


--------------------------------------------------------------------------------
/tests/__pycache__/test_model_evaluation.cpython-311-pytest-8.2.2.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/tests/__pycache__/test_model_evaluation.cpython-311-pytest-8.2.2.pyc


--------------------------------------------------------------------------------
/tests/__pycache__/test_model_training.cpython-311-pytest-8.2.2.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/tests/__pycache__/test_model_training.cpython-311-pytest-8.2.2.pyc


--------------------------------------------------------------------------------
/tests/models/logistic_regression.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/tests/models/logistic_regression.pkl


--------------------------------------------------------------------------------
/tests/models/random_forest.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/tests/models/random_forest.pkl


--------------------------------------------------------------------------------
/tests/models/xgboost.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/tests/models/xgboost.pkl


--------------------------------------------------------------------------------
/tests/test_data_preprocessing.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import pandas as pd
  3 | from sklearn.impute import SimpleImputer
  4 | from sklearn.preprocessing import OneHotEncoder, StandardScaler
  5 | from sklearn.model_selection import train_test_split
  6 | import os
  7 | 
  8 | 
  9 | # Veri setini yükleme
 10 | def load_data(file_path):
 11 |     df = pd.read_csv(file_path)
 12 |     return df
 13 | 
 14 | 
 15 | # Veri Keşfi (Exploratory Data Analysis)
 16 | def perform_eda(df):
 17 |     print("Veri Setinin İlk 5 Satırı:\n", df.head())
 18 |     print("\nVeri Seti Hakkında Bilgiler:\n", df.info())
 19 |     print("\nVeri Setindeki Eksik Değerler:\n", df.isnull().sum())
 20 |     print("\nTemel İstatistikler:\n", df.describe())
 21 | 
 22 | 
 23 | # Eksik verileri işleme
 24 | def handle_missing_values(df):
 25 |     imputer = SimpleImputer(strategy='mean')
 26 |     numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
 27 |     df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
 28 |     return df
 29 | 
 30 | 
 31 | # Kategorik değişkenleri dönüştürme
 32 | def encode_categorical_values(df):
 33 |     # Kategorik sütunları belirle
 34 |     categorical_columns = df.select_dtypes(include=['object', 'category']).columns
 35 |     # Encoder oluştur
 36 |     encoder = OneHotEncoder(sparse_output=False, drop='first')
 37 |     # Kategorik sütunları dönüştür
 38 |     encoded_columns = pd.DataFrame(encoder.fit_transform(df[categorical_columns]),
 39 |                                    columns=encoder.get_feature_names_out(categorical_columns))
 40 |     # Orijinal kategorik sütunları düşür
 41 |     df = df.drop(categorical_columns, axis=1)
 42 |     # Yeni sütunları ekle
 43 |     df = pd.concat([df, encoded_columns], axis=1)
 44 |     return df
 45 | 
 46 | 
 47 | # Verileri normalleştirme
 48 | def normalize_data(df):
 49 |     scaler = StandardScaler()
 50 |     numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
 51 |     df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
 52 |     return df
 53 | 
 54 | 
 55 | # Verileri eğitim ve test setlerine ayırma
 56 | def split_data(df, target_column, test_size=0.2, random_state=42):
 57 |     X = df.drop(target_column, axis=1)
 58 |     y = df[target_column]
 59 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
 60 |     return X_train, X_test, y_train, y_test
 61 | 
 62 | 
 63 | # Eğitim ve test setlerini kaydetme
 64 | def save_datasets(X_train, X_test, y_train, y_test, output_dir):
 65 |     os.makedirs(output_dir, exist_ok=True)
 66 |     X_train.to_csv(os.path.join(output_dir, 'X_train.csv'), index=False)
 67 |     X_test.to_csv(os.path.join(output_dir, 'X_test.csv'), index=False)
 68 |     y_train.to_csv(os.path.join(output_dir, 'y_train.csv'), index=False)
 69 |     y_test.to_csv(os.path.join(output_dir, 'y_test.csv'), index=False)
 70 |     print(f"Datasets saved to {output_dir}")
 71 | 
 72 | 
 73 | @pytest.fixture
 74 | def sample_data():
 75 |     data = {
 76 |         'Age': [25, 35, 45, 55],
 77 |         'Gender': ['0', '1', '0', '1'],  # 'Gender' sütunu string tipinde olmalı
 78 |         'BMI': [22.5, 24.5, 28.0, 30.0],
 79 |         'Diagnosis': [0, 1, 0, 1]
 80 |     }
 81 |     return pd.DataFrame(data)
 82 | 
 83 | 
 84 | def test_load_data(sample_data):
 85 |     df = sample_data
 86 |     assert not df.empty
 87 | 
 88 | 
 89 | def test_handle_missing_values(sample_data):
 90 |     df = sample_data.copy()
 91 |     df.loc[0, 'BMI'] = None
 92 |     df = handle_missing_values(df)
 93 |     assert df['BMI'].isnull().sum() == 0
 94 | 
 95 | 
 96 | def test_encode_categorical_values(sample_data):
 97 |     df = sample_data
 98 |     df = encode_categorical_values(df)
 99 |     assert 'Gender_1' in df.columns
100 | 
101 | 
102 | def test_normalize_data(sample_data):
103 |     df = sample_data
104 |     df = normalize_data(df)
105 |     assert df['Age'].mean() < 1
106 | 
107 | 
108 | def test_split_data(sample_data):
109 |     df = sample_data
110 |     X_train, X_test, y_train, y_test = split_data(df, 'Diagnosis')
111 |     assert len(X_train) + len(X_test) == len(df)
112 | 
113 | 
114 | def test_save_datasets(tmpdir, sample_data):
115 |     df = sample_data
116 |     X_train, X_test, y_train, y_test = split_data(df, 'Diagnosis')
117 |     output_dir = tmpdir.mkdir("data")
118 |     save_datasets(X_train, X_test, y_train, y_test, output_dir)
119 |     assert (output_dir / 'X_train.csv').check()
120 | 


--------------------------------------------------------------------------------
/tests/test_feature_engineering.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import pandas as pd
 3 | import numpy as np
 4 | 
 5 | 
 6 | # Yeni özellikler oluşturma
 7 | def create_new_features(df):
 8 |     df['Age_BMI'] = df['Age'] * df['BMI']
 9 |     return df
10 | 
11 | 
12 | # Polinom özellikler ekleme
13 | def add_polynomial_features(df, degree=2):
14 |     for column in df.select_dtypes(include=['float64', 'int64']).columns:
15 |         for power in range(2, degree + 1):
16 |             df[f'{column}^{power}'] = np.power(df[column], power)
17 |     return df
18 | 
19 | 
20 | @pytest.fixture
21 | def sample_data():
22 |     data = {
23 |         'Age': [25, 35, 45, 55],
24 |         'BMI': [22.5, 24.5, 28.0, 30.0]
25 |     }
26 |     return pd.DataFrame(data)
27 | 
28 | 
29 | def test_create_new_features(sample_data):
30 |     df = sample_data
31 |     df = create_new_features(df)
32 |     assert 'Age_BMI' in df.columns
33 |     assert (df['Age_BMI'] == df['Age'] * df['BMI']).all()
34 | 
35 | 
36 | def test_add_polynomial_features(sample_data):
37 |     df = sample_data
38 |     df = add_polynomial_features(df, degree=2)
39 |     assert 'Age^2' in df.columns
40 |     assert (df['Age^2'] == df['Age'] ** 2).all()
41 |     assert 'BMI^2' in df.columns
42 |     assert (df['BMI^2'] == df['BMI'] ** 2).all()
43 | 


--------------------------------------------------------------------------------
/tests/test_model_training.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import pandas as pd
  3 | from sklearn.linear_model import LogisticRegression
  4 | from sklearn.ensemble import RandomForestClassifier
  5 | from xgboost import XGBClassifier
  6 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
  7 | import joblib
  8 | import os
  9 | 
 10 | 
 11 | # Veri setlerini yükleme
 12 | def load_data(train_path, test_path):
 13 |     X_train = pd.read_csv(train_path)
 14 |     X_test = pd.read_csv(test_path)
 15 |     y_train = pd.read_csv(train_path.replace('X_', 'y_')).values.ravel().astype('int')
 16 |     y_test = pd.read_csv(test_path.replace('X_', 'y_')).values.ravel().astype('int')
 17 |     return X_train, X_test, y_train, y_test
 18 | 
 19 | 
 20 | # Model eğitimi ve değerlendirmesi
 21 | def train_and_evaluate_model(X_train, X_test, y_train, y_test, model, model_name):
 22 |     model.fit(X_train, y_train)
 23 |     y_pred = model.predict(X_test)
 24 |     y_pred_prob = model.predict_proba(X_test)[:, 1]
 25 | 
 26 |     accuracy = accuracy_score(y_test, y_pred)
 27 |     precision = precision_score(y_test, y_pred)
 28 |     recall = recall_score(y_test, y_pred)
 29 |     f1 = f1_score(y_test, y_pred)
 30 |     roc_auc = roc_auc_score(y_test, y_pred_prob)
 31 |     cm = confusion_matrix(y_test, y_pred)
 32 | 
 33 |     print(f"Model: {model_name}")
 34 |     print(f"Accuracy: {accuracy:.4f}")
 35 |     print(f"Precision: {precision:.4f}")
 36 |     print(f"Recall: {recall:.4f}")
 37 |     print(f"F1 Score: {f1:.4f}")
 38 |     print(f"ROC AUC: {roc_auc:.4f}")
 39 |     print("Confusion Matrix:")
 40 |     print(cm)
 41 |     print("\n")
 42 | 
 43 |     # Modeli kaydetme
 44 |     os.makedirs('models', exist_ok=True)
 45 |     joblib.dump(model, f"models/{model_name}.pkl")
 46 |     print(f"Model saved as models/{model_name}.pkl")
 47 | 
 48 | 
 49 | @pytest.fixture
 50 | def sample_data():
 51 |     X_train = pd.DataFrame({
 52 |         'Age': [25, 35, 45, 55],
 53 |         'BMI': [22.5, 24.5, 28.0, 30.0]
 54 |     })
 55 |     y_train = pd.Series([0, 1, 0, 1])
 56 |     return X_train, y_train
 57 | 
 58 | 
 59 | def test_train_and_evaluate_model(sample_data):
 60 |     X_train, y_train = sample_data
 61 |     X_test, y_test = X_train.copy(), y_train.copy()
 62 | 
 63 |     # Logistic Regression
 64 |     lr = LogisticRegression()
 65 |     train_and_evaluate_model(X_train, X_test, y_train, y_test, lr, "logistic_regression")
 66 | 
 67 |     # Random Forest
 68 |     rf = RandomForestClassifier(n_estimators=10)
 69 |     train_and_evaluate_model(X_train, X_test, y_train, y_test, rf, "random_forest")
 70 | 
 71 |     # XGBoost
 72 |     xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
 73 |     train_and_evaluate_model(X_train, X_test, y_train, y_test, xgb, "xgboost")
 74 | 
 75 |     # Modellerin kaydedildiğini kontrol etme
 76 |     assert os.path.exists('models/logistic_regression.pkl')
 77 |     assert os.path.exists('models/random_forest.pkl')
 78 |     assert os.path.exists('models/xgboost.pkl')
 79 | 
 80 | 
 81 | def test_load_data(tmpdir):
 82 |     # Geçici veri dosyaları oluşturma
 83 |     train_file = tmpdir.join("X_train.csv")
 84 |     test_file = tmpdir.join("X_test.csv")
 85 |     train_file.write("Age,BMI\n25,22.5\n35,24.5\n45,28.0\n55,30.0")
 86 |     test_file.write("Age,BMI\n25,22.5\n35,24.5\n45,28.0\n55,30.0")
 87 | 
 88 |     y_train_file = tmpdir.join("y_train.csv")
 89 |     y_test_file = tmpdir.join("y_test.csv")
 90 |     y_train_file.write("Diagnosis\n0\n1\n0\n1")
 91 |     y_test_file.write("Diagnosis\n0\n1\n0\n1")
 92 | 
 93 |     # Veri setlerini yükleme
 94 |     X_train, X_test, y_train, y_test = load_data(str(train_file), str(test_file))
 95 | 
 96 |     # Veri setlerinin doğru yüklendiğini kontrol etme
 97 |     assert X_train.shape == (4, 2)
 98 |     assert X_test.shape == (4, 2)
 99 |     assert y_train.shape == (4,)
100 |     assert y_test.shape == (4,)
101 | 


--------------------------------------------------------------------------------