├── .gitignore
├── README.md
├── app
├── app.py
├── models
│ └── pipeline_xgboost.pkl
├── static
│ └── styles.css
└── templates
│ └── index.html
├── data
└── processed
│ ├── X_test.csv
│ ├── X_test_engineered.csv
│ └── X_train.csv
├── models
├── logistic_regression.pkl
├── random_forest.pkl
└── xgboost.pkl
├── notebooks
└── exploratory_data_analysis.ipynb
├── raw
└── diabetes_data.csv
├── requirements.txt
├── scripts
├── __pycache__
│ └── utility_functions.cpython-311.pyc
├── comprehensive_model_report.py
├── data_preprocessing.py
├── feature_engineering.py
├── model_evaluation.py
├── model_training.py
├── plots
│ ├── Logistic Regression_confusion_matrix.png
│ ├── Logistic Regression_roc_curve.png
│ ├── Random Forest_confusion_matrix.png
│ ├── Random Forest_roc_curve.png
│ ├── XGBoost_confusion_matrix.png
│ └── XGBoost_roc_curve.png
├── reports
│ ├── Logistic Regression_report.csv
│ ├── Random Forest_report.csv
│ └── XGBoost_report.csv
└── utility_functions.py
└── tests
├── __pycache__
├── test_data_preprocessing.cpython-311-pytest-8.2.2.pyc
├── test_feature_engineering.cpython-311-pytest-8.2.2.pyc
├── test_model_evaluation.cpython-311-pytest-8.2.2.pyc
└── test_model_training.cpython-311-pytest-8.2.2.pyc
├── models
├── logistic_regression.pkl
├── random_forest.pkl
└── xgboost.pkl
├── test_data_preprocessing.py
├── test_feature_engineering.py
└── test_model_training.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 |
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 |
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 |
121 | # SageMath parsed files
122 | *.sage.py
123 |
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 |
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 |
137 | # Rope project settings
138 | .ropeproject
139 |
140 | # mkdocs documentation
141 | /site
142 |
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 |
148 | # Pyre type checker
149 | .pyre/
150 |
151 | # pytype static type analyzer
152 | .pytype/
153 |
154 | # Cython debug symbols
155 | cython_debug/
156 |
157 | # PyCharm
158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | # and can be added to the global gitignore or merged into this file. For a more nuclear
161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Diabetes Health Prediction and Analysis 🎉
2 |
3 | 
4 |
5 | ---
6 |
7 | Welcome to the **Diabetes Health Prediction and Analysis** project! This repository contains a comprehensive pipeline for predicting diabetes diagnosis using various machine learning and deep learning models, along with an in-depth exploratory data analysis and feature engineering steps.
8 |
9 | ## 🚀 Project Overview
10 |
11 | This project aims to provide a thorough analysis of diabetes-related health data, develop predictive models, and evaluate their performance. The key components of the project include:
12 |
13 | - 📊 Data Preprocessing
14 | - 🔍 Exploratory Data Analysis (EDA)
15 | - 🛠️ Feature Engineering
16 | - 🧠 Model Training
17 | - 📈 Model Evaluation
18 | - 📑 Comprehensive Reports
19 |
20 | ## 📂 Project Structure
21 |
22 | Here's an overview of the project directory structure:
23 |
24 |
25 | ```plaintext
26 | Diabetes_Health_Prediction_and_Analysis/
27 | ├── data/
28 | │ ├── raw/
29 | │ │ └── diabetes_data.csv
30 | │ ├── processed/
31 | │ │ ├── X_train.csv
32 | │ │ ├── X_train_engineered.csv
33 | │ │ ├── X_test.csv
34 | │ │ ├── X_test_engineered.csv
35 | │ │ ├── y_train.csv
36 | │ │ └── y_test.csv
37 | ├── app/
38 | │ ├── app.py
39 | │ ├── templates/
40 | │ │ └── index.html
41 | │ └── static/
42 | │ └── styles.css
43 | ├── models/
44 | │ ├── logistic_regression.pkl
45 | │ ├── random_forest.pkl
46 | │ └── xgboost.pkl
47 | ├── notebooks/
48 | │ └── exploratory_data_analysis.ipynb
49 | ├── scripts/
50 | │ ├── plots/
51 | │ ├── reports/
52 | │ ├── data_preprocessing.py
53 | │ ├── feature_engineering.py
54 | │ ├── model_training.py
55 | │ ├── model_evaluation.py
56 | │ └── model_performance_report.py
57 | ├── tests/
58 | │ ├── models/
59 | │ ├── test_data_preprocessing.py
60 | │ ├── test_feature_engineering.py
61 | │ ├── test_model_training.py
62 | ├── requirements.txt
63 | └── README.md
64 | ```
65 |
66 | ## 🔧 Setup and Installation
67 |
68 | To get started with this project, follow the steps below:
69 |
70 | 1. **Clone the repository:**
71 |
72 | ```sh
73 | git clone https://github.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis.git
74 | cd Diabetes_Health_Prediction_and_Analysis
75 | ```
76 |
77 | 2. **Create and activate a virtual environment:**
78 |
79 | ```sh
80 | python -m venv venv
81 | source venv/bin/activate # On Windows use `venv\Scripts\activate`
82 | ```
83 |
84 | 3. **Install the required packages:**
85 |
86 | ```sh
87 | pip install -r requirements.txt
88 | ```
89 |
90 | 4. **Run the data preprocessing script:**
91 |
92 | ```sh
93 | python scripts/data_preprocessing.py
94 | ```
95 |
96 | 5. **Run the feature engineering script:**
97 |
98 | ```sh
99 | python scripts/feature_engineering.py
100 | ```
101 |
102 | 6. **Train the models:**
103 |
104 | ```sh
105 | python scripts/model_training.py
106 | ```
107 |
108 | 7. **Evaluate the models:**
109 |
110 | ```sh
111 | python scripts/model_evaluation.py
112 | ```
113 |
114 | 8. **Generate comprehensive model performance reports:**
115 |
116 | ```sh
117 | python script/comprehensive_model_report.py
118 | ```
119 |
120 | ## 🚀 Usage
121 |
122 | - **Exploratory Data Analysis**: Check the `notebooks/exploratory_data_analysis.ipynb` notebook for detailed data analysis and visualizations.
123 | - **Scripts**: All scripts for data preprocessing, feature engineering, model training, and evaluation are located in the `scripts/` directory.
124 | - **Tests**: To ensure code quality and correctness, tests are included in the `tests/` directory. Run them with `pytest`.
125 |
126 | ## 📊 Models
127 |
128 | The following models are trained and evaluated in this project:
129 |
130 | ---
131 |
132 | ### Logistic Regression
133 |
134 | #### ROC Curve:
135 | 
136 |
137 | *The ROC curve illustrates the true positive rate (sensitivity) versus the false positive rate (1-specificity) for different threshold settings. A higher area under the curve (AUC) indicates better model performance.*
138 |
139 | #### Confusion Matrix:
140 | 
141 |
142 | *The confusion matrix provides a summary of the prediction results on the classification problem. It shows the number of true positive (TP), true negative (TN), false positive (FP), and false negative (FN) predictions.*
143 |
144 | ---
145 |
146 | ### Random Forest
147 |
148 | #### ROC Curve:
149 | 
150 |
151 | *The ROC curve illustrates the true positive rate (sensitivity) versus the false positive rate (1-specificity) for different threshold settings. A higher area under the curve (AUC) indicates better model performance.*
152 |
153 | #### Confusion Matrix:
154 | 
155 |
156 | *The confusion matrix provides a summary of the prediction results on
157 |
158 |
159 |
160 | ## 🎯 Performance Metrics
161 |
162 | The performance of the models is evaluated using the following metrics:
163 |
164 | - **Accuracy**
165 | - **Precision**
166 | - **Recall**
167 | - **F1 Score**
168 | - **ROC AUC Score**
169 | - **Confusion Matrix**
170 |
171 | ### Logistic Regression
172 |
173 | - **Accuracy (Doğruluk):** %78.99
174 | - **Precision (Kesinlik):** %73.19
175 | - **Recall (Duyarlılık):** %70.63
176 | - **F1 Score:** %71.89
177 | - **ROC AUC:** %83.86
178 |
179 | **Confusion Matrix:**
180 | ```plaintext
181 | [[196 37]
182 | [ 42 101]]
183 | ```
184 | Model dosyası:
185 | ```sh
186 | models/logistic_regression.pkl
187 | ```
188 |
189 | ### Random Forest
190 |
191 | - **Accuracy (Doğruluk):** %91.22
192 | - **Precision (Kesinlik):** %94.35
193 | - **Recall (Duyarlılık):** %81.82
194 | - **F1 Score:** %87.64
195 | - **ROC AUC:** %97.69
196 |
197 | **Confusion Matrix:**
198 | ```plaintext
199 | [[226 7]
200 | [ 26 117]]
201 | ```
202 | Model dosyası:
203 | ```sh
204 | models/random_forest.pkl
205 | ```
206 | ##### Explanations:
207 |
208 | 1. [x] **_Accuracy:_** The ratio of correctly predicted instances to the total instances.
209 | 2. [x] **_Precision:**_ The ratio of true positive predictions to the total predicted positives. It measures the accuracy of positive predictions.
210 | 3. [x] **_Recall:_** The ratio of true positive predictions to the actual positives. It measures the model's ability to identify positive instances.
211 | 4. [x] **_F1 Score:_** The harmonic mean of precision and recall. It provides a balance between precision and recall.
212 | 5. [x] **_ROC AUC:_** The area under the ROC curve. It summarizes the model's ability to distinguish between classes.
213 |
214 | **Confusion Matrix:**
215 |
216 | * True Positive (TP): 117 - The number of actual positive cases correctly identified by the model.
217 | * True Negative (TN): 226 - The number of actual negative cases correctly identified by the model.
218 | * False Positive (FP): 7 - The number of actual negative cases incorrectly identified as positive by the model.
219 | * False Negative (FN): 26 - The number of actual positive cases incorrectly identified as negative by the model.
220 |
221 | ##### Explanations:
222 | 1. [x] **_Accuracy:_** The ratio of correctly predicted instances to the total instances.
223 | 2. [x] **_Precision:_** The ratio of true positive predictions to the total predicted positives. It measures the accuracy of positive predictions.
224 | 3. [x] **_Recall:_** The ratio of true positive predictions to the actual positives. It measures the model's ability to identify positive instances.
225 | 4. [x] **_F1 Score:_** The harmonic mean of precision and recall. It provides a balance between precision and recall.
226 | 5. [x] **_ROC AUC:_** The area under the ROC curve. It summarizes the model's ability to distinguish between classes.
227 |
228 | **Confusion Matrix:**
229 |
230 | * True Positive (TP): 117 - The number of actual positive cases correctly identified by the model.
231 | * True Negative (TN): 226 - The number of actual negative cases correctly identified by the model.
232 | * False Positive (FP): 7 - The number of actual negative cases incorrectly identified as positive by the model.
233 | * False Negative (FN): 26 - The number of actual positive cases incorrectly identified as negative by the model.
234 |
235 | ### XGBoost
236 |
237 | - **Accuracy (Doğruluk):** %91.76
238 | - **Precision (Kesinlik):** %93.08
239 | - **Recall (Duyarlılık):** %84.62
240 | - **F1 Score:** %88.64
241 | - **ROC AUC:** %98.41
242 |
243 | **Confusion Matrix:**
244 | ```plaintext
245 | [[224 9]
246 | [ 22 121]]
247 | ```
248 | Model dosyası:
249 | ```sh
250 | models/xgboost.pkl
251 | ```
252 | ##### Explanations:
253 |
254 | 1. [x] **_Accuracy_:** The ratio of correctly predicted instances to the total instances.
255 | 2. [x] **_Precision:_** The ratio of true positive predictions to the total predicted positives. It measures the accuracy of positive predictions.
256 | 3. [x] **_Recall:_** The ratio of true positive predictions to the actual positives. It measures the model's ability to identify positive instances.
257 | 4. [x] _**F1 Score:**_ The harmonic mean of precision and recall. It provides a balance between precision and recall.
258 | 5. [x] **_ROC AUC:_** The area under the ROC curve. It summarizes the model's ability to distinguish between classes.
259 |
260 | **Confusion Matrix:**
261 |
262 | * True Positive (TP): 121 - The number of actual positive cases correctly identified by the model.
263 | * True Negative (TN): 224 - The number of actual negative cases correctly identified by the model.
264 | * False Positive (FP): 9 - The number of actual negative cases incorrectly identified as positive by the model.
265 | * False Negative (FN): 22 - The number of actual positive cases incorrectly identified as negative by the model.
266 |
267 | ## 📈 Results
268 |
269 | Model performance reports and evaluation metrics are saved and displayed in the `comprehensive_model_report.py` script output.
270 |
271 | ## 💡 Future Work
272 |
273 | - Implement more advanced deep learning models (e.g., Neural Networks, LSTM).
274 | - Perform hyperparameter tuning to optimize model performance.
275 | - Explore feature selection techniques to improve model accuracy.
276 | - Integrate additional health datasets for broader analysis.
277 |
278 | ## 🤝 Contributing
279 |
280 | Contributions are welcome! Please feel free to submit a Pull Request.
281 |
282 | Whether it's improving the documentation, adding new features, or fixing bugs, your contributions are highly appreciated. Let's make this project better together! 🚀
283 |
284 | ### How to Contribute:
285 |
286 | 1. **Fork the Repository**: Click on the 'Fork' button at the top right corner of this page to create a copy of this repository in your GitHub account.
287 |
288 | 2. **Clone the Forked Repository**:
289 | ```bash
290 | git clone https://github.com/your-username/Diabetes_Health_Prediction_and_Analysis.git
291 | ```
292 |
293 | 3. **Create a New Branch**:
294 | ```bash
295 | git checkout -b feature/your-feature-name
296 | ```
297 |
298 | 4. **Make Your Changes**: Implement your feature, bug fix, or improvement.
299 |
300 | 5. **Commit Your Changes**:
301 | ```bash
302 | git commit -m "Add your commit message here"
303 | ```
304 |
305 | 6. **Push to Your Forked Repository**:
306 | ```bash
307 | git push origin feature/your-feature-name
308 | ```
309 |
310 | 7. **Open a Pull Request**: Go to the original repository on GitHub and click on the 'New Pull Request' button. Compare changes from your forked repository and submit the pull request.
311 |
312 | ---
313 |
314 | Thank you for your contributions! Together, we can build a more robust and efficient Diabetes Health Prediction and Analysis tool. 🌟
315 |
316 | ## 📄 License
317 |
318 | This project is licensed under the MIT License.
319 |
320 | ## 📬 Contact
321 |
322 | If you have any questions or suggestions, feel free to open an issue or contact me directly. I am always open to feedback and would love to hear from you!
323 |
324 | ---
325 |
326 | ### How to Reach Me:
327 |
328 | - **Email:** [piinartp@gmail.com](mailto:piinartp@gmail.com)
329 | - **GitHub Issues:** [Open an Issue](https://github.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/issues)
330 | - **LinkedIn:** [Your LinkedIn Profile](https://www.linkedin.com/in/piinartp/)
331 |
332 | ---
333 |
334 | Thank you for your interest in the Diabetes Health Prediction and Analysis project! Your feedback and suggestions are invaluable in making this project better and more useful for everyone. 🌟
335 |
336 | 
337 |
338 | ---
339 |
340 |
341 | ---
342 |
343 | ⭐️ Don't forget to give this project a star if you found it useful! ⭐️
344 |
--------------------------------------------------------------------------------
/app/app.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, request, render_template
2 | import joblib
3 | import pandas as pd
4 |
5 | # Load the trained model
6 | model = joblib.load("./models/pipeline_xgboost.pkl")
7 |
8 | # Load the feature names
9 | X_train = pd.read_csv("../data/processed/X_train_engineered.csv")
10 | feature_names = X_train.columns.tolist()
11 |
12 | # Remove 'PatientID' from feature names if it exists
13 | if 'PatientID' in feature_names:
14 | feature_names.remove('PatientID')
15 |
16 | # Categorize features
17 | demographic_features = [feature for feature in feature_names if
18 | 'age' in feature or 'gender' in feature or 'ethnicity' in feature]
19 | medical_history_features = [feature for feature in feature_names if 'history' in feature or 'diabetes' in feature]
20 | lifestyle_features = [feature for feature in feature_names if
21 | 'bmi' in feature or 'smoking' in feature or 'activity' in feature]
22 |
23 | app = Flask(__name__)
24 |
25 |
26 | @app.route('/')
27 | def home():
28 | return render_template('index.html', demographic_features=demographic_features,
29 | medical_history_features=medical_history_features,
30 | lifestyle_features=lifestyle_features)
31 |
32 |
33 | @app.route('/predict', methods=['POST'])
34 | def predict():
35 | try:
36 | # Extract form data
37 | features = [float(request.form[feature]) for feature in feature_names]
38 | input_data = pd.DataFrame([features], columns=feature_names)
39 |
40 | # Predict using the model
41 | prediction = model.predict(input_data)[0]
42 | prediction_prob = model.predict_proba(input_data)[0][1]
43 |
44 | return render_template('index.html', prediction=prediction, probability=prediction_prob,
45 | demographic_features=demographic_features,
46 | medical_history_features=medical_history_features,
47 | lifestyle_features=lifestyle_features)
48 | except KeyError as e:
49 | return f"Missing form data for feature: {e.args[0]}", 400
50 | except ValueError as e:
51 | return str(e), 400
52 |
53 |
54 | if __name__ == "__main__":
55 | app.run(debug=True)
56 |
--------------------------------------------------------------------------------
/app/models/pipeline_xgboost.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/app/models/pipeline_xgboost.pkl
--------------------------------------------------------------------------------
/app/static/styles.css:
--------------------------------------------------------------------------------
1 | body {
2 | background-color: #f4f7f6;
3 | font-family: 'Roboto', sans-serif;
4 | }
5 |
6 | .container {
7 | max-width: 800px;
8 | margin: auto;
9 | }
10 |
11 | .card {
12 | border: none;
13 | border-radius: 15px;
14 | overflow: hidden;
15 | }
16 |
17 | .card-header {
18 | background: linear-gradient(45deg, #007bff, #0056b3);
19 | }
20 |
21 | .card-title {
22 | margin: 0;
23 | }
24 |
25 | .btn-primary {
26 | background-color: #007bff;
27 | border-color: #007bff;
28 | transition: background-color 0.3s, border-color 0.3s;
29 | }
30 |
31 | .btn-primary:hover {
32 | background-color: #0056b3;
33 | border-color: #0056b3;
34 | }
35 |
36 | .form-section {
37 | margin-bottom: 20px;
38 | padding: 20px;
39 | border: 1px solid #e0e0e0;
40 | border-radius: 10px;
41 | background-color: #ffffff;
42 | box-shadow: 0 4px 8px rgba(0, 0, 0, 0.05);
43 | }
44 |
45 | .section-title {
46 | margin-bottom: 15px;
47 | font-size: 1.3em;
48 | font-weight: bold;
49 | color: #333333;
50 | }
51 |
52 | .result {
53 | background-color: #e9f7ef;
54 | border: 1px solid #d4edda;
55 | border-radius: 10px;
56 | padding: 20px;
57 | }
58 |
59 | .result h2, .result h3 {
60 | margin: 0;
61 | color: #155724;
62 | }
63 |
--------------------------------------------------------------------------------
/app/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | Diabetes Prediction
10 |
11 |
12 |
13 |
14 |
15 |
18 |
19 |
64 |
65 | {% if prediction is not none %}
66 |
67 |
68 |
Prediction: {{ prediction }}
69 | Probability: {{ probability }}
70 |
71 |
72 | {% endif %}
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
--------------------------------------------------------------------------------
/models/logistic_regression.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/models/logistic_regression.pkl
--------------------------------------------------------------------------------
/models/random_forest.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/models/random_forest.pkl
--------------------------------------------------------------------------------
/models/xgboost.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/models/xgboost.pkl
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pytest~=8.2.2
2 | pandas~=2.2.2
3 | joblib~=1.4.0
4 | scikit-learn~=1.4.2
5 | xgboost~=2.0.3
6 | numpy~=1.26.4
7 | seaborn~=0.13.2
8 | matplotlib~=3.8.4
9 |
--------------------------------------------------------------------------------
/scripts/__pycache__/utility_functions.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/scripts/__pycache__/utility_functions.cpython-311.pyc
--------------------------------------------------------------------------------
/scripts/comprehensive_model_report.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import matplotlib.pyplot as plt
3 | from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, classification_report, accuracy_score, \
4 | precision_score, recall_score, f1_score
5 | import seaborn as sns
6 | import joblib
7 | import os
8 | from utility_functions import load_model, load_train_test_data, plot_roc_curve, plot_confusion_matrix
9 |
10 |
11 | # Modeli ve veri setlerini yükleme
12 | def load_model_and_data(model_path, X_test_path, y_test_path):
13 | model = load_model(model_path)
14 | X_test = pd.read_csv(X_test_path)
15 | y_test = pd.read_csv(y_test_path).values.ravel().astype('int')
16 | return model, X_test, y_test
17 |
18 |
19 | # ROC eğrisini çizme ve kaydetme
20 | def plot_and_save_roc_curve(model, X_test, y_test, model_name):
21 | y_pred_prob = model.predict_proba(X_test)[:, 1]
22 | fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
23 | roc_auc = roc_auc_score(y_test, y_pred_prob)
24 |
25 | plt.figure()
26 | plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
27 | plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
28 | plt.xlim([0.0, 1.0])
29 | plt.ylim([0.0, 1.05])
30 | plt.xlabel('False Positive Rate')
31 | plt.ylabel('True Positive Rate')
32 | plt.title(f'Receiver Operating Characteristic - {model_name}')
33 | plt.legend(loc="lower right")
34 | os.makedirs('plots', exist_ok=True)
35 | plt.savefig(f'plots/{model_name}_roc_curve.png')
36 | plt.show()
37 | return roc_auc
38 |
39 |
40 | # Confusion Matrix çizme ve kaydetme
41 | def plot_and_save_confusion_matrix(y_test, y_pred, model_name):
42 | cm = confusion_matrix(y_test, y_pred)
43 | plt.figure()
44 | sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Diabetes', 'Diabetes'],
45 | yticklabels=['No Diabetes', 'Diabetes'])
46 | plt.xlabel('Predicted')
47 | plt.ylabel('True')
48 | plt.title(f'Confusion Matrix - {model_name}')
49 | os.makedirs('plots', exist_ok=True)
50 | plt.savefig(f'plots/{model_name}_confusion_matrix.png')
51 | plt.show()
52 |
53 |
54 | # Kapsamlı model değerlendirme raporu oluşturma
55 | def comprehensive_model_report(model_path, X_test_path, y_test_path, model_name):
56 | model, X_test, y_test = load_model_and_data(model_path, X_test_path, y_test_path)
57 | y_pred = model.predict(X_test)
58 |
59 | # ROC Curve ve AUC
60 | roc_auc = plot_and_save_roc_curve(model, X_test, y_test, model_name)
61 |
62 | # Confusion Matrix
63 | plot_and_save_confusion_matrix(y_test, y_pred, model_name)
64 |
65 | # Classification Report
66 | class_report = classification_report(y_test, y_pred, output_dict=True)
67 | print(f"Classification Report for {model_name}:\n")
68 | print(classification_report(y_test, y_pred))
69 |
70 | # Diğer Metrikler
71 | accuracy = accuracy_score(y_test, y_pred)
72 | precision = precision_score(y_test, y_pred)
73 | recall = recall_score(y_test, y_pred)
74 | f1 = f1_score(y_test, y_pred)
75 |
76 | metrics = {
77 | 'accuracy': accuracy,
78 | 'precision': precision,
79 | 'recall': recall,
80 | 'f1_score': f1,
81 | 'roc_auc': roc_auc
82 | }
83 |
84 | # Sonuçları kaydetme
85 | report_path = f'reports/{model_name}_report.csv'
86 | os.makedirs('reports', exist_ok=True)
87 | report_df = pd.DataFrame(class_report).transpose()
88 | report_df['accuracy'] = accuracy
89 | report_df['roc_auc'] = roc_auc
90 | report_df.to_csv(report_path)
91 |
92 | print(f"Comprehensive report saved as {report_path}")
93 | print(f"Metrics: {metrics}")
94 |
95 |
96 | # Ana işlem fonksiyonu
97 | def main():
98 | model_paths = ["models/logistic_regression.pkl", "models/random_forest.pkl", "models/xgboost.pkl"]
99 | model_names = ["Logistic Regression", "Random Forest", "XGBoost"]
100 | X_test_path = "../data/processed/X_test_engineered.csv"
101 | y_test_path = "../data/processed/y_test.csv"
102 |
103 | for model_path, model_name in zip(model_paths, model_names):
104 | comprehensive_model_report(model_path, X_test_path, y_test_path, model_name)
105 |
106 |
107 | if __name__ == "__main__":
108 | main()
109 |
--------------------------------------------------------------------------------
/scripts/data_preprocessing.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import seaborn as sns
3 | import matplotlib.pyplot as plt
4 | from sklearn.model_selection import train_test_split
5 | from sklearn.preprocessing import StandardScaler, OneHotEncoder
6 | from sklearn.impute import SimpleImputer
7 | import os
8 |
9 |
10 | # Veri setini yükleme
11 | def load_data(file_path):
12 | df = pd.read_csv(file_path)
13 | return df
14 |
15 |
16 | # Veri Keşfi (Exploratory Data Analysis)
17 | def perform_eda(df, dataset_name=""):
18 | print(f"{dataset_name} Veri Setinin İlk 5 Satırı:\n", df.head())
19 | print(f"\n{dataset_name} Veri Seti Hakkında Bilgiler:\n", df.info())
20 | print(f"\n{dataset_name} Veri Setindeki Eksik Değerler:\n", df.isnull().sum())
21 | print(f"\n{dataset_name} Temel İstatistikler:\n", df.describe())
22 |
23 | # Kategorik değişkenlerin dağılımı
24 | categorical_columns = df.select_dtypes(include=['object', 'category']).columns
25 | for column in categorical_columns:
26 | plt.figure(figsize=(10, 5))
27 | sns.countplot(data=df, x=column)
28 | plt.title(f'{dataset_name} {column} Dağılımı')
29 | plt.show()
30 |
31 | # Sayısal değişkenlerin dağılımı
32 | numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
33 | df[numeric_columns].hist(bins=15, figsize=(20, 15))
34 | plt.suptitle(f'{dataset_name} Sayısal Değişkenlerin Dağılımı')
35 | plt.show()
36 |
37 | # Korelasyon matrisi (DoctorInCharge sütunu çıkarıldı)
38 | if 'DoctorInCharge' in df.columns:
39 | df_corr = df.drop(columns=['DoctorInCharge'])
40 | else:
41 | df_corr = df.copy()
42 | plt.figure(figsize=(15, 10))
43 | sns.heatmap(df_corr.corr(), annot=True, cmap='coolwarm')
44 | plt.title(f'{dataset_name} Korelasyon Matrisi')
45 | plt.show()
46 |
47 |
48 | # Eksik verileri işleme
49 | def handle_missing_values(df):
50 | imputer = SimpleImputer(strategy='mean')
51 | numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
52 | df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
53 | return df
54 |
55 |
56 | # Kategorik değişkenleri dönüştürme
57 | def encode_categorical_values(df):
58 | categorical_columns = df.select_dtypes(include=['object']).columns
59 | encoder = OneHotEncoder(sparse_output=False, drop='first')
60 | encoded_columns = pd.DataFrame(encoder.fit_transform(df[categorical_columns]))
61 | encoded_columns.columns = encoder.get_feature_names_out(categorical_columns)
62 | df = df.drop(categorical_columns, axis=1)
63 | df = pd.concat([df, encoded_columns], axis=1)
64 | return df
65 |
66 |
67 | # Verileri normalleştirme
68 | def normalize_data(df):
69 | scaler = StandardScaler()
70 | numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
71 | df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
72 | return df
73 |
74 |
75 | # Verileri eğitim ve test setlerine ayırma
76 | def split_data(df, target_column, test_size=0.2, random_state=42):
77 | X = df.drop(target_column, axis=1)
78 | y = df[target_column]
79 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
80 | return X_train, X_test, y_train, y_test
81 |
82 |
83 | # Eğitim ve test setlerini kaydetme
84 | def save_datasets(X_train, X_test, y_train, y_test, output_dir):
85 | os.makedirs(output_dir, exist_ok=True)
86 | X_train.to_csv(os.path.join(output_dir, 'X_train.csv'), index=False)
87 | X_test.to_csv(os.path.join(output_dir, 'X_test.csv'), index=False)
88 | y_train.to_csv(os.path.join(output_dir, 'y_train.csv'), index=False)
89 | y_test.to_csv(os.path.join(output_dir, 'y_test.csv'), index=False)
90 | print(f"Datasets saved to {output_dir}")
91 |
92 |
93 | # Ana işlem fonksiyonu
94 | def main(file_path, target_column, output_dir):
95 | df = load_data(file_path)
96 | perform_eda(df, dataset_name="Orijinal")
97 | df = handle_missing_values(df)
98 | df = encode_categorical_values(df)
99 | df = normalize_data(df)
100 | X_train, X_test, y_train, y_test = split_data(df, target_column)
101 | save_datasets(X_train, X_test, y_train, y_test, output_dir)
102 |
103 | # Eğitim ve test setleri için sütun analizleri
104 | perform_eda(pd.concat([X_train, y_train], axis=1), dataset_name="Eğitim Seti")
105 | perform_eda(pd.concat([X_test, y_test], axis=1), dataset_name="Test Seti")
106 |
107 | return X_train, X_test, y_train, y_test
108 |
109 |
110 | if __name__ == "__main__":
111 | file_path = "../data/raw/diabetes_data.csv" # Veri seti yolu
112 | target_column = "Diagnosis" # Hedef değişken
113 | output_dir = "../data/processed" # Kaydedilecek dizin
114 | X_train, X_test, y_train, y_test = main(file_path, target_column, output_dir)
115 | print("Veri ön işleme tamamlandı ve veriler eğitim/test setlerine ayrıldı.")
116 |
--------------------------------------------------------------------------------
/scripts/feature_engineering.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.preprocessing import PolynomialFeatures
3 |
4 |
5 | # Yeni özellikler oluşturma
6 | def create_new_features(df):
7 | # Örneğin, yaş ve BMI'nin çarpımını yeni bir özellik olarak ekleyebiliriz
8 | df['Age_BMI'] = df['Age'] * df['BMI']
9 | return df
10 |
11 |
12 | # Polynomial Features oluşturma
13 | def add_polynomial_features(df, degree=2):
14 | poly = PolynomialFeatures(degree, include_bias=False)
15 | numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
16 | poly_features = poly.fit_transform(df[numeric_columns])
17 | poly_feature_names = poly.get_feature_names_out(numeric_columns)
18 | poly_df = pd.DataFrame(poly_features, columns=poly_feature_names)
19 |
20 | df = df.reset_index(drop=True)
21 | poly_df = poly_df.reset_index(drop=True)
22 |
23 | df = pd.concat([df, poly_df], axis=1)
24 | return df
25 |
26 |
27 | # Ana işlem fonksiyonu
28 | def main(train_file_path, test_file_path, train_output_path, test_output_path, degree=2):
29 | # Eğitim veri seti için feature engineering
30 | df_train = pd.read_csv(train_file_path)
31 | df_train = create_new_features(df_train)
32 | df_train = add_polynomial_features(df_train, degree)
33 | df_train.to_csv(train_output_path, index=False)
34 | print(f"Eğitim veri seti için feature engineering tamamlandı ve {train_output_path} dosyasına kaydedildi.")
35 |
36 | # Test veri seti için feature engineering
37 | df_test = pd.read_csv(test_file_path)
38 | df_test = create_new_features(df_test)
39 | df_test = add_polynomial_features(df_test, degree)
40 | df_test.to_csv(test_output_path, index=False)
41 | print(f"Test veri seti için feature engineering tamamlandı ve {test_output_path} dosyasına kaydedildi.")
42 |
43 |
44 | if __name__ == "__main__":
45 | train_file_path = "../data/processed/X_train.csv" # Eğitim veri seti yolu
46 | test_file_path = "../data/processed/X_test.csv" # Test veri seti yolu
47 | train_output_path = "../data/processed/X_train_engineered.csv" # Kaydedilecek yeni eğitim veri seti yolu
48 | test_output_path = "../data/processed/X_test_engineered.csv" # Kaydedilecek yeni test veri seti yolu
49 | degree = 2 # Polynomial degree
50 | main(train_file_path, test_file_path, train_output_path, test_output_path, degree)
51 |
--------------------------------------------------------------------------------
/scripts/model_evaluation.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import seaborn as sns
3 | import matplotlib.pyplot as plt
4 | from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, classification_report
5 | import joblib
6 |
7 |
8 | # Modeli ve veri setlerini yükleme
9 | def load_model_and_data(model_path, X_test_path, y_test_path):
10 | model = joblib.load(model_path)
11 | X_test = pd.read_csv(X_test_path)
12 | y_test = pd.read_csv(y_test_path).values.ravel().astype('int')
13 | return model, X_test, y_test
14 |
15 |
16 | # ROC eğrisini çizme
17 | def plot_roc_curve(model, X_test, y_test, model_name):
18 | y_pred_prob = model.predict_proba(X_test)[:, 1]
19 | fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
20 | roc_auc = roc_auc_score(y_test, y_pred_prob)
21 |
22 | plt.figure()
23 | plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
24 | plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
25 | plt.xlim([0.0, 1.0])
26 | plt.ylim([0.0, 1.05])
27 | plt.xlabel('False Positive Rate')
28 | plt.ylabel('True Positive Rate')
29 | plt.title(f'Receiver Operating Characteristic - {model_name}')
30 | plt.legend(loc="lower right")
31 | plt.savefig(f'plots/{model_name}_roc_curve.png')
32 | plt.show()
33 |
34 |
35 | # Confusion Matrix çizme
36 | def plot_confusion_matrix(y_test, y_pred, model_name):
37 | cm = confusion_matrix(y_test, y_pred)
38 | plt.figure()
39 | sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Diabetes', 'Diabetes'],
40 | yticklabels=['No Diabetes', 'Diabetes'])
41 | plt.xlabel('Predicted')
42 | plt.ylabel('True')
43 | plt.title(f'Confusion Matrix - {model_name}')
44 | plt.savefig(f'plots/{model_name}_confusion_matrix.png')
45 | plt.show()
46 |
47 |
48 | # Modeli değerlendirme
49 | def evaluate_model(model_path, X_test_path, y_test_path, model_name):
50 | model, X_test, y_test = load_model_and_data(model_path, X_test_path, y_test_path)
51 | y_pred = model.predict(X_test)
52 |
53 | # ROC Curve
54 | plot_roc_curve(model, X_test, y_test, model_name)
55 |
56 | # Confusion Matrix
57 | plot_confusion_matrix(y_test, y_pred, model_name)
58 |
59 | # Classification Report
60 | print(f"Classification Report for {model_name}:\n")
61 | print(classification_report(y_test, y_pred))
62 |
63 |
64 | # Ana işlem fonksiyonu
65 | def main():
66 | model_paths = ["models/logistic_regression.pkl", "models/random_forest.pkl", "models/xgboost.pkl"]
67 | model_names = ["Logistic Regression", "Random Forest", "XGBoost"]
68 | X_test_path = "../data/processed/X_test_engineered.csv"
69 | y_test_path = "../data/processed/y_test.csv"
70 |
71 | for model_path, model_name in zip(model_paths, model_names):
72 | evaluate_model(model_path, X_test_path, y_test_path, model_name)
73 |
74 |
75 | if __name__ == "__main__":
76 | main()
77 |
--------------------------------------------------------------------------------
/scripts/model_training.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.model_selection import train_test_split, GridSearchCV
3 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
4 | from sklearn.linear_model import LogisticRegression
5 | from sklearn.ensemble import RandomForestClassifier
6 | from xgboost import XGBClassifier
7 | import joblib
8 | import os
9 |
10 |
11 | # Veri setlerini yükleme
12 | def load_data(X_train_path, X_test_path, y_train_path, y_test_path):
13 | X_train = pd.read_csv(X_train_path)
14 | X_test = pd.read_csv(X_test_path)
15 | y_train = pd.read_csv(y_train_path).values.ravel().astype('int') # Hedef değişkeni integer tipine dönüştür
16 | y_test = pd.read_csv(y_test_path).values.ravel().astype('int') # Hedef değişkeni integer tipine dönüştür
17 |
18 | return X_train, X_test, y_train, y_test
19 |
20 |
21 | # Model eğitimi ve değerlendirmesi
22 | def train_and_evaluate_model(X_train, X_test, y_train, y_test, model, model_name):
23 | model.fit(X_train, y_train)
24 | y_pred = model.predict(X_test)
25 | y_pred_prob = model.predict_proba(X_test)[:, 1]
26 |
27 | accuracy = accuracy_score(y_test, y_pred)
28 | precision = precision_score(y_test, y_pred)
29 | recall = recall_score(y_test, y_pred)
30 | f1 = f1_score(y_test, y_pred)
31 | roc_auc = roc_auc_score(y_test, y_pred_prob)
32 | cm = confusion_matrix(y_test, y_pred)
33 |
34 | print(f"Model: {model_name}")
35 | print(f"Accuracy: {accuracy:.4f}")
36 | print(f"Precision: {precision:.4f}")
37 | print(f"Recall: {recall:.4f}")
38 | print(f"F1 Score: {f1:.4f}")
39 | print(f"ROC AUC: {roc_auc:.4f}")
40 | print("Confusion Matrix:")
41 | print(cm)
42 | print("\n")
43 |
44 | # Modeli kaydetme
45 | os.makedirs('../models', exist_ok=True)
46 | joblib.dump(model, f"../models/{model_name}.pkl")
47 | print(f"Model saved as models/{model_name}.pkl")
48 |
49 |
50 | # Ana işlem fonksiyonu
51 | def main(X_train_path, X_test_path, y_train_path, y_test_path):
52 | X_train, X_test, y_train, y_test = load_data(X_train_path, X_test_path, y_train_path, y_test_path)
53 |
54 | # Lojistik Regresyon
55 | lr = LogisticRegression(max_iter=1000)
56 | train_and_evaluate_model(X_train, X_test, y_train, y_test, lr, "logistic_regression")
57 |
58 | # Random Forest
59 | rf = RandomForestClassifier(n_estimators=100)
60 | train_and_evaluate_model(X_train, X_test, y_train, y_test, rf, "random_forest")
61 |
62 | # XGBoost
63 | xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
64 | train_and_evaluate_model(X_train, X_test, y_train, y_test, xgb, "xgboost")
65 |
66 |
67 | if __name__ == "__main__":
68 | X_train_path = "../data/processed/X_train_engineered.csv" # Eğitim veri seti yolu
69 | X_test_path = "../data/processed/X_test_engineered.csv" # Test veri seti yolu
70 | y_train_path = "../data/processed/y_train.csv" # Eğitim hedef değişkeni veri seti yolu
71 | y_test_path = "../data/processed/y_test.csv" # Test hedef değişkeni veri seti yolu
72 | main(X_train_path, X_test_path, y_train_path, y_test_path)
73 |
--------------------------------------------------------------------------------
/scripts/plots/Logistic Regression_confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/scripts/plots/Logistic Regression_confusion_matrix.png
--------------------------------------------------------------------------------
/scripts/plots/Logistic Regression_roc_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/scripts/plots/Logistic Regression_roc_curve.png
--------------------------------------------------------------------------------
/scripts/plots/Random Forest_confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/scripts/plots/Random Forest_confusion_matrix.png
--------------------------------------------------------------------------------
/scripts/plots/Random Forest_roc_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/scripts/plots/Random Forest_roc_curve.png
--------------------------------------------------------------------------------
/scripts/plots/XGBoost_confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/scripts/plots/XGBoost_confusion_matrix.png
--------------------------------------------------------------------------------
/scripts/plots/XGBoost_roc_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/scripts/plots/XGBoost_roc_curve.png
--------------------------------------------------------------------------------
/scripts/reports/Logistic Regression_report.csv:
--------------------------------------------------------------------------------
1 | ,precision,recall,f1-score,support,accuracy,roc_auc
2 | 0,0.8235294117647058,0.8412017167381974,0.832271762208068,233.0,0.7898936170212766,0.8386206068609503
3 | 1,0.7318840579710145,0.7062937062937062,0.7188612099644128,143.0,0.7898936170212766,0.8386206068609503
4 | accuracy,0.7898936170212766,0.7898936170212766,0.7898936170212766,0.7898936170212766,0.7898936170212766,0.8386206068609503
5 | macro avg,0.7777067348678601,0.7737477115159519,0.7755664860862403,376.0,0.7898936170212766,0.8386206068609503
6 | weighted avg,0.7886749288059349,0.7898936170212766,0.7891395574983799,376.0,0.7898936170212766,0.8386206068609503
7 |
--------------------------------------------------------------------------------
/scripts/reports/Random Forest_report.csv:
--------------------------------------------------------------------------------
1 | ,precision,recall,f1-score,support,accuracy,roc_auc
2 | 0,0.8968253968253969,0.9699570815450643,0.931958762886598,233.0,0.9122340425531915,0.9768600498214233
3 | 1,0.9435483870967742,0.8181818181818182,0.8764044943820225,143.0,0.9122340425531915,0.9768600498214233
4 | accuracy,0.9122340425531915,0.9122340425531915,0.9122340425531915,0.9122340425531915,0.9122340425531915,0.9768600498214233
5 | macro avg,0.9201868919610856,0.8940694498634413,0.9041816286343103,376.0,0.9122340425531915,0.9768600498214233
6 | weighted avg,0.91459504472116,0.9122340425531915,0.9108304107691664,376.0,0.9122340425531915,0.9768600498214233
7 |
--------------------------------------------------------------------------------
/scripts/reports/XGBoost_report.csv:
--------------------------------------------------------------------------------
1 | ,precision,recall,f1-score,support,accuracy,roc_auc
2 | 0,0.9105691056910569,0.9613733905579399,0.9352818371607515,233.0,0.9175531914893617,0.9840631471532759
3 | 1,0.9307692307692308,0.8461538461538461,0.8864468864468864,143.0,0.9175531914893617,0.9840631471532759
4 | accuracy,0.9175531914893617,0.9175531914893617,0.9175531914893617,0.9175531914893617,0.9175531914893617,0.9840631471532759
5 | macro avg,0.9206691682301438,0.903763618355893,0.910864361803819,376.0,0.9175531914893617,0.9840631471532759
6 | weighted avg,0.9182516000691922,0.9175531914893617,0.9167089702669144,376.0,0.9175531914893617,0.9840631471532759
7 |
--------------------------------------------------------------------------------
/scripts/utility_functions.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import seaborn as sns
3 | import matplotlib.pyplot as plt
4 | from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, classification_report
5 | import joblib
6 | import os
7 |
8 |
9 | # Veri setini yükleme
10 | def load_data(file_path):
11 | df = pd.read_csv(file_path)
12 | return df
13 |
14 |
15 | # Veri setlerini yükleme
16 | def load_train_test_data(X_train_path, X_test_path, y_train_path, y_test_path):
17 | X_train = pd.read_csv(X_train_path)
18 | X_test = pd.read_csv(X_test_path)
19 | y_train = pd.read_csv(y_train_path).values.ravel().astype('int')
20 | y_test = pd.read_csv(y_test_path).values.ravel().astype('int')
21 | return X_train, X_test, y_train, y_test
22 |
23 |
24 | # Modeli kaydetme
25 | def save_model(model, model_name):
26 | os.makedirs('../models', exist_ok=True)
27 | joblib.dump(model, f"../models/{model_name}.pkl")
28 | print(f"Model saved as models/{model_name}.pkl")
29 |
30 |
31 | # Modeli yükleme
32 | def load_model(model_path):
33 | model = joblib.load(model_path)
34 | return model
35 |
36 |
37 | # ROC eğrisini çizme
38 | def plot_roc_curve(model, X_test, y_test, model_name):
39 | y_pred_prob = model.predict_proba(X_test)[:, 1]
40 | fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
41 | roc_auc = roc_auc_score(y_test, y_pred_prob)
42 |
43 | plt.figure()
44 | plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
45 | plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
46 | plt.xlim([0.0, 1.0])
47 | plt.ylim([0.0, 1.05])
48 | plt.xlabel('False Positive Rate')
49 | plt.ylabel('True Positive Rate')
50 | plt.title(f'Receiver Operating Characteristic - {model_name}')
51 | plt.legend(loc="lower right")
52 | os.makedirs('plots', exist_ok=True)
53 | plt.savefig(f'plots/{model_name}_roc_curve.png')
54 | plt.show()
55 |
56 |
57 | # Confusion Matrix çizme
58 | def plot_confusion_matrix(y_test, y_pred, model_name):
59 | cm = confusion_matrix(y_test, y_pred)
60 | plt.figure()
61 | sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Diabetes', 'Diabetes'],
62 | yticklabels=['No Diabetes', 'Diabetes'])
63 | plt.xlabel('Predicted')
64 | plt.ylabel('True')
65 | plt.title(f'Confusion Matrix - {model_name}')
66 | os.makedirs('plots', exist_ok=True)
67 | plt.savefig(f'plots/{model_name}_confusion_matrix.png')
68 | plt.show()
69 |
70 |
71 | # Modeli değerlendirme
72 | def evaluate_model(model, X_test, y_test, model_name):
73 | y_pred = model.predict(X_test)
74 |
75 | # ROC Curve
76 | plot_roc_curve(model, X_test, y_test, model_name)
77 |
78 | # Confusion Matrix
79 | plot_confusion_matrix(y_test, y_pred, model_name)
80 |
81 | # Classification Report
82 | print(f"Classification Report for {model_name}:\n")
83 | print(classification_report(y_test, y_pred))
84 |
85 |
86 | # Eğitim ve test setlerini kaydetme
87 | def save_datasets(X_train, X_test, y_train, y_test, output_dir):
88 | os.makedirs(output_dir, exist_ok=True)
89 | X_train.to_csv(os.path.join(output_dir, 'X_train.csv'), index=False)
90 | X_test.to_csv(os.path.join(output_dir, 'X_test.csv'), index=False)
91 | y_train.to_csv(os.path.join(output_dir, 'y_train.csv'), index=False)
92 | y_test.to_csv(os.path.join(output_dir, 'y_test.csv'), index=False)
93 | print(f"Datasets saved to {output_dir}")
94 |
--------------------------------------------------------------------------------
/tests/__pycache__/test_data_preprocessing.cpython-311-pytest-8.2.2.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/tests/__pycache__/test_data_preprocessing.cpython-311-pytest-8.2.2.pyc
--------------------------------------------------------------------------------
/tests/__pycache__/test_feature_engineering.cpython-311-pytest-8.2.2.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/tests/__pycache__/test_feature_engineering.cpython-311-pytest-8.2.2.pyc
--------------------------------------------------------------------------------
/tests/__pycache__/test_model_evaluation.cpython-311-pytest-8.2.2.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/tests/__pycache__/test_model_evaluation.cpython-311-pytest-8.2.2.pyc
--------------------------------------------------------------------------------
/tests/__pycache__/test_model_training.cpython-311-pytest-8.2.2.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/tests/__pycache__/test_model_training.cpython-311-pytest-8.2.2.pyc
--------------------------------------------------------------------------------
/tests/models/logistic_regression.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/tests/models/logistic_regression.pkl
--------------------------------------------------------------------------------
/tests/models/random_forest.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/tests/models/random_forest.pkl
--------------------------------------------------------------------------------
/tests/models/xgboost.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThecoderPinar/Diabetes_Health_Prediction_and_Analysis/ab675d7a3824652a15fcc552693687d62aa183bb/tests/models/xgboost.pkl
--------------------------------------------------------------------------------
/tests/test_data_preprocessing.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import pandas as pd
3 | from sklearn.impute import SimpleImputer
4 | from sklearn.preprocessing import OneHotEncoder, StandardScaler
5 | from sklearn.model_selection import train_test_split
6 | import os
7 |
8 |
9 | # Veri setini yükleme
10 | def load_data(file_path):
11 | df = pd.read_csv(file_path)
12 | return df
13 |
14 |
15 | # Veri Keşfi (Exploratory Data Analysis)
16 | def perform_eda(df):
17 | print("Veri Setinin İlk 5 Satırı:\n", df.head())
18 | print("\nVeri Seti Hakkında Bilgiler:\n", df.info())
19 | print("\nVeri Setindeki Eksik Değerler:\n", df.isnull().sum())
20 | print("\nTemel İstatistikler:\n", df.describe())
21 |
22 |
23 | # Eksik verileri işleme
24 | def handle_missing_values(df):
25 | imputer = SimpleImputer(strategy='mean')
26 | numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
27 | df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
28 | return df
29 |
30 |
31 | # Kategorik değişkenleri dönüştürme
32 | def encode_categorical_values(df):
33 | # Kategorik sütunları belirle
34 | categorical_columns = df.select_dtypes(include=['object', 'category']).columns
35 | # Encoder oluştur
36 | encoder = OneHotEncoder(sparse_output=False, drop='first')
37 | # Kategorik sütunları dönüştür
38 | encoded_columns = pd.DataFrame(encoder.fit_transform(df[categorical_columns]),
39 | columns=encoder.get_feature_names_out(categorical_columns))
40 | # Orijinal kategorik sütunları düşür
41 | df = df.drop(categorical_columns, axis=1)
42 | # Yeni sütunları ekle
43 | df = pd.concat([df, encoded_columns], axis=1)
44 | return df
45 |
46 |
47 | # Verileri normalleştirme
48 | def normalize_data(df):
49 | scaler = StandardScaler()
50 | numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
51 | df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
52 | return df
53 |
54 |
55 | # Verileri eğitim ve test setlerine ayırma
56 | def split_data(df, target_column, test_size=0.2, random_state=42):
57 | X = df.drop(target_column, axis=1)
58 | y = df[target_column]
59 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
60 | return X_train, X_test, y_train, y_test
61 |
62 |
63 | # Eğitim ve test setlerini kaydetme
64 | def save_datasets(X_train, X_test, y_train, y_test, output_dir):
65 | os.makedirs(output_dir, exist_ok=True)
66 | X_train.to_csv(os.path.join(output_dir, 'X_train.csv'), index=False)
67 | X_test.to_csv(os.path.join(output_dir, 'X_test.csv'), index=False)
68 | y_train.to_csv(os.path.join(output_dir, 'y_train.csv'), index=False)
69 | y_test.to_csv(os.path.join(output_dir, 'y_test.csv'), index=False)
70 | print(f"Datasets saved to {output_dir}")
71 |
72 |
73 | @pytest.fixture
74 | def sample_data():
75 | data = {
76 | 'Age': [25, 35, 45, 55],
77 | 'Gender': ['0', '1', '0', '1'], # 'Gender' sütunu string tipinde olmalı
78 | 'BMI': [22.5, 24.5, 28.0, 30.0],
79 | 'Diagnosis': [0, 1, 0, 1]
80 | }
81 | return pd.DataFrame(data)
82 |
83 |
84 | def test_load_data(sample_data):
85 | df = sample_data
86 | assert not df.empty
87 |
88 |
89 | def test_handle_missing_values(sample_data):
90 | df = sample_data.copy()
91 | df.loc[0, 'BMI'] = None
92 | df = handle_missing_values(df)
93 | assert df['BMI'].isnull().sum() == 0
94 |
95 |
96 | def test_encode_categorical_values(sample_data):
97 | df = sample_data
98 | df = encode_categorical_values(df)
99 | assert 'Gender_1' in df.columns
100 |
101 |
102 | def test_normalize_data(sample_data):
103 | df = sample_data
104 | df = normalize_data(df)
105 | assert df['Age'].mean() < 1
106 |
107 |
108 | def test_split_data(sample_data):
109 | df = sample_data
110 | X_train, X_test, y_train, y_test = split_data(df, 'Diagnosis')
111 | assert len(X_train) + len(X_test) == len(df)
112 |
113 |
114 | def test_save_datasets(tmpdir, sample_data):
115 | df = sample_data
116 | X_train, X_test, y_train, y_test = split_data(df, 'Diagnosis')
117 | output_dir = tmpdir.mkdir("data")
118 | save_datasets(X_train, X_test, y_train, y_test, output_dir)
119 | assert (output_dir / 'X_train.csv').check()
120 |
--------------------------------------------------------------------------------
/tests/test_feature_engineering.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import pandas as pd
3 | import numpy as np
4 |
5 |
6 | # Yeni özellikler oluşturma
7 | def create_new_features(df):
8 | df['Age_BMI'] = df['Age'] * df['BMI']
9 | return df
10 |
11 |
12 | # Polinom özellikler ekleme
13 | def add_polynomial_features(df, degree=2):
14 | for column in df.select_dtypes(include=['float64', 'int64']).columns:
15 | for power in range(2, degree + 1):
16 | df[f'{column}^{power}'] = np.power(df[column], power)
17 | return df
18 |
19 |
20 | @pytest.fixture
21 | def sample_data():
22 | data = {
23 | 'Age': [25, 35, 45, 55],
24 | 'BMI': [22.5, 24.5, 28.0, 30.0]
25 | }
26 | return pd.DataFrame(data)
27 |
28 |
29 | def test_create_new_features(sample_data):
30 | df = sample_data
31 | df = create_new_features(df)
32 | assert 'Age_BMI' in df.columns
33 | assert (df['Age_BMI'] == df['Age'] * df['BMI']).all()
34 |
35 |
36 | def test_add_polynomial_features(sample_data):
37 | df = sample_data
38 | df = add_polynomial_features(df, degree=2)
39 | assert 'Age^2' in df.columns
40 | assert (df['Age^2'] == df['Age'] ** 2).all()
41 | assert 'BMI^2' in df.columns
42 | assert (df['BMI^2'] == df['BMI'] ** 2).all()
43 |
--------------------------------------------------------------------------------
/tests/test_model_training.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import pandas as pd
3 | from sklearn.linear_model import LogisticRegression
4 | from sklearn.ensemble import RandomForestClassifier
5 | from xgboost import XGBClassifier
6 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
7 | import joblib
8 | import os
9 |
10 |
11 | # Veri setlerini yükleme
12 | def load_data(train_path, test_path):
13 | X_train = pd.read_csv(train_path)
14 | X_test = pd.read_csv(test_path)
15 | y_train = pd.read_csv(train_path.replace('X_', 'y_')).values.ravel().astype('int')
16 | y_test = pd.read_csv(test_path.replace('X_', 'y_')).values.ravel().astype('int')
17 | return X_train, X_test, y_train, y_test
18 |
19 |
20 | # Model eğitimi ve değerlendirmesi
21 | def train_and_evaluate_model(X_train, X_test, y_train, y_test, model, model_name):
22 | model.fit(X_train, y_train)
23 | y_pred = model.predict(X_test)
24 | y_pred_prob = model.predict_proba(X_test)[:, 1]
25 |
26 | accuracy = accuracy_score(y_test, y_pred)
27 | precision = precision_score(y_test, y_pred)
28 | recall = recall_score(y_test, y_pred)
29 | f1 = f1_score(y_test, y_pred)
30 | roc_auc = roc_auc_score(y_test, y_pred_prob)
31 | cm = confusion_matrix(y_test, y_pred)
32 |
33 | print(f"Model: {model_name}")
34 | print(f"Accuracy: {accuracy:.4f}")
35 | print(f"Precision: {precision:.4f}")
36 | print(f"Recall: {recall:.4f}")
37 | print(f"F1 Score: {f1:.4f}")
38 | print(f"ROC AUC: {roc_auc:.4f}")
39 | print("Confusion Matrix:")
40 | print(cm)
41 | print("\n")
42 |
43 | # Modeli kaydetme
44 | os.makedirs('models', exist_ok=True)
45 | joblib.dump(model, f"models/{model_name}.pkl")
46 | print(f"Model saved as models/{model_name}.pkl")
47 |
48 |
49 | @pytest.fixture
50 | def sample_data():
51 | X_train = pd.DataFrame({
52 | 'Age': [25, 35, 45, 55],
53 | 'BMI': [22.5, 24.5, 28.0, 30.0]
54 | })
55 | y_train = pd.Series([0, 1, 0, 1])
56 | return X_train, y_train
57 |
58 |
59 | def test_train_and_evaluate_model(sample_data):
60 | X_train, y_train = sample_data
61 | X_test, y_test = X_train.copy(), y_train.copy()
62 |
63 | # Logistic Regression
64 | lr = LogisticRegression()
65 | train_and_evaluate_model(X_train, X_test, y_train, y_test, lr, "logistic_regression")
66 |
67 | # Random Forest
68 | rf = RandomForestClassifier(n_estimators=10)
69 | train_and_evaluate_model(X_train, X_test, y_train, y_test, rf, "random_forest")
70 |
71 | # XGBoost
72 | xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
73 | train_and_evaluate_model(X_train, X_test, y_train, y_test, xgb, "xgboost")
74 |
75 | # Modellerin kaydedildiğini kontrol etme
76 | assert os.path.exists('models/logistic_regression.pkl')
77 | assert os.path.exists('models/random_forest.pkl')
78 | assert os.path.exists('models/xgboost.pkl')
79 |
80 |
81 | def test_load_data(tmpdir):
82 | # Geçici veri dosyaları oluşturma
83 | train_file = tmpdir.join("X_train.csv")
84 | test_file = tmpdir.join("X_test.csv")
85 | train_file.write("Age,BMI\n25,22.5\n35,24.5\n45,28.0\n55,30.0")
86 | test_file.write("Age,BMI\n25,22.5\n35,24.5\n45,28.0\n55,30.0")
87 |
88 | y_train_file = tmpdir.join("y_train.csv")
89 | y_test_file = tmpdir.join("y_test.csv")
90 | y_train_file.write("Diagnosis\n0\n1\n0\n1")
91 | y_test_file.write("Diagnosis\n0\n1\n0\n1")
92 |
93 | # Veri setlerini yükleme
94 | X_train, X_test, y_train, y_test = load_data(str(train_file), str(test_file))
95 |
96 | # Veri setlerinin doğru yüklendiğini kontrol etme
97 | assert X_train.shape == (4, 2)
98 | assert X_test.shape == (4, 2)
99 | assert y_train.shape == (4,)
100 | assert y_test.shape == (4,)
101 |
--------------------------------------------------------------------------------