├── file 3 ├── file2 ├── file1 ├── README.md ├── README(1).md └── hospital_readmission_prediction.ipynb /file 3: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras.models import Sequential 3 | from tensorflow.keras.layers import Dense, Dropout 4 | from tensorflow.keras.callbacks import EarlyStopping 5 | 6 | # Build the model 7 | mlp_model = Sequential([ 8 | Dense(64, activation='relu', input_shape=(X_train.shape[1],)), 9 | Dropout(0.3), 10 | Dense(32, activation='relu'), 11 | Dropout(0.3), 12 | Dense(1, activation='sigmoid') # Binary classification 13 | ]) 14 | 15 | mlp_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) 16 | 17 | # Early stopping to prevent overfitting 18 | early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) 19 | 20 | # Train the model 21 | history = mlp_model.fit( 22 | X_train, y_train, 23 | validation_split=0.2, 24 | epochs=50, 25 | batch_size=32, 26 | callbacks=[early_stop], 27 | verbose=1 28 | ) 29 | 30 | # Evaluate 31 | mlp_loss, mlp_accuracy = mlp_model.evaluate(X_test, y_test) 32 | y_pred_dl = (mlp_model.predict(X_test) > 0.5).astype("int32") 33 | 34 | print("\nMLP Model Results:") 35 | print("---------------------------") 36 | print("Accuracy:", mlp_accuracy) 37 | print("Classification Report:") 38 | print(classification_report(y_test, y_pred_dl)) 39 | 40 | # Plot training history 41 | plt.figure(figsize=(10,4)) 42 | plt.plot(history.history['loss'], label='Train Loss') 43 | plt.plot(history.history['val_loss'], label='Val Loss') 44 | plt.title('MLP Training Loss Over Epochs') 45 | plt.xlabel('Epoch') 46 | plt.ylabel('Loss') 47 | plt.legend() 48 | plt.tight_layout() 49 | plt.show() 50 | -------------------------------------------------------------------------------- /file2: -------------------------------------------------------------------------------- 1 | from sklearn.linear_model import LogisticRegression 2 | from sklearn.ensemble import RandomForestClassifier 3 | from sklearn.metrics import roc_auc_score 4 | 5 | # Initialize models 6 | log_reg = LogisticRegression() 7 | rf_clf = RandomForestClassifier(n_estimators=100, random_state=42) 8 | xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss') 9 | 10 | # Dictionary to store results 11 | results = {} 12 | 13 | # Function to train, predict and evaluate models 14 | def evaluate_model(name, model): 15 | model.fit(X_train, y_train) 16 | y_pred = model.predict(X_test) 17 | y_proba = model.predict_proba(X_test)[:, 1] 18 | 19 | acc = accuracy_score(y_test, y_pred) 20 | auc = roc_auc_score(y_test, y_proba) 21 | print(f"\n{name} Results:") 22 | print("---------------------------") 23 | print("Accuracy:", acc) 24 | print("ROC AUC Score:", auc) 25 | print("Classification Report:") 26 | print(classification_report(y_test, y_pred)) 27 | 28 | results[name] = {'accuracy': acc, 'roc_auc': auc} 29 | 30 | # Evaluate all models 31 | evaluate_model("Logistic Regression", log_reg) 32 | evaluate_model("Random Forest", rf_clf) 33 | evaluate_model("XGBoost", xgb_clf) 34 | 35 | # Plot comparison 36 | plt.figure(figsize=(8,5)) 37 | bars = plt.bar(results.keys(), [v['accuracy'] for v in results.values()]) 38 | plt.title("Model Accuracy Comparison") 39 | plt.ylabel("Accuracy") 40 | plt.ylim(0, 1) 41 | 42 | # Annotate bars 43 | for bar in bars: 44 | yval = bar.get_height() 45 | plt.text(bar.get_x() + bar.get_width()/2, yval + 0.01, f"{yval:.2f}", ha='center', fontsize=10) 46 | 47 | plt.tight_layout() 48 | plt.show() 49 | -------------------------------------------------------------------------------- /file1: -------------------------------------------------------------------------------- 1 | pip install pandas numpy scikit-learn xgboost matplotlib seaborn 2 | import numpy as np 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.metrics import classification_report, confusion_matrix, accuracy_score 9 | from sklearn.preprocessing import LabelEncoder, StandardScaler 10 | from xgboost import XGBClassifier 11 | 12 | # Set random seed 13 | np.random.seed(42) 14 | 15 | # Step 1: Generate Synthetic Data 16 | n_samples = 1000 17 | data = pd.DataFrame({ 18 | 'age': np.random.randint(18, 90, n_samples), 19 | 'gender': np.random.choice(['Male', 'Female'], n_samples), 20 | 'length_of_stay': np.random.randint(1, 30, n_samples), 21 | 'num_prev_admissions': np.random.randint(0, 10, n_samples), 22 | 'comorbidity_score': np.random.normal(loc=2, scale=1, size=n_samples).round(1), 23 | 'has_diabetes': np.random.choice([0, 1], n_samples), 24 | 'has_hypertension': np.random.choice([0, 1], n_samples), 25 | 'discharged_to_home': np.random.choice([0, 1], n_samples), 26 | 'readmitted_within_30_days': np.random.choice([0, 1], n_samples, p=[0.7, 0.3]) 27 | }) 28 | 29 | # Step 2: Preprocessing 30 | # Encode gender 31 | data['gender'] = LabelEncoder().fit_transform(data['gender']) 32 | 33 | # Features and target 34 | X = data.drop('readmitted_within_30_days', axis=1) 35 | y = data['readmitted_within_30_days'] 36 | 37 | # Normalize features 38 | scaler = StandardScaler() 39 | X_scaled = scaler.fit_transform(X) 40 | 41 | # Split into train/test 42 | X_train, X_test, y_train, y_test = train_test_split( 43 | X_scaled, y, test_size=0.3, random_state=42, stratify=y 44 | ) 45 | 46 | # Step 3: Train XGBoost Classifier 47 | model = XGBClassifier(use_label_encoder=False, eval_metric='logloss') 48 | model.fit(X_train, y_train) 49 | 50 | # Step 4: Evaluate Model 51 | y_pred = model.predict(X_test) 52 | 53 | print("\nConfusion Matrix:") 54 | print(confusion_matrix(y_test, y_pred)) 55 | print("\nClassification Report:") 56 | print(classification_report(y_test, y_pred)) 57 | print("Accuracy:", accuracy_score(y_test, y_pred)) 58 | 59 | # Step 5: Feature Importance Plot 60 | plt.figure(figsize=(10,6)) 61 | sns.barplot(x=model.feature_importances_, y=X.columns) 62 | plt.title("Feature Importance - Hospital Readmission Prediction") 63 | plt.xlabel("Importance Score") 64 | plt.ylabel("Features") 65 | plt.tight_layout() 66 | plt.show() 67 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🏥 Predicting Hospital Readmission Rates 2 | 3 | This project demonstrates how to predict hospital readmission within 30 days using **synthetic data** and multiple machine learning models. The pipeline includes classical ML algorithms and a deep learning approach using Keras (TensorFlow). 4 | 5 | --- 6 | 7 | ## 📦 Project Structure 8 | 9 | - `hospital_readmission_prediction.ipynb` – Jupyter Notebook with full end-to-end workflow. 10 | - `predict_hospital_readmission.py` – Python script version of the same workflow. 11 | - `roc_curve_comparison.png` – Visualization of ROC curves comparing model performance. 12 | - `hospital_readmission_report.txt` – Summary report of results and findings. 13 | 14 | --- 15 | 16 | ## 🧠 Models Included 17 | 18 | | Model | Description | 19 | |----------------------|-----------------------------------------| 20 | | Logistic Regression | Baseline linear model | 21 | | Random Forest | Ensemble-based model using decision trees | 22 | | XGBoost | Gradient boosting algorithm | 23 | | MLP (Keras) | Deep learning model with dense layers | 24 | 25 | --- 26 | 27 | ## 🗂 Features Used (Synthetic) 28 | 29 | - Age 30 | - Gender 31 | - Length of Stay 32 | - Number of Previous Admissions 33 | - Comorbidity Score 34 | - Diabetes Status 35 | - Hypertension Status 36 | - Discharge Destination (Home or Not) 37 | 38 | --- 39 | 40 | ## ⚙️ How to Run 41 | 42 | 1. Clone this repo or download the `.ipynb` and `.py` files. 43 | 2. Install dependencies: 44 | 45 | ```bash 46 | pip install pandas numpy scikit-learn xgboost tensorflow matplotlib seaborn 47 | 48 | Run the notebook: 49 | 50 | jupyter notebook hospital_readmission_prediction.ipynb 51 | 52 | Or execute the script: 53 | 54 | python predict_hospital_readmission.py 55 | 56 | 📊 Output 57 | 58 | Evaluation metrics: Accuracy, ROC AUC, Precision, Recall, F1-score 59 | 60 | Visualization: Combined ROC Curve Comparison 61 | 62 | Summary report with key findings and future recommendations. 63 | 64 | 📌 Summary of Findings 65 | 66 | XGBoost and MLP performed best in terms of ROC AUC. 67 | 68 | MLP handled non-linear relationships effectively. 69 | 70 | Logistic Regression provided fast and interpretable results. 71 | 72 | The workflow is extendable to real-world EHR datasets like MIMIC-III. 73 | 74 | 📈 Future Work 75 | 76 | Incorporate real hospital datasets. 77 | 78 | Apply feature selection and hyperparameter tuning. 79 | 80 | Deploy as an API or interactive app using Flask or Streamlit. 81 | 82 | 👨‍⚕️ Author Okes Imoni 83 | EmEmail: jennyimoni@gmail.com 84 | 85 | Data Scientist & Health Tech Enthusiast 86 | 📧 Contact me | 🌐 LinkedIn 87 | -------------------------------------------------------------------------------- /README(1).md: -------------------------------------------------------------------------------- 1 | 2 | # 🏥 Predicting Hospital Readmission Rates 3 | 4 | This project demonstrates how to predict hospital readmission within 30 days using **synthetic data** and multiple machine learning models. The pipeline includes classical ML algorithms and a deep learning approach using Keras (TensorFlow). 5 | 6 | --- 7 | 8 | ## 📦 Project Structure 9 | 10 | - `hospital_readmission_prediction.ipynb` – Jupyter Notebook with full end-to-end workflow. 11 | - `predict_hospital_readmission.py` – Python script version of the same workflow. 12 | - `roc_curve_comparison.png` – Visualization of ROC curves comparing model performance. 13 | - `hospital_readmission_report.txt` – Summary report of results and findings. 14 | 15 | --- 16 | 17 | ## 🧠 Models Included 18 | 19 | | Model | Description | 20 | |----------------------|-----------------------------------------| 21 | | Logistic Regression | Baseline linear model | 22 | | Random Forest | Ensemble-based model using decision trees | 23 | | XGBoost | Gradient boosting algorithm | 24 | | MLP (Keras) | Deep learning model with dense layers | 25 | 26 | --- 27 | 28 | ## 🗂 Features Used (Synthetic) 29 | 30 | - Age 31 | - Gender 32 | - Length of Stay 33 | - Number of Previous Admissions 34 | - Comorbidity Score 35 | - Diabetes Status 36 | - Hypertension Status 37 | - Discharge Destination (Home or Not) 38 | 39 | --- 40 | 41 | ## ⚙️ How to Run 42 | 43 | 1. Clone this repo or download the `.ipynb` and `.py` files. 44 | 2. Install dependencies: 45 | 46 | ```bash 47 | pip install pandas numpy scikit-learn xgboost tensorflow matplotlib seaborn 48 | ``` 49 | 50 | 3. Run the notebook: 51 | ```bash 52 | jupyter notebook hospital_readmission_prediction.ipynb 53 | ``` 54 | 55 | Or execute the script: 56 | ```bash 57 | python predict_hospital_readmission.py 58 | ``` 59 | 60 | --- 61 | 62 | ## 📊 Output 63 | 64 | - Evaluation metrics: **Accuracy, ROC AUC, Precision, Recall, F1-score** 65 | - Visualization: Combined **ROC Curve Comparison** 66 | - Summary report with key findings and future recommendations. 67 | 68 | --- 69 | 70 | ## 📌 Summary of Findings 71 | 72 | - XGBoost and MLP performed best in terms of ROC AUC. 73 | - MLP handled non-linear relationships effectively. 74 | - Logistic Regression provided fast and interpretable results. 75 | - The workflow is extendable to real-world EHR datasets like MIMIC-III. 76 | 77 | --- 78 | 79 | ## 📈 Future Work 80 | 81 | - Incorporate real hospital datasets. 82 | - Apply feature selection and hyperparameter tuning. 83 | - Deploy as an API or interactive app using Flask or Streamlit. 84 | 85 | --- 86 | 87 | ## 👨‍⚕️ Author 88 | 89 | **Agbozu Ebingiye Nelvin** 90 | _Data Scientist & Health Tech Enthusiast_ 91 | 📧 [Contact me](mailto:your-email@example.com) | 🌐 [LinkedIn](https://www.linkedin.com) 92 | -------------------------------------------------------------------------------- /hospital_readmission_prediction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "c61de6c8", 6 | "metadata": {}, 7 | "source": [ 8 | "# 🏥 Predicting Hospital Readmission Rates\n", 9 | "An end-to-end machine learning + deep learning pipeline using synthetic hospital data.\n", 10 | "Models used: Logistic Regression, Random Forest, XGBoost, MLP (TensorFlow).\n", 11 | "Includes: Data generation, preprocessing, training, evaluation, ROC comparison." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "007728e2", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "import numpy as np\n", 22 | "import pandas as pd\n", 23 | "import matplotlib.pyplot as plt\n", 24 | "import seaborn as sns\n", 25 | "\n", 26 | "from sklearn.model_selection import train_test_split\n", 27 | "from sklearn.preprocessing import LabelEncoder, StandardScaler\n", 28 | "from sklearn.linear_model import LogisticRegression\n", 29 | "from sklearn.ensemble import RandomForestClassifier\n", 30 | "from xgboost import XGBClassifier\n", 31 | "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, auc\n", 32 | "\n", 33 | "import tensorflow as tf\n", 34 | "from tensorflow.keras.models import Sequential\n", 35 | "from tensorflow.keras.layers import Dense, Dropout\n", 36 | "from tensorflow.keras.callbacks import EarlyStopping\n" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "id": "866da584", 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# Generate synthetic data\n", 47 | "np.random.seed(42)\n", 48 | "n_samples = 1000\n", 49 | "data = pd.DataFrame({\n", 50 | " 'age': np.random.randint(18, 90, n_samples),\n", 51 | " 'gender': np.random.choice(['Male', 'Female'], n_samples),\n", 52 | " 'length_of_stay': np.random.randint(1, 30, n_samples),\n", 53 | " 'num_prev_admissions': np.random.randint(0, 10, n_samples),\n", 54 | " 'comorbidity_score': np.random.normal(loc=2, scale=1, size=n_samples).round(1),\n", 55 | " 'has_diabetes': np.random.choice([0, 1], n_samples),\n", 56 | " 'has_hypertension': np.random.choice([0, 1], n_samples),\n", 57 | " 'discharged_to_home': np.random.choice([0, 1], n_samples),\n", 58 | " 'readmitted_within_30_days': np.random.choice([0, 1], n_samples, p=[0.7, 0.3])\n", 59 | "})\n", 60 | "data['gender'] = LabelEncoder().fit_transform(data['gender'])\n", 61 | "\n", 62 | "# Split data\n", 63 | "X = data.drop('readmitted_within_30_days', axis=1)\n", 64 | "y = data['readmitted_within_30_days']\n", 65 | "scaler = StandardScaler()\n", 66 | "X_scaled = scaler.fit_transform(X)\n", 67 | "X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, stratify=y, random_state=42)\n" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "id": "a561c9c3", 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "# Train classical ML models\n", 78 | "models = {\n", 79 | " \"Logistic Regression\": LogisticRegression(),\n", 80 | " \"Random Forest\": RandomForestClassifier(n_estimators=100, random_state=42),\n", 81 | " \"XGBoost\": XGBClassifier(use_label_encoder=False, eval_metric='logloss')\n", 82 | "}\n", 83 | "roc_data = {}\n", 84 | "for name, model in models.items():\n", 85 | " model.fit(X_train, y_train)\n", 86 | " y_proba = model.predict_proba(X_test)[:, 1]\n", 87 | " fpr, tpr, _ = roc_curve(y_test, y_proba)\n", 88 | " roc_data[name] = (fpr, tpr, auc(fpr, tpr))\n", 89 | " print(f\"\\n{name} Accuracy: {accuracy_score(y_test, model.predict(X_test)):.2f}\")\n", 90 | " print(classification_report(y_test, model.predict(X_test)))\n" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "id": "001bf42b", 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "# Train MLP with TensorFlow\n", 101 | "mlp_model = Sequential([\n", 102 | " Dense(64, activation='relu', input_shape=(X_train.shape[1],)),\n", 103 | " Dropout(0.3),\n", 104 | " Dense(32, activation='relu'),\n", 105 | " Dropout(0.3),\n", 106 | " Dense(1, activation='sigmoid')\n", 107 | "])\n", 108 | "mlp_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n", 109 | "early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)\n", 110 | "history = mlp_model.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=32, callbacks=[early_stop], verbose=0)\n", 111 | "\n", 112 | "# Evaluate and store ROC\n", 113 | "y_proba_dl = mlp_model.predict(X_test).ravel()\n", 114 | "fpr_dl, tpr_dl, _ = roc_curve(y_test, y_proba_dl)\n", 115 | "roc_data[\"MLP (Deep Learning)\"] = (fpr_dl, tpr_dl, auc(fpr_dl, tpr_dl))\n", 116 | "\n", 117 | "print(f\"\\nMLP Accuracy: {mlp_model.evaluate(X_test, y_test, verbose=0)[1]:.2f}\")\n", 118 | "y_pred_dl = (y_proba_dl > 0.5).astype('int')\n", 119 | "print(classification_report(y_test, y_pred_dl))\n" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "id": "6fdaaefa", 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "# ROC Curve Comparison\n", 130 | "plt.figure(figsize=(10, 6))\n", 131 | "for name, (fpr, tpr, roc_auc) in roc_data.items():\n", 132 | " plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')\n", 133 | "plt.plot([0, 1], [0, 1], 'k--')\n", 134 | "plt.title('ROC Curve Comparison')\n", 135 | "plt.xlabel('False Positive Rate')\n", 136 | "plt.ylabel('True Positive Rate')\n", 137 | "plt.legend()\n", 138 | "plt.grid(True)\n", 139 | "plt.tight_layout()\n", 140 | "plt.show()\n" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "id": "f4bcf232", 146 | "metadata": {}, 147 | "source": [ 148 | "### 📊 Summary\n", 149 | "- XGBoost and MLP performed best in terms of ROC AUC.\n", 150 | "- MLP captured nonlinear relationships but took longer to train.\n", 151 | "- Logistic Regression gave fast, interpretable results.\n", 152 | "- Future work: use real patient data (e.g. MIMIC-III), hyperparameter tuning, feature engineering." 153 | ] 154 | } 155 | ], 156 | "metadata": {}, 157 | "nbformat": 4, 158 | "nbformat_minor": 5 159 | } 160 | --------------------------------------------------------------------------------