├── file 3
├── file2
├── file1
├── README.md
├── README(1).md
└── hospital_readmission_prediction.ipynb


/file 3:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras.models import Sequential
 3 | from tensorflow.keras.layers import Dense, Dropout
 4 | from tensorflow.keras.callbacks import EarlyStopping
 5 | 
 6 | # Build the model
 7 | mlp_model = Sequential([
 8 |     Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
 9 |     Dropout(0.3),
10 |     Dense(32, activation='relu'),
11 |     Dropout(0.3),
12 |     Dense(1, activation='sigmoid')  # Binary classification
13 | ])
14 | 
15 | mlp_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
16 | 
17 | # Early stopping to prevent overfitting
18 | early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
19 | 
20 | # Train the model
21 | history = mlp_model.fit(
22 |     X_train, y_train,
23 |     validation_split=0.2,
24 |     epochs=50,
25 |     batch_size=32,
26 |     callbacks=[early_stop],
27 |     verbose=1
28 | )
29 | 
30 | # Evaluate
31 | mlp_loss, mlp_accuracy = mlp_model.evaluate(X_test, y_test)
32 | y_pred_dl = (mlp_model.predict(X_test) > 0.5).astype("int32")
33 | 
34 | print("\nMLP Model Results:")
35 | print("---------------------------")
36 | print("Accuracy:", mlp_accuracy)
37 | print("Classification Report:")
38 | print(classification_report(y_test, y_pred_dl))
39 | 
40 | # Plot training history
41 | plt.figure(figsize=(10,4))
42 | plt.plot(history.history['loss'], label='Train Loss')
43 | plt.plot(history.history['val_loss'], label='Val Loss')
44 | plt.title('MLP Training Loss Over Epochs')
45 | plt.xlabel('Epoch')
46 | plt.ylabel('Loss')
47 | plt.legend()
48 | plt.tight_layout()
49 | plt.show()
50 | 


--------------------------------------------------------------------------------
/file2:
--------------------------------------------------------------------------------
 1 | from sklearn.linear_model import LogisticRegression
 2 | from sklearn.ensemble import RandomForestClassifier
 3 | from sklearn.metrics import roc_auc_score
 4 | 
 5 | # Initialize models
 6 | log_reg = LogisticRegression()
 7 | rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
 8 | xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
 9 | 
10 | # Dictionary to store results
11 | results = {}
12 | 
13 | # Function to train, predict and evaluate models
14 | def evaluate_model(name, model):
15 |     model.fit(X_train, y_train)
16 |     y_pred = model.predict(X_test)
17 |     y_proba = model.predict_proba(X_test)[:, 1]
18 | 
19 |     acc = accuracy_score(y_test, y_pred)
20 |     auc = roc_auc_score(y_test, y_proba)
21 |     print(f"\n{name} Results:")
22 |     print("---------------------------")
23 |     print("Accuracy:", acc)
24 |     print("ROC AUC Score:", auc)
25 |     print("Classification Report:")
26 |     print(classification_report(y_test, y_pred))
27 | 
28 |     results[name] = {'accuracy': acc, 'roc_auc': auc}
29 | 
30 | # Evaluate all models
31 | evaluate_model("Logistic Regression", log_reg)
32 | evaluate_model("Random Forest", rf_clf)
33 | evaluate_model("XGBoost", xgb_clf)
34 | 
35 | # Plot comparison
36 | plt.figure(figsize=(8,5))
37 | bars = plt.bar(results.keys(), [v['accuracy'] for v in results.values()])
38 | plt.title("Model Accuracy Comparison")
39 | plt.ylabel("Accuracy")
40 | plt.ylim(0, 1)
41 | 
42 | # Annotate bars
43 | for bar in bars:
44 |     yval = bar.get_height()
45 |     plt.text(bar.get_x() + bar.get_width()/2, yval + 0.01, f"{yval:.2f}", ha='center', fontsize=10)
46 | 
47 | plt.tight_layout()
48 | plt.show()
49 | 


--------------------------------------------------------------------------------
/file1:
--------------------------------------------------------------------------------
 1 | pip install pandas numpy scikit-learn xgboost matplotlib seaborn
 2 | import numpy as np
 3 | import pandas as pd
 4 | import matplotlib.pyplot as plt
 5 | import seaborn as sns
 6 | 
 7 | from sklearn.model_selection import train_test_split
 8 | from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
 9 | from sklearn.preprocessing import LabelEncoder, StandardScaler
10 | from xgboost import XGBClassifier
11 | 
12 | # Set random seed
13 | np.random.seed(42)
14 | 
15 | # Step 1: Generate Synthetic Data
16 | n_samples = 1000
17 | data = pd.DataFrame({
18 |     'age': np.random.randint(18, 90, n_samples),
19 |     'gender': np.random.choice(['Male', 'Female'], n_samples),
20 |     'length_of_stay': np.random.randint(1, 30, n_samples),
21 |     'num_prev_admissions': np.random.randint(0, 10, n_samples),
22 |     'comorbidity_score': np.random.normal(loc=2, scale=1, size=n_samples).round(1),
23 |     'has_diabetes': np.random.choice([0, 1], n_samples),
24 |     'has_hypertension': np.random.choice([0, 1], n_samples),
25 |     'discharged_to_home': np.random.choice([0, 1], n_samples),
26 |     'readmitted_within_30_days': np.random.choice([0, 1], n_samples, p=[0.7, 0.3])
27 | })
28 | 
29 | # Step 2: Preprocessing
30 | # Encode gender
31 | data['gender'] = LabelEncoder().fit_transform(data['gender'])
32 | 
33 | # Features and target
34 | X = data.drop('readmitted_within_30_days', axis=1)
35 | y = data['readmitted_within_30_days']
36 | 
37 | # Normalize features
38 | scaler = StandardScaler()
39 | X_scaled = scaler.fit_transform(X)
40 | 
41 | # Split into train/test
42 | X_train, X_test, y_train, y_test = train_test_split(
43 |     X_scaled, y, test_size=0.3, random_state=42, stratify=y
44 | )
45 | 
46 | # Step 3: Train XGBoost Classifier
47 | model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
48 | model.fit(X_train, y_train)
49 | 
50 | # Step 4: Evaluate Model
51 | y_pred = model.predict(X_test)
52 | 
53 | print("\nConfusion Matrix:")
54 | print(confusion_matrix(y_test, y_pred))
55 | print("\nClassification Report:")
56 | print(classification_report(y_test, y_pred))
57 | print("Accuracy:", accuracy_score(y_test, y_pred))
58 | 
59 | # Step 5: Feature Importance Plot
60 | plt.figure(figsize=(10,6))
61 | sns.barplot(x=model.feature_importances_, y=X.columns)
62 | plt.title("Feature Importance - Hospital Readmission Prediction")
63 | plt.xlabel("Importance Score")
64 | plt.ylabel("Features")
65 | plt.tight_layout()
66 | plt.show()
67 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 🏥 Predicting Hospital Readmission Rates
 2 | 
 3 | This project demonstrates how to predict hospital readmission within 30 days using **synthetic data** and multiple machine learning models. The pipeline includes classical ML algorithms and a deep learning approach using Keras (TensorFlow).
 4 | 
 5 | ---
 6 | 
 7 | ## 📦 Project Structure
 8 | 
 9 | - `hospital_readmission_prediction.ipynb` – Jupyter Notebook with full end-to-end workflow.
10 | - `predict_hospital_readmission.py` – Python script version of the same workflow.
11 | - `roc_curve_comparison.png` – Visualization of ROC curves comparing model performance.
12 | - `hospital_readmission_report.txt` – Summary report of results and findings.
13 | 
14 | ---
15 | 
16 | ## 🧠 Models Included
17 | 
18 | | Model                 | Description                             |
19 | |----------------------|-----------------------------------------|
20 | | Logistic Regression  | Baseline linear model                   |
21 | | Random Forest        | Ensemble-based model using decision trees |
22 | | XGBoost              | Gradient boosting algorithm             |
23 | | MLP (Keras)          | Deep learning model with dense layers   |
24 | 
25 | ---
26 | 
27 | ## 🗂 Features Used (Synthetic)
28 | 
29 | - Age  
30 | - Gender  
31 | - Length of Stay  
32 | - Number of Previous Admissions  
33 | - Comorbidity Score  
34 | - Diabetes Status  
35 | - Hypertension Status  
36 | - Discharge Destination (Home or Not)
37 | 
38 | ---
39 | 
40 | ## ⚙️ How to Run
41 | 
42 | 1. Clone this repo or download the `.ipynb` and `.py` files.
43 | 2. Install dependencies:
44 | 
45 | ```bash
46 | pip install pandas numpy scikit-learn xgboost tensorflow matplotlib seaborn
47 | 
48 |     Run the notebook:
49 | 
50 | jupyter notebook hospital_readmission_prediction.ipynb
51 | 
52 | Or execute the script:
53 | 
54 | python predict_hospital_readmission.py
55 | 
56 | 📊 Output
57 | 
58 |     Evaluation metrics: Accuracy, ROC AUC, Precision, Recall, F1-score
59 | 
60 |     Visualization: Combined ROC Curve Comparison
61 | 
62 |     Summary report with key findings and future recommendations.
63 | 
64 | 📌 Summary of Findings
65 | 
66 |     XGBoost and MLP performed best in terms of ROC AUC.
67 | 
68 |     MLP handled non-linear relationships effectively.
69 | 
70 |     Logistic Regression provided fast and interpretable results.
71 | 
72 |     The workflow is extendable to real-world EHR datasets like MIMIC-III.
73 | 
74 | 📈 Future Work
75 | 
76 |     Incorporate real hospital datasets.
77 | 
78 |     Apply feature selection and hyperparameter tuning.
79 | 
80 |     Deploy as an API or interactive app using Flask or Streamlit.
81 | 
82 | 👨‍⚕️ Author Okes Imoni
83 |     EmEmail: jennyimoni@gmail.com
84 | 
85 | Data Scientist & Health Tech Enthusiast
86 | 📧 Contact me | 🌐 LinkedIn
87 | 


--------------------------------------------------------------------------------
/README(1).md:
--------------------------------------------------------------------------------
 1 | 
 2 | # 🏥 Predicting Hospital Readmission Rates
 3 | 
 4 | This project demonstrates how to predict hospital readmission within 30 days using **synthetic data** and multiple machine learning models. The pipeline includes classical ML algorithms and a deep learning approach using Keras (TensorFlow).
 5 | 
 6 | ---
 7 | 
 8 | ## 📦 Project Structure
 9 | 
10 | - `hospital_readmission_prediction.ipynb` – Jupyter Notebook with full end-to-end workflow.
11 | - `predict_hospital_readmission.py` – Python script version of the same workflow.
12 | - `roc_curve_comparison.png` – Visualization of ROC curves comparing model performance.
13 | - `hospital_readmission_report.txt` – Summary report of results and findings.
14 | 
15 | ---
16 | 
17 | ## 🧠 Models Included
18 | 
19 | | Model                 | Description                             |
20 | |----------------------|-----------------------------------------|
21 | | Logistic Regression  | Baseline linear model                   |
22 | | Random Forest        | Ensemble-based model using decision trees |
23 | | XGBoost              | Gradient boosting algorithm             |
24 | | MLP (Keras)          | Deep learning model with dense layers   |
25 | 
26 | ---
27 | 
28 | ## 🗂 Features Used (Synthetic)
29 | 
30 | - Age  
31 | - Gender  
32 | - Length of Stay  
33 | - Number of Previous Admissions  
34 | - Comorbidity Score  
35 | - Diabetes Status  
36 | - Hypertension Status  
37 | - Discharge Destination (Home or Not)
38 | 
39 | ---
40 | 
41 | ## ⚙️ How to Run
42 | 
43 | 1. Clone this repo or download the `.ipynb` and `.py` files.
44 | 2. Install dependencies:
45 | 
46 | ```bash
47 | pip install pandas numpy scikit-learn xgboost tensorflow matplotlib seaborn
48 | ```
49 | 
50 | 3. Run the notebook:
51 | ```bash
52 | jupyter notebook hospital_readmission_prediction.ipynb
53 | ```
54 | 
55 | Or execute the script:
56 | ```bash
57 | python predict_hospital_readmission.py
58 | ```
59 | 
60 | ---
61 | 
62 | ## 📊 Output
63 | 
64 | - Evaluation metrics: **Accuracy, ROC AUC, Precision, Recall, F1-score**
65 | - Visualization: Combined **ROC Curve Comparison**
66 | - Summary report with key findings and future recommendations.
67 | 
68 | ---
69 | 
70 | ## 📌 Summary of Findings
71 | 
72 | - XGBoost and MLP performed best in terms of ROC AUC.
73 | - MLP handled non-linear relationships effectively.
74 | - Logistic Regression provided fast and interpretable results.
75 | - The workflow is extendable to real-world EHR datasets like MIMIC-III.
76 | 
77 | ---
78 | 
79 | ## 📈 Future Work
80 | 
81 | - Incorporate real hospital datasets.
82 | - Apply feature selection and hyperparameter tuning.
83 | - Deploy as an API or interactive app using Flask or Streamlit.
84 | 
85 | ---
86 | 
87 | ## 👨‍⚕️ Author
88 | 
89 | **Agbozu Ebingiye Nelvin**  
90 | _Data Scientist & Health Tech Enthusiast_  
91 | 📧 [Contact me](mailto:your-email@example.com) | 🌐 [LinkedIn](https://www.linkedin.com)
92 | 


--------------------------------------------------------------------------------
/hospital_readmission_prediction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "c61de6c8",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# 🏥 Predicting Hospital Readmission Rates\n",
  9 |     "An end-to-end machine learning + deep learning pipeline using synthetic hospital data.\n",
 10 |     "Models used: Logistic Regression, Random Forest, XGBoost, MLP (TensorFlow).\n",
 11 |     "Includes: Data generation, preprocessing, training, evaluation, ROC comparison."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "id": "007728e2",
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "import numpy as np\n",
 22 |     "import pandas as pd\n",
 23 |     "import matplotlib.pyplot as plt\n",
 24 |     "import seaborn as sns\n",
 25 |     "\n",
 26 |     "from sklearn.model_selection import train_test_split\n",
 27 |     "from sklearn.preprocessing import LabelEncoder, StandardScaler\n",
 28 |     "from sklearn.linear_model import LogisticRegression\n",
 29 |     "from sklearn.ensemble import RandomForestClassifier\n",
 30 |     "from xgboost import XGBClassifier\n",
 31 |     "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, auc\n",
 32 |     "\n",
 33 |     "import tensorflow as tf\n",
 34 |     "from tensorflow.keras.models import Sequential\n",
 35 |     "from tensorflow.keras.layers import Dense, Dropout\n",
 36 |     "from tensorflow.keras.callbacks import EarlyStopping\n"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "id": "866da584",
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "# Generate synthetic data\n",
 47 |     "np.random.seed(42)\n",
 48 |     "n_samples = 1000\n",
 49 |     "data = pd.DataFrame({\n",
 50 |     "    'age': np.random.randint(18, 90, n_samples),\n",
 51 |     "    'gender': np.random.choice(['Male', 'Female'], n_samples),\n",
 52 |     "    'length_of_stay': np.random.randint(1, 30, n_samples),\n",
 53 |     "    'num_prev_admissions': np.random.randint(0, 10, n_samples),\n",
 54 |     "    'comorbidity_score': np.random.normal(loc=2, scale=1, size=n_samples).round(1),\n",
 55 |     "    'has_diabetes': np.random.choice([0, 1], n_samples),\n",
 56 |     "    'has_hypertension': np.random.choice([0, 1], n_samples),\n",
 57 |     "    'discharged_to_home': np.random.choice([0, 1], n_samples),\n",
 58 |     "    'readmitted_within_30_days': np.random.choice([0, 1], n_samples, p=[0.7, 0.3])\n",
 59 |     "})\n",
 60 |     "data['gender'] = LabelEncoder().fit_transform(data['gender'])\n",
 61 |     "\n",
 62 |     "# Split data\n",
 63 |     "X = data.drop('readmitted_within_30_days', axis=1)\n",
 64 |     "y = data['readmitted_within_30_days']\n",
 65 |     "scaler = StandardScaler()\n",
 66 |     "X_scaled = scaler.fit_transform(X)\n",
 67 |     "X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, stratify=y, random_state=42)\n"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "id": "a561c9c3",
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "# Train classical ML models\n",
 78 |     "models = {\n",
 79 |     "    \"Logistic Regression\": LogisticRegression(),\n",
 80 |     "    \"Random Forest\": RandomForestClassifier(n_estimators=100, random_state=42),\n",
 81 |     "    \"XGBoost\": XGBClassifier(use_label_encoder=False, eval_metric='logloss')\n",
 82 |     "}\n",
 83 |     "roc_data = {}\n",
 84 |     "for name, model in models.items():\n",
 85 |     "    model.fit(X_train, y_train)\n",
 86 |     "    y_proba = model.predict_proba(X_test)[:, 1]\n",
 87 |     "    fpr, tpr, _ = roc_curve(y_test, y_proba)\n",
 88 |     "    roc_data[name] = (fpr, tpr, auc(fpr, tpr))\n",
 89 |     "    print(f\"\\n{name} Accuracy: {accuracy_score(y_test, model.predict(X_test)):.2f}\")\n",
 90 |     "    print(classification_report(y_test, model.predict(X_test)))\n"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "id": "001bf42b",
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "# Train MLP with TensorFlow\n",
101 |     "mlp_model = Sequential([\n",
102 |     "    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),\n",
103 |     "    Dropout(0.3),\n",
104 |     "    Dense(32, activation='relu'),\n",
105 |     "    Dropout(0.3),\n",
106 |     "    Dense(1, activation='sigmoid')\n",
107 |     "])\n",
108 |     "mlp_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n",
109 |     "early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)\n",
110 |     "history = mlp_model.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=32, callbacks=[early_stop], verbose=0)\n",
111 |     "\n",
112 |     "# Evaluate and store ROC\n",
113 |     "y_proba_dl = mlp_model.predict(X_test).ravel()\n",
114 |     "fpr_dl, tpr_dl, _ = roc_curve(y_test, y_proba_dl)\n",
115 |     "roc_data[\"MLP (Deep Learning)\"] = (fpr_dl, tpr_dl, auc(fpr_dl, tpr_dl))\n",
116 |     "\n",
117 |     "print(f\"\\nMLP Accuracy: {mlp_model.evaluate(X_test, y_test, verbose=0)[1]:.2f}\")\n",
118 |     "y_pred_dl = (y_proba_dl > 0.5).astype('int')\n",
119 |     "print(classification_report(y_test, y_pred_dl))\n"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "id": "6fdaaefa",
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "# ROC Curve Comparison\n",
130 |     "plt.figure(figsize=(10, 6))\n",
131 |     "for name, (fpr, tpr, roc_auc) in roc_data.items():\n",
132 |     "    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')\n",
133 |     "plt.plot([0, 1], [0, 1], 'k--')\n",
134 |     "plt.title('ROC Curve Comparison')\n",
135 |     "plt.xlabel('False Positive Rate')\n",
136 |     "plt.ylabel('True Positive Rate')\n",
137 |     "plt.legend()\n",
138 |     "plt.grid(True)\n",
139 |     "plt.tight_layout()\n",
140 |     "plt.show()\n"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "id": "f4bcf232",
146 |    "metadata": {},
147 |    "source": [
148 |     "### 📊 Summary\n",
149 |     "- XGBoost and MLP performed best in terms of ROC AUC.\n",
150 |     "- MLP captured nonlinear relationships but took longer to train.\n",
151 |     "- Logistic Regression gave fast, interpretable results.\n",
152 |     "- Future work: use real patient data (e.g. MIMIC-III), hyperparameter tuning, feature engineering."
153 |    ]
154 |   }
155 |  ],
156 |  "metadata": {},
157 |  "nbformat": 4,
158 |  "nbformat_minor": 5
159 | }
160 | 


--------------------------------------------------------------------------------