├── vr_exposure_therapy_data.xlsx ├── README.md ├── VR_Exposure_Therapy_Analysis.ipynb └── file /vr_exposure_therapy_data.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Okes2024/VR-Data-Science-for-Exposure-Therapy-Analysis/HEAD/vr_exposure_therapy_data.xlsx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 📊 Features 2 | Generates synthetic VR therapy session data with over 100 data points. 3 | 4 | Analyzes anxiety levels, heart rate, and engagement metrics. 5 | 6 | Provides visual insights for therapy progress tracking. 7 | 8 | Implements statistical summaries for informed decision-making. 9 | 10 | 📁 Dataset 11 | The dataset includes: 12 | 13 | SessionID 14 | 15 | PatientID 16 | 17 | AnxietyLevelBefore / AnxietyLevelAfter 18 | 19 | HeartRateBefore / HeartRateAfter 20 | 21 | EngagementScore 22 | 23 | SessionDuration 24 | 25 | 🚀 Technologies Used 26 | Python (Pandas, NumPy, Matplotlib, Seaborn) 27 | 28 | Jupyter Notebook 29 | 30 | Synthetic Data Generation Techniques 31 | 32 | 📥 How to Use 33 | Clone the repository: 34 | 35 | bash 36 | Copy 37 | Edit 38 | git clone https://github.com/Okes2024/VR-Data-Science-for-Exposure-Therapy-Analysis.git 39 | Install required libraries: 40 | 41 | bash 42 | Copy 43 | Edit 44 | pip install pandas numpy matplotlib seaborn 45 | Open the Jupyter Notebook: 46 | 47 | bash 48 | Copy 49 | Edit 50 | jupyter notebook VR_Exposure_Therapy_Analysis.ipynb 51 | 👨‍💻 Author 52 | Name: Okes Imoni 53 | GitHub: Okes2024 54 | -------------------------------------------------------------------------------- /VR_Exposure_Therapy_Analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "b2f53bdd", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "\n", 11 | "import pandas as pd\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import seaborn as sns\n", 14 | "\n", 15 | "# Load dataset\n", 16 | "df = pd.read_excel('vr_exposure_therapy_data.xlsx')\n", 17 | "\n", 18 | "# Display first few rows\n", 19 | "print(df.head())\n", 20 | "\n", 21 | "# Summary statistics\n", 22 | "print(df.describe())\n", 23 | "\n", 24 | "# Distribution of Exposure Levels\n", 25 | "plt.figure(figsize=(6,4))\n", 26 | "sns.countplot(x='ExposureLevel', data=df, palette='Set2')\n", 27 | "plt.title('Distribution of Exposure Levels')\n", 28 | "plt.show()\n", 29 | "\n", 30 | "# Heart Rate vs Stress Level\n", 31 | "plt.figure(figsize=(6,4))\n", 32 | "sns.scatterplot(x='HeartRate', y='StressLevel', hue='ExposureLevel', data=df, palette='Set1')\n", 33 | "plt.title('Heart Rate vs Stress Level by Exposure Level')\n", 34 | "plt.show()\n", 35 | "\n", 36 | "# Improvement Score distribution\n", 37 | "plt.figure(figsize=(6,4))\n", 38 | "sns.histplot(df['ImprovementScore'], bins=10, kde=True, color='blue')\n", 39 | "plt.title('Improvement Score Distribution')\n", 40 | "plt.show()\n" 41 | ] 42 | } 43 | ], 44 | "metadata": {}, 45 | "nbformat": 4, 46 | "nbformat_minor": 5 47 | } 48 | -------------------------------------------------------------------------------- /file: -------------------------------------------------------------------------------- 1 | """ 2 | VR-Data-Science-for-Exposure-Therapy-Analysis 3 | Synthetic dataset generation (>500 samples), EDA, basic modeling (classification + regression), 4 | and export of CSV + visualizations. 5 | 6 | Save as: vr_exposure_analysis.py 7 | Requires: numpy, pandas, scikit-learn, matplotlib, seaborn 8 | Install: pip install numpy pandas scikit-learn matplotlib seaborn 9 | """ 10 | 11 | import os 12 | import random 13 | from pathlib import Path 14 | from datetime import datetime, timedelta 15 | 16 | import numpy as np 17 | import pandas as pd 18 | import matplotlib.pyplot as plt 19 | import seaborn as sns 20 | 21 | from sklearn.model_selection import train_test_split 22 | from sklearn.preprocessing import StandardScaler 23 | from sklearn.pipeline import make_pipeline 24 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor 25 | from sklearn.metrics import ( 26 | accuracy_score, classification_report, confusion_matrix, 27 | mean_squared_error, r2_score 28 | ) 29 | 30 | # ------------------ Settings ------------------ 31 | RANDOM_SEED = 42 32 | NUM_PARTICIPANTS = 150 33 | SESSIONS_PER_PARTICIPANT = 4 # average sessions each 34 | NUM_SAMPLES = NUM_PARTICIPANTS * SESSIONS_PER_PARTICIPANT # 600 samples 35 | OUTPUT_CSV = "vr_exposure_synthetic.csv" 36 | PLOTS_DIR = Path("plots") 37 | PLOTS_DIR.mkdir(parents=True, exist_ok=True) 38 | np.random.seed(RANDOM_SEED) 39 | random.seed(RANDOM_SEED) 40 | 41 | # ------------------ Synthetic data design ------------------ 42 | # Features to simulate: 43 | # - ParticipantID, Age, Gender 44 | # - BaselineAnxiety (0-100), BaselineSUDS (0-10) 45 | # - SessionNumber (1..n), Environment (phobia type), ExposureIntensity (0-1) 46 | # - AvgHeartRate (bpm), PeakHeartRate, GSR_mean, GSR_peak 47 | # - HeadRotationMean, HeadRotationStd (degrees) 48 | # - TimeInHotzone (seconds), TasksCompleted (0..5) 49 | # - TherapistPresent (Yes/No), SessionDuration (sec) 50 | # - SelfReportedSUDS_post (0-10), AnxietyScore_post (0-100) 51 | # - ImprovementBinary (1 if clinically meaningful improvement), ImprovementDelta (baseline - post) 52 | 53 | environments = [ 54 | "Heights", "Crowds", "PublicSpeaking", "ClosedSpaces", "Animals", "Driving", "SocialInteraction" 55 | ] 56 | genders = ["Male", "Female", "Other"] 57 | 58 | def generate_participant(part_id): 59 | """Generate stable participant-level attributes.""" 60 | age = int(np.clip(np.random.normal(34, 12), 18, 75)) 61 | gender = random.choices(genders, weights=[0.48, 0.48, 0.04])[0] 62 | baseline_anxiety = float(np.clip(np.random.normal(65 - (age-30)*0.2, 12), 20, 95)) 63 | baseline_suds = float(np.clip((baseline_anxiety / 10.0) + np.random.normal(0, 1.2), 0, 10)) 64 | trait_sensitivity = float(np.clip(np.random.beta(2, 4) * 1.5, 0.05, 1.0)) # participant sensitivity multiplier 65 | return {"ParticipantID": f"P{1000+part_id}", "Age": age, "Gender": gender, 66 | "BaselineAnxiety": round(baseline_anxiety, 1), 67 | "BaselineSUDS": round(baseline_suds, 1), 68 | "TraitSensitivity": round(trait_sensitivity, 3)} 69 | 70 | rows = [] 71 | start_date = datetime.now() - timedelta(days=300) 72 | 73 | for pid in range(NUM_PARTICIPANTS): 74 | p = generate_participant(pid) 75 | # assign a favored environment per participant (simulate targeted therapy) 76 | fav_env = random.choice(environments) 77 | for session in range(1, SESSIONS_PER_PARTICIPANT + 1): 78 | # session-level variables 79 | env = fav_env if random.random() < 0.75 else random.choice(environments) 80 | exposure_intensity = float(np.clip(np.random.beta(2 + session*0.2, 3 - session*0.05) , 0.05, 0.99)) 81 | therapist_present = random.random() < 0.85 82 | session_duration = int(np.clip(np.random.normal(900 + session*20, 120), 300, 1800)) # seconds 83 | time_in_hotzone = int(np.clip(session_duration * exposure_intensity * np.random.uniform(0.4, 0.95), 10, session_duration)) 84 | tasks_completed = int(np.clip(np.random.poisson(2 + session*0.2), 0, 6)) 85 | # physiological signals (simulate increases proportional to exposure and trait sensitivity) 86 | hr_baseline = 60 + (p["Age"] - 30)*0.2 + np.random.normal(0, 3) 87 | avg_hr = hr_baseline + (exposure_intensity * 25 * p["TraitSensitivity"]) + np.random.normal(0, 5) 88 | peak_hr = max(avg_hr + np.random.normal(8, 6), avg_hr + 3) 89 | gsr_mean = float(np.clip(0.2 + exposure_intensity * 2.5 * p["TraitSensitivity"] + np.random.normal(0, 0.15), 0.05, 6.0)) 90 | gsr_peak = float(np.clip(gsr_mean + abs(np.random.normal(0.4, 0.3)), 0.1, 10.0)) 91 | head_rot_mean = float(np.clip(np.random.normal(10 + exposure_intensity*30, 8), 0, 120)) 92 | head_rot_std = float(np.clip(np.random.normal(5 + exposure_intensity*8, 3), 0.5, 60)) 93 | # immediate subjective distress after session 94 | suds_post = float(np.clip(p["BaselineSUDS"] * (1 - 0.08*session) + (exposure_intensity * 2.5) * p["TraitSensitivity"] 95 | + np.random.normal(0, 0.9), 0, 10)) 96 | anxiety_post = float(np.clip(p["BaselineAnxiety"] * (1 - 0.07*session) + exposure_intensity * 8 * p["TraitSensitivity"] 97 | + np.random.normal(0, 6), 0, 100)) 98 | # improvement metrics (delta) 99 | improvement_delta = p["BaselineAnxiety"] - anxiety_post 100 | # mark clinically meaningful improvement if delta >= 10 points OR percentage reduction >= 15% 101 | improved_binary = int((improvement_delta >= 10) or (improvement_delta / max(1.0, p["BaselineAnxiety"]) >= 0.15)) 102 | # session timestamp 103 | session_date = start_date + timedelta(days=random.randint(0, 300), hours=random.randint(0,23), minutes=random.randint(0,59)) 104 | rows.append({ 105 | "ParticipantID": p["ParticipantID"], 106 | "SessionDate": session_date.isoformat(sep=' '), 107 | "Age": p["Age"], 108 | "Gender": p["Gender"], 109 | "BaselineAnxiety": p["BaselineAnxiety"], 110 | "BaselineSUDS": p["BaselineSUDS"], 111 | "Environment": env, 112 | "SessionNumber": session, 113 | "ExposureIntensity": round(exposure_intensity, 3), 114 | "TherapistPresent": "Yes" if therapist_present else "No", 115 | "SessionDuration_sec": session_duration, 116 | "TimeInHotzone_sec": time_in_hotzone, 117 | "TasksCompleted": tasks_completed, 118 | "AvgHeartRate_bpm": round(avg_hr, 1), 119 | "PeakHeartRate_bpm": round(peak_hr, 1), 120 | "GSR_mean_uS": round(gsr_mean, 3), 121 | "GSR_peak_uS": round(gsr_peak, 3), 122 | "HeadRotationMean_deg": round(head_rot_mean, 2), 123 | "HeadRotationStd_deg": round(head_rot_std, 2), 124 | "SUDS_post": round(suds_post, 2), 125 | "AnxietyScore_post": round(anxiety_post, 1), 126 | "ImprovementDelta": round(improvement_delta, 2), 127 | "ImprovedBinary": improved_binary 128 | }) 129 | 130 | # ------------------ Create DataFrame and save CSV ------------------ 131 | df = pd.DataFrame(rows) 132 | df.to_csv(OUTPUT_CSV, index=False) 133 | print(f"Synthetic dataset saved to: {Path(OUTPUT_CSV).resolve()}") 134 | print("Dataset shape:", df.shape) 135 | print(df["ImprovedBinary"].value_counts(normalize=True).round(3)) 136 | 137 | # ------------------ Exploratory Visuals ------------------ 138 | sns.set(style="whitegrid", context="notebook") 139 | 140 | # 1) Distribution of baseline anxiety 141 | plt.figure(figsize=(8,4)) 142 | sns.histplot(df["BaselineAnxiety"], bins=25, kde=True) 143 | plt.title("Baseline Anxiety Distribution") 144 | plt.xlabel("Baseline Anxiety (0-100)") 145 | plt.tight_layout() 146 | plt.savefig(PLOTS_DIR / "baseline_anxiety_distribution.png") 147 | plt.close() 148 | 149 | # 2) Post-session anxiety by session number 150 | plt.figure(figsize=(8,5)) 151 | sns.boxplot(x="SessionNumber", y="AnxietyScore_post", data=df) 152 | plt.title("Post-session Anxiety by Session Number") 153 | plt.tight_layout() 154 | plt.savefig(PLOTS_DIR / "anxiety_by_session.png") 155 | plt.close() 156 | 157 | # 3) AvgHeartRate vs ExposureIntensity (sampled scatter) 158 | plt.figure(figsize=(8,5)) 159 | sns.scatterplot(x="ExposureIntensity", y="AvgHeartRate_bpm", hue="ImprovedBinary", data=df, alpha=0.6) 160 | plt.title("Avg Heart Rate vs Exposure Intensity (colored by improvement)") 161 | plt.tight_layout() 162 | plt.savefig(PLOTS_DIR / "hr_vs_exposure.png") 163 | plt.close() 164 | 165 | # 4) Improvement rate by environment 166 | plt.figure(figsize=(9,5)) 167 | impr_by_env = df.groupby("Environment")["ImprovedBinary"].mean().sort_values(ascending=False) 168 | sns.barplot(x=impr_by_env.index, y=impr_by_env.values) 169 | plt.xticks(rotation=45) 170 | plt.ylabel("Improvement Rate (fraction)") 171 | plt.title("Improvement Rate by VR Environment") 172 | plt.tight_layout() 173 | plt.savefig(PLOTS_DIR / "improvement_by_environment.png") 174 | plt.close() 175 | 176 | print(f"Saved plots to {PLOTS_DIR.resolve()}") 177 | 178 | # ------------------ Modeling ------------------ 179 | # We'll do: 180 | # - Classification: predict ImprovedBinary (did the session produce meaningful improvement) 181 | # - Regression: predict AnxietyScore_post (continuous) 182 | 183 | # Prepare features (simple encoding) 184 | feature_cols = [ 185 | "Age", "BaselineAnxiety", "BaselineSUDS", "SessionNumber", 186 | "ExposureIntensity", "SessionDuration_sec", "TimeInHotzone_sec", 187 | "TasksCompleted", "AvgHeartRate_bpm", "PeakHeartRate_bpm", "GSR_mean_uS", 188 | "GSR_peak_uS", "HeadRotationMean_deg", "HeadRotationStd_deg" 189 | ] 190 | # Encode categorical environment using one-hot 191 | env_dummies = pd.get_dummies(df["Environment"], prefix="Env") 192 | ther_dummies = pd.get_dummies(df["TherapistPresent"], prefix="Ther") 193 | gender_dummies = pd.get_dummies(df["Gender"], prefix="Gender") 194 | 195 | X = pd.concat([df[feature_cols], env_dummies, ther_dummies, gender_dummies], axis=1) 196 | y_class = df["ImprovedBinary"].values 197 | y_reg = df["AnxietyScore_post"].values 198 | 199 | # Train/test split 200 | X_train, X_test, y_train_cl, y_test_cl, y_train_reg, y_test_reg = train_test_split( 201 | X, y_class, y_reg, test_size=0.25, random_state=RANDOM_SEED, stratify=y_class 202 | ) 203 | 204 | # Standardize numeric features for regression (RandomForest insensitive but good practice) 205 | num_cols = feature_cols 206 | scaler = StandardScaler() 207 | X_train_scaled = X_train.copy() 208 | X_test_scaled = X_test.copy() 209 | X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols]) 210 | X_test_scaled[num_cols] = scaler.transform(X_test[num_cols]) 211 | 212 | # Classification model 213 | clf = RandomForestClassifier(n_estimators=200, random_state=RANDOM_SEED, n_jobs=-1) 214 | clf.fit(X_train_scaled, y_train_cl) 215 | y_pred_cl = clf.predict(X_test_scaled) 216 | acc = accuracy_score(y_test_cl, y_pred_cl) 217 | print("\nClassification (ImprovedBinary) Results:") 218 | print(f"Accuracy: {acc:.3f}") 219 | print(classification_report(y_test_cl, y_pred_cl, digits=3)) 220 | cm = confusion_matrix(y_test_cl, y_pred_cl) 221 | print("Confusion matrix:\n", cm) 222 | 223 | # Save classification feature importances 224 | feat_imp = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False).head(20) 225 | plt.figure(figsize=(8,6)) 226 | sns.barplot(x=feat_imp.values, y=feat_imp.index) 227 | plt.title("Top 20 Feature Importances (Classification RF)") 228 | plt.tight_layout() 229 | plt.savefig(PLOTS_DIR / "feat_importance_classification.png") 230 | plt.close() 231 | 232 | # Regression model 233 | reg = RandomForestRegressor(n_estimators=200, random_state=RANDOM_SEED, n_jobs=-1) 234 | reg.fit(X_train_scaled, y_train_reg) 235 | y_pred_reg = reg.predict(X_test_scaled) 236 | mse = mean_squared_error(y_test_reg, y_pred_reg) 237 | r2 = r2_score(y_test_reg, y_pred_reg) 238 | print("\nRegression (AnxietyScore_post) Results:") 239 | print(f"MSE: {mse:.3f}, RMSE: {np.sqrt(mse):.3f}, R2: {r2:.3f}") 240 | 241 | # Save regression feature importances 242 | feat_imp_reg = pd.Series(reg.feature_importances_, index=X.columns).sort_values(ascending=False).head(20) 243 | plt.figure(figsize=(8,6)) 244 | sns.barplot(x=feat_imp_reg.values, y=feat_imp_reg.index) 245 | plt.title("Top 20 Feature Importances (Regression RF)") 246 | plt.tight_layout() 247 | plt.savefig(PLOTS_DIR / "feat_importance_regression.png") 248 | plt.close() 249 | 250 | # Save a confusion matrix visualize 251 | plt.figure(figsize=(5,4)) 252 | sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False) 253 | plt.xlabel("Predicted") 254 | plt.ylabel("Actual") 255 | plt.title("Confusion Matrix (ImprovedBinary)") 256 | plt.tight_layout() 257 | plt.savefig(PLOTS_DIR / "confusion_matrix.png") 258 | plt.close() 259 | 260 | # ------------------ Example predictions ------------------ 261 | examples = X_test_scaled.sample(5, random_state=RANDOM_SEED) 262 | examples_orig_idx = examples.index 263 | print("\nExample predictions (sample):") 264 | for idx in examples_orig_idx: 265 | rowX = X_test_scaled.loc[idx:idx] 266 | pred_cl = clf.predict(rowX)[0] 267 | prob_cl = clf.predict_proba(rowX)[0].max() 268 | pred_reg = reg.predict(rowX)[0] 269 | actual_cl = y_test_cl[list(X_test_scaled.index).index(idx)] 270 | actual_reg = y_test_reg[list(X_test_scaled.index).index(idx)] 271 | print(f"- Index {idx}: PredImproved={pred_cl} (p~{prob_cl:.2f}) | PredAnxiety={pred_reg:.1f} | TrueImp={actual_cl} | TrueAnx={actual_reg:.1f}") 272 | 273 | # ------------------ Save processed features and models (optional) ------------------ 274 | # Save the processed feature CSV for reproducibility / downstream analysis 275 | processed_csv = "vr_exposure_features_for_modeling.csv" 276 | X_scaled_full = pd.concat([X_train_scaled, X_test_scaled], axis=0) 277 | y_full = np.concatenate([y_train_cl, y_test_cl]) 278 | X_scaled_full = X_scaled_full.reset_index(drop=True) 279 | out_df = X_scaled_full.copy() 280 | out_df["ImprovedBinary"] = np.concatenate([y_train_cl, y_test_cl]) 281 | out_df.to_csv(processed_csv, index=False) 282 | print(f"\nSaved processed feature CSV: {Path(processed_csv).resolve()}") 283 | print("All done. Plots and CSV files are in the repository folder.") 284 | --------------------------------------------------------------------------------