├── vr_exposure_therapy_data.xlsx
├── README.md
├── VR_Exposure_Therapy_Analysis.ipynb
└── file


/vr_exposure_therapy_data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Okes2024/VR-Data-Science-for-Exposure-Therapy-Analysis/HEAD/vr_exposure_therapy_data.xlsx


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 📊 Features
 2 | Generates synthetic VR therapy session data with over 100 data points.
 3 | 
 4 | Analyzes anxiety levels, heart rate, and engagement metrics.
 5 | 
 6 | Provides visual insights for therapy progress tracking.
 7 | 
 8 | Implements statistical summaries for informed decision-making.
 9 | 
10 | 📁 Dataset
11 | The dataset includes:
12 | 
13 | SessionID
14 | 
15 | PatientID
16 | 
17 | AnxietyLevelBefore / AnxietyLevelAfter
18 | 
19 | HeartRateBefore / HeartRateAfter
20 | 
21 | EngagementScore
22 | 
23 | SessionDuration
24 | 
25 | 🚀 Technologies Used
26 | Python (Pandas, NumPy, Matplotlib, Seaborn)
27 | 
28 | Jupyter Notebook
29 | 
30 | Synthetic Data Generation Techniques
31 | 
32 | 📥 How to Use
33 | Clone the repository:
34 | 
35 | bash
36 | Copy
37 | Edit
38 | git clone https://github.com/Okes2024/VR-Data-Science-for-Exposure-Therapy-Analysis.git
39 | Install required libraries:
40 | 
41 | bash
42 | Copy
43 | Edit
44 | pip install pandas numpy matplotlib seaborn
45 | Open the Jupyter Notebook:
46 | 
47 | bash
48 | Copy
49 | Edit
50 | jupyter notebook VR_Exposure_Therapy_Analysis.ipynb
51 | 👨‍💻 Author
52 | Name: Okes Imoni
53 | GitHub: Okes2024
54 | 


--------------------------------------------------------------------------------
/VR_Exposure_Therapy_Analysis.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "id": "b2f53bdd",
 7 |    "metadata": {},
 8 |    "outputs": [],
 9 |    "source": [
10 |     "\n",
11 |     "import pandas as pd\n",
12 |     "import matplotlib.pyplot as plt\n",
13 |     "import seaborn as sns\n",
14 |     "\n",
15 |     "# Load dataset\n",
16 |     "df = pd.read_excel('vr_exposure_therapy_data.xlsx')\n",
17 |     "\n",
18 |     "# Display first few rows\n",
19 |     "print(df.head())\n",
20 |     "\n",
21 |     "# Summary statistics\n",
22 |     "print(df.describe())\n",
23 |     "\n",
24 |     "# Distribution of Exposure Levels\n",
25 |     "plt.figure(figsize=(6,4))\n",
26 |     "sns.countplot(x='ExposureLevel', data=df, palette='Set2')\n",
27 |     "plt.title('Distribution of Exposure Levels')\n",
28 |     "plt.show()\n",
29 |     "\n",
30 |     "# Heart Rate vs Stress Level\n",
31 |     "plt.figure(figsize=(6,4))\n",
32 |     "sns.scatterplot(x='HeartRate', y='StressLevel', hue='ExposureLevel', data=df, palette='Set1')\n",
33 |     "plt.title('Heart Rate vs Stress Level by Exposure Level')\n",
34 |     "plt.show()\n",
35 |     "\n",
36 |     "# Improvement Score distribution\n",
37 |     "plt.figure(figsize=(6,4))\n",
38 |     "sns.histplot(df['ImprovementScore'], bins=10, kde=True, color='blue')\n",
39 |     "plt.title('Improvement Score Distribution')\n",
40 |     "plt.show()\n"
41 |    ]
42 |   }
43 |  ],
44 |  "metadata": {},
45 |  "nbformat": 4,
46 |  "nbformat_minor": 5
47 | }
48 | 


--------------------------------------------------------------------------------
/file:
--------------------------------------------------------------------------------
  1 | """
  2 | VR-Data-Science-for-Exposure-Therapy-Analysis
  3 | Synthetic dataset generation (>500 samples), EDA, basic modeling (classification + regression),
  4 | and export of CSV + visualizations.
  5 | 
  6 | Save as: vr_exposure_analysis.py
  7 | Requires: numpy, pandas, scikit-learn, matplotlib, seaborn
  8 | Install: pip install numpy pandas scikit-learn matplotlib seaborn
  9 | """
 10 | 
 11 | import os
 12 | import random
 13 | from pathlib import Path
 14 | from datetime import datetime, timedelta
 15 | 
 16 | import numpy as np
 17 | import pandas as pd
 18 | import matplotlib.pyplot as plt
 19 | import seaborn as sns
 20 | 
 21 | from sklearn.model_selection import train_test_split
 22 | from sklearn.preprocessing import StandardScaler
 23 | from sklearn.pipeline import make_pipeline
 24 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 25 | from sklearn.metrics import (
 26 |     accuracy_score, classification_report, confusion_matrix,
 27 |     mean_squared_error, r2_score
 28 | )
 29 | 
 30 | # ------------------ Settings ------------------
 31 | RANDOM_SEED = 42
 32 | NUM_PARTICIPANTS = 150
 33 | SESSIONS_PER_PARTICIPANT = 4   # average sessions each
 34 | NUM_SAMPLES = NUM_PARTICIPANTS * SESSIONS_PER_PARTICIPANT  # 600 samples
 35 | OUTPUT_CSV = "vr_exposure_synthetic.csv"
 36 | PLOTS_DIR = Path("plots")
 37 | PLOTS_DIR.mkdir(parents=True, exist_ok=True)
 38 | np.random.seed(RANDOM_SEED)
 39 | random.seed(RANDOM_SEED)
 40 | 
 41 | # ------------------ Synthetic data design ------------------
 42 | # Features to simulate:
 43 | # - ParticipantID, Age, Gender
 44 | # - BaselineAnxiety (0-100), BaselineSUDS (0-10)
 45 | # - SessionNumber (1..n), Environment (phobia type), ExposureIntensity (0-1)
 46 | # - AvgHeartRate (bpm), PeakHeartRate, GSR_mean, GSR_peak
 47 | # - HeadRotationMean, HeadRotationStd (degrees)
 48 | # - TimeInHotzone (seconds), TasksCompleted (0..5)
 49 | # - TherapistPresent (Yes/No), SessionDuration (sec)
 50 | # - SelfReportedSUDS_post (0-10), AnxietyScore_post (0-100)
 51 | # - ImprovementBinary (1 if clinically meaningful improvement), ImprovementDelta (baseline - post)
 52 | 
 53 | environments = [
 54 |     "Heights", "Crowds", "PublicSpeaking", "ClosedSpaces", "Animals", "Driving", "SocialInteraction"
 55 | ]
 56 | genders = ["Male", "Female", "Other"]
 57 | 
 58 | def generate_participant(part_id):
 59 |     """Generate stable participant-level attributes."""
 60 |     age = int(np.clip(np.random.normal(34, 12), 18, 75))
 61 |     gender = random.choices(genders, weights=[0.48, 0.48, 0.04])[0]
 62 |     baseline_anxiety = float(np.clip(np.random.normal(65 - (age-30)*0.2, 12), 20, 95))
 63 |     baseline_suds = float(np.clip((baseline_anxiety / 10.0) + np.random.normal(0, 1.2), 0, 10))
 64 |     trait_sensitivity = float(np.clip(np.random.beta(2, 4) * 1.5, 0.05, 1.0))  # participant sensitivity multiplier
 65 |     return {"ParticipantID": f"P{1000+part_id}", "Age": age, "Gender": gender,
 66 |             "BaselineAnxiety": round(baseline_anxiety, 1),
 67 |             "BaselineSUDS": round(baseline_suds, 1),
 68 |             "TraitSensitivity": round(trait_sensitivity, 3)}
 69 | 
 70 | rows = []
 71 | start_date = datetime.now() - timedelta(days=300)
 72 | 
 73 | for pid in range(NUM_PARTICIPANTS):
 74 |     p = generate_participant(pid)
 75 |     # assign a favored environment per participant (simulate targeted therapy)
 76 |     fav_env = random.choice(environments)
 77 |     for session in range(1, SESSIONS_PER_PARTICIPANT + 1):
 78 |         # session-level variables
 79 |         env = fav_env if random.random() < 0.75 else random.choice(environments)
 80 |         exposure_intensity = float(np.clip(np.random.beta(2 + session*0.2, 3 - session*0.05) , 0.05, 0.99))
 81 |         therapist_present = random.random() < 0.85
 82 |         session_duration = int(np.clip(np.random.normal(900 + session*20, 120), 300, 1800))  # seconds
 83 |         time_in_hotzone = int(np.clip(session_duration * exposure_intensity * np.random.uniform(0.4, 0.95), 10, session_duration))
 84 |         tasks_completed = int(np.clip(np.random.poisson(2 + session*0.2), 0, 6))
 85 |         # physiological signals (simulate increases proportional to exposure and trait sensitivity)
 86 |         hr_baseline = 60 + (p["Age"] - 30)*0.2 + np.random.normal(0, 3)
 87 |         avg_hr = hr_baseline + (exposure_intensity * 25 * p["TraitSensitivity"]) + np.random.normal(0, 5)
 88 |         peak_hr = max(avg_hr + np.random.normal(8, 6), avg_hr + 3)
 89 |         gsr_mean = float(np.clip(0.2 + exposure_intensity * 2.5 * p["TraitSensitivity"] + np.random.normal(0, 0.15), 0.05, 6.0))
 90 |         gsr_peak = float(np.clip(gsr_mean + abs(np.random.normal(0.4, 0.3)), 0.1, 10.0))
 91 |         head_rot_mean = float(np.clip(np.random.normal(10 + exposure_intensity*30, 8), 0, 120))
 92 |         head_rot_std = float(np.clip(np.random.normal(5 + exposure_intensity*8, 3), 0.5, 60))
 93 |         # immediate subjective distress after session
 94 |         suds_post = float(np.clip(p["BaselineSUDS"] * (1 - 0.08*session) + (exposure_intensity * 2.5) * p["TraitSensitivity"]
 95 |                                   + np.random.normal(0, 0.9), 0, 10))
 96 |         anxiety_post = float(np.clip(p["BaselineAnxiety"] * (1 - 0.07*session) + exposure_intensity * 8 * p["TraitSensitivity"]
 97 |                                      + np.random.normal(0, 6), 0, 100))
 98 |         # improvement metrics (delta)
 99 |         improvement_delta = p["BaselineAnxiety"] - anxiety_post
100 |         # mark clinically meaningful improvement if delta >= 10 points OR percentage reduction >= 15%
101 |         improved_binary = int((improvement_delta >= 10) or (improvement_delta / max(1.0, p["BaselineAnxiety"]) >= 0.15))
102 |         # session timestamp
103 |         session_date = start_date + timedelta(days=random.randint(0, 300), hours=random.randint(0,23), minutes=random.randint(0,59))
104 |         rows.append({
105 |             "ParticipantID": p["ParticipantID"],
106 |             "SessionDate": session_date.isoformat(sep=' '),
107 |             "Age": p["Age"],
108 |             "Gender": p["Gender"],
109 |             "BaselineAnxiety": p["BaselineAnxiety"],
110 |             "BaselineSUDS": p["BaselineSUDS"],
111 |             "Environment": env,
112 |             "SessionNumber": session,
113 |             "ExposureIntensity": round(exposure_intensity, 3),
114 |             "TherapistPresent": "Yes" if therapist_present else "No",
115 |             "SessionDuration_sec": session_duration,
116 |             "TimeInHotzone_sec": time_in_hotzone,
117 |             "TasksCompleted": tasks_completed,
118 |             "AvgHeartRate_bpm": round(avg_hr, 1),
119 |             "PeakHeartRate_bpm": round(peak_hr, 1),
120 |             "GSR_mean_uS": round(gsr_mean, 3),
121 |             "GSR_peak_uS": round(gsr_peak, 3),
122 |             "HeadRotationMean_deg": round(head_rot_mean, 2),
123 |             "HeadRotationStd_deg": round(head_rot_std, 2),
124 |             "SUDS_post": round(suds_post, 2),
125 |             "AnxietyScore_post": round(anxiety_post, 1),
126 |             "ImprovementDelta": round(improvement_delta, 2),
127 |             "ImprovedBinary": improved_binary
128 |         })
129 | 
130 | # ------------------ Create DataFrame and save CSV ------------------
131 | df = pd.DataFrame(rows)
132 | df.to_csv(OUTPUT_CSV, index=False)
133 | print(f"Synthetic dataset saved to: {Path(OUTPUT_CSV).resolve()}")
134 | print("Dataset shape:", df.shape)
135 | print(df["ImprovedBinary"].value_counts(normalize=True).round(3))
136 | 
137 | # ------------------ Exploratory Visuals ------------------
138 | sns.set(style="whitegrid", context="notebook")
139 | 
140 | # 1) Distribution of baseline anxiety
141 | plt.figure(figsize=(8,4))
142 | sns.histplot(df["BaselineAnxiety"], bins=25, kde=True)
143 | plt.title("Baseline Anxiety Distribution")
144 | plt.xlabel("Baseline Anxiety (0-100)")
145 | plt.tight_layout()
146 | plt.savefig(PLOTS_DIR / "baseline_anxiety_distribution.png")
147 | plt.close()
148 | 
149 | # 2) Post-session anxiety by session number
150 | plt.figure(figsize=(8,5))
151 | sns.boxplot(x="SessionNumber", y="AnxietyScore_post", data=df)
152 | plt.title("Post-session Anxiety by Session Number")
153 | plt.tight_layout()
154 | plt.savefig(PLOTS_DIR / "anxiety_by_session.png")
155 | plt.close()
156 | 
157 | # 3) AvgHeartRate vs ExposureIntensity (sampled scatter)
158 | plt.figure(figsize=(8,5))
159 | sns.scatterplot(x="ExposureIntensity", y="AvgHeartRate_bpm", hue="ImprovedBinary", data=df, alpha=0.6)
160 | plt.title("Avg Heart Rate vs Exposure Intensity (colored by improvement)")
161 | plt.tight_layout()
162 | plt.savefig(PLOTS_DIR / "hr_vs_exposure.png")
163 | plt.close()
164 | 
165 | # 4) Improvement rate by environment
166 | plt.figure(figsize=(9,5))
167 | impr_by_env = df.groupby("Environment")["ImprovedBinary"].mean().sort_values(ascending=False)
168 | sns.barplot(x=impr_by_env.index, y=impr_by_env.values)
169 | plt.xticks(rotation=45)
170 | plt.ylabel("Improvement Rate (fraction)")
171 | plt.title("Improvement Rate by VR Environment")
172 | plt.tight_layout()
173 | plt.savefig(PLOTS_DIR / "improvement_by_environment.png")
174 | plt.close()
175 | 
176 | print(f"Saved plots to {PLOTS_DIR.resolve()}")
177 | 
178 | # ------------------ Modeling ------------------
179 | # We'll do:
180 | # - Classification: predict ImprovedBinary (did the session produce meaningful improvement)
181 | # - Regression: predict AnxietyScore_post (continuous)
182 | 
183 | # Prepare features (simple encoding)
184 | feature_cols = [
185 |     "Age", "BaselineAnxiety", "BaselineSUDS", "SessionNumber",
186 |     "ExposureIntensity", "SessionDuration_sec", "TimeInHotzone_sec",
187 |     "TasksCompleted", "AvgHeartRate_bpm", "PeakHeartRate_bpm", "GSR_mean_uS",
188 |     "GSR_peak_uS", "HeadRotationMean_deg", "HeadRotationStd_deg"
189 | ]
190 | # Encode categorical environment using one-hot
191 | env_dummies = pd.get_dummies(df["Environment"], prefix="Env")
192 | ther_dummies = pd.get_dummies(df["TherapistPresent"], prefix="Ther")
193 | gender_dummies = pd.get_dummies(df["Gender"], prefix="Gender")
194 | 
195 | X = pd.concat([df[feature_cols], env_dummies, ther_dummies, gender_dummies], axis=1)
196 | y_class = df["ImprovedBinary"].values
197 | y_reg = df["AnxietyScore_post"].values
198 | 
199 | # Train/test split
200 | X_train, X_test, y_train_cl, y_test_cl, y_train_reg, y_test_reg = train_test_split(
201 |     X, y_class, y_reg, test_size=0.25, random_state=RANDOM_SEED, stratify=y_class
202 | )
203 | 
204 | # Standardize numeric features for regression (RandomForest insensitive but good practice)
205 | num_cols = feature_cols
206 | scaler = StandardScaler()
207 | X_train_scaled = X_train.copy()
208 | X_test_scaled = X_test.copy()
209 | X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols])
210 | X_test_scaled[num_cols] = scaler.transform(X_test[num_cols])
211 | 
212 | # Classification model
213 | clf = RandomForestClassifier(n_estimators=200, random_state=RANDOM_SEED, n_jobs=-1)
214 | clf.fit(X_train_scaled, y_train_cl)
215 | y_pred_cl = clf.predict(X_test_scaled)
216 | acc = accuracy_score(y_test_cl, y_pred_cl)
217 | print("\nClassification (ImprovedBinary) Results:")
218 | print(f"Accuracy: {acc:.3f}")
219 | print(classification_report(y_test_cl, y_pred_cl, digits=3))
220 | cm = confusion_matrix(y_test_cl, y_pred_cl)
221 | print("Confusion matrix:\n", cm)
222 | 
223 | # Save classification feature importances
224 | feat_imp = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False).head(20)
225 | plt.figure(figsize=(8,6))
226 | sns.barplot(x=feat_imp.values, y=feat_imp.index)
227 | plt.title("Top 20 Feature Importances (Classification RF)")
228 | plt.tight_layout()
229 | plt.savefig(PLOTS_DIR / "feat_importance_classification.png")
230 | plt.close()
231 | 
232 | # Regression model
233 | reg = RandomForestRegressor(n_estimators=200, random_state=RANDOM_SEED, n_jobs=-1)
234 | reg.fit(X_train_scaled, y_train_reg)
235 | y_pred_reg = reg.predict(X_test_scaled)
236 | mse = mean_squared_error(y_test_reg, y_pred_reg)
237 | r2 = r2_score(y_test_reg, y_pred_reg)
238 | print("\nRegression (AnxietyScore_post) Results:")
239 | print(f"MSE: {mse:.3f}, RMSE: {np.sqrt(mse):.3f}, R2: {r2:.3f}")
240 | 
241 | # Save regression feature importances
242 | feat_imp_reg = pd.Series(reg.feature_importances_, index=X.columns).sort_values(ascending=False).head(20)
243 | plt.figure(figsize=(8,6))
244 | sns.barplot(x=feat_imp_reg.values, y=feat_imp_reg.index)
245 | plt.title("Top 20 Feature Importances (Regression RF)")
246 | plt.tight_layout()
247 | plt.savefig(PLOTS_DIR / "feat_importance_regression.png")
248 | plt.close()
249 | 
250 | # Save a confusion matrix visualize
251 | plt.figure(figsize=(5,4))
252 | sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
253 | plt.xlabel("Predicted")
254 | plt.ylabel("Actual")
255 | plt.title("Confusion Matrix (ImprovedBinary)")
256 | plt.tight_layout()
257 | plt.savefig(PLOTS_DIR / "confusion_matrix.png")
258 | plt.close()
259 | 
260 | # ------------------ Example predictions ------------------
261 | examples = X_test_scaled.sample(5, random_state=RANDOM_SEED)
262 | examples_orig_idx = examples.index
263 | print("\nExample predictions (sample):")
264 | for idx in examples_orig_idx:
265 |     rowX = X_test_scaled.loc[idx:idx]
266 |     pred_cl = clf.predict(rowX)[0]
267 |     prob_cl = clf.predict_proba(rowX)[0].max()
268 |     pred_reg = reg.predict(rowX)[0]
269 |     actual_cl = y_test_cl[list(X_test_scaled.index).index(idx)]
270 |     actual_reg = y_test_reg[list(X_test_scaled.index).index(idx)]
271 |     print(f"- Index {idx}: PredImproved={pred_cl} (p~{prob_cl:.2f}) | PredAnxiety={pred_reg:.1f} | TrueImp={actual_cl} | TrueAnx={actual_reg:.1f}")
272 | 
273 | # ------------------ Save processed features and models (optional) ------------------
274 | # Save the processed feature CSV for reproducibility / downstream analysis
275 | processed_csv = "vr_exposure_features_for_modeling.csv"
276 | X_scaled_full = pd.concat([X_train_scaled, X_test_scaled], axis=0)
277 | y_full = np.concatenate([y_train_cl, y_test_cl])
278 | X_scaled_full = X_scaled_full.reset_index(drop=True)
279 | out_df = X_scaled_full.copy()
280 | out_df["ImprovedBinary"] = np.concatenate([y_train_cl, y_test_cl])
281 | out_df.to_csv(processed_csv, index=False)
282 | print(f"\nSaved processed feature CSV: {Path(processed_csv).resolve()}")
283 | print("All done. Plots and CSV files are in the repository folder.")
284 | 


--------------------------------------------------------------------------------