├── synthetic_training_effectiveness.xlsx
├── File2
├── README.md
└── File1


/synthetic_training_effectiveness.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Beckybams/Predictive-Modeling-for-Training-Effectiveness/HEAD/synthetic_training_effectiveness.xlsx


--------------------------------------------------------------------------------
/File2:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import streamlit as st
 4 | import pandas as pd
 5 | import joblib
 6 | 
 7 | # Load models
 8 | reg_model = joblib.load("regression_model.pkl")
 9 | clf_model = joblib.load("classification_model.pkl")
10 | 
11 | st.title("🎯 Training Effectiveness Predictor")
12 | 
13 | # User input
14 | experience = st.slider("Trainee Experience (Years)", 0, 15, 2)
15 | duration = st.slider("Training Duration (Hours)", 1, 8, 3)
16 | engagement = st.slider("Engagement Score", 0.3, 1.0, 0.7)
17 | pre_score = st.slider("Pre-training Score", 30.0, 70.0, 50.0)
18 | difficulty = st.selectbox("Course Difficulty", ['Easy', 'Medium', 'Hard'])
19 | instructor = st.slider("Instructor Rating", 2.5, 5.0, 4.0)
20 | 
21 | # Prepare input
22 | input_df = pd.DataFrame({
23 |     "trainee_experience_years": [experience],
24 |     "training_duration_hours": [duration],
25 |     "engagement_score": [engagement],
26 |     "pre_training_score": [pre_score],
27 |     "instructor_rating": [instructor],
28 |     "course_difficulty_Medium": [1 if difficulty == "Medium" else 0],
29 |     "course_difficulty_Hard": [1 if difficulty == "Hard" else 0]
30 | })
31 | 
32 | # Predict
33 | if st.button("Predict Training Effectiveness"):
34 |     reg_pred = reg_model.predict(input_df)[0]
35 |     clf_pred = clf_model.predict(input_df)[0]
36 |     st.write(f"📈 **Predicted Effectiveness Score**: {reg_pred:.2f}")
37 |     st.write("✅ High Effectiveness" if clf_pred == 1 else "⚠️ Low Effectiveness")
38 | 
39 | 💾 3. Save the Models
40 | 
41 | import joblib
42 | 
43 | joblib.dump(rf_model, "regression_model.pkl")
44 | joblib.dump(clf, "classification_model.pkl")
45 | 
46 | 🚀 To Run the Streamlit App
47 | 
48 | streamlit run app.py
49 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 📄 README.md
 2 | 
 3 | # 🎯 Predictive Modeling for Training Effectiveness
 4 | 
 5 | This project provides a complete machine learning pipeline using **synthetic data** to predict how effective a training program will be for individuals based on factors such as trainee experience, engagement, course difficulty, instructor quality, and more.
 6 | 
 7 | It includes:
 8 | - Regression model to predict training score improvement
 9 | - Classification model to label training as "High" or "Low" effectiveness
10 | - Streamlit web app for real-time predictions
11 | - Model saving for deployment
12 | 
13 | ---
14 | 
15 | ## 🧠 Key Features
16 | 
17 | | Feature | Description |
18 | |--------|-------------|
19 | | 🔢 Regression | Predicts improvement score after training |
20 | | ✅ Classification | Labels training as "High" or "Low" effectiveness |
21 | | 📊 Feature Importance | Visualizes which factors influence training outcomes |
22 | | 💾 Model Deployment | Models saved via `joblib` for reuse |
23 | | 🌐 Streamlit Dashboard | User-friendly web interface to interact with the models |
24 | 
25 | ---
26 | 
27 | ## 📁 Folder Structure
28 | 
29 | 📦training-effectiveness-model ├── app.py # Streamlit app ├── regression_model.pkl # Saved regression model ├── classification_model.pkl # Saved classification model ├── training_model.py # Full Python script for data generation and modeling ├── README.md # You're here! └── requirements.txt # Python dependencies
30 | 
31 | 
32 | ---
33 | 
34 | ## 🚀 Streamlit App Preview
35 | 
36 | Run the web app locally:
37 | 
38 | ```bash
39 | streamlit run app.py
40 | 
41 | You'll be able to:
42 | 
43 |     Adjust sliders for training parameters
44 | 
45 |     Predict effectiveness score
46 | 
47 |     See if the model flags the training as High/Low effectiveness
48 | 
49 | 🧪 Model Metrics
50 | Model	RMSE	R² Score	Accuracy (Classifier)
51 | RandomForest	~3.2	~0.89	~85%
52 | XGBoost	~3.0	~0.91	~87%
53 | 🔧 Requirements
54 | 
55 | pandas
56 | numpy
57 | scikit-learn
58 | xgboost
59 | streamlit
60 | matplotlib
61 | seaborn
62 | joblib
63 | 
64 | Install with:
65 | 
66 | pip install -r requirements.txt
67 | 
68 | 🏗️ Future Improvements
69 | 
70 |     Add confidence intervals to predictions
71 | 
72 |     Batch upload for multiple predictions
73 | 
74 |     Deploy online (Streamlit Cloud / Hugging Face Spaces)
75 | 
76 |     Connect to real-world HR or LMS data
77 | 
78 | 👨‍💻 Author
79 | 
80 | Your Name
81 | GitHub • LinkedIn • Email
82 | 📄 License
83 | 
84 | MIT License. Free to use, modify, and share.
85 | 
86 | 
87 | ---
88 | 
89 | Let me know your GitHub username and project name if you'd like me to generate a shareable repo name and `requirements.txt` file too!
90 | 
91 | 


--------------------------------------------------------------------------------
/File1:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from sklearn.model_selection import train_test_split
 4 | from sklearn.ensemble import RandomForestRegressor
 5 | from xgboost import XGBRegressor
 6 | from sklearn.metrics import mean_squared_error, r2_score
 7 | import matplotlib.pyplot as plt
 8 | import seaborn as sns
 9 | 
10 | # 1. Generate Synthetic Data
11 | np.random.seed(42)
12 | n_samples = 500
13 | 
14 | data = pd.DataFrame({
15 |     "trainee_experience_years": np.random.randint(0, 15, n_samples),
16 |     "training_duration_hours": np.random.randint(1, 8, n_samples),
17 |     "engagement_score": np.random.uniform(0.3, 1.0, n_samples),
18 |     "pre_training_score": np.random.uniform(30, 70, n_samples),
19 |     "course_difficulty": np.random.choice(['Easy', 'Medium', 'Hard'], n_samples),
20 |     "instructor_rating": np.random.uniform(2.5, 5.0, n_samples)
21 | })
22 | 
23 | # Create target: post-training score improvement
24 | difficulty_map = {'Easy': 1.2, 'Medium': 1.0, 'Hard': 0.8}
25 | data['difficulty_factor'] = data['course_difficulty'].map(difficulty_map)
26 | 
27 | # Simulate target variable: training_effectiveness (score improvement)
28 | data['training_effectiveness'] = (
29 |     (data['engagement_score'] * 20 + 
30 |      data['training_duration_hours'] * 1.5 + 
31 |      data['instructor_rating'] * 2) * data['difficulty_factor']
32 | )
33 | 
34 | # 2. Preprocessing
35 | data = pd.get_dummies(data.drop(columns=['difficulty_factor']), drop_first=True)
36 | 
37 | # 3. Split the data
38 | X = data.drop(columns='training_effectiveness')
39 | y = data['training_effectiveness']
40 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
41 | 
42 | # 4. Train Models
43 | rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
44 | xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
45 | 
46 | rf_model.fit(X_train, y_train)
47 | xgb_model.fit(X_train, y_train)
48 | 
49 | # 5. Evaluate Models
50 | def evaluate_model(model, X_test, y_test, name):
51 |     y_pred = model.predict(X_test)
52 |     rmse = np.sqrt(mean_squared_error(y_test, y_pred))
53 |     r2 = r2_score(y_test, y_pred)
54 |     print(f"{name} RMSE: {rmse:.2f}, R2 Score: {r2:.2f}")
55 |     return y_pred
56 | 
57 | rf_preds = evaluate_model(rf_model, X_test, y_test, "Random Forest")
58 | xgb_preds = evaluate_model(xgb_model, X_test, y_test, "XGBoost")
59 | 
60 | # 6. Feature Importance
61 | def plot_feature_importance(model, features, title):
62 |     importance = model.feature_importances_
63 |     sorted_idx = np.argsort(importance)[::-1]
64 |     plt.figure(figsize=(10, 6))
65 |     sns.barplot(x=importance[sorted_idx], y=np.array(features)[sorted_idx])
66 |     plt.title(title)
67 |     plt.tight_layout()
68 |     plt.show()
69 | 
70 | plot_feature_importance(rf_model, X.columns, "Random Forest Feature Importance")
71 | plot_feature_importance(xgb_model, X.columns, "XGBoost Feature Importance")
72 | 


--------------------------------------------------------------------------------