├── synthetic_training_effectiveness.xlsx ├── File2 ├── README.md └── File1 /synthetic_training_effectiveness.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Beckybams/Predictive-Modeling-for-Training-Effectiveness/HEAD/synthetic_training_effectiveness.xlsx -------------------------------------------------------------------------------- /File2: -------------------------------------------------------------------------------- 1 | 2 | 3 | import streamlit as st 4 | import pandas as pd 5 | import joblib 6 | 7 | # Load models 8 | reg_model = joblib.load("regression_model.pkl") 9 | clf_model = joblib.load("classification_model.pkl") 10 | 11 | st.title("🎯 Training Effectiveness Predictor") 12 | 13 | # User input 14 | experience = st.slider("Trainee Experience (Years)", 0, 15, 2) 15 | duration = st.slider("Training Duration (Hours)", 1, 8, 3) 16 | engagement = st.slider("Engagement Score", 0.3, 1.0, 0.7) 17 | pre_score = st.slider("Pre-training Score", 30.0, 70.0, 50.0) 18 | difficulty = st.selectbox("Course Difficulty", ['Easy', 'Medium', 'Hard']) 19 | instructor = st.slider("Instructor Rating", 2.5, 5.0, 4.0) 20 | 21 | # Prepare input 22 | input_df = pd.DataFrame({ 23 | "trainee_experience_years": [experience], 24 | "training_duration_hours": [duration], 25 | "engagement_score": [engagement], 26 | "pre_training_score": [pre_score], 27 | "instructor_rating": [instructor], 28 | "course_difficulty_Medium": [1 if difficulty == "Medium" else 0], 29 | "course_difficulty_Hard": [1 if difficulty == "Hard" else 0] 30 | }) 31 | 32 | # Predict 33 | if st.button("Predict Training Effectiveness"): 34 | reg_pred = reg_model.predict(input_df)[0] 35 | clf_pred = clf_model.predict(input_df)[0] 36 | st.write(f"📈 **Predicted Effectiveness Score**: {reg_pred:.2f}") 37 | st.write("✅ High Effectiveness" if clf_pred == 1 else "⚠️ Low Effectiveness") 38 | 39 | 💾 3. Save the Models 40 | 41 | import joblib 42 | 43 | joblib.dump(rf_model, "regression_model.pkl") 44 | joblib.dump(clf, "classification_model.pkl") 45 | 46 | 🚀 To Run the Streamlit App 47 | 48 | streamlit run app.py 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 📄 README.md 2 | 3 | # 🎯 Predictive Modeling for Training Effectiveness 4 | 5 | This project provides a complete machine learning pipeline using **synthetic data** to predict how effective a training program will be for individuals based on factors such as trainee experience, engagement, course difficulty, instructor quality, and more. 6 | 7 | It includes: 8 | - Regression model to predict training score improvement 9 | - Classification model to label training as "High" or "Low" effectiveness 10 | - Streamlit web app for real-time predictions 11 | - Model saving for deployment 12 | 13 | --- 14 | 15 | ## 🧠 Key Features 16 | 17 | | Feature | Description | 18 | |--------|-------------| 19 | | 🔢 Regression | Predicts improvement score after training | 20 | | ✅ Classification | Labels training as "High" or "Low" effectiveness | 21 | | 📊 Feature Importance | Visualizes which factors influence training outcomes | 22 | | 💾 Model Deployment | Models saved via `joblib` for reuse | 23 | | 🌐 Streamlit Dashboard | User-friendly web interface to interact with the models | 24 | 25 | --- 26 | 27 | ## 📁 Folder Structure 28 | 29 | 📦training-effectiveness-model ├── app.py # Streamlit app ├── regression_model.pkl # Saved regression model ├── classification_model.pkl # Saved classification model ├── training_model.py # Full Python script for data generation and modeling ├── README.md # You're here! └── requirements.txt # Python dependencies 30 | 31 | 32 | --- 33 | 34 | ## 🚀 Streamlit App Preview 35 | 36 | Run the web app locally: 37 | 38 | ```bash 39 | streamlit run app.py 40 | 41 | You'll be able to: 42 | 43 | Adjust sliders for training parameters 44 | 45 | Predict effectiveness score 46 | 47 | See if the model flags the training as High/Low effectiveness 48 | 49 | 🧪 Model Metrics 50 | Model RMSE R² Score Accuracy (Classifier) 51 | RandomForest ~3.2 ~0.89 ~85% 52 | XGBoost ~3.0 ~0.91 ~87% 53 | 🔧 Requirements 54 | 55 | pandas 56 | numpy 57 | scikit-learn 58 | xgboost 59 | streamlit 60 | matplotlib 61 | seaborn 62 | joblib 63 | 64 | Install with: 65 | 66 | pip install -r requirements.txt 67 | 68 | 🏗️ Future Improvements 69 | 70 | Add confidence intervals to predictions 71 | 72 | Batch upload for multiple predictions 73 | 74 | Deploy online (Streamlit Cloud / Hugging Face Spaces) 75 | 76 | Connect to real-world HR or LMS data 77 | 78 | 👨‍💻 Author 79 | 80 | Your Name 81 | GitHub • LinkedIn • Email 82 | 📄 License 83 | 84 | MIT License. Free to use, modify, and share. 85 | 86 | 87 | --- 88 | 89 | Let me know your GitHub username and project name if you'd like me to generate a shareable repo name and `requirements.txt` file too! 90 | 91 | -------------------------------------------------------------------------------- /File1: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.ensemble import RandomForestRegressor 5 | from xgboost import XGBRegressor 6 | from sklearn.metrics import mean_squared_error, r2_score 7 | import matplotlib.pyplot as plt 8 | import seaborn as sns 9 | 10 | # 1. Generate Synthetic Data 11 | np.random.seed(42) 12 | n_samples = 500 13 | 14 | data = pd.DataFrame({ 15 | "trainee_experience_years": np.random.randint(0, 15, n_samples), 16 | "training_duration_hours": np.random.randint(1, 8, n_samples), 17 | "engagement_score": np.random.uniform(0.3, 1.0, n_samples), 18 | "pre_training_score": np.random.uniform(30, 70, n_samples), 19 | "course_difficulty": np.random.choice(['Easy', 'Medium', 'Hard'], n_samples), 20 | "instructor_rating": np.random.uniform(2.5, 5.0, n_samples) 21 | }) 22 | 23 | # Create target: post-training score improvement 24 | difficulty_map = {'Easy': 1.2, 'Medium': 1.0, 'Hard': 0.8} 25 | data['difficulty_factor'] = data['course_difficulty'].map(difficulty_map) 26 | 27 | # Simulate target variable: training_effectiveness (score improvement) 28 | data['training_effectiveness'] = ( 29 | (data['engagement_score'] * 20 + 30 | data['training_duration_hours'] * 1.5 + 31 | data['instructor_rating'] * 2) * data['difficulty_factor'] 32 | ) 33 | 34 | # 2. Preprocessing 35 | data = pd.get_dummies(data.drop(columns=['difficulty_factor']), drop_first=True) 36 | 37 | # 3. Split the data 38 | X = data.drop(columns='training_effectiveness') 39 | y = data['training_effectiveness'] 40 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 41 | 42 | # 4. Train Models 43 | rf_model = RandomForestRegressor(n_estimators=100, random_state=42) 44 | xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42) 45 | 46 | rf_model.fit(X_train, y_train) 47 | xgb_model.fit(X_train, y_train) 48 | 49 | # 5. Evaluate Models 50 | def evaluate_model(model, X_test, y_test, name): 51 | y_pred = model.predict(X_test) 52 | rmse = np.sqrt(mean_squared_error(y_test, y_pred)) 53 | r2 = r2_score(y_test, y_pred) 54 | print(f"{name} RMSE: {rmse:.2f}, R2 Score: {r2:.2f}") 55 | return y_pred 56 | 57 | rf_preds = evaluate_model(rf_model, X_test, y_test, "Random Forest") 58 | xgb_preds = evaluate_model(xgb_model, X_test, y_test, "XGBoost") 59 | 60 | # 6. Feature Importance 61 | def plot_feature_importance(model, features, title): 62 | importance = model.feature_importances_ 63 | sorted_idx = np.argsort(importance)[::-1] 64 | plt.figure(figsize=(10, 6)) 65 | sns.barplot(x=importance[sorted_idx], y=np.array(features)[sorted_idx]) 66 | plt.title(title) 67 | plt.tight_layout() 68 | plt.show() 69 | 70 | plot_feature_importance(rf_model, X.columns, "Random Forest Feature Importance") 71 | plot_feature_importance(xgb_model, X.columns, "XGBoost Feature Importance") 72 | --------------------------------------------------------------------------------