├── .devcontainer └── devcontainer.json ├── .gitignore ├── Association_Rule_Learning ├── Association_Rule_Learning_projects │ ├── market_basket_analysis.py │ └── recommendation_system.py ├── README.md ├── main.py ├── requirements.txt └── screenshots │ ├── feature_sel.png │ ├── img_comp.png │ ├── mark_bask.png │ └── recomm.png ├── DBSCAN_HDBSCAN ├── DBSCAN_HDBSCAN_projects │ ├── anomaly_detection.py │ └── customer_behavior_analysis.py ├── main.py ├── readme.md ├── requirements.txt └── screenshots │ ├── anom_det.png │ └── customer_behavior.png ├── Decision_Trees ├── Decision_Trees_projects │ ├── gini_impurity_implementation.py │ └── gym_decision_tree.py ├── main.py ├── readme.md ├── requirements.txt └── screenshots │ ├── gini.png │ └── gym1.png ├── Dimensionality_Reduction ├── Dimensionality_Reduction_projects │ ├── feature_selection.py │ └── image_compression.py ├── README.md ├── main.py ├── requirements.txt └── screenshots │ ├── feature_sel.png │ └── img_comp.png ├── Fuzzy_C_Means ├── Fuzzy_C_Means_projects │ ├── customer_profiling.py │ └── image_segmentation.py ├── README.md ├── main.py ├── requirements.txt └── screenshots │ ├── cust_prof.png │ └── imag_seg.png ├── GMM ├── GMM_projects │ ├── customer_segmentation.py │ └── image_color_segmentation.py ├── main.py ├── readme.md ├── requirements.txt └── screenshots │ ├── cust.png │ └── image_clust.png ├── Hierarchical_Clustering ├── Hierarchical_projects │ ├── document_clustering.py │ └── market_basket_analysis.py ├── README.md ├── main.py ├── requirements.txt └── screenshots │ ├── doc_clust.png │ └── market_basket.png ├── K-Means ├── K_Means_projects │ ├── customer_segmentation.py │ └── loan_approval.py ├── main.py ├── readme.md ├── requirements.txt └── screenshots │ ├── cust_seg.png │ └── loan.png ├── KNN ├── KNN_projects │ ├── TShirt_size.csv │ ├── movie_recommendation.py │ ├── netflix_titles.csv │ └── tshirt_size_prediction.py ├── main.py ├── readme.md ├── requirements.txt └── screenshots │ ├── movie1.png │ ├── movie2.png │ └── t-shirt.png ├── Linear_Regression ├── Linear_regression_projects │ ├── Salary_dataset.csv │ ├── __init__.py │ ├── house_price_prediction.py │ ├── messi_goal_prediction.py │ ├── normal_equation_vs_gradient_descent.py │ ├── salary_prediction.py │ └── study_hours_exam_prediction.py ├── README.md ├── main.py ├── requirements.txt └── screenshots │ ├── house1.png │ ├── leo1.png │ ├── leo2.png │ ├── leo3.png │ ├── norm_grad1.png │ ├── norm_grad2.png │ ├── salary_pred.png │ ├── score1.png │ └── score2.png ├── Logistic_Regression ├── Logistic_Regression_projects │ ├── Copy of sonar data.csv │ ├── diabetes.csv │ ├── diabetes_prediction.py │ ├── rock_vs_mine.py │ └── simple_hiv_prediction.py ├── main.py ├── readme.md ├── requirements.txt └── screenshots │ ├── diab1.png │ ├── diab2.png │ ├── hiv.png │ └── rock_mine.png ├── Naive_Bayes ├── Naive_Bayes_projects │ ├── FakeNewsNet.csv │ ├── fake_news_detection.py │ ├── fake_news_prediction.py │ ├── spam.csv │ ├── spam_detection_nb.py │ └── weather_prediction.py ├── main.py ├── readme.md ├── requirements.txt └── screenshots │ ├── fake_news.png │ ├── spamde.png │ └── weath.png ├── Poisson_Regression ├── Poisson_Regression_projects │ ├── competition_award.py │ ├── competition_awards_data.csv │ └── no_of_car_accident.py ├── main.py ├── readme.md ├── requirements.txt └── screenshots │ ├── caraccident.png │ ├── caraccidents.png │ └── comp.png ├── README.md ├── SVM ├── SVM_projects │ ├── breast_cancer_prediction.py │ └── spam_detection.py ├── main.py ├── readme.md ├── requirements.txt └── screenshots │ ├── breast.png │ └── spam.png ├── main.py ├── packages.txt └── requirements.txt /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Python 3", 3 | // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile 4 | "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye", 5 | "customizations": { 6 | "codespaces": { 7 | "openFiles": [ 8 | "README.md", 9 | "main.py" 10 | ] 11 | }, 12 | "vscode": { 13 | "settings": {}, 14 | "extensions": [ 15 | "ms-python.python", 16 | "ms-python.vscode-pylance" 17 | ] 18 | } 19 | }, 20 | "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y = min_lift] 105 | 106 | if len(rules) > 0: 107 | # Create a copy of rules with formatted strings for display 108 | display_rules = rules.copy() 109 | display_rules['rule'] = display_rules.apply(format_rule, axis=1) 110 | 111 | # Display rules 112 | st.subheader("Association Rules") 113 | st.dataframe(display_rules[['rule', 'support', 'confidence', 'lift']]) 114 | 115 | # Visualize support vs confidence 116 | fig = px.scatter(rules, x="support", y="confidence", 117 | size="lift", color="lift", 118 | hover_data=["antecedents", "consequents"], 119 | title="Support vs Confidence") 120 | st.plotly_chart(fig) 121 | 122 | # Network visualization 123 | st.subheader("Rule Network") 124 | G = nx.Graph() 125 | 126 | # Add nodes and edges 127 | for _, rule in rules.iterrows(): 128 | antecedents = list(rule['antecedents'])[0] 129 | consequents = list(rule['consequents'])[0] 130 | G.add_edge(antecedents, consequents, weight=rule['lift']) 131 | 132 | # Create plot 133 | plt.figure(figsize=(12, 8)) 134 | pos = nx.spring_layout(G) 135 | nx.draw(G, pos, with_labels=True, node_color='lightblue', 136 | node_size=1500, font_size=10, font_weight='bold') 137 | st.pyplot(plt) 138 | 139 | # Top rules by lift 140 | st.subheader("Top Rules by Lift") 141 | top_rules = rules.sort_values('lift', ascending=False).head(5) 142 | for _, rule in top_rules.iterrows(): 143 | antecedents = list(rule['antecedents'])[0] 144 | consequents = list(rule['consequents'])[0] 145 | st.write(f"If {antecedents} → {consequents}") 146 | st.write(f"Support: {rule['support']:.3f}, Confidence: {rule['confidence']:.3f}, Lift: {rule['lift']:.3f}") 147 | st.write("---") 148 | else: 149 | st.warning("No rules found with the current parameters. Try adjusting the thresholds.") 150 | except Exception as e: 151 | st.error(f"An error occurred: {str(e)}") 152 | st.info("Try adjusting the parameters or using different data.") 153 | 154 | if __name__ == "__main__": 155 | run() -------------------------------------------------------------------------------- /Association_Rule_Learning/Association_Rule_Learning_projects/recommendation_system.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import numpy as np 4 | from mlxtend.frequent_patterns import apriori, association_rules 5 | import plotly.express as px 6 | import plotly.graph_objects as go 7 | import networkx as nx 8 | import matplotlib.pyplot as plt 9 | 10 | def generate_sample_data(n_users=1000): 11 | # Define product categories and their items 12 | categories = { 13 | 'Electronics': ['Smartphone', 'Laptop', 'Tablet', 'Headphones', 'Camera'], 14 | 'Books': ['Fiction', 'Non-Fiction', 'Biography', 'Science', 'History'], 15 | 'Movies': ['Action', 'Comedy', 'Drama', 'Sci-Fi', 'Documentary'], 16 | 'Music': ['Pop', 'Rock', 'Classical', 'Jazz', 'Hip-Hop'], 17 | 'Games': ['Action', 'Strategy', 'Puzzle', 'Sports', 'RPG'] 18 | } 19 | 20 | # Define common user preferences 21 | common_preferences = [ 22 | ['Smartphone', 'Headphones'], 23 | ['Laptop', 'Tablet'], 24 | ['Fiction', 'Biography'], 25 | ['Action', 'Sci-Fi'], 26 | ['Pop', 'Rock'], 27 | ['Action', 'Strategy'], 28 | ['Comedy', 'Drama'], 29 | ['Classical', 'Jazz'] 30 | ] 31 | 32 | # Generate user interactions 33 | interactions = [] 34 | for _ in range(n_users): 35 | user_interactions = [] 36 | 37 | # 80% chance to include a common preference 38 | if np.random.random() < 0.8: 39 | pref_idx = np.random.randint(0, len(common_preferences)) 40 | user_interactions.extend(common_preferences[pref_idx]) 41 | 42 | # Add 2-4 random items 43 | n_additional = np.random.randint(2, 5) 44 | for _ in range(n_additional): 45 | category = np.random.choice(list(categories.keys())) 46 | item = np.random.choice(categories[category]) 47 | if item not in user_interactions: # Avoid duplicates 48 | user_interactions.append(item) 49 | 50 | interactions.append(user_interactions) 51 | 52 | return interactions 53 | 54 | def format_rule(rule): 55 | """Convert frozenset to string for display""" 56 | antecedents = ', '.join(list(rule['antecedents'])) 57 | consequents = ', '.join(list(rule['consequents'])) 58 | return f"{antecedents} → {consequents}" 59 | 60 | def run(): 61 | st.header("Recommendation System using Association Rules") 62 | st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Association_Rule_Learning)", unsafe_allow_html=True) 63 | 64 | # Load or generate dataset 65 | uploaded_file = st.file_uploader("Upload a CSV file with user-item interactions", type=["csv"]) 66 | if uploaded_file is not None: 67 | df = pd.read_csv(uploaded_file) 68 | # Convert to list of interactions 69 | interactions = df.values.tolist() 70 | else: 71 | st.info("Using sample user-item interaction data") 72 | interactions = generate_sample_data() 73 | 74 | # Convert interactions to one-hot encoded DataFrame 75 | unique_items = list(set(item for interaction in interactions for item in interaction)) 76 | df = pd.DataFrame([[1 if item in interaction else 0 for item in unique_items] 77 | for interaction in interactions], columns=unique_items) 78 | 79 | # Display data info 80 | st.subheader("Dataset Information") 81 | st.write(f"Number of users: {len(interactions)}") 82 | st.write(f"Number of unique items: {len(unique_items)}") 83 | st.write("Sample interactions:") 84 | st.dataframe(df.head()) 85 | 86 | # Parameters 87 | st.subheader("Association Rule Parameters") 88 | min_support = st.slider("Minimum Support", min_value=0.001, max_value=0.5, value=0.003, step=0.001) 89 | min_confidence = st.slider("Minimum Confidence", min_value=0.1, max_value=1.0, value=0.15, step=0.05) 90 | min_lift = st.slider("Minimum Lift", min_value=1.0, max_value=5.0, value=1.1, step=0.1) 91 | 92 | if st.button("Generate Recommendations"): 93 | try: 94 | # Generate frequent itemsets 95 | frequent_itemsets = apriori(df, min_support=min_support, use_colnames=True) 96 | 97 | if len(frequent_itemsets) == 0: 98 | st.warning("No frequent itemsets found. Try lowering the minimum support threshold.") 99 | return 100 | 101 | # Generate rules 102 | rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence) 103 | rules = rules[rules['lift'] >= min_lift] 104 | 105 | if len(rules) > 0: 106 | # Create a copy of rules with formatted strings for display 107 | display_rules = rules.copy() 108 | display_rules['rule'] = display_rules.apply(format_rule, axis=1) 109 | 110 | # Display rules 111 | st.subheader("Association Rules") 112 | st.dataframe(display_rules[['rule', 'support', 'confidence', 'lift']]) 113 | 114 | # Visualize support vs confidence 115 | fig = px.scatter(rules, x="support", y="confidence", 116 | size="lift", color="lift", 117 | hover_data=["antecedents", "consequents"], 118 | title="Support vs Confidence") 119 | st.plotly_chart(fig) 120 | 121 | # Interactive recommendation 122 | st.subheader("Get Recommendations") 123 | selected_items = st.multiselect("Select items you like:", unique_items) 124 | 125 | if selected_items: 126 | # Find rules where selected items are in antecedents 127 | recommendations = [] 128 | for _, rule in rules.iterrows(): 129 | if all(item in rule['antecedents'] for item in selected_items): 130 | recommendations.extend(list(rule['consequents'])) 131 | 132 | if recommendations: 133 | # Remove duplicates and selected items 134 | recommendations = list(set(recommendations) - set(selected_items)) 135 | 136 | # Sort by frequency 137 | recommendation_counts = pd.Series(recommendations).value_counts() 138 | 139 | st.write("Recommended items based on your selection:") 140 | for item, count in recommendation_counts.items(): 141 | st.write(f"- {item} (recommended {count} times)") 142 | else: 143 | st.info("No specific recommendations found. Try selecting different items or adjusting the parameters.") 144 | else: 145 | st.warning("No rules found with the current parameters. Try adjusting the thresholds.") 146 | except Exception as e: 147 | st.error(f"An error occurred: {str(e)}") 148 | st.info("Try adjusting the parameters or using different data.") 149 | 150 | if __name__ == "__main__": 151 | run() -------------------------------------------------------------------------------- /Association_Rule_Learning/README.md: -------------------------------------------------------------------------------- 1 | # Association Rule Learning Projects 2 | 3 | This repository contains projects that demonstrate the application of Association Rule Learning algorithms in various domains. 4 | 5 | ## Projects 6 | 7 | ### 1. Market Basket Analysis 8 | 9 | **Screenshots:** 10 | ![Market Basket Analysis](screenshots/mark_bask.png) 11 | - Interactive transaction data upload 12 | - Customizable support, confidence, and lift thresholds 13 | - Visualization of association rules 14 | - Network visualization of item relationships 15 | - Top rules analysis 16 | - Support vs Confidence scatter plot 17 | 18 | ### 2. Recommendation System 19 | 20 | **Screenshots:** 21 | ![Recommendation System](screenshots/recomm.png) 22 | - Interactive user-item interaction data upload 23 | - Customizable support, confidence, and lift thresholds 24 | - Visualization of item relationships 25 | - Network visualization of item connections 26 | - Personalized recommendations based on selected items 27 | - Rule-based item suggestions 28 | 29 | ### 3. Feature Selection 30 | 31 | **Screenshots:** 32 | ![Feature Selection](screenshots/feature_sel.png) 33 | - Association rule-based feature selection 34 | - Interactive parameter tuning 35 | - Rule visualization 36 | - Feature importance analysis 37 | 38 | ### 4. Image Compression 39 | 40 | **Screenshots:** 41 | ![Image Compression](screenshots/img_comp.png) 42 | - Association rule-based image compression 43 | - Visualization of compressed vs. original image 44 | - Parameter tuning 45 | - Compression ratio analysis 46 | 47 | ## How to Run 48 | 49 | 1. Install the required packages: 50 | ```bash 51 | pip install -r requirements.txt 52 | ``` 53 | 54 | 2. Run the Streamlit app: 55 | ```bash 56 | streamlit run main.py 57 | ``` 58 | 59 | ## Project Structure 60 | 61 | - `main.py`: Main entry point for running the projects 62 | - `Association_Rule_Learning_projects/`: Directory containing individual project files 63 | - `market_basket_analysis.py`: Market basket analysis using association rules 64 | - `recommendation_system.py`: Recommendation system using association rules 65 | 66 | ## Features 67 | 68 | - Interactive parameter tuning 69 | - Rich visualizations using Plotly and NetworkX 70 | - Support for custom data upload 71 | - Sample data generation 72 | - Detailed analysis tools 73 | - Interactive recommendation capabilities 74 | 75 | ## Contributing 76 | 77 | Contributions are welcome! Please feel free to submit a Pull Request. 78 | 79 | ## License 80 | 81 | This project is licensed under the MIT License. -------------------------------------------------------------------------------- /Association_Rule_Learning/main.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import sys 3 | from pathlib import Path 4 | 5 | # Ensure the Decision_Trees_projects folder is in the Python path 6 | sys.path.append(str(Path(__file__).parent)) 7 | from Association_Rule_Learning_projects import ( 8 | market_basket_analysis, 9 | recommendation_system 10 | ) 11 | 12 | def run(): 13 | st.title("Association Rule Learning Projects") 14 | 15 | # Sidebar for project selection 16 | project = st.sidebar.selectbox( 17 | "Select a project", 18 | [ 19 | "Market Basket Analysis", 20 | "Recommendation System" 21 | ], 22 | ) 23 | 24 | # Run the selected project 25 | if project == "Market Basket Analysis": 26 | market_basket_analysis.run() 27 | elif project == "Recommendation System": 28 | recommendation_system.run() 29 | 30 | if __name__ == "__main__": 31 | run() -------------------------------------------------------------------------------- /Association_Rule_Learning/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit==1.22.0 2 | pandas==1.5.3 3 | numpy==1.24.3 4 | mlxtend==0.23.1 5 | plotly==5.13.1 6 | matplotlib==3.7.1 7 | networkx==3.1 8 | scipy==1.10.1 9 | seaborn -------------------------------------------------------------------------------- /Association_Rule_Learning/screenshots/feature_sel.png: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Association_Rule_Learning/screenshots/img_comp.png: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Association_Rule_Learning/screenshots/mark_bask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Association_Rule_Learning/screenshots/mark_bask.png -------------------------------------------------------------------------------- /Association_Rule_Learning/screenshots/recomm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Association_Rule_Learning/screenshots/recomm.png -------------------------------------------------------------------------------- /DBSCAN_HDBSCAN/DBSCAN_HDBSCAN_projects/anomaly_detection.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import numpy as np 4 | import plotly.express as px 5 | import plotly.graph_objects as go 6 | from sklearn.cluster import DBSCAN 7 | from sklearn.preprocessing import StandardScaler 8 | from sklearn.decomposition import PCA 9 | import seaborn as sns 10 | import matplotlib.pyplot as plt 11 | 12 | def run(): 13 | st.header("Anomaly Detection using DBSCAN") 14 | st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/DBSCAN_HDBSCAN)", unsafe_allow_html=True) 15 | 16 | # Load or generate dataset 17 | uploaded_file = st.file_uploader("Upload a CSV file with transaction data", type=["csv"]) 18 | if uploaded_file is not None: 19 | df = pd.read_csv(uploaded_file) 20 | else: 21 | st.info("Using sample transaction data") 22 | # Generate sample transaction data 23 | np.random.seed(42) 24 | n_transactions = 1000 25 | 26 | # Generate normal transactions 27 | normal_amounts = np.random.normal(100, 20, int(n_transactions * 0.95)) 28 | normal_times = np.random.normal(12, 2, int(n_transactions * 0.95)) 29 | 30 | # Generate anomalous transactions 31 | anomaly_amounts = np.random.uniform(500, 1000, int(n_transactions * 0.05)) 32 | anomaly_times = np.random.uniform(0, 24, int(n_transactions * 0.05)) 33 | 34 | # Combine normal and anomalous data 35 | amounts = np.concatenate([normal_amounts, anomaly_amounts]) 36 | times = np.concatenate([normal_times, anomaly_times]) 37 | 38 | data = { 39 | 'Transaction_ID': range(1, n_transactions + 1), 40 | 'Amount': amounts, 41 | 'Time': times, 42 | 'Location_X': np.random.normal(0, 1, n_transactions), 43 | 'Location_Y': np.random.normal(0, 1, n_transactions), 44 | 'Merchant_Category': np.random.choice(['Retail', 'Food', 'Travel', 'Other'], n_transactions) 45 | } 46 | df = pd.DataFrame(data) 47 | 48 | # Display data info 49 | st.subheader("Dataset Information") 50 | st.write(f"Number of transactions: {len(df)}") 51 | st.write("Sample data:") 52 | st.dataframe(df.head()) 53 | 54 | # Feature selection 55 | st.subheader("Feature Selection") 56 | features = ['Amount', 'Time', 'Location_X', 'Location_Y'] 57 | selected_features = st.multiselect("Select features for anomaly detection", features, 58 | default=['Amount', 'Time']) 59 | 60 | if len(selected_features) >= 2: 61 | # Prepare data 62 | X = df[selected_features] 63 | scaler = StandardScaler() 64 | X_scaled = scaler.fit_transform(X) 65 | 66 | # DBSCAN parameters 67 | st.subheader("DBSCAN Parameters") 68 | eps = st.slider("Epsilon (eps)", min_value=0.1, max_value=2.0, value=0.5, step=0.1) 69 | min_samples = st.slider("Minimum Samples", min_value=2, max_value=20, value=5) 70 | 71 | # Apply DBSCAN 72 | dbscan = DBSCAN(eps=eps, min_samples=min_samples) 73 | df['Cluster'] = dbscan.fit_predict(X_scaled) 74 | 75 | # Label clusters 76 | df['Anomaly'] = df['Cluster'].apply(lambda x: 'Anomaly' if x == -1 else 'Normal') 77 | 78 | # Visualize clusters using PCA 79 | pca = PCA(n_components=2) 80 | X_pca = pca.fit_transform(X_scaled) 81 | df['PCA1'] = X_pca[:, 0] 82 | df['PCA2'] = X_pca[:, 1] 83 | 84 | # PCA Scatter plot 85 | fig = px.scatter(df, x='PCA1', y='PCA2', color='Anomaly', 86 | hover_data=selected_features, 87 | title='Transaction Clusters (PCA Visualization)') 88 | st.plotly_chart(fig) 89 | 90 | # Anomaly Analysis 91 | st.subheader("Anomaly Analysis") 92 | anomaly_count = len(df[df['Anomaly'] == 'Anomaly']) 93 | st.write(f"Number of anomalies detected: {anomaly_count}") 94 | st.write(f"Percentage of anomalies: {(anomaly_count/len(df))*100:.2f}%") 95 | 96 | # Display anomaly statistics 97 | st.write("\nAnomaly Statistics:") 98 | anomaly_stats = df[df['Anomaly'] == 'Anomaly'][selected_features].describe() 99 | st.dataframe(anomaly_stats) 100 | 101 | # Feature importance visualization 102 | st.subheader("Feature Distribution by Cluster") 103 | for feature in selected_features: 104 | fig = px.box(df, x='Anomaly', y=feature, title=f'{feature} Distribution by Cluster') 105 | st.plotly_chart(fig) 106 | 107 | # Correlation heatmap 108 | st.subheader("Feature Correlation Matrix") 109 | correlation_matrix = df[selected_features].corr() 110 | fig, ax = plt.subplots(figsize=(10, 8)) 111 | sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0) 112 | st.pyplot(fig) 113 | 114 | # Interactive prediction 115 | st.subheader("Check New Transaction") 116 | input_values = {} 117 | for feature in selected_features: 118 | if feature == 'Amount': 119 | input_values[feature] = st.number_input(feature, min_value=0.0, max_value=1000.0, value=100.0) 120 | elif feature == 'Time': 121 | input_values[feature] = st.number_input(feature, min_value=0.0, max_value=24.0, value=12.0) 122 | elif feature == 'Location_X': 123 | input_values[feature] = st.number_input(feature, min_value=-3.0, max_value=3.0, value=0.0) 124 | elif feature == 'Location_Y': 125 | input_values[feature] = st.number_input(feature, min_value=-3.0, max_value=3.0, value=0.0) 126 | 127 | if st.button("Check for Anomaly"): 128 | # Create input array with only the selected features 129 | new_transaction = np.array([[input_values[feature] for feature in selected_features]]) 130 | new_transaction_scaled = scaler.transform(new_transaction) 131 | prediction = dbscan.fit_predict(np.vstack([X_scaled, new_transaction_scaled]))[-1] 132 | result = "Anomaly" if prediction == -1 else "Normal" 133 | st.success(f"Transaction is classified as: {result}") 134 | 135 | if __name__ == "__main__": 136 | run() -------------------------------------------------------------------------------- /DBSCAN_HDBSCAN/DBSCAN_HDBSCAN_projects/customer_behavior_analysis.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import numpy as np 4 | import plotly.express as px 5 | import plotly.graph_objects as go 6 | import hdbscan 7 | from sklearn.preprocessing import StandardScaler 8 | from sklearn.decomposition import PCA 9 | import seaborn as sns 10 | import matplotlib.pyplot as plt 11 | 12 | def run(): 13 | st.header("Customer Behavior Analysis using HDBSCAN") 14 | st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/DBSCAN_HDBSCAN)", unsafe_allow_html=True) 15 | 16 | # Load or generate dataset 17 | uploaded_file = st.file_uploader("Upload a CSV file with customer behavior data", type=["csv"]) 18 | if uploaded_file is not None: 19 | df = pd.read_csv(uploaded_file) 20 | else: 21 | st.info("Using sample customer behavior data") 22 | # Generate sample customer behavior data 23 | np.random.seed(42) 24 | n_customers = 1000 25 | 26 | # Generate different customer segments 27 | segments = { 28 | 'High_Value': {'size': 0.2, 'income': (80000, 150000), 'frequency': (20, 40), 'recency': (0, 30)}, 29 | 'Regular': {'size': 0.4, 'income': (40000, 80000), 'frequency': (10, 20), 'recency': (30, 90)}, 30 | 'Occasional': {'size': 0.3, 'income': (20000, 40000), 'frequency': (5, 10), 'recency': (90, 180)}, 31 | 'Inactive': {'size': 0.1, 'income': (0, 20000), 'frequency': (0, 5), 'recency': (180, 365)} 32 | } 33 | 34 | data = { 35 | 'Customer_ID': range(1, n_customers + 1), 36 | 'Annual_Income': [], 37 | 'Purchase_Frequency': [], 38 | 'Days_Since_Last_Purchase': [], 39 | 'Average_Order_Value': [], 40 | 'Website_Time_Spent': [], 41 | 'App_Usage_Frequency': [] 42 | } 43 | 44 | for segment, params in segments.items(): 45 | n_segment = int(n_customers * params['size']) 46 | data['Annual_Income'].extend(np.random.uniform(*params['income'], n_segment)) 47 | data['Purchase_Frequency'].extend(np.random.uniform(*params['frequency'], n_segment)) 48 | data['Days_Since_Last_Purchase'].extend(np.random.uniform(*params['recency'], n_segment)) 49 | data['Average_Order_Value'].extend(np.random.uniform(50, 500, n_segment)) 50 | data['Website_Time_Spent'].extend(np.random.uniform(5, 60, n_segment)) 51 | data['App_Usage_Frequency'].extend(np.random.uniform(1, 30, n_segment)) 52 | 53 | df = pd.DataFrame(data) 54 | 55 | # Display data info 56 | st.subheader("Dataset Information") 57 | st.write(f"Number of customers: {len(df)}") 58 | st.write("Sample data:") 59 | st.dataframe(df.head()) 60 | 61 | # Feature selection 62 | st.subheader("Feature Selection") 63 | features = ['Annual_Income', 'Purchase_Frequency', 'Days_Since_Last_Purchase', 64 | 'Average_Order_Value', 'Website_Time_Spent', 'App_Usage_Frequency'] 65 | selected_features = st.multiselect("Select features for behavior analysis", features, 66 | default=['Annual_Income', 'Purchase_Frequency', 'Days_Since_Last_Purchase']) 67 | 68 | if len(selected_features) >= 2: 69 | # Prepare data 70 | X = df[selected_features] 71 | scaler = StandardScaler() 72 | X_scaled = scaler.fit_transform(X) 73 | 74 | # HDBSCAN parameters 75 | st.subheader("HDBSCAN Parameters") 76 | min_cluster_size = st.slider("Minimum Cluster Size", min_value=5, max_value=50, value=15) 77 | min_samples = st.slider("Minimum Samples", min_value=1, max_value=20, value=5) 78 | 79 | # Apply HDBSCAN 80 | clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples) 81 | df['Cluster'] = clusterer.fit_predict(X_scaled) 82 | 83 | # Label clusters 84 | df['Segment'] = df['Cluster'].apply(lambda x: f'Segment {x}' if x != -1 else 'Noise') 85 | 86 | # Visualize clusters using PCA 87 | pca = PCA(n_components=2) 88 | X_pca = pca.fit_transform(X_scaled) 89 | df['PCA1'] = X_pca[:, 0] 90 | df['PCA2'] = X_pca[:, 1] 91 | 92 | # PCA Scatter plot 93 | fig = px.scatter(df, x='PCA1', y='PCA2', color='Segment', 94 | hover_data=selected_features, 95 | title='Customer Segments (PCA Visualization)') 96 | st.plotly_chart(fig) 97 | 98 | # Segment Analysis 99 | st.subheader("Segment Analysis") 100 | segment_counts = df['Segment'].value_counts() 101 | st.write("Segment Distribution:") 102 | st.write(segment_counts) 103 | 104 | # Display segment statistics 105 | for segment in df['Segment'].unique(): 106 | st.write(f"\n{segment} Customers:") 107 | segment_data = df[df['Segment'] == segment] 108 | st.write(f"Number of customers: {len(segment_data)}") 109 | stats = segment_data[selected_features].describe() 110 | st.write("Segment Statistics:") 111 | st.dataframe(stats) 112 | 113 | # Feature importance visualization 114 | st.subheader("Feature Distribution by Segment") 115 | for feature in selected_features: 116 | fig = px.box(df, x='Segment', y=feature, title=f'{feature} Distribution by Segment') 117 | st.plotly_chart(fig) 118 | 119 | # Correlation heatmap 120 | st.subheader("Feature Correlation Matrix") 121 | correlation_matrix = df[selected_features].corr() 122 | fig, ax = plt.subplots(figsize=(10, 8)) 123 | sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0) 124 | st.pyplot(fig) 125 | 126 | # Interactive prediction 127 | st.subheader("Analyze New Customer") 128 | input_values = {} 129 | for feature in selected_features: 130 | if feature == 'Annual_Income': 131 | input_values[feature] = st.number_input(feature, min_value=0.0, max_value=200000.0, value=50000.0) 132 | elif feature == 'Purchase_Frequency': 133 | input_values[feature] = st.number_input(feature, min_value=0.0, max_value=50.0, value=10.0) 134 | elif feature == 'Days_Since_Last_Purchase': 135 | input_values[feature] = st.number_input(feature, min_value=0.0, max_value=365.0, value=30.0) 136 | elif feature == 'Average_Order_Value': 137 | input_values[feature] = st.number_input(feature, min_value=0.0, max_value=1000.0, value=100.0) 138 | elif feature == 'Website_Time_Spent': 139 | input_values[feature] = st.number_input(feature, min_value=0.0, max_value=120.0, value=30.0) 140 | elif feature == 'App_Usage_Frequency': 141 | input_values[feature] = st.number_input(feature, min_value=0.0, max_value=50.0, value=10.0) 142 | 143 | if st.button("Analyze Customer"): 144 | # Create input array with only the selected features 145 | new_customer = np.array([[input_values[feature] for feature in selected_features]]) 146 | new_customer_scaled = scaler.transform(new_customer) 147 | prediction = clusterer.fit_predict(np.vstack([X_scaled, new_customer_scaled]))[-1] 148 | segment = f'Segment {prediction}' if prediction != -1 else 'Noise' 149 | st.success(f"Customer belongs to: {segment}") 150 | 151 | if __name__ == "__main__": 152 | run() -------------------------------------------------------------------------------- /DBSCAN_HDBSCAN/main.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import sys 3 | from pathlib import Path 4 | 5 | # Ensure the Decision_Trees_projects folder is in the Python path 6 | sys.path.append(str(Path(__file__).parent)) 7 | from DBSCAN_HDBSCAN_projects import ( 8 | anomaly_detection, 9 | customer_behavior_analysis 10 | ) 11 | 12 | def run(): 13 | st.title("DBSCAN & HDBSCAN Clustering Projects") 14 | 15 | # Sidebar for project selection 16 | project = st.sidebar.selectbox( 17 | "Select a project", 18 | [ 19 | "Anomaly Detection", 20 | "Customer Behavior Analysis" 21 | ], 22 | ) 23 | 24 | # Run the selected project 25 | if project == "Anomaly Detection": 26 | anomaly_detection.run() 27 | elif project == "Customer Behavior Analysis": 28 | customer_behavior_analysis.run() 29 | 30 | if __name__ == "__main__": 31 | run() -------------------------------------------------------------------------------- /DBSCAN_HDBSCAN/readme.md: -------------------------------------------------------------------------------- 1 | # DBSCAN/HDBSCAN Projects 2 | 3 | This repository contains various DBSCAN and HDBSCAN clustering projects implemented in Python. Each project demonstrates the application of density-based clustering algorithms to solve real-world problems using datasets. 4 | 5 | ## Project Structure 6 | 7 | ``` 8 | DBSCAN_HDBSCAN/ 9 | ├── main.py 10 | ├── requirements.txt 11 | ├── DBSCAN_HDBSCAN_projects/ 12 | │ ├── customer_behavior_analysis.py 13 | ``` 14 | 15 | ### Key Files 16 | - **`main.py`**: The main entry point for running the Streamlit app. 17 | - **`requirements.txt`**: Contains the dependencies required to run the project. 18 | - **`DBSCAN_HDBSCAN_projects/`**: Contains individual project scripts. 19 | 20 | ## Projects Included 21 | 22 | 1. **Customer Behavior Analysis** 23 | Analyzes customer behavior patterns using HDBSCAN clustering to identify distinct customer segments. 24 | 25 | **Screenshots:** 26 | ![Customer Behavior Analysis](screenshots/customer_behavior.png) 27 | 28 | - Interactive parameter tuning 29 | - Cluster visualization 30 | - Behavior pattern analysis 31 | - Noise point identification 32 | 33 | 2. **Anomaly Detection** 34 | Detects anomalies in data using DBSCAN clustering. 35 | 36 | **Screenshots:** 37 | ![Anomaly Detection](screenshots/anom_det.png) 38 | 39 | - Outlier detection 40 | - Cluster visualization 41 | - Interactive parameter tuning 42 | 43 | ## How to Run 44 | 45 | 1. Clone the repository: 46 | ```bash 47 | git clone https://github.com/benasphy/ML_projects.git 48 | cd DBSCAN_HDBSCAN 49 | ``` 50 | 51 | 2. Install dependencies: 52 | ```bash 53 | pip install -r requirements.txt 54 | ``` 55 | 56 | 3. Run the Streamlit app: 57 | ```bash 58 | streamlit run main.py 59 | ``` 60 | 61 | 4. Select a project from the sidebar to explore its functionality. 62 | 63 | ## Requirements 64 | 65 | The project requires the following Python libraries: 66 | - `streamlit` 67 | - `numpy` 68 | - `pandas` 69 | - `scikit-learn` 70 | - `hdbscan` 71 | - `matplotlib` 72 | - `plotly` 73 | 74 | ## Datasets 75 | 76 | - **`customer_behavior.csv`**: Contains customer behavior data for analysis. 77 | 78 | ## License 79 | 80 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. 81 | 82 | ## Acknowledgments 83 | 84 | - Datasets used in this project are sourced from publicly available repositories. 85 | - Special thanks to the contributors of the Python libraries used in this project. 86 | 87 | --- 88 | Feel free to contribute to this repository by submitting issues or pull requests. -------------------------------------------------------------------------------- /DBSCAN_HDBSCAN/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit==1.22.0 2 | pandas==1.5.3 3 | numpy==1.24.3 4 | scikit-learn==1.2.2 5 | plotly==5.13.1 6 | matplotlib==3.7.1 7 | seaborn==0.12.2 8 | scipy==1.10.1 -------------------------------------------------------------------------------- /DBSCAN_HDBSCAN/screenshots/anom_det.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/DBSCAN_HDBSCAN/screenshots/anom_det.png -------------------------------------------------------------------------------- /DBSCAN_HDBSCAN/screenshots/customer_behavior.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/DBSCAN_HDBSCAN/screenshots/customer_behavior.png -------------------------------------------------------------------------------- /Decision_Trees/Decision_Trees_projects/gym_decision_tree.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text 5 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 6 | import plotly.express as px 7 | import plotly.graph_objects as go 8 | import matplotlib.pyplot as plt 9 | import seaborn as sns 10 | 11 | def generate_sample_data(): 12 | np.random.seed(42) 13 | n_samples = 1000 14 | 15 | # Generate features 16 | energy_levels = np.random.choice(['High', 'Low'], n_samples, p=[0.4, 0.6]) 17 | motivation_levels = np.random.choice(['Highly Motivated', 'Neutral', 'No Motivation'], 18 | n_samples, p=[0.3, 0.4, 0.3]) 19 | 20 | # Create DataFrame 21 | df = pd.DataFrame({ 22 | 'Energy': energy_levels, 23 | 'Motivation': motivation_levels 24 | }) 25 | 26 | # Generate target (gym attendance) with some patterns 27 | def determine_gym_attendance(row): 28 | if row['Energy'] == 'High' and row['Motivation'] in ['Highly Motivated', 'Neutral']: 29 | return 1 30 | elif row['Energy'] == 'Low' and row['Motivation'] == 'No Motivation': 31 | return 0 32 | else: 33 | # Add some randomness for other combinations 34 | return np.random.choice([0, 1], p=[0.7, 0.3]) 35 | 36 | df['Gym'] = df.apply(determine_gym_attendance, axis=1) 37 | 38 | return df 39 | 40 | def run(): 41 | st.header("Gym Attendance Prediction using Decision Trees") 42 | st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Decision_Trees)", unsafe_allow_html=True) 43 | 44 | # Generate sample data 45 | df = generate_sample_data() 46 | 47 | # Display dataset info 48 | st.subheader("Dataset Overview") 49 | col1, col2 = st.columns(2) 50 | with col1: 51 | st.write("Dataset Shape:", df.shape) 52 | st.write("Features:", ", ".join(df.columns[:-1])) 53 | st.write("Target: Gym Attendance (0: No, 1: Yes)") 54 | with col2: 55 | st.write("Class Distribution:") 56 | class_dist = df['Gym'].value_counts() 57 | fig = px.pie(values=class_dist.values, names=['No', 'Yes'], 58 | title='Gym Attendance Distribution') 59 | st.plotly_chart(fig) 60 | 61 | # Feature Analysis 62 | st.subheader("Feature Analysis") 63 | 64 | # Energy level impact 65 | fig = px.histogram(df, x='Energy', color='Gym', 66 | title='Gym Attendance by Energy Level', 67 | barmode='group') 68 | st.plotly_chart(fig) 69 | 70 | # Motivation level impact 71 | fig = px.histogram(df, x='Motivation', color='Gym', 72 | title='Gym Attendance by Motivation Level', 73 | barmode='group') 74 | st.plotly_chart(fig) 75 | 76 | # Prepare data 77 | X = pd.get_dummies(df[['Energy', 'Motivation']]) 78 | y = df['Gym'] 79 | 80 | # Train model 81 | model = DecisionTreeClassifier(criterion="entropy", max_depth=3, random_state=42) 82 | model.fit(X, y) 83 | 84 | # Model evaluation 85 | st.subheader("Model Performance") 86 | y_pred = model.predict(X) 87 | accuracy = accuracy_score(y, y_pred) 88 | 89 | # Display metrics 90 | col1, col2, col3 = st.columns(3) 91 | with col1: 92 | st.metric("Accuracy", f"{accuracy:.2%}") 93 | with col2: 94 | st.metric("Precision", f"{classification_report(y, y_pred, output_dict=True)['1']['precision']:.2%}") 95 | with col3: 96 | st.metric("Recall", f"{classification_report(y, y_pred, output_dict=True)['1']['recall']:.2%}") 97 | 98 | # Confusion Matrix 99 | st.subheader("Confusion Matrix") 100 | cm = confusion_matrix(y, y_pred) 101 | fig = px.imshow(cm, 102 | labels=dict(x="Predicted", y="Actual", color="Count"), 103 | x=['No', 'Yes'], 104 | y=['No', 'Yes'], 105 | text_auto=True, 106 | aspect="auto") 107 | st.plotly_chart(fig) 108 | 109 | # Feature Importance 110 | st.subheader("Feature Importance") 111 | importance = pd.DataFrame({ 112 | 'Feature': X.columns, 113 | 'Importance': model.feature_importances_ 114 | }) 115 | fig = px.bar(importance, x='Feature', y='Importance', 116 | title='Feature Importance in Prediction') 117 | st.plotly_chart(fig) 118 | 119 | # Decision Tree Visualization 120 | st.subheader("Decision Tree Structure") 121 | fig, ax = plt.subplots(figsize=(12, 8)) 122 | plot_tree(model, feature_names=X.columns, class_names=['No', 'Yes'], 123 | filled=True, rounded=True, fontsize=10) 124 | st.pyplot(fig) 125 | 126 | # Interactive Prediction 127 | st.subheader("Make a Prediction") 128 | st.write("Enter your current state:") 129 | 130 | col1, col2 = st.columns(2) 131 | with col1: 132 | energy = st.selectbox("Energy Level:", df['Energy'].unique()) 133 | with col2: 134 | motivation = st.selectbox("Motivation Level:", df['Motivation'].unique()) 135 | 136 | if st.button("Predict"): 137 | # Prepare input data 138 | input_data = pd.DataFrame({ 139 | 'Energy': [energy], 140 | 'Motivation': [motivation] 141 | }) 142 | 143 | # One-hot encode categorical variables 144 | input_encoded = pd.get_dummies(input_data) 145 | # Ensure all columns from training data are present 146 | for col in X.columns: 147 | if col not in input_encoded.columns: 148 | input_encoded[col] = 0 149 | input_encoded = input_encoded[X.columns] 150 | 151 | # Make prediction 152 | prediction = model.predict(input_encoded)[0] 153 | probability = model.predict_proba(input_encoded)[0] 154 | 155 | # Display prediction 156 | st.subheader("Prediction Result") 157 | col1, col2 = st.columns(2) 158 | with col1: 159 | st.metric("Prediction", "Will go to the gym" if prediction == 1 else "Will not go to the gym") 160 | with col2: 161 | st.metric("Confidence", f"{max(probability):.2%}") 162 | 163 | # Visualize prediction probability 164 | fig = go.Figure(data=[ 165 | go.Bar(x=['No', 'Yes'], 166 | y=probability, 167 | text=[f'{p:.2%}' for p in probability], 168 | textposition='auto', 169 | ) 170 | ]) 171 | fig.update_layout(title='Prediction Probabilities') 172 | st.plotly_chart(fig) 173 | 174 | # Data Insights 175 | st.subheader("Data Insights") 176 | 177 | # Energy and Motivation combination analysis 178 | fig = px.sunburst(df, path=['Energy', 'Motivation', 'Gym'], 179 | title='Gym Attendance by Energy and Motivation Levels') 180 | st.plotly_chart(fig) 181 | 182 | # Success rate by feature combinations 183 | success_rate = df.groupby(['Energy', 'Motivation'])['Gym'].mean().reset_index() 184 | fig = px.treemap(success_rate, path=['Energy', 'Motivation'], 185 | values='Gym', 186 | title='Success Rate by Feature Combinations') 187 | st.plotly_chart(fig) 188 | 189 | if __name__ == "__main__": 190 | run() -------------------------------------------------------------------------------- /Decision_Trees/main.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import sys 3 | from pathlib import Path 4 | 5 | # Ensure the Decision_Trees_projects folder is in the Python path 6 | sys.path.append(str(Path(__file__).parent)) 7 | 8 | from Decision_Trees_projects import ( 9 | gym_decision_tree, 10 | gini_impurity_implementation, 11 | ) 12 | 13 | def run(): 14 | st.title("Decision Tree Projects") 15 | 16 | # Sidebar for project selection 17 | project = st.sidebar.selectbox( 18 | "Select a project", 19 | [ 20 | "Gym Decision Tree", 21 | "Gini Impurity Implementation", 22 | ], 23 | ) 24 | 25 | # Run the selected project 26 | if project == "Gym Decision Tree": 27 | gym_decision_tree.run() 28 | elif project == "Gini Impurity Implementation": 29 | gini_impurity_implementation.run() 30 | 31 | if __name__ == "__main__": 32 | run() -------------------------------------------------------------------------------- /Decision_Trees/readme.md: -------------------------------------------------------------------------------- 1 | # Decision Tree Projects 2 | 3 | This repository contains various Decision Tree projects implemented in Python. Each project demonstrates the application of Decision Trees to solve real-world problems using datasets. 4 | 5 | ## Project Structure 6 | 7 | ``` 8 | Decision_Tree/ 9 | ├── main.py 10 | ├── requirements.txt 11 | ├── Decision_Tree_projects/ 12 | │ ├── gym_decision_tree.py 13 | │ ├── gini_impurity_implementation.py 14 | ``` 15 | 16 | ### Key Files 17 | - **`main.py`**: The main entry point for running the Streamlit app. 18 | - **`requirements.txt`**: Contains the dependencies required to run the project. 19 | - **`Decision_Tree_projects/`**: Contains individual project scripts. 20 | 21 | ## Projects Included 22 | 23 | 1. **Gym Decision Tree** 24 | Predicts whether a person will go to the gym based on their energy level and motivation using a Decision Tree. 25 | - Visualizes the decision tree. 26 | - Allows user input to predict gym attendance. 27 | 28 | **Screenshots:** 29 | ![Gym Decision Tree](screenshots/gym1.png) 30 | 31 | 2. **Gini Impurity Implementation** 32 | Demonstrates the use of Gini Impurity to build a Decision Tree for predicting gym attendance. 33 | - Visualizes the decision tree. 34 | - Allows user input to predict gym attendance. 35 | 36 | **Screenshots:** 37 | ![Gini Impurity Implementation](screenshots/gini.png) 38 | 39 | ## How to Run 40 | 41 | 1. Clone the repository: 42 | ```bash 43 | git clone https://github.com/benasphy/ML_projects.git 44 | cd Decision_Tree 45 | ``` 46 | 47 | 2. Install dependencies: 48 | ```bash 49 | pip install -r requirements.txt 50 | ``` 51 | 52 | 3. Run the Streamlit app: 53 | ```bash 54 | streamlit run main.py 55 | ``` 56 | 57 | 4. Select a project from the sidebar to explore its functionality. 58 | 59 | ## Requirements 60 | 61 | The project requires the following Python libraries: 62 | - `streamlit` 63 | - `numpy` 64 | - `pandas` 65 | - `scikit-learn` 66 | - `matplotlib` 67 | 68 | ## Screenshots 69 | 70 | Add screenshots of the Streamlit app interface here to showcase the projects. 71 | 72 | ## License 73 | 74 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. 75 | 76 | ## Acknowledgments 77 | 78 | - Datasets used in this project are synthetic and created for demonstration purposes. 79 | - Special thanks to the contributors of the Python libraries used in this project. 80 | 81 | --- 82 | Feel free to contribute to this repository by submitting issues or pull requests. 83 | -------------------------------------------------------------------------------- /Decision_Trees/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit 2 | numpy 3 | pandas 4 | scikit-learn 5 | plotly 6 | scipy 7 | seaborn 8 | matplotlib -------------------------------------------------------------------------------- /Decision_Trees/screenshots/gini.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Decision_Trees/screenshots/gini.png -------------------------------------------------------------------------------- /Decision_Trees/screenshots/gym1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Decision_Trees/screenshots/gym1.png -------------------------------------------------------------------------------- /Dimensionality_Reduction/Dimensionality_Reduction_projects/feature_selection.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.decomposition import PCA 5 | from sklearn.manifold import TSNE 6 | from sklearn.preprocessing import StandardScaler 7 | import plotly.express as px 8 | import plotly.graph_objects as go 9 | import seaborn as sns 10 | import matplotlib.pyplot as plt 11 | 12 | def generate_sample_data(n_samples=1000): 13 | # Generate features with different patterns 14 | np.random.seed(42) 15 | 16 | # Generate correlated features 17 | x1 = np.random.normal(0, 1, n_samples) 18 | x2 = x1 + np.random.normal(0, 0.5, n_samples) 19 | x3 = x1 - x2 + np.random.normal(0, 0.3, n_samples) 20 | 21 | # Generate independent features 22 | x4 = np.random.normal(0, 1, n_samples) 23 | x5 = np.random.normal(0, 1, n_samples) 24 | 25 | # Generate target variable 26 | y = 2*x1 + 3*x2 - x3 + np.random.normal(0, 0.5, n_samples) 27 | 28 | # Create DataFrame 29 | data = { 30 | 'Feature1': x1, 31 | 'Feature2': x2, 32 | 'Feature3': x3, 33 | 'Feature4': x4, 34 | 'Feature5': x5, 35 | 'Target': y 36 | } 37 | 38 | return pd.DataFrame(data) 39 | 40 | def run(): 41 | st.header("Feature Selection using Dimensionality Reduction") 42 | st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Dimensionality_Reduction)", unsafe_allow_html=True) 43 | 44 | # Load or generate dataset 45 | uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"]) 46 | if uploaded_file is not None: 47 | df = pd.read_csv(uploaded_file) 48 | else: 49 | st.info("Using sample dataset") 50 | df = generate_sample_data() 51 | 52 | # Display data info 53 | st.subheader("Dataset Information") 54 | st.write(f"Number of samples: {len(df)}") 55 | st.write(f"Number of features: {len(df.columns) - 1}") # Excluding target 56 | st.write("Sample data:") 57 | st.dataframe(df.head()) 58 | 59 | # Feature selection 60 | st.subheader("Feature Selection") 61 | features = [col for col in df.columns if col != 'Target'] 62 | selected_features = st.multiselect("Select features for analysis", features, default=features) 63 | 64 | if len(selected_features) >= 2: 65 | # Prepare data 66 | X = df[selected_features] 67 | y = df['Target'] if 'Target' in df.columns else None 68 | 69 | # Scale data 70 | scaler = StandardScaler() 71 | X_scaled = scaler.fit_transform(X) 72 | 73 | # Dimensionality reduction methods 74 | st.subheader("Dimensionality Reduction Methods") 75 | method = st.selectbox( 76 | "Select method", 77 | ["PCA", "t-SNE"] 78 | ) 79 | 80 | if method == "PCA": 81 | # PCA parameters 82 | n_components = st.slider("Number of Components", min_value=1, max_value=len(selected_features), value=2) 83 | 84 | # Apply PCA 85 | pca = PCA(n_components=n_components) 86 | X_pca = pca.fit_transform(X_scaled) 87 | 88 | # Create DataFrame with PCA results 89 | pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(n_components)]) 90 | if y is not None: 91 | pca_df['Target'] = y 92 | 93 | # Plot PCA results 94 | st.subheader("PCA Results") 95 | 96 | # Scatter plot 97 | fig = px.scatter(pca_df, x='PC1', y='PC2', color='Target' if y is not None else None, 98 | title="PCA Visualization", 99 | labels={'PC1': f'PC1 ({pca.explained_variance_ratio_[0]:.2%})', 100 | 'PC2': f'PC2 ({pca.explained_variance_ratio_[1]:.2%})'}) 101 | st.plotly_chart(fig) 102 | 103 | # Explained variance 104 | fig = px.bar(x=range(1, len(pca.explained_variance_ratio_) + 1), 105 | y=pca.explained_variance_ratio_, 106 | title="Explained Variance by Component", 107 | labels={'x': 'Component', 'y': 'Explained Variance'}) 108 | st.plotly_chart(fig) 109 | 110 | # Cumulative explained variance 111 | cumulative_variance = np.cumsum(pca.explained_variance_ratio_) 112 | fig = px.line(x=range(1, len(cumulative_variance) + 1), 113 | y=cumulative_variance, 114 | title="Cumulative Explained Variance", 115 | labels={'x': 'Number of Components', 'y': 'Cumulative Explained Variance'}) 116 | st.plotly_chart(fig) 117 | 118 | # Feature importance 119 | st.subheader("Feature Importance") 120 | feature_importance = pd.DataFrame( 121 | pca.components_.T, 122 | columns=[f'PC{i+1}' for i in range(n_components)], 123 | index=selected_features 124 | ) 125 | st.dataframe(feature_importance) 126 | 127 | # Plot feature importance 128 | fig, ax = plt.subplots(figsize=(10, 6)) 129 | sns.heatmap(feature_importance, annot=True, cmap='coolwarm', center=0, ax=ax) 130 | st.pyplot(fig) 131 | 132 | else: # t-SNE 133 | # t-SNE parameters 134 | perplexity = st.slider("Perplexity", min_value=5, max_value=50, value=30) 135 | learning_rate = st.slider("Learning Rate", min_value=10, max_value=1000, value=200) 136 | 137 | # Apply t-SNE 138 | tsne = TSNE(n_components=2, perplexity=perplexity, learning_rate=learning_rate) 139 | X_tsne = tsne.fit_transform(X_scaled) 140 | 141 | # Create DataFrame with t-SNE results 142 | tsne_df = pd.DataFrame(X_tsne, columns=['t-SNE1', 't-SNE2']) 143 | if y is not None: 144 | tsne_df['Target'] = y 145 | 146 | # Plot t-SNE results 147 | st.subheader("t-SNE Results") 148 | fig = px.scatter(tsne_df, x='t-SNE1', y='t-SNE2', color='Target' if y is not None else None, 149 | title="t-SNE Visualization") 150 | st.plotly_chart(fig) 151 | 152 | # Feature correlation 153 | st.subheader("Feature Correlation") 154 | correlation_matrix = df[selected_features].corr() 155 | fig, ax = plt.subplots(figsize=(10, 8)) 156 | sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, ax=ax) 157 | st.pyplot(fig) 158 | 159 | if __name__ == "__main__": 160 | run() -------------------------------------------------------------------------------- /Dimensionality_Reduction/Dimensionality_Reduction_projects/image_compression.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.decomposition import PCA 5 | import plotly.express as px 6 | import plotly.graph_objects as go 7 | from skimage import io 8 | import matplotlib.pyplot as plt 9 | from PIL import Image 10 | import io as io_lib 11 | 12 | def compress_image(image, n_components): 13 | # Reshape image to 2D array 14 | h, w, d = image.shape 15 | image_2d = image.reshape(h * w, d) 16 | 17 | # Apply PCA 18 | pca = PCA(n_components=min(n_components, d)) 19 | compressed = pca.fit_transform(image_2d) 20 | 21 | # Reconstruct image 22 | reconstructed = pca.inverse_transform(compressed) 23 | reconstructed = reconstructed.reshape(h, w, d) 24 | 25 | # Clip values to valid range 26 | reconstructed = np.clip(reconstructed, 0, 1) 27 | 28 | # Calculate actual compressed size 29 | # For each pixel, we store: 30 | # 1. n_components values (compressed data) 31 | # 2. mean vector (d values) 32 | # 3. component vectors (n_components * d values) 33 | compressed_size = (h * w * n_components + # compressed data 34 | d + # mean vector 35 | n_components * d) # component vectors 36 | 37 | return reconstructed, pca.explained_variance_ratio_, compressed_size 38 | 39 | def run(): 40 | st.header("Image Compression using PCA") 41 | st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Dimensionality_Reduction)", unsafe_allow_html=True) 42 | 43 | # File uploader 44 | uploaded_file = st.file_uploader("Upload an image", type=['jpg', 'jpeg', 'png']) 45 | 46 | if uploaded_file is not None: 47 | # Read image 48 | image = io.imread(uploaded_file) 49 | 50 | # Display original image 51 | st.subheader("Original Image") 52 | st.image(image, use_column_width=True) 53 | 54 | # Convert to float and normalize 55 | image_float = image.astype(np.float32) / 255.0 56 | 57 | # Parameters 58 | st.subheader("Compression Parameters") 59 | max_components = min(image.shape[2], 3) # Limit to number of color channels 60 | n_components = st.slider("Number of Components", min_value=1, max_value=max_components, value=1) 61 | 62 | if st.button("Compress Image"): 63 | # Compress image 64 | compressed_image, explained_variance, compressed_size = compress_image(image_float, n_components) 65 | 66 | # Convert back to uint8 for display 67 | compressed_display = (compressed_image * 255).astype(np.uint8) 68 | 69 | # Display compressed image 70 | st.subheader("Compressed Image") 71 | st.image(compressed_display, use_column_width=True) 72 | 73 | # Display compression statistics 74 | st.subheader("Compression Statistics") 75 | original_size = image.size # Number of pixels * number of channels 76 | compression_ratio = original_size / compressed_size 77 | st.write(f"Original Size: {original_size/1024:.1f} KB") 78 | st.write(f"Compressed Size: {compressed_size/1024:.1f} KB") 79 | st.write(f"Compression Ratio: {compression_ratio:.1f}x") 80 | st.write(f"Components Used: {n_components}") 81 | st.write(f"Explained Variance: {np.sum(explained_variance)*100:.1f}%") 82 | 83 | # Display explained variance 84 | st.subheader("Explained Variance") 85 | fig = go.Figure() 86 | fig.add_trace(go.Bar( 87 | x=list(range(1, len(explained_variance) + 1)), 88 | y=explained_variance, 89 | name='Individual' 90 | )) 91 | fig.add_trace(go.Scatter( 92 | x=list(range(1, len(explained_variance) + 1)), 93 | y=np.cumsum(explained_variance), 94 | name='Cumulative', 95 | mode='lines+markers' 96 | )) 97 | fig.update_layout( 98 | title='Explained Variance by Component', 99 | xaxis_title='Component', 100 | yaxis_title='Explained Variance', 101 | showlegend=True 102 | ) 103 | st.plotly_chart(fig) 104 | 105 | # Download compressed image 106 | compressed_pil = Image.fromarray(compressed_display) 107 | img_byte_arr = io_lib.BytesIO() 108 | compressed_pil.save(img_byte_arr, format='PNG') 109 | img_byte_arr = img_byte_arr.getvalue() 110 | st.download_button( 111 | label="Download Compressed Image", 112 | data=img_byte_arr, 113 | file_name="compressed_image.png", 114 | mime="image/png" 115 | ) 116 | 117 | if __name__ == "__main__": 118 | run() -------------------------------------------------------------------------------- /Dimensionality_Reduction/README.md: -------------------------------------------------------------------------------- 1 | # Dimensionality Reduction Projects 2 | 3 | This repository contains projects that demonstrate the application of various dimensionality reduction techniques in different domains. 4 | 5 | ## Projects 6 | 7 | ### 1. Image Compression 8 | 9 | **Screenshots:** 10 | ![Image Compression](screenshots/img_comp.png) 11 | - Interactive image upload and processing 12 | - PCA-based image compression 13 | - Adjustable number of components 14 | - Compression statistics and analysis 15 | - Explained variance visualization 16 | - Color channel analysis 17 | - Download compressed images 18 | 19 | ### 2. Feature Selection 20 | 21 | **Screenshots:** 22 | ![Feature Selection](screenshots/feature_sel.png) 23 | - Interactive dataset upload 24 | - Multiple dimensionality reduction methods (PCA, t-SNE) 25 | - Feature importance analysis 26 | - Explained variance visualization 27 | - Interactive parameter tuning 28 | - Correlation analysis 29 | - Visual exploration of reduced dimensions 30 | 31 | ## How to Run 32 | 33 | 1. Install the required packages: 34 | ```bash 35 | pip install -r requirements.txt 36 | ``` 37 | 38 | 2. Run the Streamlit app: 39 | ```bash 40 | streamlit run main.py 41 | ``` 42 | 43 | ## Project Structure 44 | 45 | - `main.py`: Main entry point for running the projects 46 | - `Dimensionality_Reduction_projects/`: Directory containing individual project files 47 | - `image_compression.py`: Image compression using PCA 48 | - `feature_selection.py`: Feature selection using PCA and t-SNE 49 | 50 | ## Features 51 | 52 | - Interactive parameter tuning 53 | - Rich visualizations using Plotly 54 | - Support for custom data upload 55 | - Sample data generation 56 | - Detailed analysis tools 57 | - Multiple dimensionality reduction methods 58 | - Comprehensive visualizations 59 | 60 | ## Contributing 61 | 62 | Contributions are welcome! Please feel free to submit a Pull Request. 63 | 64 | ## License 65 | 66 | This project is licensed under the MIT License. -------------------------------------------------------------------------------- /Dimensionality_Reduction/main.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import sys 3 | from pathlib import Path 4 | 5 | # Ensure the Decision_Trees_projects folder is in the Python path 6 | sys.path.append(str(Path(__file__).parent)) 7 | from Dimensionality_Reduction_projects import ( 8 | image_compression, 9 | feature_selection 10 | ) 11 | 12 | def run(): 13 | st.title("Dimensionality Reduction Projects") 14 | 15 | # Sidebar for project selection 16 | project = st.sidebar.selectbox( 17 | "Select a project", 18 | [ 19 | "Image Compression", 20 | "Feature Selection" 21 | ], 22 | ) 23 | 24 | # Run the selected project 25 | if project == "Image Compression": 26 | image_compression.run() 27 | elif project == "Feature Selection": 28 | feature_selection.run() 29 | 30 | if __name__ == "__main__": 31 | run() -------------------------------------------------------------------------------- /Dimensionality_Reduction/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit==1.22.0 2 | pandas==1.5.3 3 | numpy==1.24.3 4 | scikit-learn==1.2.2 5 | plotly==5.13.1 6 | matplotlib==3.7.1 7 | seaborn==0.12.2 8 | scikit-image==0.20.0 9 | pillow==9.5.0 10 | scipy==1.10.1 11 | scikit-image -------------------------------------------------------------------------------- /Dimensionality_Reduction/screenshots/feature_sel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Dimensionality_Reduction/screenshots/feature_sel.png -------------------------------------------------------------------------------- /Dimensionality_Reduction/screenshots/img_comp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Dimensionality_Reduction/screenshots/img_comp.png -------------------------------------------------------------------------------- /Fuzzy_C_Means/Fuzzy_C_Means_projects/image_segmentation.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.decomposition import PCA 5 | import plotly.express as px 6 | import plotly.graph_objects as go 7 | from skimage import io 8 | import matplotlib.pyplot as plt 9 | from PIL import Image 10 | import io as io_lib 11 | 12 | def fuzzy_c_means(data, n_clusters, m=2, max_iter=100, error=1e-5): 13 | # Initialize membership matrix randomly 14 | n_samples = data.shape[0] 15 | membership = np.random.random((n_samples, n_clusters)) 16 | membership = membership / membership.sum(axis=1)[:, np.newaxis] 17 | 18 | # Initialize cluster centers 19 | centers = np.zeros((n_clusters, data.shape[1])) 20 | 21 | # Iterate until convergence 22 | for _ in range(max_iter): 23 | # Update cluster centers 24 | for j in range(n_clusters): 25 | centers[j] = np.sum(membership[:, j:j+1] ** m * data, axis=0) / np.sum(membership[:, j:j+1] ** m) 26 | 27 | # Update membership matrix 28 | old_membership = membership.copy() 29 | 30 | # Calculate distances between data points and centers 31 | distances = np.zeros((n_samples, n_clusters)) 32 | for j in range(n_clusters): 33 | distances[:, j] = np.sum((data - centers[j]) ** 2, axis=1) 34 | 35 | # Update membership values 36 | for j in range(n_clusters): 37 | membership[:, j] = 1 / np.sum((distances[:, j:j+1] / distances) ** (1/(m-1)), axis=1) 38 | 39 | # Check convergence 40 | if np.max(np.abs(membership - old_membership)) < error: 41 | break 42 | 43 | return membership, centers 44 | 45 | def run(): 46 | st.header("Image Segmentation using Fuzzy C-Means") 47 | st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Fuzzy_C_Means)", unsafe_allow_html=True) 48 | 49 | # File uploader 50 | uploaded_file = st.file_uploader("Upload an image", type=['jpg', 'jpeg', 'png']) 51 | 52 | if uploaded_file is not None: 53 | # Read image 54 | image = io.imread(uploaded_file) 55 | 56 | # Display original image 57 | st.subheader("Original Image") 58 | st.image(image, use_column_width=True) 59 | 60 | # Convert to LAB color space for better segmentation 61 | if len(image.shape) == 3: # Color image 62 | # Convert to float and normalize 63 | image_float = image.astype(np.float32) / 255.0 64 | 65 | # Parameters 66 | st.subheader("Segmentation Parameters") 67 | n_segments = st.slider("Number of Segments", min_value=2, max_value=8, value=4) 68 | fuzziness = st.slider("Fuzziness Parameter (m)", min_value=1.1, max_value=3.0, value=2.0, step=0.1) 69 | 70 | if st.button("Segment Image"): 71 | # Reshape image for clustering 72 | h, w, d = image_float.shape 73 | image_2d = image_float.reshape(h * w, d) 74 | 75 | # Apply Fuzzy C-Means 76 | membership, centers = fuzzy_c_means(image_2d, n_segments, m=fuzziness) 77 | 78 | # Get segment labels 79 | labels = np.argmax(membership, axis=1) 80 | 81 | # Create segmented image 82 | segmented = centers[labels].reshape(h, w, d) 83 | segmented = np.clip(segmented * 255, 0, 255).astype(np.uint8) 84 | 85 | # Display segmented image 86 | st.subheader("Segmented Image") 87 | st.image(segmented, use_column_width=True) 88 | 89 | # Display membership maps 90 | st.subheader("Membership Maps") 91 | fig, axes = plt.subplots(1, n_segments, figsize=(15, 5)) 92 | for i in range(n_segments): 93 | membership_map = membership[:, i].reshape(h, w) 94 | axes[i].imshow(membership_map, cmap='viridis') 95 | axes[i].set_title(f'Segment {i+1}') 96 | axes[i].axis('off') 97 | st.pyplot(fig) 98 | 99 | # Display segment statistics 100 | st.subheader("Segment Statistics") 101 | for i in range(n_segments): 102 | segment_size = np.sum(labels == i) 103 | segment_percentage = (segment_size / (h * w)) * 100 104 | st.write(f"Segment {i+1}: {segment_size} pixels ({segment_percentage:.1f}%)") 105 | 106 | # Display segment colors 107 | st.subheader("Segment Colors") 108 | fig, ax = plt.subplots(figsize=(10, 2)) 109 | for i in range(n_segments): 110 | color = centers[i] 111 | # Convert color to RGB tuple for matplotlib 112 | rgb_color = tuple(color) 113 | ax.bar(i, 1, color=rgb_color) 114 | ax.set_xticks(range(n_segments)) 115 | ax.set_xticklabels([f'Segment {i+1}' for i in range(n_segments)]) 116 | ax.set_yticks([]) 117 | st.pyplot(fig) 118 | 119 | # Download segmented image 120 | segmented_pil = Image.fromarray(segmented) 121 | img_byte_arr = io_lib.BytesIO() 122 | segmented_pil.save(img_byte_arr, format='PNG') 123 | img_byte_arr = img_byte_arr.getvalue() 124 | st.download_button( 125 | label="Download Segmented Image", 126 | data=img_byte_arr, 127 | file_name="segmented_image.png", 128 | mime="image/png" 129 | ) 130 | else: 131 | st.warning("Please upload a color image.") 132 | 133 | if __name__ == "__main__": 134 | run() -------------------------------------------------------------------------------- /Fuzzy_C_Means/README.md: -------------------------------------------------------------------------------- 1 | # Fuzzy C-Means Projects 2 | 3 | This repository contains projects that demonstrate the application of Fuzzy C-Means clustering algorithm in various domains. 4 | 5 | ## Projects 6 | 7 | ### 1. Image Segmentation 8 | 9 | **Screenshots:** 10 | ![Image Segmentation](screenshots/imag_seg.png) 11 | - Interactive image upload and processing 12 | - Customizable number of segments 13 | - Adjustable fuzziness parameter 14 | - Visualization of membership maps 15 | - Segment statistics and analysis 16 | - Color distribution analysis 17 | 18 | ### 2. Customer Profiling 19 | 20 | **Screenshots:** 21 | ![Customer Profiling](screenshots/cust_prof.png) 22 | - Interactive feature selection 23 | - Dynamic cluster number adjustment 24 | - Adjustable fuzziness parameter 25 | - PCA visualization with hover data 26 | - Detailed segment analysis 27 | - Feature importance visualization 28 | - Correlation heatmap 29 | - Interactive prediction for new customers 30 | 31 | ## How to Run 32 | 33 | 1. Install the required packages: 34 | ```bash 35 | pip install -r requirements.txt 36 | ``` 37 | 38 | 2. Run the Streamlit app: 39 | ```bash 40 | streamlit run main.py 41 | ``` 42 | 43 | ## Project Structure 44 | 45 | - `main.py`: Main entry point for running the projects 46 | - `Fuzzy_C_Means_projects/`: Directory containing individual project files 47 | - `image_segmentation.py`: Image segmentation using Fuzzy C-Means 48 | - `customer_profiling.py`: Customer profiling using Fuzzy C-Means 49 | 50 | ## Features 51 | 52 | - Interactive parameter tuning 53 | - Rich visualizations using Plotly 54 | - Support for custom data upload 55 | - Sample data generation 56 | - Detailed analysis tools 57 | - Interactive prediction capabilities 58 | 59 | ## Contributing 60 | 61 | Contributions are welcome! Please feel free to submit a Pull Request. 62 | 63 | ## License 64 | 65 | This project is licensed under the MIT License. -------------------------------------------------------------------------------- /Fuzzy_C_Means/main.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import sys 3 | from pathlib import Path 4 | 5 | # Ensure the Decision_Trees_projects folder is in the Python path 6 | sys.path.append(str(Path(__file__).parent)) 7 | from Fuzzy_C_Means_projects import ( 8 | image_segmentation, 9 | customer_profiling 10 | ) 11 | 12 | def run(): 13 | st.title("Fuzzy C-Means Clustering Projects") 14 | 15 | # Sidebar for project selection 16 | project = st.sidebar.selectbox( 17 | "Select a project", 18 | [ 19 | "Image Segmentation", 20 | "Customer Profiling" 21 | ], 22 | ) 23 | 24 | # Run the selected project 25 | if project == "Image Segmentation": 26 | image_segmentation.run() 27 | elif project == "Customer Profiling": 28 | customer_profiling.run() 29 | 30 | if __name__ == "__main__": 31 | run() -------------------------------------------------------------------------------- /Fuzzy_C_Means/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit==1.22.0 2 | pandas==1.5.3 3 | numpy==1.24.3 4 | scikit-learn==1.2.2 5 | plotly==5.13.1 6 | matplotlib==3.7.1 7 | seaborn==0.12.2 8 | scikit-image==0.20.0 9 | scipy==1.10.1 -------------------------------------------------------------------------------- /Fuzzy_C_Means/screenshots/cust_prof.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Fuzzy_C_Means/screenshots/cust_prof.png -------------------------------------------------------------------------------- /Fuzzy_C_Means/screenshots/imag_seg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Fuzzy_C_Means/screenshots/imag_seg.png -------------------------------------------------------------------------------- /GMM/main.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import sys 3 | from pathlib import Path 4 | 5 | # Ensure the Linear_regression_projects folder is in the Python path 6 | sys.path.append(str(Path(__file__).parent)) 7 | from GMM_projects import ( 8 | customer_segmentation, 9 | image_color_segmentation, 10 | ) 11 | 12 | def run(): 13 | st.title("GMM Projects") 14 | 15 | # Sidebar for project selection 16 | project = st.sidebar.selectbox( 17 | "Select a project", 18 | [ 19 | "Customer Segmentation", 20 | "Image Color Segmentation", 21 | ], 22 | ) 23 | 24 | # Run the selected project 25 | if project == "Customer Segmentation": 26 | customer_segmentation.run() 27 | elif project == "Image Color Segmentation": 28 | image_color_segmentation.run() 29 | 30 | if __name__ == "__main__": 31 | run() 32 | -------------------------------------------------------------------------------- /GMM/readme.md: -------------------------------------------------------------------------------- 1 | # GMM Projects 2 | 3 | This folder contains various projects that utilize Gaussian Mixture Models (GMM) for different applications. Each project is designed to demonstrate the use of GMM in machine learning tasks with interactive visualizations. 4 | 5 | ## Projects 6 | 7 | 1. **Customer Segmentation**: Clusters customers based on their characteristics using GMM. Features include: 8 | 9 | **Screenshots:** 10 | ![Customer Segmentation](screenshots/cust.png) 11 | - Interactive parameter tuning 12 | - 2D and 3D visualizations 13 | - Cluster analysis and interpretation 14 | - Model evaluation metrics 15 | 16 | 2. **Image Color Segmentation**: Segments images into color clusters using GMM. Features include: 17 | 18 | **Screenshots:** 19 | ![Image Color Segmentation](screenshots/image_clust.png) 20 | - Interactive image upload 21 | - Color cluster visualization 22 | - 3D color space analysis 23 | - Cluster information and statistics 24 | 25 | ## How to Run 26 | 27 | To run any of the projects, follow these steps: 28 | 29 | 1. Ensure you have the required dependencies installed. You can install them using pip: 30 | 31 | ```bash 32 | pip install streamlit pandas numpy scikit-learn plotly pillow 33 | ``` 34 | 35 | 2. Navigate to the GMM directory in your terminal. 36 | 37 | 3. Run the Streamlit app using the following command: 38 | 39 | ```bash 40 | streamlit run main.py 41 | ``` 42 | 43 | 4. Use the sidebar to select the project you want to run. 44 | 45 | ## Project Structure 46 | 47 | - `main.py`: The main entry point for running the projects. 48 | - `GMM_projects/`: Contains individual project files: 49 | - `customer_segmentation.py`: Customer segmentation project. 50 | - `image_color_segmentation.py`: Image color segmentation project. 51 | 52 | ## Features 53 | 54 | - Interactive parameter tuning 55 | - Real-time visualizations 56 | - Model evaluation metrics 57 | - Detailed cluster analysis 58 | - Support for custom data input 59 | 60 | ## Contributing 61 | 62 | Feel free to contribute to these projects by submitting pull requests or opening issues for any bugs or feature requests. 63 | 64 | ## License 65 | 66 | This project is licensed under the MIT License - see the LICENSE file for details. 67 | -------------------------------------------------------------------------------- /GMM/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit==1.22.0 2 | pandas==1.5.3 3 | numpy==1.24.3 4 | scikit-learn==1.2.2 5 | plotly==5.13.1 6 | pillow==9.5.0 7 | opencv-python 8 | -------------------------------------------------------------------------------- /GMM/screenshots/cust.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/GMM/screenshots/cust.png -------------------------------------------------------------------------------- /GMM/screenshots/image_clust.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/GMM/screenshots/image_clust.png -------------------------------------------------------------------------------- /Hierarchical_Clustering/Hierarchical_projects/document_clustering.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.feature_extraction.text import TfidfVectorizer 5 | from sklearn.cluster import AgglomerativeClustering 6 | from sklearn.metrics.pairwise import cosine_similarity 7 | import plotly.express as px 8 | import plotly.graph_objects as go 9 | from scipy.cluster.hierarchy import dendrogram, linkage 10 | import matplotlib.pyplot as plt 11 | import seaborn as sns 12 | 13 | def run(): 14 | st.header("Document Clustering using Hierarchical Clustering") 15 | st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Hierarchical_Clustering)", unsafe_allow_html=True) 16 | 17 | # Load dataset 18 | uploaded_file = st.file_uploader("Upload a CSV file with text documents", type=["csv"]) 19 | if uploaded_file is not None: 20 | df = pd.read_csv(uploaded_file) 21 | # Assuming the text column is named 'text' 22 | texts = df['text'].values 23 | else: 24 | st.info("Using sample document data") 25 | # Generate sample documents 26 | texts = [ 27 | "Machine learning is a subset of artificial intelligence", 28 | "Deep learning uses neural networks with multiple layers", 29 | "Natural language processing helps computers understand text", 30 | "Computer vision enables machines to interpret images", 31 | "Data science combines statistics and programming", 32 | "Big data refers to large and complex datasets", 33 | "Cloud computing provides on-demand computing resources", 34 | "Internet of Things connects physical devices to the internet", 35 | "Cybersecurity protects systems from digital attacks", 36 | "Blockchain is a distributed ledger technology", 37 | "Quantum computing uses quantum bits for calculations", 38 | "Augmented reality overlays digital content on the real world", 39 | "Virtual reality creates immersive digital environments", 40 | "Robotics combines mechanical and electronic systems", 41 | "5G technology enables faster wireless communication" 42 | ] 43 | 44 | # Text preprocessing and vectorization 45 | vectorizer = TfidfVectorizer(stop_words='english') 46 | X = vectorizer.fit_transform(texts) 47 | 48 | # Hierarchical Clustering parameters 49 | st.subheader("Clustering Parameters") 50 | n_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5) 51 | linkage_method = st.selectbox("Linkage Method", ['ward', 'complete', 'average', 'single']) 52 | 53 | # Perform hierarchical clustering 54 | clustering = AgglomerativeClustering( 55 | n_clusters=n_clusters, 56 | linkage=linkage_method 57 | ) 58 | clusters = clustering.fit_predict(X.toarray()) 59 | 60 | # Create dendrogram 61 | st.subheader("Dendrogram") 62 | Z = linkage(X.toarray(), method=linkage_method) 63 | fig, ax = plt.subplots(figsize=(10, 7)) 64 | dendrogram(Z, truncate_mode='level', p=5) 65 | plt.title('Hierarchical Clustering Dendrogram') 66 | plt.xlabel('Sample Index') 67 | plt.ylabel('Distance') 68 | st.pyplot(fig) 69 | 70 | # Document similarity matrix 71 | st.subheader("Document Similarity Matrix") 72 | similarity_matrix = cosine_similarity(X) 73 | fig, ax = plt.subplots(figsize=(10, 8)) 74 | sns.heatmap(similarity_matrix, cmap='viridis') 75 | plt.title('Document Similarity Matrix') 76 | st.pyplot(fig) 77 | 78 | # Cluster visualization using PCA 79 | from sklearn.decomposition import PCA 80 | pca = PCA(n_components=2) 81 | X_pca = pca.fit_transform(X.toarray()) 82 | 83 | # Create DataFrame for visualization 84 | viz_df = pd.DataFrame({ 85 | 'PC1': X_pca[:, 0], 86 | 'PC2': X_pca[:, 1], 87 | 'Cluster': clusters, 88 | 'Document': [f"Doc {i+1}" for i in range(len(texts))] 89 | }) 90 | 91 | # 2D Scatter plot 92 | fig = px.scatter(viz_df, x='PC1', y='PC2', color='Cluster', 93 | hover_data=['Document'], 94 | title='Document Clusters (PCA Visualization)') 95 | st.plotly_chart(fig) 96 | 97 | # Cluster analysis 98 | st.subheader("Cluster Analysis") 99 | for cluster in range(n_clusters): 100 | st.write(f"\nCluster {cluster}:") 101 | cluster_docs = [texts[i] for i in range(len(texts)) if clusters[i] == cluster] 102 | st.write(f"Number of documents: {len(cluster_docs)}") 103 | st.write("Documents in this cluster:") 104 | for doc in cluster_docs: 105 | st.write(f"- {doc}") 106 | 107 | # Cluster statistics 108 | st.subheader("Cluster Statistics") 109 | cluster_sizes = pd.Series(clusters).value_counts().sort_index() 110 | fig = px.bar(x=cluster_sizes.index, y=cluster_sizes.values, 111 | title='Number of Documents per Cluster', 112 | labels={'x': 'Cluster', 'y': 'Number of Documents'}) 113 | st.plotly_chart(fig) 114 | 115 | # Word clouds for each cluster 116 | st.subheader("Word Clouds by Cluster") 117 | from wordcloud import WordCloud 118 | 119 | for cluster in range(n_clusters): 120 | cluster_docs = [texts[i] for i in range(len(texts)) if clusters[i] == cluster] 121 | if cluster_docs: 122 | text = ' '.join(cluster_docs) 123 | wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text) 124 | fig, ax = plt.subplots(figsize=(10, 5)) 125 | ax.imshow(wordcloud, interpolation='bilinear') 126 | ax.axis('off') 127 | plt.title(f'Word Cloud - Cluster {cluster}') 128 | st.pyplot(fig) 129 | 130 | if __name__ == "__main__": 131 | run() -------------------------------------------------------------------------------- /Hierarchical_Clustering/Hierarchical_projects/market_basket_analysis.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.cluster import AgglomerativeClustering 5 | from sklearn.preprocessing import StandardScaler 6 | import plotly.express as px 7 | import plotly.graph_objects as go 8 | from scipy.cluster.hierarchy import dendrogram, linkage 9 | import matplotlib.pyplot as plt 10 | import seaborn as sns 11 | 12 | def run(): 13 | st.header("Market Basket Analysis using Hierarchical Clustering") 14 | st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Hierarchical_Clustering)", unsafe_allow_html=True) 15 | 16 | # Load dataset 17 | uploaded_file = st.file_uploader("Upload a CSV file with purchase data", type=["csv"]) 18 | if uploaded_file is not None: 19 | df = pd.read_csv(uploaded_file) 20 | else: 21 | st.info("Using sample market basket data") 22 | # Generate sample market basket data 23 | np.random.seed(42) 24 | n_transactions = 1000 25 | n_items = 20 26 | 27 | # Generate random purchase patterns 28 | data = np.random.binomial(1, 0.3, (n_transactions, n_items)) 29 | 30 | # Create item names 31 | item_names = [f'Item_{i+1}' for i in range(n_items)] 32 | 33 | # Create DataFrame 34 | df = pd.DataFrame(data, columns=item_names) 35 | 36 | # Display data info 37 | st.subheader("Dataset Information") 38 | st.write(f"Number of transactions: {len(df)}") 39 | st.write(f"Number of items: {len(df.columns)}") 40 | st.write("Sample data:") 41 | st.dataframe(df.head()) 42 | 43 | # Hierarchical Clustering parameters 44 | st.subheader("Clustering Parameters") 45 | n_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5) 46 | linkage_method = st.selectbox("Linkage Method", ['ward', 'complete', 'average', 'single']) 47 | 48 | # Prepare data for clustering 49 | X = df.values 50 | scaler = StandardScaler() 51 | X_scaled = scaler.fit_transform(X) 52 | 53 | # Perform hierarchical clustering 54 | clustering = AgglomerativeClustering( 55 | n_clusters=n_clusters, 56 | linkage=linkage_method 57 | ) 58 | clusters = clustering.fit_predict(X_scaled) 59 | 60 | # Create dendrogram 61 | st.subheader("Dendrogram") 62 | Z = linkage(X_scaled, method=linkage_method) 63 | fig, ax = plt.subplots(figsize=(10, 7)) 64 | dendrogram(Z, truncate_mode='level', p=5) 65 | plt.title('Hierarchical Clustering Dendrogram') 66 | plt.xlabel('Sample Index') 67 | plt.ylabel('Distance') 68 | st.pyplot(fig) 69 | 70 | # Item correlation matrix 71 | st.subheader("Item Correlation Matrix") 72 | correlation_matrix = df.corr() 73 | fig, ax = plt.subplots(figsize=(12, 10)) 74 | sns.heatmap(correlation_matrix, cmap='coolwarm', center=0) 75 | plt.title('Item Correlation Matrix') 76 | st.pyplot(fig) 77 | 78 | # Cluster visualization using PCA 79 | from sklearn.decomposition import PCA 80 | pca = PCA(n_components=2) 81 | X_pca = pca.fit_transform(X_scaled) 82 | 83 | # Create DataFrame for visualization 84 | viz_df = pd.DataFrame({ 85 | 'PC1': X_pca[:, 0], 86 | 'PC2': X_pca[:, 1], 87 | 'Cluster': clusters 88 | }) 89 | 90 | # 2D Scatter plot 91 | fig = px.scatter(viz_df, x='PC1', y='PC2', color='Cluster', 92 | title='Transaction Clusters (PCA Visualization)') 93 | st.plotly_chart(fig) 94 | 95 | # Cluster analysis 96 | st.subheader("Cluster Analysis") 97 | for cluster in range(n_clusters): 98 | st.write(f"\nCluster {cluster}:") 99 | cluster_data = df[clusters == cluster] 100 | st.write(f"Number of transactions: {len(cluster_data)}") 101 | 102 | # Calculate item frequencies in this cluster 103 | item_freq = cluster_data.mean().sort_values(ascending=False) 104 | top_items = item_freq[item_freq > 0.1] # Show items that appear in more than 10% of transactions 105 | 106 | st.write("Top items in this cluster:") 107 | for item, freq in top_items.items(): 108 | st.write(f"- {item}: {freq:.1%} of transactions") 109 | 110 | # Cluster statistics 111 | st.subheader("Cluster Statistics") 112 | cluster_sizes = pd.Series(clusters).value_counts().sort_index() 113 | fig = px.bar(x=cluster_sizes.index, y=cluster_sizes.values, 114 | title='Number of Transactions per Cluster', 115 | labels={'x': 'Cluster', 'y': 'Number of Transactions'}) 116 | st.plotly_chart(fig) 117 | 118 | # Item frequency by cluster 119 | st.subheader("Item Frequency by Cluster") 120 | cluster_item_freq = df.groupby(clusters).mean() 121 | 122 | # Create heatmap 123 | fig, ax = plt.subplots(figsize=(12, 8)) 124 | sns.heatmap(cluster_item_freq, cmap='YlOrRd') 125 | plt.title('Item Frequency by Cluster') 126 | st.pyplot(fig) 127 | 128 | # Association rules 129 | st.subheader("Association Rules") 130 | from mlxtend.frequent_patterns import apriori, association_rules 131 | 132 | # Generate frequent itemsets 133 | frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True) 134 | 135 | # Generate rules 136 | rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1) 137 | 138 | if not rules.empty: 139 | st.write("Top Association Rules:") 140 | # Convert frozensets to strings for display 141 | display_rules = rules.copy() 142 | display_rules['antecedents'] = display_rules['antecedents'].apply(lambda x: ', '.join(list(x))) 143 | display_rules['consequents'] = display_rules['consequents'].apply(lambda x: ', '.join(list(x))) 144 | st.dataframe(display_rules.head()) 145 | 146 | # Create a copy of rules for visualization with string versions of frozensets 147 | viz_rules = rules.copy() 148 | viz_rules['antecedents_str'] = viz_rules['antecedents'].apply(lambda x: ', '.join(list(x))) 149 | viz_rules['consequents_str'] = viz_rules['consequents'].apply(lambda x: ', '.join(list(x))) 150 | 151 | # Visualize rules 152 | fig = px.scatter(viz_rules, x='support', y='confidence', 153 | size='lift', color='lift', 154 | hover_data=['antecedents_str', 'consequents_str'], 155 | title='Association Rules Visualization') 156 | st.plotly_chart(fig) 157 | 158 | if __name__ == "__main__": 159 | run() -------------------------------------------------------------------------------- /Hierarchical_Clustering/README.md: -------------------------------------------------------------------------------- 1 | # Hierarchical Clustering Projects 2 | 3 | This folder contains various projects that utilize Hierarchical Clustering for different applications. Each project is designed to demonstrate the use of hierarchical clustering in machine learning tasks with interactive visualizations. 4 | 5 | ## Projects 6 | 7 | 1. **Document Clustering**: Clusters text documents based on their content using hierarchical clustering. Features include: 8 | 9 | **Screenshots:** 10 | ![Document Clustering](screenshots/doc_clust.png) 11 | - Interactive parameter tuning 12 | - Dendrogram visualization 13 | - Document similarity matrix 14 | - Word cloud visualization 15 | - Cluster analysis and interpretation 16 | 17 | 2. **Market Basket Analysis**: Analyzes shopping patterns using hierarchical clustering. Features include: 18 | 19 | **Screenshots:** 20 | ![Market Basket Analysis](screenshots/market_basket.png) 21 | - Transaction clustering 22 | - Item correlation analysis 23 | - Association rules mining 24 | - Interactive visualizations 25 | - Cluster analysis and interpretation 26 | 27 | ## How to Run 28 | 29 | To run any of the projects, follow these steps: 30 | 31 | 1. Ensure you have the required dependencies installed. You can install them using pip: 32 | 33 | ```bash 34 | pip install streamlit pandas numpy scikit-learn plotly matplotlib seaborn wordcloud mlxtend 35 | ``` 36 | 37 | 2. Navigate to the Hierarchical directory in your terminal. 38 | 39 | 3. Run the Streamlit app using the following command: 40 | 41 | ```bash 42 | streamlit run main.py 43 | ``` 44 | 45 | 4. Use the sidebar to select the project you want to run. 46 | 47 | ## Project Structure 48 | 49 | - `main.py`: The main entry point for running the projects. 50 | - `Hierarchical_projects/`: Contains individual project files: 51 | - `document_clustering.py`: Document clustering project. 52 | - `market_basket_analysis.py`: Market basket analysis project. 53 | 54 | ## Features 55 | 56 | - Interactive parameter tuning 57 | - Real-time visualizations 58 | - Detailed cluster analysis 59 | - Support for custom data input 60 | - Rich visualization tools including: 61 | - Dendrograms 62 | - Heatmaps 63 | - Scatter plots 64 | - Word clouds 65 | - Association rule visualizations 66 | 67 | ## Contributing 68 | 69 | Feel free to contribute to these projects by submitting pull requests or opening issues for any bugs or feature requests. 70 | 71 | ## License 72 | 73 | This project is licensed under the MIT License - see the LICENSE file for details. -------------------------------------------------------------------------------- /Hierarchical_Clustering/main.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import sys 3 | from pathlib import Path 4 | 5 | # Ensure the Decision_Trees_projects folder is in the Python path 6 | sys.path.append(str(Path(__file__).parent)) 7 | from Hierarchical_projects import ( 8 | document_clustering, 9 | market_basket_analysis, 10 | 11 | ) 12 | 13 | def run(): 14 | st.title("Clustering Projects") 15 | 16 | # Sidebar for project selection 17 | project = st.sidebar.selectbox( 18 | "Select a project", 19 | [ 20 | "Document Clustering", 21 | "Market Basket Analysis", 22 | 23 | ], 24 | ) 25 | 26 | # Run the selected project 27 | if project == "Document Clustering": 28 | document_clustering.run() 29 | elif project == "Market Basket Analysis": 30 | market_basket_analysis.run() 31 | 32 | 33 | if __name__ == "__main__": 34 | run() -------------------------------------------------------------------------------- /Hierarchical_Clustering/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit==1.22.0 2 | pandas==1.5.3 3 | numpy==1.24.3 4 | scikit-learn==1.2.2 5 | plotly==5.13.1 6 | matplotlib==3.7.1 7 | seaborn==0.12.2 8 | wordcloud==1.9.2 9 | mlxtend==0.23.1 -------------------------------------------------------------------------------- /Hierarchical_Clustering/screenshots/doc_clust.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Hierarchical_Clustering/screenshots/doc_clust.png -------------------------------------------------------------------------------- /Hierarchical_Clustering/screenshots/market_basket.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Hierarchical_Clustering/screenshots/market_basket.png -------------------------------------------------------------------------------- /K-Means/K_Means_projects/customer_segmentation.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import numpy as np 4 | import plotly.express as px 5 | import plotly.graph_objects as go 6 | from sklearn.cluster import KMeans 7 | from sklearn.preprocessing import StandardScaler 8 | from sklearn.decomposition import PCA 9 | import seaborn as sns 10 | import matplotlib.pyplot as plt 11 | 12 | def run(): 13 | st.header("Customer Segmentation using K-Means Clustering") 14 | st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/K-Means)", unsafe_allow_html=True) 15 | 16 | # Load or generate dataset 17 | uploaded_file = st.file_uploader("Upload a CSV file with customer data", type=["csv"]) 18 | if uploaded_file is not None: 19 | df = pd.read_csv(uploaded_file) 20 | else: 21 | st.info("Using sample customer data") 22 | # Generate sample customer data 23 | np.random.seed(42) 24 | n_customers = 1000 25 | 26 | data = { 27 | 'Customer_ID': range(1, n_customers + 1), 28 | 'Age': np.random.randint(18, 70, n_customers), 29 | 'Annual_Income': np.random.randint(20000, 150000, n_customers), 30 | 'Spending_Score': np.random.randint(1, 100, n_customers), 31 | 'Purchase_Frequency': np.random.randint(1, 50, n_customers), 32 | 'Average_Order_Value': np.random.randint(50, 500, n_customers), 33 | 'Days_Since_Last_Purchase': np.random.randint(0, 365, n_customers) 34 | } 35 | df = pd.DataFrame(data) 36 | 37 | # Display data info 38 | st.subheader("Dataset Information") 39 | st.write(f"Number of customers: {len(df)}") 40 | st.write("Sample data:") 41 | st.dataframe(df.head()) 42 | 43 | # Feature selection 44 | st.subheader("Feature Selection") 45 | features = ['Age', 'Annual_Income', 'Spending_Score', 'Purchase_Frequency', 46 | 'Average_Order_Value', 'Days_Since_Last_Purchase'] 47 | selected_features = st.multiselect("Select features for clustering", features, 48 | default=['Annual_Income', 'Spending_Score']) 49 | 50 | if len(selected_features) >= 2: 51 | # Prepare data 52 | X = df[selected_features] 53 | scaler = StandardScaler() 54 | X_scaled = scaler.fit_transform(X) 55 | 56 | # K-Means parameters 57 | st.subheader("Clustering Parameters") 58 | n_clusters = st.slider("Number of Clusters", min_value=2, max_value=6, value=4) 59 | random_state = st.slider("Random State", min_value=0, max_value=100, value=42) 60 | 61 | # Apply K-Means 62 | kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10) 63 | df['Segment'] = kmeans.fit_predict(X_scaled) 64 | 65 | # Label segments based on average income and spending score 66 | cluster_means = df.groupby('Segment')[['Annual_Income', 'Spending_Score']].mean() 67 | segment_labels = { 68 | 0: 'High Value', 69 | 1: 'Low Value', 70 | 2: 'High Potential', 71 | 3: 'At Risk' 72 | } 73 | df['Segment'] = df['Segment'].map(segment_labels) 74 | 75 | # Visualize clusters using PCA 76 | pca = PCA(n_components=2) 77 | X_pca = pca.fit_transform(X_scaled) 78 | df['PCA1'] = X_pca[:, 0] 79 | df['PCA2'] = X_pca[:, 1] 80 | 81 | # PCA Scatter plot 82 | fig = px.scatter(df, x='PCA1', y='PCA2', color='Segment', 83 | hover_data=selected_features, 84 | title='Customer Segments (PCA Visualization)') 85 | st.plotly_chart(fig) 86 | 87 | # Cluster Analysis 88 | st.subheader("Segment Analysis") 89 | for segment in df['Segment'].unique(): 90 | st.write(f"\n{segment} Customers:") 91 | segment_data = df[df['Segment'] == segment] 92 | st.write(f"Number of customers: {len(segment_data)}") 93 | 94 | # Display segment statistics 95 | stats = segment_data[selected_features].describe() 96 | st.write("Segment Statistics:") 97 | st.dataframe(stats) 98 | 99 | # Feature importance visualization 100 | st.subheader("Feature Importance by Segment") 101 | segment_means = df.groupby('Segment')[selected_features].mean() 102 | fig = px.bar(segment_means, title='Average Feature Values by Segment', 103 | labels={'value': 'Average Value', 'variable': 'Feature'}) 104 | st.plotly_chart(fig) 105 | 106 | # Correlation heatmap 107 | st.subheader("Feature Correlation Matrix") 108 | correlation_matrix = df[selected_features].corr() 109 | fig, ax = plt.subplots(figsize=(10, 8)) 110 | sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0) 111 | st.pyplot(fig) 112 | 113 | # Segment distribution 114 | st.subheader("Segment Distribution") 115 | segment_counts = df['Segment'].value_counts() 116 | fig = px.pie(values=segment_counts.values, names=segment_counts.index, 117 | title='Customer Segment Distribution') 118 | st.plotly_chart(fig) 119 | 120 | # Interactive prediction 121 | st.subheader("Predict Segment for New Customer") 122 | input_values = {} 123 | for feature in selected_features: 124 | if feature == 'Age': 125 | input_values[feature] = st.number_input(feature, min_value=18, max_value=70, value=35) 126 | elif feature == 'Annual_Income': 127 | input_values[feature] = st.number_input(feature, min_value=20000, max_value=150000, value=50000) 128 | elif feature == 'Spending_Score': 129 | input_values[feature] = st.number_input(feature, min_value=1, max_value=100, value=50) 130 | elif feature == 'Purchase_Frequency': 131 | input_values[feature] = st.number_input(feature, min_value=1, max_value=50, value=10) 132 | elif feature == 'Average_Order_Value': 133 | input_values[feature] = st.number_input(feature, min_value=50, max_value=500, value=100) 134 | elif feature == 'Days_Since_Last_Purchase': 135 | input_values[feature] = st.number_input(feature, min_value=0, max_value=365, value=30) 136 | 137 | if st.button("Predict Customer Segment"): 138 | # Create input array with only the selected features 139 | new_customer = np.array([[input_values[feature] for feature in selected_features]]) 140 | new_customer_scaled = scaler.transform(new_customer) 141 | prediction = kmeans.predict(new_customer_scaled)[0] 142 | segment = segment_labels[prediction] 143 | st.success(f"Predicted Customer Segment: {segment}") 144 | 145 | if __name__ == "__main__": 146 | run() -------------------------------------------------------------------------------- /K-Means/K_Means_projects/loan_approval.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import numpy as np 4 | import plotly.express as px 5 | import plotly.graph_objects as go 6 | from sklearn.cluster import KMeans 7 | from sklearn.preprocessing import StandardScaler 8 | from sklearn.decomposition import PCA 9 | import seaborn as sns 10 | import matplotlib.pyplot as plt 11 | 12 | def run(): 13 | st.header("Loan Approval Analysis using K-Means Clustering") 14 | st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/K-Means)", unsafe_allow_html=True) 15 | 16 | # Load or generate dataset 17 | uploaded_file = st.file_uploader("Upload a CSV file with loan data", type=["csv"]) 18 | if uploaded_file is not None: 19 | df = pd.read_csv(uploaded_file) 20 | else: 21 | st.info("Using sample loan data") 22 | # Generate sample loan data 23 | np.random.seed(42) 24 | n_applicants = 1000 25 | 26 | data = { 27 | 'Applicant_ID': range(1, n_applicants + 1), 28 | 'Annual_Income': np.random.randint(30000, 120000, n_applicants), 29 | 'Credit_Score': np.random.randint(300, 850, n_applicants), 30 | 'Loan_Amount': np.random.randint(5000, 50000, n_applicants), 31 | 'Debt_to_Income_Ratio': np.random.uniform(0.1, 0.5, n_applicants), 32 | 'Employment_Years': np.random.randint(0, 30, n_applicants), 33 | 'Age': np.random.randint(18, 65, n_applicants) 34 | } 35 | df = pd.DataFrame(data) 36 | 37 | # Display data info 38 | st.subheader("Dataset Information") 39 | st.write(f"Number of applicants: {len(df)}") 40 | st.write("Sample data:") 41 | st.dataframe(df.head()) 42 | 43 | # Feature selection 44 | st.subheader("Feature Selection") 45 | features = ['Annual_Income', 'Credit_Score', 'Loan_Amount', 'Debt_to_Income_Ratio', 46 | 'Employment_Years', 'Age'] 47 | selected_features = st.multiselect("Select features for clustering", features, 48 | default=['Annual_Income', 'Credit_Score', 'Loan_Amount', 'Debt_to_Income_Ratio']) 49 | 50 | if len(selected_features) >= 2: 51 | # Prepare data 52 | X = df[selected_features] 53 | scaler = StandardScaler() 54 | X_scaled = scaler.fit_transform(X) 55 | 56 | # K-Means parameters 57 | st.subheader("Clustering Parameters") 58 | n_clusters = st.slider("Number of Clusters", min_value=2, max_value=6, value=3) 59 | random_state = st.slider("Random State", min_value=0, max_value=100, value=42) 60 | 61 | # Apply K-Means 62 | kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10) 63 | df['Risk_Category'] = kmeans.fit_predict(X_scaled) 64 | 65 | # Label clusters based on average credit score and income 66 | cluster_means = df.groupby('Risk_Category')[['Credit_Score', 'Annual_Income']].mean() 67 | risk_order = cluster_means['Credit_Score'].rank(ascending=False).astype(int) - 1 68 | risk_labels = {i: f"{['Low', 'Medium', 'High'][j]} Risk" 69 | for i, j in risk_order.items()} 70 | df['Risk_Category'] = df['Risk_Category'].map(risk_labels) 71 | 72 | # Visualize clusters using PCA 73 | pca = PCA(n_components=2) 74 | X_pca = pca.fit_transform(X_scaled) 75 | df['PCA1'] = X_pca[:, 0] 76 | df['PCA2'] = X_pca[:, 1] 77 | 78 | # PCA Scatter plot 79 | fig = px.scatter(df, x='PCA1', y='PCA2', color='Risk_Category', 80 | hover_data=selected_features, 81 | title='Loan Approval Clusters (PCA Visualization)') 82 | st.plotly_chart(fig) 83 | 84 | # Cluster Analysis 85 | st.subheader("Cluster Analysis") 86 | for risk in df['Risk_Category'].unique(): 87 | st.write(f"\n{risk} Applicants:") 88 | cluster_data = df[df['Risk_Category'] == risk] 89 | st.write(f"Number of applicants: {len(cluster_data)}") 90 | 91 | # Display cluster statistics 92 | stats = cluster_data[selected_features].describe() 93 | st.write("Cluster Statistics:") 94 | st.dataframe(stats) 95 | 96 | # Feature importance visualization 97 | st.subheader("Feature Importance by Cluster") 98 | cluster_means = df.groupby('Risk_Category')[selected_features].mean() 99 | fig = px.bar(cluster_means, title='Average Feature Values by Risk Category', 100 | labels={'value': 'Average Value', 'variable': 'Feature'}) 101 | st.plotly_chart(fig) 102 | 103 | # Correlation heatmap 104 | st.subheader("Feature Correlation Matrix") 105 | correlation_matrix = df[selected_features].corr() 106 | fig, ax = plt.subplots(figsize=(10, 8)) 107 | sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0) 108 | st.pyplot(fig) 109 | 110 | # Interactive prediction 111 | st.subheader("Predict Risk Category for New Applicant") 112 | col1, col2 = st.columns(2) 113 | with col1: 114 | income = st.number_input("Annual Income", min_value=30000, max_value=120000, value=50000) 115 | credit_score = st.number_input("Credit Score", min_value=300, max_value=850, value=700) 116 | with col2: 117 | loan_amount = st.number_input("Loan Amount", min_value=5000, max_value=50000, value=20000) 118 | dti_ratio = st.number_input("Debt-to-Income Ratio", min_value=0.1, max_value=0.5, value=0.3) 119 | 120 | if st.button("Predict Risk Category"): 121 | new_applicant = np.array([[income, credit_score, loan_amount, dti_ratio]]) 122 | new_applicant_scaled = scaler.transform(new_applicant) 123 | prediction = kmeans.predict(new_applicant_scaled)[0] 124 | risk_category = risk_labels[prediction] 125 | st.success(f"Predicted Risk Category: {risk_category}") 126 | 127 | if __name__ == "__main__": 128 | run() -------------------------------------------------------------------------------- /K-Means/main.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import sys 3 | from pathlib import Path 4 | 5 | # Ensure the Decision_Trees_projects folder is in the Python path 6 | sys.path.append(str(Path(__file__).parent)) 7 | from K_Means_projects import ( 8 | loan_approval, 9 | customer_segmentation 10 | ) 11 | 12 | def run(): 13 | st.title("K-Means Clustering Projects") 14 | 15 | # Sidebar for project selection 16 | project = st.sidebar.selectbox( 17 | "Select a project", 18 | [ 19 | "Loan Approval", 20 | "Customer Segmentation" 21 | ], 22 | ) 23 | 24 | # Run the selected project 25 | if project == "Loan Approval": 26 | loan_approval.run() 27 | elif project == "Customer Segmentation": 28 | customer_segmentation.run() 29 | 30 | if __name__ == "__main__": 31 | run() 32 | -------------------------------------------------------------------------------- /K-Means/readme.md: -------------------------------------------------------------------------------- 1 | # K-Means Projects 2 | 3 | This repository contains various K-Means clustering projects implemented in Python. Each project demonstrates the application of K-Means clustering to solve real-world problems using datasets. 4 | 5 | ## Project Structure 6 | 7 | ``` 8 | K-Means/ 9 | ├── main.py 10 | ├── requirements.txt 11 | ├── K_Means_projects/ 12 | │ ├── customer_segmentation.py 13 | │ ├── loan_approval.py 14 | ``` 15 | 16 | ### Key Files 17 | - **`main.py`**: The main entry point for running the Streamlit app. 18 | - **`requirements.txt`**: Contains the dependencies required to run the project. 19 | - **`K_Means_projects/`**: Contains individual project scripts. 20 | 21 | ## Projects Included 22 | 23 | 1. **Customer Segmentation** 24 | Segments customers into different groups based on their behavior and characteristics using K-Means clustering. 25 | 26 | **Screenshots:** 27 | ![Customer Segmentation](screenshots/cust_seg.png) 28 | 29 | - Interactive parameter tuning 30 | - Cluster visualization 31 | - Customer group analysis 32 | 33 | 2. **Loan Approval Clustering** 34 | Groups loan applications into clusters based on various features using K-Means clustering. 35 | 36 | **Screenshots:** 37 | ![Loan Approval Clustering](screenshots/loan.png) 38 | 39 | - Risk assessment 40 | - Cluster analysis 41 | - Interactive visualization 42 | 43 | ## How to Run 44 | 45 | 1. Clone the repository: 46 | ```bash 47 | git clone https://github.com/benasphy/ML_projects.git 48 | cd K-Means 49 | ``` 50 | 51 | 2. Install dependencies: 52 | ```bash 53 | pip install -r requirements.txt 54 | ``` 55 | 56 | 3. Run the Streamlit app: 57 | ```bash 58 | streamlit run main.py 59 | ``` 60 | 61 | 4. Select a project from the sidebar to explore its functionality. 62 | 63 | ## Requirements 64 | 65 | The project requires the following Python libraries: 66 | - `streamlit` 67 | - `numpy` 68 | - `pandas` 69 | - `scikit-learn` 70 | - `matplotlib` 71 | - `plotly` 72 | 73 | ## Datasets 74 | 75 | - **`customer_data.csv`**: Contains customer behavior data for segmentation. 76 | - **`loan_data.csv`**: Contains loan application data for clustering. 77 | 78 | ## License 79 | 80 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. 81 | 82 | ## Acknowledgments 83 | 84 | - Datasets used in this project are sourced from publicly available repositories. 85 | - Special thanks to the contributors of the Python libraries used in this project. 86 | 87 | --- 88 | Feel free to contribute to this repository by submitting issues or pull requests. 89 | -------------------------------------------------------------------------------- /K-Means/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit==1.22.0 2 | pandas==1.5.3 3 | numpy==1.24.3 4 | scikit-learn==1.2.2 5 | plotly==5.13.1 6 | matplotlib==3.7.1 7 | seaborn==0.12.2 8 | scipy 9 | -------------------------------------------------------------------------------- /K-Means/screenshots/cust_seg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/K-Means/screenshots/cust_seg.png -------------------------------------------------------------------------------- /K-Means/screenshots/loan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/K-Means/screenshots/loan.png -------------------------------------------------------------------------------- /KNN/KNN_projects/TShirt_size.csv: -------------------------------------------------------------------------------- 1 | Height (in cms),Weight (in kgs),T Shirt Size 2 | 158,58,M 3 | 158,59,M 4 | 158,63,M 5 | 160,59,M 6 | 160,60,M 7 | 163,60,M 8 | 163,61,M 9 | 160,64,L 10 | 163,64,L 11 | 165,61,L 12 | 165,62,L 13 | 165,65,L 14 | 168,62,L 15 | 168,63,L 16 | 168,66,L 17 | 170,63,L 18 | 170,64,L 19 | 170,68,L 20 | -------------------------------------------------------------------------------- /KNN/KNN_projects/netflix_titles.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/KNN/KNN_projects/netflix_titles.csv -------------------------------------------------------------------------------- /KNN/KNN_projects/tshirt_size_prediction.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import numpy as np 4 | import os 5 | from sklearn.neighbors import KNeighborsClassifier 6 | from sklearn.preprocessing import LabelEncoder, StandardScaler 7 | from sklearn.model_selection import train_test_split, cross_val_score 8 | from sklearn.metrics import classification_report, confusion_matrix 9 | import plotly.express as px 10 | import plotly.graph_objects as go 11 | import seaborn as sns 12 | import matplotlib.pyplot as plt 13 | 14 | def run(): 15 | st.header("T-Shirt Size Prediction using KNN") 16 | st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/KNN)", unsafe_allow_html=True) 17 | 18 | # Load dataset 19 | uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"]) 20 | if uploaded_file is not None: 21 | df = pd.read_csv(uploaded_file) 22 | else: 23 | st.info("Using default dataset: TShirt_size.csv") 24 | df = pd.read_csv(os.path.join(os.path.dirname(__file__), "TShirt_size.csv")) 25 | 26 | # Display dataset info 27 | st.subheader("Dataset Overview") 28 | col1, col2 = st.columns(2) 29 | with col1: 30 | st.write("Dataset Shape:", df.shape) 31 | st.write("Number of Samples:", len(df)) 32 | with col2: 33 | size_dist = df["T Shirt Size"].value_counts() 34 | fig = px.pie(values=size_dist.values, names=size_dist.index, 35 | title='T-Shirt Size Distribution') 36 | st.plotly_chart(fig) 37 | 38 | # Data Analysis 39 | st.subheader("Data Analysis") 40 | 41 | # Height and Weight Distribution 42 | col1, col2 = st.columns(2) 43 | with col1: 44 | fig = px.box(df, x="T Shirt Size", y="Height (in cms)", 45 | title='Height Distribution by Size') 46 | st.plotly_chart(fig) 47 | 48 | with col2: 49 | fig = px.box(df, x="T Shirt Size", y="Weight (in kgs)", 50 | title='Weight Distribution by Size') 51 | st.plotly_chart(fig) 52 | 53 | # Scatter plot with size distribution 54 | fig = px.scatter(df, x="Height (in cms)", y="Weight (in kgs)", 55 | color="T Shirt Size", 56 | title='Height vs Weight by T-Shirt Size') 57 | st.plotly_chart(fig) 58 | 59 | # Data preprocessing 60 | encoder = LabelEncoder() 61 | df["T Shirt Size"] = encoder.fit_transform(df["T Shirt Size"]) 62 | 63 | X = df[["Height (in cms)", "Weight (in kgs)"]] 64 | y = df["T Shirt Size"] 65 | 66 | # Split data 67 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 68 | 69 | # Scaling 70 | scaler = StandardScaler() 71 | scaler.fit(X_train) 72 | X_train = scaler.transform(X_train) 73 | X_test = scaler.transform(X_test) 74 | 75 | # Train model 76 | model = KNeighborsClassifier(n_neighbors=3, metric="manhattan") 77 | model.fit(X_train, y_train) 78 | 79 | # Model evaluation 80 | st.subheader("Model Performance") 81 | y_pred = model.predict(X_test) 82 | 83 | # Display metrics 84 | col1, col2, col3 = st.columns(3) 85 | with col1: 86 | scores = cross_val_score(model, X, y, cv=5, scoring='precision_weighted') 87 | st.metric("Average Precision", f"{scores.mean():.2%}") 88 | with col2: 89 | st.metric("Standard Deviation", f"{scores.std():.2%}") 90 | with col3: 91 | accuracy = (y_pred == y_test).mean() 92 | st.metric("Test Accuracy", f"{accuracy:.2%}") 93 | 94 | # Confusion Matrix 95 | st.subheader("Confusion Matrix") 96 | cm = confusion_matrix(y_test, y_pred) 97 | fig = px.imshow(cm, 98 | labels=dict(x="Predicted", y="Actual", color="Count"), 99 | x=['Medium', 'Large'], 100 | y=['Medium', 'Large'], 101 | text_auto=True, 102 | aspect="auto") 103 | st.plotly_chart(fig) 104 | 105 | # KNN Visualization 106 | st.subheader("KNN Decision Boundaries") 107 | 108 | # Create a mesh grid 109 | h_min, h_max = X["Height (in cms)"].min() - 1, X["Height (in cms)"].max() + 1 110 | w_min, w_max = X["Weight (in kgs)"].min() - 1, X["Weight (in kgs)"].max() + 1 111 | h_grid = np.arange(h_min, h_max, 0.5) 112 | w_grid = np.arange(w_min, w_max, 0.5) 113 | hh, ww = np.meshgrid(h_grid, w_grid) 114 | 115 | # Predict for each point in the grid 116 | grid_points = np.c_[hh.ravel(), ww.ravel()] 117 | grid_points_scaled = scaler.transform(grid_points) 118 | grid_predictions = model.predict(grid_points_scaled) 119 | 120 | # Plot decision boundaries 121 | fig = px.scatter(x=grid_points[:, 0], y=grid_points[:, 1], 122 | color=grid_predictions, 123 | title='KNN Decision Boundaries') 124 | fig.add_scatter(x=X["Height (in cms)"], y=X["Weight (in kgs)"], 125 | mode='markers', 126 | marker=dict(color=y, symbol='circle'), 127 | name='Training Data') 128 | st.plotly_chart(fig) 129 | 130 | # Prediction interface 131 | st.subheader("Predict T-Shirt Size") 132 | col1, col2 = st.columns(2) 133 | with col1: 134 | height = st.number_input("Height (in cms):", min_value=140, max_value=200, value=170) 135 | with col2: 136 | weight = st.number_input("Weight (in kgs):", min_value=40, max_value=120, value=70) 137 | 138 | if st.button("Predict Size"): 139 | new_sample = np.array([height, weight]).reshape(1, -1) 140 | new_sample_scaled = scaler.transform(new_sample) 141 | prediction = model.predict(new_sample_scaled)[0] 142 | probabilities = model.predict_proba(new_sample_scaled)[0] 143 | 144 | size_mapping = {0: "Large", 1: "Medium"} 145 | predicted_size = size_mapping[prediction] 146 | 147 | # Display prediction 148 | col1, col2 = st.columns(2) 149 | with col1: 150 | st.metric("Predicted Size", predicted_size) 151 | with col2: 152 | st.metric("Confidence", f"{max(probabilities):.2%}") 153 | 154 | # Visualize prediction probabilities 155 | fig = go.Figure(data=[ 156 | go.Bar(x=['Large', 'Medium'], 157 | y=probabilities, 158 | text=[f'{p:.2%}' for p in probabilities], 159 | textposition='auto', 160 | ) 161 | ]) 162 | fig.update_layout(title='Prediction Probabilities', 163 | xaxis_title='Size', 164 | yaxis_title='Probability') 165 | st.plotly_chart(fig) 166 | 167 | # Show nearest neighbors 168 | st.subheader("Nearest Neighbors") 169 | distances, indices = model.kneighbors(new_sample_scaled) 170 | 171 | neighbors_df = pd.DataFrame({ 172 | 'Height (cm)': X.iloc[indices[0]]["Height (in cms)"], 173 | 'Weight (kg)': X.iloc[indices[0]]["Weight (in kgs)"], 174 | 'Size': [size_mapping[y.iloc[i]] for i in indices[0]], 175 | 'Distance': distances[0] 176 | }) 177 | 178 | fig = px.scatter(neighbors_df, x='Height (cm)', y='Weight (kg)', 179 | color='Size', 180 | size='Distance', 181 | title='Nearest Neighbors', 182 | hover_data=['Distance']) 183 | fig.add_scatter(x=[height], y=[weight], 184 | mode='markers', 185 | marker=dict(color='red', symbol='star', size=15), 186 | name='Your Measurements') 187 | st.plotly_chart(fig) 188 | 189 | if __name__ == "__main__": 190 | run() -------------------------------------------------------------------------------- /KNN/main.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import sys 3 | from pathlib import Path 4 | 5 | # Ensure the KNN_projects folder is in the Python path 6 | sys.path.append(str(Path(__file__).parent)) 7 | 8 | from KNN_projects import ( 9 | movie_recommendation, 10 | tshirt_size_prediction, 11 | ) 12 | 13 | def run(): 14 | st.title("KNN Projects") 15 | 16 | # Sidebar for project selection 17 | project = st.sidebar.selectbox( 18 | "Select a project", 19 | [ 20 | "Movie Recommendation", 21 | "T-Shirt Size Prediction", 22 | ], 23 | ) 24 | 25 | # Run the selected project 26 | if project == "Movie Recommendation": 27 | movie_recommendation.run() 28 | elif project == "T-Shirt Size Prediction": 29 | tshirt_size_prediction.run() 30 | 31 | if __name__ == "__main__": 32 | run() 33 | -------------------------------------------------------------------------------- /KNN/readme.md: -------------------------------------------------------------------------------- 1 | # KNN Projects 2 | 3 | This folder contains various projects that utilize the K-Nearest Neighbors (KNN) algorithm for different applications. Each project is designed to demonstrate the use of KNN in machine learning tasks. 4 | 5 | ## Projects 6 | 7 | 1. **Movie Recommendation System**: Recommends similar movies based on content features using KNN. 8 | 9 | **Screenshots:** 10 | ![Movie Recommendation 1](screenshots/movie1.png) 11 | ![Movie Recommendation 2](screenshots/movie2.png) 12 | 2. **T-Shirt Size Prediction**: Predicts T-shirt sizes based on height and weight measurements using KNN. 13 | 14 | **Screenshots:** 15 | ![T-Shirt Size Prediction](screenshots/t-shirt.png) 16 | 17 | ## How to Run 18 | 19 | To run any of the projects, follow these steps: 20 | 21 | 1. Ensure you have the required dependencies installed. You can install them using pip: 22 | 23 | ```bash 24 | pip install streamlit pandas numpy scikit-learn 25 | ``` 26 | 27 | 2. Navigate to the KNN directory in your terminal. 28 | 29 | 3. Run the Streamlit app using the following command: 30 | 31 | ```bash 32 | streamlit run main.py 33 | ``` 34 | 35 | 4. Use the sidebar to select the project you want to run. 36 | 37 | ## Project Structure 38 | 39 | - `main.py`: The main entry point for running the projects. 40 | - `KNN_projects/`: Contains individual project files: 41 | - `movie_recommendation.py`: Movie recommendation system project. 42 | - `tshirt_size_prediction.py`: T-shirt size prediction project. 43 | 44 | ## Data 45 | 46 | Each project uses its own dataset, which is either uploaded by the user or loaded from a default CSV file located in the `KNN_projects/` directory. 47 | 48 | ## Contributing 49 | 50 | Feel free to contribute to these projects by submitting pull requests or opening issues for any bugs or feature requests. 51 | 52 | ## License 53 | 54 | This project is licensed under the MIT License - see the LICENSE file for details. 55 | -------------------------------------------------------------------------------- /KNN/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit==1.22.0 2 | pandas==1.5.3 3 | numpy==1.24.3 4 | scikit-learn==1.2.2 5 | plotly 6 | scipy 7 | seaborn 8 | wordcloud 9 | -------------------------------------------------------------------------------- /KNN/screenshots/movie1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/KNN/screenshots/movie1.png -------------------------------------------------------------------------------- /KNN/screenshots/movie2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/KNN/screenshots/movie2.png -------------------------------------------------------------------------------- /KNN/screenshots/t-shirt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/KNN/screenshots/t-shirt.png -------------------------------------------------------------------------------- /Linear_Regression/Linear_regression_projects/Salary_dataset.csv: -------------------------------------------------------------------------------- 1 | ,YearsExperience,Salary 2 | 0,1.2000000000000002,39344.0 3 | 1,1.4000000000000001,46206.0 4 | 2,1.6,37732.0 5 | 3,2.1,43526.0 6 | 4,2.3000000000000003,39892.0 7 | 5,3.0,56643.0 8 | 6,3.1,60151.0 9 | 7,3.3000000000000003,54446.0 10 | 8,3.3000000000000003,64446.0 11 | 9,3.8000000000000003,57190.0 12 | 10,4.0,63219.0 13 | 11,4.1,55795.0 14 | 12,4.1,56958.0 15 | 13,4.199999999999999,57082.0 16 | 14,4.6,61112.0 17 | 15,5.0,67939.0 18 | 16,5.199999999999999,66030.0 19 | 17,5.3999999999999995,83089.0 20 | 18,6.0,81364.0 21 | 19,6.1,93941.0 22 | 20,6.8999999999999995,91739.0 23 | 21,7.199999999999999,98274.0 24 | 22,8.0,101303.0 25 | 23,8.299999999999999,113813.0 26 | 24,8.799999999999999,109432.0 27 | 25,9.1,105583.0 28 | 26,9.6,116970.0 29 | 27,9.7,112636.0 30 | 28,10.4,122392.0 31 | 29,10.6,121873.0 32 | -------------------------------------------------------------------------------- /Linear_Regression/Linear_regression_projects/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Linear_Regression/Linear_regression_projects/house_price_prediction.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.linear_model import LinearRegression 5 | from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error 6 | import plotly.express as px 7 | import plotly.graph_objects as go 8 | from scipy import stats 9 | 10 | def calculate_residuals(y_true, y_pred): 11 | """Calculate and return residuals.""" 12 | return y_true - y_pred 13 | 14 | def run(): 15 | st.header("House Price Prediction") 16 | st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Linear_Regression)", unsafe_allow_html=True) 17 | 18 | # Example data 19 | house_sizes = np.array([1400, 1600, 1700, 1875, 1100, 1550, 2350, 2450, 1425, 1700]) 20 | house_prices = np.array([245, 312, 279, 308, 199, 219, 405, 324, 319, 255]) 21 | 22 | # Create DataFrame 23 | df = pd.DataFrame({ 24 | 'Size': house_sizes, 25 | 'Price': house_prices 26 | }) 27 | 28 | # Data Overview 29 | st.subheader("Data Overview") 30 | col1, col2 = st.columns(2) 31 | 32 | with col1: 33 | st.write("**Dataset Information:**") 34 | st.write(f"Number of houses: {len(df)}") 35 | st.write("\n**Basic Statistics:**") 36 | st.write(df.describe().round(2)) 37 | 38 | with col2: 39 | # Distribution plots 40 | fig = px.box(df, title='Price and Size Distributions') 41 | st.plotly_chart(fig) 42 | 43 | # Data Analysis 44 | st.subheader("Data Analysis") 45 | 46 | # Correlation analysis 47 | correlation = df['Size'].corr(df['Price']) 48 | st.write(f"**Correlation between Size and Price:** {correlation:.3f}") 49 | 50 | # Scatter plot with trend line 51 | fig = px.scatter(df, x='Size', y='Price', 52 | title='House Size vs Price', 53 | labels={'Size': 'House Size (sq. ft.)', 54 | 'Price': 'Price ($1000)'}) 55 | fig.add_trace(go.Scatter(x=df['Size'], 56 | y=stats.linregress(df['Size'], df['Price'])[0] * df['Size'] + 57 | stats.linregress(df['Size'], df['Price'])[1], 58 | mode='lines', 59 | name='Trend Line')) 60 | st.plotly_chart(fig) 61 | 62 | # Reshape data 63 | X = house_sizes.reshape(-1, 1) 64 | y = house_prices 65 | 66 | # Train model 67 | model = LinearRegression() 68 | model.fit(X, y) 69 | 70 | # Model Evaluation 71 | st.subheader("Model Evaluation") 72 | y_pred = model.predict(X) 73 | 74 | col1, col2, col3 = st.columns(3) 75 | with col1: 76 | st.metric("R² Score", f"{r2_score(y, y_pred):.3f}") 77 | with col2: 78 | st.metric("RMSE", f"${np.sqrt(mean_squared_error(y, y_pred)):.2f}K") 79 | with col3: 80 | st.metric("MAE", f"${mean_absolute_error(y, y_pred):.2f}K") 81 | 82 | # Residual Analysis 83 | st.subheader("Residual Analysis") 84 | residuals = calculate_residuals(y, y_pred) 85 | 86 | col1, col2 = st.columns(2) 87 | with col1: 88 | # Residuals vs Predicted 89 | fig = px.scatter(x=y_pred, y=residuals, 90 | title='Residuals vs Predicted Values', 91 | labels={'x': 'Predicted Price ($1000)', 92 | 'y': 'Residuals'}) 93 | fig.add_hline(y=0, line_dash="dash", line_color="red") 94 | st.plotly_chart(fig) 95 | 96 | with col2: 97 | # Residuals distribution 98 | fig = px.histogram(residuals, 99 | title='Residuals Distribution', 100 | labels={'value': 'Residuals'}) 101 | st.plotly_chart(fig) 102 | 103 | # Prediction Interface 104 | st.subheader("Price Prediction") 105 | 106 | col1, col2 = st.columns(2) 107 | with col1: 108 | new_size = st.number_input("Enter house size (sq. ft.):", 109 | min_value=500, 110 | max_value=10000, 111 | value=2000, 112 | step=100) 113 | 114 | # Calculate prediction 115 | prediction = model.predict([[new_size]])[0] 116 | confidence_interval = 1.96 * np.sqrt(mean_squared_error(y, y_pred)) 117 | 118 | st.metric("Predicted Price", 119 | f"${prediction:.2f}K", 120 | f"±${confidence_interval:.2f}K") 121 | 122 | with col2: 123 | # Prediction visualization 124 | fig = go.Figure() 125 | 126 | # Add actual data points 127 | fig.add_trace(go.Scatter( 128 | x=df['Size'], 129 | y=df['Price'], 130 | mode='markers', 131 | name='Actual Data', 132 | marker=dict(color='blue') 133 | )) 134 | 135 | # Add regression line 136 | x_range = np.linspace(min(df['Size']), max(df['Size']), 100) 137 | y_range = model.predict(x_range.reshape(-1, 1)) 138 | fig.add_trace(go.Scatter( 139 | x=x_range, 140 | y=y_range, 141 | mode='lines', 142 | name='Regression Line', 143 | line=dict(color='red') 144 | )) 145 | 146 | # Add prediction point 147 | fig.add_trace(go.Scatter( 148 | x=[new_size], 149 | y=[prediction], 150 | mode='markers', 151 | name='Prediction', 152 | marker=dict(color='green', size=12) 153 | )) 154 | 155 | # Add confidence interval 156 | fig.add_trace(go.Scatter( 157 | x=x_range, 158 | y=y_range + confidence_interval, 159 | mode='lines', 160 | line=dict(width=0), 161 | showlegend=False 162 | )) 163 | fig.add_trace(go.Scatter( 164 | x=x_range, 165 | y=y_range - confidence_interval, 166 | mode='lines', 167 | line=dict(width=0), 168 | fill='tonexty', 169 | name='95% Confidence Interval' 170 | )) 171 | 172 | fig.update_layout( 173 | title='House Price Prediction', 174 | xaxis_title='House Size (sq. ft.)', 175 | yaxis_title='Price ($1000)', 176 | showlegend=True 177 | ) 178 | st.plotly_chart(fig) 179 | 180 | # Model Information 181 | st.subheader("Model Information") 182 | st.write(f"**Slope (Price per sq. ft.):** ${model.coef_[0]:.2f}") 183 | st.write(f"**Intercept:** ${model.intercept_:.2f}") 184 | st.write(f"**Equation:** Price = ${model.coef_[0]:.2f} × Size + ${model.intercept_:.2f}") 185 | 186 | if __name__ == "__main__": 187 | run() -------------------------------------------------------------------------------- /Linear_Regression/README.md: -------------------------------------------------------------------------------- 1 | # Linear Regression Projects 2 | 3 | This repository contains various Linear Regression projects implemented in Python. Each project demonstrates the application of Linear Regression to solve real-world problems using datasets. 4 | 5 | ## Project Structure 6 | 7 | ``` 8 | LinearRegression/ 9 | ├── main.py 10 | ├── requirements.txt 11 | ├── Linear_regression_projects/ 12 | │ ├── messi_goal_prediction.py 13 | │ ├── house_price_prediction.py 14 | │ ├── study_hours_exam_prediction.py 15 | │ ├── normal_equation_vs_gradient_descent.py 16 | │ ├── salary_prediction.py 17 | │ ├── diabetes.csv 18 | ``` 19 | 20 | ### Key Files 21 | - **`main.py`**: The main entry point for running the Streamlit app. 22 | - **`requirements.txt`**: Contains the dependencies required to run the project. 23 | - **`Linear_regression_projects/`**: Contains individual project scripts and datasets. 24 | 25 | ## Projects Included 26 | 27 | 1. **Messi Goal Prediction** 28 | Predicts the number of goals Messi will score based on the number of matches played using Linear Regression. 29 | 30 | **Screenshots:** 31 | ![Messi Goal Distribution](screenshots/leo1.png) 32 | ![Goal Prediction Model](screenshots/leo2.png) 33 | ![Model Performance Metrics](screenshots/leo3.png) 34 | 35 | 2. **House Price Prediction** 36 | Predicts house prices based on their sizes using Linear Regression. 37 | 38 | **Screenshots:** 39 | ![House Price Prediction](screenshots/house1.png) 40 | 41 | 3. **Study Hours and Exam Prediction** 42 | Predicts exam scores based on the number of hours studied using Linear Regression. 43 | 44 | **Screenshots:** 45 | ![Score Prediction 1](screenshots/score1.png) 46 | ![Score Prediction 2](screenshots/score2.png) 47 | 48 | 4. **Normal Equation vs Gradient Descent** 49 | Compares the performance of the normal equation and gradient descent methods for solving linear regression. 50 | 51 | **Screenshots:** 52 | ![Normal vs Gradient 1](screenshots/norm_grad1.png) 53 | ![Normal vs Gradient 2](screenshots/norm_grad2.png) 54 | 55 | 5. **Salary Prediction** 56 | Predicts salary based on years of experience using Linear Regression. 57 | 58 | **Screenshots:** 59 | ![Salary Prediction](screenshots/salary_pred.png) 60 | -------------------------------------------------------------------------------- /Linear_Regression/main.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import sys 3 | from pathlib import Path 4 | 5 | # Ensure the Linear_regression_projects folder is in the Python path 6 | sys.path.append(str(Path(__file__).parent)) 7 | 8 | from Linear_regression_projects import ( 9 | messi_goal_prediction, 10 | house_price_prediction, 11 | study_hours_exam_prediction, 12 | normal_equation_vs_gradient_descent, 13 | salary_prediction, 14 | ) 15 | 16 | def run(): 17 | st.title("Linear Regression Projects") 18 | 19 | # Sidebar for project selection 20 | project = st.sidebar.selectbox( 21 | "Select a project", 22 | [ 23 | "Messi Goal Prediction", 24 | "House Price Prediction", 25 | "Study Hours and Exam Prediction", 26 | "Normal Equation vs Gradient Descent", 27 | "Salary Prediction", 28 | ], 29 | ) 30 | 31 | # Run the selected project 32 | if project == "Messi Goal Prediction": 33 | messi_goal_prediction.run() 34 | elif project == "House Price Prediction": 35 | house_price_prediction.run() 36 | elif project == "Study Hours and Exam Prediction": 37 | study_hours_exam_prediction.run() 38 | elif project == "Normal Equation vs Gradient Descent": 39 | normal_equation_vs_gradient_descent.run() 40 | elif project == "Salary Prediction": 41 | salary_prediction.run() 42 | 43 | if __name__ == "__main__": 44 | run() -------------------------------------------------------------------------------- /Linear_Regression/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Linear_Regression/requirements.txt -------------------------------------------------------------------------------- /Linear_Regression/screenshots/house1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Linear_Regression/screenshots/house1.png -------------------------------------------------------------------------------- /Linear_Regression/screenshots/leo1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Linear_Regression/screenshots/leo1.png -------------------------------------------------------------------------------- /Linear_Regression/screenshots/leo2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Linear_Regression/screenshots/leo2.png -------------------------------------------------------------------------------- /Linear_Regression/screenshots/leo3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Linear_Regression/screenshots/leo3.png -------------------------------------------------------------------------------- /Linear_Regression/screenshots/norm_grad1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Linear_Regression/screenshots/norm_grad1.png -------------------------------------------------------------------------------- /Linear_Regression/screenshots/norm_grad2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Linear_Regression/screenshots/norm_grad2.png -------------------------------------------------------------------------------- /Linear_Regression/screenshots/salary_pred.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Linear_Regression/screenshots/salary_pred.png -------------------------------------------------------------------------------- /Linear_Regression/screenshots/score1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Linear_Regression/screenshots/score1.png -------------------------------------------------------------------------------- /Linear_Regression/screenshots/score2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Linear_Regression/screenshots/score2.png -------------------------------------------------------------------------------- /Logistic_Regression/Logistic_Regression_projects/diabetes_prediction.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 7 | import plotly.express as px 8 | import plotly.graph_objects as go 9 | from sklearn.preprocessing import StandardScaler 10 | import seaborn as sns 11 | import matplotlib.pyplot as plt 12 | 13 | def run(): 14 | st.header("Diabetes Prediction using Logistic Regression") 15 | st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Logistic_Regression)", unsafe_allow_html=True) 16 | 17 | # Load dataset 18 | df = pd.read_csv("Logistic_Regression/Logistic_Regression_projects/diabetes.csv") 19 | 20 | # Display dataset info 21 | st.subheader("Dataset Overview") 22 | col1, col2 = st.columns(2) 23 | with col1: 24 | st.write("Dataset Shape:", df.shape) 25 | st.write("Features:", ", ".join(df.columns[:-1])) 26 | st.write("Target: Outcome (0: No Diabetes, 1: Diabetes)") 27 | with col2: 28 | st.write("Class Distribution:") 29 | class_dist = df['Outcome'].value_counts() 30 | fig = px.pie(values=class_dist.values, names=['No Diabetes', 'Diabetes'], 31 | title='Diabetes Distribution') 32 | st.plotly_chart(fig) 33 | 34 | # Feature selection 35 | st.subheader("Feature Selection") 36 | selected_features = st.multiselect( 37 | "Select features for prediction", 38 | df.columns[:-1], 39 | default=['Glucose', 'BMI', 'Age'] 40 | ) 41 | 42 | if selected_features: 43 | # Prepare data 44 | X = df[selected_features] 45 | y = df['Outcome'] 46 | 47 | # Split data 48 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 49 | 50 | # Scale features 51 | scaler = StandardScaler() 52 | X_train_scaled = scaler.fit_transform(X_train) 53 | X_test_scaled = scaler.transform(X_test) 54 | 55 | # Train model 56 | model = LogisticRegression(max_iter=1000) 57 | model.fit(X_train_scaled, y_train) 58 | 59 | # Model evaluation 60 | st.subheader("Model Performance") 61 | y_pred = model.predict(X_test_scaled) 62 | accuracy = accuracy_score(y_test, y_pred) 63 | 64 | # Display metrics 65 | col1, col2, col3 = st.columns(3) 66 | with col1: 67 | st.metric("Accuracy", f"{accuracy:.2%}") 68 | with col2: 69 | st.metric("Precision", f"{classification_report(y_test, y_pred, output_dict=True)['1']['precision']:.2%}") 70 | with col3: 71 | st.metric("Recall", f"{classification_report(y_test, y_pred, output_dict=True)['1']['recall']:.2%}") 72 | 73 | # Confusion Matrix 74 | st.subheader("Confusion Matrix") 75 | cm = confusion_matrix(y_test, y_pred) 76 | fig = px.imshow(cm, 77 | labels=dict(x="Predicted", y="Actual", color="Count"), 78 | x=['No Diabetes', 'Diabetes'], 79 | y=['No Diabetes', 'Diabetes'], 80 | text_auto=True, 81 | aspect="auto") 82 | st.plotly_chart(fig) 83 | 84 | # Feature Importance 85 | st.subheader("Feature Importance") 86 | importance = pd.DataFrame({ 87 | 'Feature': selected_features, 88 | 'Importance': np.abs(model.coef_[0]) 89 | }) 90 | fig = px.bar(importance, x='Feature', y='Importance', 91 | title='Feature Importance in Prediction') 92 | st.plotly_chart(fig) 93 | 94 | # Interactive Prediction 95 | st.subheader("Make a Prediction") 96 | st.write("Enter patient information:") 97 | 98 | # Create input fields for selected features 99 | input_data = {} 100 | cols = st.columns(len(selected_features)) 101 | for i, feature in enumerate(selected_features): 102 | with cols[i]: 103 | input_data[feature] = st.number_input( 104 | f"{feature}", 105 | min_value=float(df[feature].min()), 106 | max_value=float(df[feature].max()), 107 | value=float(df[feature].mean()) 108 | ) 109 | 110 | if st.button("Predict"): 111 | # Scale input data 112 | input_scaled = scaler.transform(pd.DataFrame([input_data])) 113 | prediction = model.predict(input_scaled)[0] 114 | probability = model.predict_proba(input_scaled)[0] 115 | 116 | # Display prediction 117 | st.subheader("Prediction Result") 118 | col1, col2 = st.columns(2) 119 | with col1: 120 | st.metric("Prediction", "Diabetes" if prediction == 1 else "No Diabetes") 121 | with col2: 122 | st.metric("Confidence", f"{max(probability):.2%}") 123 | 124 | # Visualize prediction probability 125 | fig = go.Figure(data=[ 126 | go.Bar(x=['No Diabetes', 'Diabetes'], 127 | y=probability, 128 | text=[f'{p:.2%}' for p in probability], 129 | textposition='auto', 130 | ) 131 | ]) 132 | fig.update_layout(title='Prediction Probabilities') 133 | st.plotly_chart(fig) 134 | 135 | # Data Visualization 136 | st.subheader("Data Analysis") 137 | selected_feature = st.selectbox("Select feature to analyze", selected_features) 138 | 139 | # Distribution plot 140 | fig = px.histogram(df, x=selected_feature, color='Outcome', 141 | title=f'Distribution of {selected_feature} by Diabetes Status', 142 | barmode='overlay') 143 | st.plotly_chart(fig) 144 | 145 | # Correlation heatmap 146 | st.subheader("Feature Correlations") 147 | corr = df[selected_features + ['Outcome']].corr() 148 | fig = px.imshow(corr, 149 | labels=dict(color="Correlation"), 150 | x=corr.columns, 151 | y=corr.columns, 152 | aspect="auto") 153 | st.plotly_chart(fig) 154 | 155 | if __name__ == "__main__": 156 | run() -------------------------------------------------------------------------------- /Logistic_Regression/Logistic_Regression_projects/rock_vs_mine.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 7 | import plotly.express as px 8 | import plotly.graph_objects as go 9 | from sklearn.preprocessing import StandardScaler 10 | import seaborn as sns 11 | import matplotlib.pyplot as plt 12 | 13 | def run(): 14 | st.header("Rock vs Mine Classification using Logistic Regression") 15 | st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Logistic_Regression)", unsafe_allow_html=True) 16 | 17 | # Load dataset 18 | df = pd.read_csv("Logistic_Regression/Logistic_Regression_projects/Copy of sonar data.csv") 19 | 20 | # Display dataset info 21 | st.subheader("Dataset Overview") 22 | col1, col2 = st.columns(2) 23 | with col1: 24 | st.write("Dataset Shape:", df.shape) 25 | st.write("Features: 60 frequency bands") 26 | st.write("Target: R (Rock) or M (Mine)") 27 | with col2: 28 | st.write("Class Distribution:") 29 | class_dist = df['R'].value_counts() 30 | fig = px.pie(values=class_dist.values, names=['Rock', 'Mine'], 31 | title='Rock vs Mine Distribution') 32 | st.plotly_chart(fig) 33 | 34 | # Feature selection 35 | st.subheader("Feature Selection") 36 | n_features = st.slider("Number of Features to Use", min_value=5, max_value=60, value=20) 37 | 38 | # Select top features based on variance 39 | feature_vars = df.iloc[:, :-1].var() 40 | selected_features = feature_vars.nlargest(n_features).index.tolist() 41 | 42 | # Prepare data 43 | X = df[selected_features] 44 | y = df['R'].map({'R': 0, 'M': 1}) # Convert to binary 45 | 46 | # Split data 47 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 48 | 49 | # Scale features 50 | scaler = StandardScaler() 51 | X_train_scaled = scaler.fit_transform(X_train) 52 | X_test_scaled = scaler.transform(X_test) 53 | 54 | # Train model 55 | model = LogisticRegression(max_iter=1000) 56 | model.fit(X_train_scaled, y_train) 57 | 58 | # Model evaluation 59 | st.subheader("Model Performance") 60 | y_pred = model.predict(X_test_scaled) 61 | accuracy = accuracy_score(y_test, y_pred) 62 | 63 | # Display metrics 64 | col1, col2, col3 = st.columns(3) 65 | with col1: 66 | st.metric("Accuracy", f"{accuracy:.2%}") 67 | with col2: 68 | st.metric("Precision", f"{classification_report(y_test, y_pred, output_dict=True)['1']['precision']:.2%}") 69 | with col3: 70 | st.metric("Recall", f"{classification_report(y_test, y_pred, output_dict=True)['1']['recall']:.2%}") 71 | 72 | # Confusion Matrix 73 | st.subheader("Confusion Matrix") 74 | cm = confusion_matrix(y_test, y_pred) 75 | fig = px.imshow(cm, 76 | labels=dict(x="Predicted", y="Actual", color="Count"), 77 | x=['Rock', 'Mine'], 78 | y=['Rock', 'Mine'], 79 | text_auto=True, 80 | aspect="auto") 81 | st.plotly_chart(fig) 82 | 83 | # Feature Importance 84 | st.subheader("Feature Importance") 85 | importance = pd.DataFrame({ 86 | 'Feature': selected_features, 87 | 'Importance': np.abs(model.coef_[0]) 88 | }) 89 | fig = px.bar(importance, x='Feature', y='Importance', 90 | title='Feature Importance in Prediction') 91 | st.plotly_chart(fig) 92 | 93 | # Interactive Prediction 94 | st.subheader("Make a Prediction") 95 | st.write("Enter frequency band values:") 96 | 97 | # Create input fields for selected features 98 | input_data = {} 99 | cols = st.columns(3) 100 | for i, feature in enumerate(selected_features): 101 | with cols[i % 3]: 102 | input_data[feature] = st.number_input( 103 | f"{feature}", 104 | min_value=float(df[feature].min()), 105 | max_value=float(df[feature].max()), 106 | value=float(df[feature].mean()) 107 | ) 108 | 109 | if st.button("Predict"): 110 | # Scale input data 111 | input_scaled = scaler.transform(pd.DataFrame([input_data])) 112 | prediction = model.predict(input_scaled)[0] 113 | probability = model.predict_proba(input_scaled)[0] 114 | 115 | # Display prediction 116 | st.subheader("Prediction Result") 117 | col1, col2 = st.columns(2) 118 | with col1: 119 | st.metric("Prediction", "Mine" if prediction == 1 else "Rock") 120 | with col2: 121 | st.metric("Confidence", f"{max(probability):.2%}") 122 | 123 | # Visualize prediction probability 124 | fig = go.Figure(data=[ 125 | go.Bar(x=['Rock', 'Mine'], 126 | y=probability, 127 | text=[f'{p:.2%}' for p in probability], 128 | textposition='auto', 129 | ) 130 | ]) 131 | fig.update_layout(title='Prediction Probabilities') 132 | st.plotly_chart(fig) 133 | 134 | # Data Visualization 135 | st.subheader("Data Analysis") 136 | 137 | # PCA for dimensionality reduction 138 | from sklearn.decomposition import PCA 139 | pca = PCA(n_components=2) 140 | X_pca = pca.fit_transform(X) 141 | 142 | # Plot PCA results 143 | fig = px.scatter( 144 | x=X_pca[:, 0], y=X_pca[:, 1], 145 | color=df['R'].map({'R': 'Rock', 'M': 'Mine'}), 146 | title='PCA Visualization of Rock vs Mine Data', 147 | labels={'x': 'First Principal Component', 'y': 'Second Principal Component'} 148 | ) 149 | st.plotly_chart(fig) 150 | 151 | # Feature correlation heatmap 152 | st.subheader("Feature Correlations") 153 | corr = df[selected_features].corr() 154 | fig = px.imshow(corr, 155 | labels=dict(color="Correlation"), 156 | x=corr.columns, 157 | y=corr.columns, 158 | aspect="auto") 159 | st.plotly_chart(fig) 160 | 161 | # Frequency band analysis 162 | st.subheader("Frequency Band Analysis") 163 | selected_band = st.selectbox("Select frequency band to analyze", selected_features) 164 | 165 | fig = px.box(df, x='R', y=selected_band, 166 | title=f'Distribution of {selected_band} by Class', 167 | labels={'R': 'Class', selected_band: 'Value'}) 168 | st.plotly_chart(fig) 169 | 170 | if __name__ == "__main__": 171 | run() -------------------------------------------------------------------------------- /Logistic_Regression/Logistic_Regression_projects/simple_hiv_prediction.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 7 | import plotly.express as px 8 | import plotly.graph_objects as go 9 | from sklearn.preprocessing import StandardScaler 10 | import seaborn as sns 11 | import matplotlib.pyplot as plt 12 | 13 | def generate_sample_data(): 14 | np.random.seed(42) 15 | n_samples = 1000 16 | 17 | # Generate features 18 | age = np.random.normal(35, 10, n_samples) 19 | age = np.clip(age, 18, 65) 20 | 21 | risk_factors = np.random.choice(['Low', 'Medium', 'High'], n_samples, p=[0.6, 0.3, 0.1]) 22 | sexual_activity = np.random.choice(['None', 'Protected', 'Unprotected'], n_samples, p=[0.3, 0.5, 0.2]) 23 | drug_use = np.random.choice(['None', 'Past', 'Current'], n_samples, p=[0.7, 0.2, 0.1]) 24 | 25 | # Create DataFrame 26 | df = pd.DataFrame({ 27 | 'Age': age, 28 | 'RiskFactor': risk_factors, 29 | 'SexualActivity': sexual_activity, 30 | 'DrugUse': drug_use 31 | }) 32 | 33 | # Generate target (HIV status) with some patterns 34 | base_prob = 0.05 35 | risk_effect = { 36 | 'Low': 0.5, 37 | 'Medium': 1.0, 38 | 'High': 2.0 39 | } 40 | activity_effect = { 41 | 'None': 0.3, 42 | 'Protected': 0.7, 43 | 'Unprotected': 1.5 44 | } 45 | drug_effect = { 46 | 'None': 0.5, 47 | 'Past': 1.2, 48 | 'Current': 1.8 49 | } 50 | 51 | # Calculate probability of HIV 52 | prob = base_prob * \ 53 | df['RiskFactor'].map(risk_effect) * \ 54 | df['SexualActivity'].map(activity_effect) * \ 55 | df['DrugUse'].map(drug_effect) * \ 56 | (1 + (df['Age'] - 35) / 100) 57 | 58 | # Generate actual HIV status 59 | df['HIV'] = np.random.binomial(1, prob) 60 | 61 | return df 62 | 63 | def run(): 64 | st.header("HIV Risk Prediction using Logistic Regression") 65 | st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Logistic_Regression)", unsafe_allow_html=True) 66 | 67 | # Generate sample data 68 | df = generate_sample_data() 69 | 70 | # Display dataset info 71 | st.subheader("Dataset Overview") 72 | col1, col2 = st.columns(2) 73 | with col1: 74 | st.write("Dataset Shape:", df.shape) 75 | st.write("Features:", ", ".join(df.columns[:-1])) 76 | st.write("Target: HIV Status (0: Negative, 1: Positive)") 77 | with col2: 78 | st.write("Class Distribution:") 79 | class_dist = df['HIV'].value_counts() 80 | fig = px.pie(values=class_dist.values, names=['Negative', 'Positive'], 81 | title='HIV Status Distribution') 82 | st.plotly_chart(fig) 83 | 84 | # Prepare data 85 | X = pd.get_dummies(df[['Age', 'RiskFactor', 'SexualActivity', 'DrugUse']]) 86 | y = df['HIV'] 87 | 88 | # Split data 89 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 90 | 91 | # Scale features 92 | scaler = StandardScaler() 93 | X_train_scaled = scaler.fit_transform(X_train) 94 | X_test_scaled = scaler.transform(X_test) 95 | 96 | # Train model 97 | model = LogisticRegression(max_iter=1000) 98 | model.fit(X_train_scaled, y_train) 99 | 100 | # Model evaluation 101 | st.subheader("Model Performance") 102 | y_pred = model.predict(X_test_scaled) 103 | accuracy = accuracy_score(y_test, y_pred) 104 | 105 | # Display metrics 106 | col1, col2, col3 = st.columns(3) 107 | with col1: 108 | st.metric("Accuracy", f"{accuracy:.2%}") 109 | with col2: 110 | st.metric("Precision", f"{classification_report(y_test, y_pred, output_dict=True)['1']['precision']:.2%}") 111 | with col3: 112 | st.metric("Recall", f"{classification_report(y_test, y_pred, output_dict=True)['1']['recall']:.2%}") 113 | 114 | # Confusion Matrix 115 | st.subheader("Confusion Matrix") 116 | cm = confusion_matrix(y_test, y_pred) 117 | fig = px.imshow(cm, 118 | labels=dict(x="Predicted", y="Actual", color="Count"), 119 | x=['Negative', 'Positive'], 120 | y=['Negative', 'Positive'], 121 | text_auto=True, 122 | aspect="auto") 123 | st.plotly_chart(fig) 124 | 125 | # Feature Importance 126 | st.subheader("Feature Importance") 127 | importance = pd.DataFrame({ 128 | 'Feature': X.columns, 129 | 'Importance': np.abs(model.coef_[0]) 130 | }) 131 | fig = px.bar(importance, x='Feature', y='Importance', 132 | title='Feature Importance in Prediction') 133 | st.plotly_chart(fig) 134 | 135 | # Interactive Prediction 136 | st.subheader("Make a Prediction") 137 | st.write("Enter patient information:") 138 | 139 | col1, col2 = st.columns(2) 140 | with col1: 141 | age = st.slider("Age", 18, 65, 35) 142 | risk_factor = st.selectbox("Risk Factor", df['RiskFactor'].unique()) 143 | with col2: 144 | sexual_activity = st.selectbox("Sexual Activity", df['SexualActivity'].unique()) 145 | drug_use = st.selectbox("Drug Use", df['DrugUse'].unique()) 146 | 147 | if st.button("Predict"): 148 | # Prepare input data 149 | input_data = pd.DataFrame({ 150 | 'Age': [age], 151 | 'RiskFactor': [risk_factor], 152 | 'SexualActivity': [sexual_activity], 153 | 'DrugUse': [drug_use] 154 | }) 155 | 156 | # One-hot encode categorical variables 157 | input_encoded = pd.get_dummies(input_data) 158 | # Ensure all columns from training data are present 159 | for col in X.columns: 160 | if col not in input_encoded.columns: 161 | input_encoded[col] = 0 162 | input_encoded = input_encoded[X.columns] 163 | 164 | # Scale input data 165 | input_scaled = scaler.transform(input_encoded) 166 | 167 | # Make prediction 168 | prediction = model.predict(input_scaled)[0] 169 | probability = model.predict_proba(input_scaled)[0] 170 | 171 | # Display prediction 172 | st.subheader("Prediction Result") 173 | col1, col2 = st.columns(2) 174 | with col1: 175 | st.metric("Prediction", "Positive" if prediction == 1 else "Negative") 176 | with col2: 177 | st.metric("Risk Probability", f"{probability[1]:.2%}") 178 | 179 | # Visualize prediction probability 180 | fig = go.Figure(data=[ 181 | go.Bar(x=['Negative', 'Positive'], 182 | y=probability, 183 | text=[f'{p:.2%}' for p in probability], 184 | textposition='auto', 185 | ) 186 | ]) 187 | fig.update_layout(title='Prediction Probabilities') 188 | st.plotly_chart(fig) 189 | 190 | # Data Visualization 191 | st.subheader("Data Analysis") 192 | 193 | # Age distribution by HIV status 194 | fig = px.histogram(df, x='Age', color='HIV', 195 | title='Age Distribution by HIV Status', 196 | barmode='overlay', 197 | labels={'HIV': 'HIV Status'}) 198 | st.plotly_chart(fig) 199 | 200 | # Risk factor analysis 201 | fig = px.box(df, x='RiskFactor', y='Age', color='HIV', 202 | title='Age Distribution by Risk Factor and HIV Status') 203 | st.plotly_chart(fig) 204 | 205 | # Sexual activity analysis 206 | fig = px.sunburst(df, path=['SexualActivity', 'HIV'], 207 | title='HIV Status by Sexual Activity') 208 | st.plotly_chart(fig) 209 | 210 | # Drug use analysis 211 | fig = px.treemap(df, path=['DrugUse', 'HIV'], 212 | title='HIV Status by Drug Use') 213 | st.plotly_chart(fig) 214 | 215 | if __name__ == "__main__": 216 | run() -------------------------------------------------------------------------------- /Logistic_Regression/main.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import sys 3 | from pathlib import Path 4 | 5 | # Ensure the Logistic_Regression_projects folder is in the Python path 6 | sys.path.append(str(Path(__file__).parent)) 7 | 8 | from Logistic_Regression_projects import ( 9 | diabetes_prediction, 10 | rock_vs_mine, 11 | simple_hiv_prediction, 12 | ) 13 | 14 | def run(): 15 | st.title("Logistic Regression Projects") 16 | 17 | # Sidebar for project selection 18 | project = st.sidebar.selectbox( 19 | "Select a project", 20 | [ 21 | "Diabetes Prediction", 22 | "Rock vs Mine", 23 | "Simple HIV Prediction", 24 | ], 25 | ) 26 | 27 | # Run the selected project 28 | if project == "Diabetes Prediction": 29 | diabetes_prediction.run() 30 | elif project == "Rock vs Mine": 31 | rock_vs_mine.run() 32 | elif project == "Simple HIV Prediction": 33 | simple_hiv_prediction.run() 34 | 35 | if __name__ == "__main__": 36 | run() -------------------------------------------------------------------------------- /Logistic_Regression/readme.md: -------------------------------------------------------------------------------- 1 | # Logistic Regression Projects 2 | 3 | This repository contains various Logistic Regression projects implemented in Python. Each project demonstrates the application of Logistic Regression to solve real-world problems using datasets. 4 | 5 | ## Project Structure 6 | 7 | ``` 8 | Logistic_Regression/ 9 | ├── main.py 10 | ├── requirements.txt 11 | ├── Logistic_regression_projects/ 12 | │ ├── diabetes_prediction.py 13 | │ ├── rock_vs_mine.py 14 | │ ├── simple_hiv_prediction.py 15 | │ ├── diabetes.csv 16 | │ ├── sonar.csv 17 | ``` 18 | 19 | ### Key Files 20 | - **`main.py`**: The main entry point for running the Streamlit app. 21 | - **`requirements.txt`**: Contains the dependencies required to run the project. 22 | - **`Logistic_regression_projects/`**: Contains individual project scripts and datasets. 23 | 24 | ## Projects Included 25 | 26 | 1. **Diabetes Prediction** 27 | Predicts the likelihood of diabetes based on health metrics such as glucose level, blood pressure, BMI, etc. 28 | Dataset: `diabetes.csv` 29 | 30 | **Screenshots:** 31 | ![Diabetes Prediction 1](screenshots/diab1.png) 32 | ![Diabetes Prediction 2](screenshots/diab2.png) 33 | 34 | 2. **Rock vs Mine Classification** 35 | Classifies sonar signals as either "Rock" or "Mine" using Logistic Regression. 36 | Dataset: `sonar.csv` 37 | 38 | **Screenshots:** 39 | ![Rock vs Mine](screenshots/rock_mine.png) 40 | 41 | 3. **Simple HIV Prediction** 42 | Predicts HIV status based on features like age, CD4 count, and viral load. 43 | Dataset: Synthetic data (hardcoded in the script). 44 | 45 | **Screenshots:** 46 | ![HIV Prediction](screenshots/hiv.png) 47 | 48 | ## How to Run 49 | 50 | 1. Clone the repository: 51 | ```bash 52 | git clone https://github.com/benasphy/ML_projects.git 53 | cd Logistic_Regression 54 | ``` 55 | 56 | 2. Install dependencies: 57 | ```bash 58 | pip install -r requirements.txt 59 | ``` 60 | 61 | 3. Run the Streamlit app: 62 | ```bash 63 | streamlit run main.py 64 | ``` 65 | 66 | 4. Select a project from the sidebar to explore its functionality. 67 | 68 | ## Requirements 69 | 70 | The project requires the following Python libraries: 71 | - `streamlit` 72 | - `numpy` 73 | - `pandas` 74 | - `scikit-learn` 75 | 76 | ## Datasets 77 | 78 | - **`diabetes.csv`**: Contains data for predicting diabetes outcomes based on various health metrics. 79 | - **`sonar.csv`**: Contains sonar data for classification tasks. 80 | 81 | ## Screenshots 82 | 83 | Add screenshots of the Streamlit app interface here to showcase the projects. 84 | 85 | ## License 86 | 87 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. 88 | 89 | ## Acknowledgments 90 | 91 | - Datasets used in this project are sourced from publicly available repositories. 92 | - Special thanks to the contributors of the Python libraries used in this project. 93 | 94 | --- 95 | Feel free to contribute to this repository by submitting issues or pull requests. 96 | 97 | ### Steps to Save: 98 | 1. Save this content as `README.md` in the `Logistic_Regression` folder. 99 | 2. Ensure the project structure matches the one described in the `README.md`. 100 | -------------------------------------------------------------------------------- /Logistic_Regression/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit 2 | numpy 3 | pandas 4 | scikit-learn 5 | plotly 6 | scipy 7 | seaborn -------------------------------------------------------------------------------- /Logistic_Regression/screenshots/diab1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Logistic_Regression/screenshots/diab1.png -------------------------------------------------------------------------------- /Logistic_Regression/screenshots/diab2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Logistic_Regression/screenshots/diab2.png -------------------------------------------------------------------------------- /Logistic_Regression/screenshots/hiv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Logistic_Regression/screenshots/hiv.png -------------------------------------------------------------------------------- /Logistic_Regression/screenshots/rock_mine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Logistic_Regression/screenshots/rock_mine.png -------------------------------------------------------------------------------- /Naive_Bayes/Naive_Bayes_projects/fake_news_prediction.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.feature_extraction.text import CountVectorizer 6 | from sklearn.naive_bayes import MultinomialNB 7 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 8 | import plotly.express as px 9 | import plotly.graph_objects as go 10 | from collections import Counter 11 | import re 12 | from wordcloud import WordCloud 13 | import matplotlib.pyplot as plt 14 | 15 | def preprocess_text(text): 16 | """Clean and preprocess text data.""" 17 | # Convert to lowercase 18 | text = str(text).lower() 19 | # Remove special characters and digits 20 | text = re.sub(r'[^a-zA-Z\s]', '', text) 21 | # Remove extra whitespace 22 | text = re.sub(r'\s+', ' ', text).strip() 23 | return text 24 | 25 | def generate_wordcloud(texts, title): 26 | """Generate and display a word cloud.""" 27 | wordcloud = WordCloud(width=800, height=400, 28 | background_color='white', 29 | min_font_size=10).generate(' '.join(texts)) 30 | 31 | fig, ax = plt.subplots(figsize=(10, 5)) 32 | ax.imshow(wordcloud) 33 | ax.axis('off') 34 | ax.set_title(title) 35 | return fig 36 | 37 | def run(): 38 | st.header("Fake News Prediction") 39 | st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/Naive_Bayes)", unsafe_allow_html=True) 40 | 41 | # Load dataset 42 | uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"]) 43 | if uploaded_file is not None: 44 | df = pd.read_csv(uploaded_file) 45 | X = df[['title', 'news_url', 'source_domain', 'tweet_num']].apply(lambda x: ' '.join(x.astype(str)), axis=1) 46 | y = df['real'] 47 | else: 48 | st.info("Using default dataset: FakeNewsNet.csv") 49 | df = pd.read_csv("Naive_Bayes/Naive_Bayes_projects/FakeNewsNet.csv") 50 | X = df[['title', 'news_url', 'source_domain', 'tweet_num']].apply(lambda x: ' '.join(x.astype(str)), axis=1) 51 | y = df['real'] 52 | 53 | # Display dataset info 54 | st.subheader("Dataset Overview") 55 | col1, col2 = st.columns(2) 56 | with col1: 57 | st.write("Dataset Shape:", df.shape) 58 | st.write("Number of Articles:", len(df)) 59 | with col2: 60 | class_dist = df['real'].value_counts() 61 | fig = px.pie(values=class_dist.values, names=['Fake', 'Real'], 62 | title='News Distribution') 63 | st.plotly_chart(fig) 64 | 65 | # Text Analysis 66 | st.subheader("Text Analysis") 67 | 68 | # Preprocess text 69 | df['cleaned_text'] = X.apply(preprocess_text) 70 | 71 | # Text length analysis 72 | df['text_length'] = X.str.len() 73 | 74 | # Text length distribution 75 | fig = px.box(df, x='real', y='text_length', 76 | title='Text Length Distribution by Category', 77 | labels={'real': 'Category', 'text_length': 'Text Length'}) 78 | st.plotly_chart(fig) 79 | 80 | # Vectorize text 81 | vectorizer = CountVectorizer(stop_words='english', max_features=5000) 82 | X_vec = vectorizer.fit_transform(X) 83 | 84 | # Split data 85 | X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42) 86 | 87 | # Train model 88 | model = MultinomialNB() 89 | model.fit(X_train, y_train) 90 | 91 | # Evaluate model 92 | y_pred = model.predict(X_test) 93 | accuracy = accuracy_score(y_test, y_pred) 94 | 95 | # Display metrics 96 | st.subheader("Model Performance") 97 | col1, col2, col3 = st.columns(3) 98 | with col1: 99 | st.metric("Accuracy", f"{accuracy:.2%}") 100 | with col2: 101 | st.metric("Real News Precision", 102 | f"{classification_report(y_test, y_pred, output_dict=True)['1']['precision']:.2%}") 103 | with col3: 104 | st.metric("Real News Recall", 105 | f"{classification_report(y_test, y_pred, output_dict=True)['1']['recall']:.2%}") 106 | 107 | # Confusion Matrix 108 | st.subheader("Confusion Matrix") 109 | cm = confusion_matrix(y_test, y_pred) 110 | fig = px.imshow(cm, 111 | labels=dict(x="Predicted", y="Actual", color="Count"), 112 | x=['Fake', 'Real'], 113 | y=['Fake', 'Real'], 114 | text_auto=True, 115 | aspect="auto") 116 | st.plotly_chart(fig) 117 | 118 | # Feature Importance 119 | st.subheader("Most Important Words") 120 | feature_importance = pd.DataFrame({ 121 | 'Word': vectorizer.get_feature_names_out(), 122 | 'Importance': model.feature_log_prob_[1] - model.feature_log_prob_[0] 123 | }) 124 | feature_importance = feature_importance.sort_values('Importance', ascending=False) 125 | 126 | col1, col2 = st.columns(2) 127 | with col1: 128 | # Most indicative of real news 129 | fig = px.bar(feature_importance.head(10), x='Word', y='Importance', 130 | title='Top Words Indicating Real News') 131 | st.plotly_chart(fig) 132 | 133 | with col2: 134 | # Most indicative of fake news 135 | fig = px.bar(feature_importance.tail(10), x='Word', y='Importance', 136 | title='Top Words Indicating Fake News') 137 | st.plotly_chart(fig) 138 | 139 | # Predict custom input 140 | st.subheader("Test a News Article") 141 | news_text = st.text_area("Enter news article text:", height=200) 142 | 143 | if st.button("Check News"): 144 | if news_text: 145 | # Preprocess and vectorize input 146 | cleaned_input = preprocess_text(news_text) 147 | input_vectorized = vectorizer.transform([cleaned_input]) 148 | 149 | # Make prediction 150 | prediction = model.predict(input_vectorized)[0] 151 | probabilities = model.predict_proba(input_vectorized)[0] 152 | 153 | # Display prediction 154 | col1, col2 = st.columns(2) 155 | with col1: 156 | st.metric("Prediction", "Real News" if prediction == 1 else "Fake News") 157 | with col2: 158 | st.metric("Confidence", f"{max(probabilities):.2%}") 159 | 160 | # Visualize prediction probabilities 161 | fig = go.Figure(data=[ 162 | go.Bar(x=['Fake News', 'Real News'], 163 | y=probabilities, 164 | text=[f'{p:.2%}' for p in probabilities], 165 | textposition='auto', 166 | ) 167 | ]) 168 | fig.update_layout(title='Prediction Probabilities', 169 | xaxis_title='Category', 170 | yaxis_title='Probability') 171 | st.plotly_chart(fig) 172 | 173 | # Text Analysis 174 | st.subheader("Text Analysis") 175 | col1, col2 = st.columns(2) 176 | with col1: 177 | st.write("**Text Length:**", len(news_text)) 178 | st.write("**Word Count:**", len(cleaned_input.split())) 179 | with col2: 180 | # Get top contributing words 181 | words = cleaned_input.split() 182 | word_scores = [] 183 | for word in set(words): 184 | if word in vectorizer.vocabulary_: 185 | idx = vectorizer.vocabulary_[word] 186 | score = model.feature_log_prob_[1][idx] - model.feature_log_prob_[0][idx] 187 | word_scores.append((word, score)) 188 | 189 | if word_scores: 190 | word_scores.sort(key=lambda x: abs(x[1]), reverse=True) 191 | st.write("**Top Contributing Words:**") 192 | for word, score in word_scores[:5]: 193 | indicator = "→ Real News" if score > 0 else "→ Fake News" 194 | st.write(f"- {word} {indicator}") 195 | 196 | if __name__ == "__main__": 197 | run() -------------------------------------------------------------------------------- /Naive_Bayes/Naive_Bayes_projects/weather_prediction.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.naive_bayes import GaussianNB 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 7 | import plotly.express as px 8 | import plotly.graph_objects as go 9 | import seaborn as sns 10 | import matplotlib.pyplot as plt 11 | 12 | def generate_sample_data(): 13 | np.random.seed(42) 14 | n_samples = 1000 15 | 16 | # Generate weather features 17 | temperature = np.random.normal(25, 5, n_samples) # Mean 25°C, std 5°C 18 | humidity = np.random.normal(60, 15, n_samples) # Mean 60%, std 15% 19 | pressure = np.random.normal(1013, 5, n_samples) # Mean 1013 hPa, std 5 hPa 20 | wind_speed = np.random.exponential(5, n_samples) # Mean 5 m/s 21 | 22 | # Create DataFrame 23 | df = pd.DataFrame({ 24 | 'Temperature': temperature, 25 | 'Humidity': humidity, 26 | 'Pressure': pressure, 27 | 'Wind_Speed': wind_speed 28 | }) 29 | 30 | # Generate weather conditions based on features 31 | def determine_weather(row): 32 | if row['Temperature'] > 30 and row['Humidity'] > 70: 33 | return 'Stormy' 34 | elif row['Temperature'] < 20 and row['Humidity'] > 80: 35 | return 'Rainy' 36 | elif row['Temperature'] > 25 and row['Humidity'] < 50: 37 | return 'Sunny' 38 | else: 39 | return 'Cloudy' 40 | 41 | df['Weather'] = df.apply(determine_weather, axis=1) 42 | 43 | return df 44 | 45 | def run(): 46 | st.header("Weather Prediction using Naive Bayes") 47 | st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Naive_Bayes)", unsafe_allow_html=True) 48 | 49 | # Generate sample data 50 | df = generate_sample_data() 51 | 52 | # Display dataset info 53 | st.subheader("Dataset Overview") 54 | col1, col2 = st.columns(2) 55 | with col1: 56 | st.write("Dataset Shape:", df.shape) 57 | st.write("Features:", ", ".join(df.columns[:-1])) 58 | with col2: 59 | st.write("Weather Distribution:") 60 | weather_dist = df['Weather'].value_counts() 61 | fig = px.pie(values=weather_dist.values, names=weather_dist.index, 62 | title='Weather Conditions Distribution') 63 | st.plotly_chart(fig) 64 | 65 | # Feature Analysis 66 | st.subheader("Feature Analysis") 67 | 68 | # Temperature distribution by weather 69 | fig = px.box(df, x='Weather', y='Temperature', 70 | title='Temperature Distribution by Weather Condition') 71 | st.plotly_chart(fig) 72 | 73 | # Humidity distribution by weather 74 | fig = px.box(df, x='Weather', y='Humidity', 75 | title='Humidity Distribution by Weather Condition') 76 | st.plotly_chart(fig) 77 | 78 | # Feature correlations 79 | st.subheader("Feature Correlations") 80 | numeric_cols = ['Temperature', 'Humidity', 'Pressure', 'Wind_Speed'] 81 | corr = df[numeric_cols].corr() 82 | fig = px.imshow(corr, 83 | labels=dict(x="Features", y="Features", color="Correlation"), 84 | x=numeric_cols, 85 | y=numeric_cols, 86 | text_auto=True, 87 | aspect="auto") 88 | st.plotly_chart(fig) 89 | 90 | # Prepare data for modeling 91 | X = df[['Temperature', 'Humidity', 'Pressure', 'Wind_Speed']] 92 | y = df['Weather'] 93 | 94 | # Split data 95 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 96 | 97 | # Train model 98 | model = GaussianNB() 99 | model.fit(X_train, y_train) 100 | 101 | # Model evaluation 102 | st.subheader("Model Performance") 103 | y_pred = model.predict(X_test) 104 | accuracy = accuracy_score(y_test, y_pred) 105 | 106 | # Display metrics 107 | col1, col2, col3 = st.columns(3) 108 | with col1: 109 | st.metric("Accuracy", f"{accuracy:.2%}") 110 | with col2: 111 | st.metric("Macro Precision", 112 | f"{classification_report(y_test, y_pred, output_dict=True)['macro avg']['precision']:.2%}") 113 | with col3: 114 | st.metric("Macro Recall", 115 | f"{classification_report(y_test, y_pred, output_dict=True)['macro avg']['recall']:.2%}") 116 | 117 | # Confusion Matrix 118 | st.subheader("Confusion Matrix") 119 | cm = confusion_matrix(y_test, y_pred) 120 | fig = px.imshow(cm, 121 | labels=dict(x="Predicted", y="Actual", color="Count"), 122 | x=sorted(df['Weather'].unique()), 123 | y=sorted(df['Weather'].unique()), 124 | text_auto=True, 125 | aspect="auto") 126 | st.plotly_chart(fig) 127 | 128 | # Feature Distributions 129 | st.subheader("Feature Distributions by Weather Condition") 130 | 131 | # Create subplot for feature distributions 132 | fig = plt.figure(figsize=(12, 8)) 133 | for i, feature in enumerate(numeric_cols, 1): 134 | plt.subplot(2, 2, i) 135 | for weather in df['Weather'].unique(): 136 | sns.kdeplot(data=df[df['Weather'] == weather][feature], label=weather) 137 | plt.title(f'{feature} Distribution') 138 | plt.legend() 139 | plt.tight_layout() 140 | st.pyplot(fig) 141 | 142 | # Interactive Prediction 143 | st.subheader("Make a Weather Prediction") 144 | st.write("Enter weather conditions:") 145 | 146 | col1, col2 = st.columns(2) 147 | with col1: 148 | temperature = st.slider("Temperature (°C)", float(df['Temperature'].min()), 149 | float(df['Temperature'].max()), 25.0) 150 | humidity = st.slider("Humidity (%)", float(df['Humidity'].min()), 151 | float(df['Humidity'].max()), 60.0) 152 | with col2: 153 | pressure = st.slider("Pressure (hPa)", float(df['Pressure'].min()), 154 | float(df['Pressure'].max()), 1013.0) 155 | wind_speed = st.slider("Wind Speed (m/s)", float(df['Wind_Speed'].min()), 156 | float(df['Wind_Speed'].max()), 5.0) 157 | 158 | if st.button("Predict Weather"): 159 | # Prepare input data 160 | input_data = np.array([[temperature, humidity, pressure, wind_speed]]) 161 | 162 | # Make prediction 163 | prediction = model.predict(input_data)[0] 164 | probabilities = model.predict_proba(input_data)[0] 165 | 166 | # Display prediction 167 | st.subheader("Prediction Result") 168 | col1, col2 = st.columns(2) 169 | with col1: 170 | st.metric("Predicted Weather", prediction) 171 | with col2: 172 | st.metric("Confidence", f"{max(probabilities):.2%}") 173 | 174 | # Visualize prediction probabilities 175 | fig = go.Figure(data=[ 176 | go.Bar(x=sorted(df['Weather'].unique()), 177 | y=probabilities, 178 | text=[f'{p:.2%}' for p in probabilities], 179 | textposition='auto', 180 | ) 181 | ]) 182 | fig.update_layout(title='Weather Condition Probabilities', 183 | xaxis_title='Weather Condition', 184 | yaxis_title='Probability') 185 | st.plotly_chart(fig) 186 | 187 | # Weather Condition Characteristics 188 | st.subheader("Typical Characteristics of Predicted Weather") 189 | weather_stats = df[df['Weather'] == prediction].describe() 190 | for feature in numeric_cols: 191 | st.write(f"**{feature}**:") 192 | st.write(f"- Average: {weather_stats[feature]['mean']:.2f}") 193 | st.write(f"- Range: {weather_stats[feature]['min']:.2f} to {weather_stats[feature]['max']:.2f}") 194 | 195 | if __name__ == "__main__": 196 | run() -------------------------------------------------------------------------------- /Naive_Bayes/main.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import sys 3 | from pathlib import Path 4 | 5 | # Ensure the Linear_regression_projects folder is in the Python path 6 | sys.path.append(str(Path(__file__).parent)) 7 | from Naive_Bayes_projects import ( 8 | weather_prediction, 9 | spam_detection_nb, 10 | fake_news_prediction, 11 | ) 12 | 13 | def run(): 14 | st.title("Naive Bayes Projects") 15 | 16 | # Sidebar for project selection 17 | project = st.sidebar.selectbox( 18 | "Select a project", 19 | [ 20 | "Weather Prediction", 21 | "Spam Detection", 22 | "Fake News Prediction", 23 | ], 24 | ) 25 | 26 | # Run the selected project 27 | if project == "Weather Prediction": 28 | weather_prediction.run() 29 | elif project == "Spam Detection": 30 | spam_detection_nb.run() 31 | elif project == "Fake News Prediction": 32 | fake_news_prediction.run() 33 | 34 | if __name__ == "__main__": 35 | run() -------------------------------------------------------------------------------- /Naive_Bayes/readme.md: -------------------------------------------------------------------------------- 1 | # Naive Bayes Projects 2 | 3 | This folder contains various projects that utilize the Naive Bayes algorithm for different applications. Each project is designed to demonstrate the use of Naive Bayes in machine learning tasks. 4 | 5 | ## Projects 6 | 7 | 1. **Weather Prediction**: Predicts weather conditions using historical data. 8 | 9 | **Screenshots:** 10 | ![Weather Prediction](screenshots/weath.png) 11 | 2. **Spam Detection**: Classifies emails as spam or not spam using text data. 12 | 13 | **Screenshots:** 14 | ![Spam Detection](screenshots/spamde.png) 15 | 3. **Fake News Prediction**: Detects fake news articles using features like title, news URL, source domain, and tweet number. 16 | 17 | **Screenshots:** 18 | ![Fake News Prediction](screenshots/fake_news.png) 19 | 20 | ## How to Run 21 | 22 | To run any of the projects, follow these steps: 23 | 24 | 1. Ensure you have the required dependencies installed. You can install them using pip: 25 | 26 | ```bash 27 | pip install streamlit pandas scikit-learn 28 | ``` 29 | 30 | 2. Navigate to the Naive_Bayes directory in your terminal. 31 | 32 | 3. Run the Streamlit app using the following command: 33 | 34 | ```bash 35 | streamlit run main.py 36 | ``` 37 | 38 | 4. Use the sidebar to select the project you want to run. 39 | 40 | ## Project Structure 41 | 42 | - `main.py`: The main entry point for running the projects. 43 | - `Naive_Bayes_projects/`: Contains individual project files: 44 | - `weather_prediction.py`: Weather prediction project. 45 | - `spam_detection_nb.py`: Spam detection project. 46 | - `fake_news_prediction.py`: Fake news prediction project. 47 | 48 | ## Data 49 | 50 | Each project uses its own dataset, which is either uploaded by the user or loaded from a default CSV file located in the `Naive_Bayes_projects/` directory. 51 | 52 | ## Contributing 53 | 54 | Feel free to contribute to these projects by submitting pull requests or opening issues for any bugs or feature requests. 55 | 56 | ## License 57 | 58 | This project is licensed under the MIT License - see the LICENSE file for details. 59 | -------------------------------------------------------------------------------- /Naive_Bayes/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit 2 | numpy 3 | pandas 4 | scikit-learn 5 | plotly 6 | scipy 7 | seaborn -------------------------------------------------------------------------------- /Naive_Bayes/screenshots/fake_news.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Naive_Bayes/screenshots/fake_news.png -------------------------------------------------------------------------------- /Naive_Bayes/screenshots/spamde.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Naive_Bayes/screenshots/spamde.png -------------------------------------------------------------------------------- /Naive_Bayes/screenshots/weath.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Naive_Bayes/screenshots/weath.png -------------------------------------------------------------------------------- /Poisson_Regression/Poisson_Regression_projects/competition_awards_data.csv: -------------------------------------------------------------------------------- 1 | Awards,Math Score 2 | 0,43 3 | 0,38 4 | 0,41 5 | 0,33 6 | 0,39 7 | 0,43 8 | 0,35 9 | 0,41 10 | 0,36 11 | 0,38 12 | 0,60 13 | 0,30 14 | 0,32 15 | 0,30 16 | 0,37 17 | 0,44 18 | 0,45 19 | 0,44 20 | 0,37 21 | 0,43 22 | 0,34 23 | 0,40 24 | 0,34 25 | 0,38 26 | 0,32 27 | 0,42 28 | 1,64 29 | 1,60 30 | 0,39 31 | 1,62 32 | 0,35 33 | 0,37 34 | 1,50 35 | 0,40 36 | 1,65 37 | 1,68 38 | 0,45 39 | 0,35 40 | 0,36 41 | 0,37 42 | 0,31 43 | 0,31 44 | 0,30 45 | 0,42 46 | 0,42 47 | 1,62 48 | 0,38 49 | 0,39 50 | 0,47 51 | 1,65 52 | 0,33 53 | 0,34 54 | 0,31 55 | 3,89 56 | 0,30 57 | 0,44 58 | 1,70 59 | 1,68 60 | 0,38 61 | 0,44 62 | 0,30 63 | 1,70 64 | 0,39 65 | 1,66 66 | 0,30 67 | 1,61 68 | 0,37 69 | 0,33 70 | 1,64 71 | 0,30 72 | 2,83 73 | 0,43 74 | 0,35 75 | 0,30 76 | 1,59 77 | 0,47 78 | 0,35 79 | 0,39 80 | 0,32 81 | 0,31 82 | 0,38 83 | 0,33 84 | 1,62 85 | 0,39 86 | 0,38 87 | 0,30 88 | 1,66 89 | 0,41 90 | 0,42 91 | 0,31 92 | 0,34 93 | 0,48 94 | 0,37 95 | 0,30 96 | 0,40 97 | 0,41 98 | 1,69 99 | 0,42 100 | 1,63 101 | 0,40 102 | 0,30 103 | 0,38 104 | 0,34 105 | 0,30 106 | 0,32 107 | 0,35 108 | 0,38 109 | 1,70 110 | 0,33 111 | 0,36 112 | 1,63 113 | 1,66 114 | 2,86 115 | 0,34 116 | 1,63 117 | 0,40 118 | 1,72 119 | 0,40 120 | 0,47 121 | 2,86 122 | 1,67 123 | 3,88 124 | 1,64 125 | 0,39 126 | 0,37 127 | 1,63 128 | 1,72 129 | 0,30 130 | 0,38 131 | 1,67 132 | 0,31 133 | 1,62 134 | 0,44 135 | 0,42 136 | 0,36 137 | 0,30 138 | 3,89 139 | 1,71 140 | 0,35 141 | 0,33 142 | 0,42 143 | 3,80 144 | 0,40 145 | 0,30 146 | 1,67 147 | 1,59 148 | 0,43 149 | 0,40 150 | 0,41 151 | 3,88 152 | 0,30 153 | 0,45 154 | 0,30 155 | 2,82 156 | 0,47 157 | 1,70 158 | 5,90 159 | 0,33 160 | 0,30 161 | 1,63 162 | 0,36 163 | 0,34 164 | 2,87 165 | 4,90 166 | 1,66 167 | 0,35 168 | 1,63 169 | 1,70 170 | 3,90 171 | 1,61 172 | 1,67 173 | 3,89 174 | 2,86 175 | 0,39 176 | 0,30 177 | 1,67 178 | 1,68 179 | 3,88 180 | 2,83 181 | 0,33 182 | 6,90 183 | 4,90 184 | 1,70 185 | 0,39 186 | 2,87 187 | 2,88 188 | 1,59 189 | 2,86 190 | 1,65 191 | 1,61 192 | 0,46 193 | 1,62 194 | 2,81 195 | 5,91 196 | 1,70 197 | 2,85 198 | 2,86 199 | 1,66 200 | 0,41 201 | 3,89 202 | -------------------------------------------------------------------------------- /Poisson_Regression/Poisson_Regression_projects/no_of_car_accident.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.linear_model import PoissonRegressor 6 | from sklearn.metrics import mean_squared_error, r2_score 7 | import plotly.express as px 8 | import plotly.graph_objects as go 9 | from sklearn.preprocessing import StandardScaler 10 | import seaborn as sns 11 | import matplotlib.pyplot as plt 12 | 13 | def generate_sample_data(): 14 | np.random.seed(42) 15 | n_samples = 1000 16 | 17 | # Generate features 18 | traffic_volume = np.random.normal(5000, 2000, n_samples) 19 | traffic_volume = np.clip(traffic_volume, 1000, 10000) 20 | 21 | weather_conditions = np.random.choice(['Clear', 'Rainy', 'Snowy', 'Foggy'], n_samples, p=[0.6, 0.2, 0.1, 0.1]) 22 | time_of_day = np.random.choice(['Morning', 'Afternoon', 'Evening', 'Night'], n_samples, p=[0.3, 0.3, 0.2, 0.2]) 23 | road_type = np.random.choice(['Highway', 'Urban', 'Rural'], n_samples, p=[0.4, 0.4, 0.2]) 24 | 25 | # Create DataFrame 26 | df = pd.DataFrame({ 27 | 'TrafficVolume': traffic_volume, 28 | 'WeatherCondition': weather_conditions, 29 | 'TimeOfDay': time_of_day, 30 | 'RoadType': road_type 31 | }) 32 | 33 | # Generate target (number of accidents) with some patterns 34 | base_rate = 0.001 35 | weather_effect = { 36 | 'Clear': 1.0, 37 | 'Rainy': 1.5, 38 | 'Snowy': 2.0, 39 | 'Foggy': 1.8 40 | } 41 | time_effect = { 42 | 'Morning': 1.2, 43 | 'Afternoon': 1.0, 44 | 'Evening': 1.5, 45 | 'Night': 1.8 46 | } 47 | road_effect = { 48 | 'Highway': 1.0, 49 | 'Urban': 1.5, 50 | 'Rural': 0.8 51 | } 52 | 53 | # Calculate expected number of accidents 54 | expected_accidents = base_rate * \ 55 | df['TrafficVolume'] * \ 56 | df['WeatherCondition'].map(weather_effect) * \ 57 | df['TimeOfDay'].map(time_effect) * \ 58 | df['RoadType'].map(road_effect) 59 | 60 | # Generate actual number of accidents using Poisson distribution 61 | df['Accidents'] = np.random.poisson(expected_accidents) 62 | 63 | return df 64 | 65 | def run(): 66 | st.header("Car Accident Prediction using Poisson Regression") 67 | st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Poisson_Regression)", unsafe_allow_html=True) 68 | 69 | # Generate sample data 70 | df = generate_sample_data() 71 | 72 | # Display dataset info 73 | st.subheader("Dataset Overview") 74 | col1, col2 = st.columns(2) 75 | with col1: 76 | st.write("Dataset Shape:", df.shape) 77 | st.write("Features:", ", ".join(df.columns[:-1])) 78 | st.write("Target: Number of Accidents") 79 | with col2: 80 | st.write("Accident Statistics:") 81 | st.write(f"Mean: {df['Accidents'].mean():.2f}") 82 | st.write(f"Max: {df['Accidents'].max()}") 83 | st.write(f"Min: {df['Accidents'].min()}") 84 | 85 | # Data distribution visualization 86 | fig = px.histogram(df, x='Accidents', 87 | title='Distribution of Number of Accidents', 88 | nbins=30) 89 | st.plotly_chart(fig) 90 | 91 | # Prepare data 92 | X = pd.get_dummies(df[['TrafficVolume', 'WeatherCondition', 'TimeOfDay', 'RoadType']]) 93 | y = df['Accidents'] 94 | 95 | # Split data 96 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 97 | 98 | # Scale features 99 | scaler = StandardScaler() 100 | X_train_scaled = scaler.fit_transform(X_train) 101 | X_test_scaled = scaler.transform(X_test) 102 | 103 | # Train model 104 | model = PoissonRegressor(alpha=0.1) 105 | model.fit(X_train_scaled, y_train) 106 | 107 | # Model evaluation 108 | st.subheader("Model Performance") 109 | y_pred = model.predict(X_test_scaled) 110 | mse = mean_squared_error(y_test, y_pred) 111 | r2 = r2_score(y_test, y_pred) 112 | 113 | # Display metrics 114 | col1, col2 = st.columns(2) 115 | with col1: 116 | st.metric("Mean Squared Error", f"{mse:.2f}") 117 | with col2: 118 | st.metric("R² Score", f"{r2:.2%}") 119 | 120 | # Actual vs Predicted Plot 121 | fig = px.scatter(x=y_test, y=y_pred, 122 | labels={'x': 'Actual Accidents', 'y': 'Predicted Accidents'}, 123 | title='Actual vs Predicted Accidents') 124 | fig.add_trace(go.Scatter(x=[0, max(y_test)], y=[0, max(y_test)], 125 | mode='lines', name='Perfect Prediction')) 126 | st.plotly_chart(fig) 127 | 128 | # Feature Importance 129 | st.subheader("Feature Importance") 130 | importance = pd.DataFrame({ 131 | 'Feature': X.columns, 132 | 'Importance': np.abs(model.coef_) 133 | }) 134 | fig = px.bar(importance, x='Feature', y='Importance', 135 | title='Feature Importance in Prediction') 136 | st.plotly_chart(fig) 137 | 138 | # Interactive Prediction 139 | st.subheader("Make a Prediction") 140 | st.write("Enter traffic conditions:") 141 | 142 | col1, col2 = st.columns(2) 143 | with col1: 144 | traffic_volume = st.slider("Traffic Volume", 1000, 10000, 5000) 145 | weather = st.selectbox("Weather Condition", df['WeatherCondition'].unique()) 146 | with col2: 147 | time = st.selectbox("Time of Day", df['TimeOfDay'].unique()) 148 | road = st.selectbox("Road Type", df['RoadType'].unique()) 149 | 150 | if st.button("Predict"): 151 | # Prepare input data 152 | input_data = pd.DataFrame({ 153 | 'TrafficVolume': [traffic_volume], 154 | 'WeatherCondition': [weather], 155 | 'TimeOfDay': [time], 156 | 'RoadType': [road] 157 | }) 158 | 159 | # One-hot encode categorical variables 160 | input_encoded = pd.get_dummies(input_data) 161 | # Ensure all columns from training data are present 162 | for col in X.columns: 163 | if col not in input_encoded.columns: 164 | input_encoded[col] = 0 165 | input_encoded = input_encoded[X.columns] 166 | 167 | # Scale input data 168 | input_scaled = scaler.transform(input_encoded) 169 | 170 | # Make prediction 171 | prediction = model.predict(input_scaled)[0] 172 | 173 | # Display prediction 174 | st.subheader("Prediction Result") 175 | st.metric("Expected Number of Accidents", f"{prediction:.1f}") 176 | 177 | # Visualize prediction with confidence interval 178 | fig = go.Figure() 179 | fig.add_trace(go.Bar( 180 | x=['Predicted Accidents'], 181 | y=[prediction], 182 | error_y=dict(type='data', array=[np.sqrt(prediction)], visible=True), 183 | name='Prediction' 184 | )) 185 | fig.update_layout(title='Predicted Accidents with 95% Confidence Interval') 186 | st.plotly_chart(fig) 187 | 188 | # Data Analysis 189 | st.subheader("Data Analysis") 190 | 191 | # Weather impact 192 | fig = px.box(df, x='WeatherCondition', y='Accidents', 193 | title='Accident Distribution by Weather Condition') 194 | st.plotly_chart(fig) 195 | 196 | # Time of day impact 197 | fig = px.box(df, x='TimeOfDay', y='Accidents', 198 | title='Accident Distribution by Time of Day') 199 | st.plotly_chart(fig) 200 | 201 | # Road type impact 202 | fig = px.box(df, x='RoadType', y='Accidents', 203 | title='Accident Distribution by Road Type') 204 | st.plotly_chart(fig) 205 | 206 | # Traffic volume vs accidents 207 | fig = px.scatter(df, x='TrafficVolume', y='Accidents', 208 | color='WeatherCondition', 209 | title='Traffic Volume vs Accidents by Weather Condition') 210 | st.plotly_chart(fig) 211 | 212 | if __name__ == "__main__": 213 | run() -------------------------------------------------------------------------------- /Poisson_Regression/main.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import sys 3 | from pathlib import Path 4 | 5 | # Ensure the Poisson_Regression_projects folder is in the Python path 6 | sys.path.append(str(Path(__file__).parent)) 7 | 8 | from Poisson_Regression_projects import ( 9 | competition_award, 10 | no_of_car_accident, 11 | ) 12 | 13 | def run(): 14 | st.title("Poisson Regression Projects") 15 | 16 | # Sidebar for project selection 17 | project = st.sidebar.selectbox( 18 | "Select a project", 19 | [ 20 | "Competition Award Prediction", 21 | "Number of Car Accidents Prediction", 22 | ], 23 | ) 24 | 25 | # Run the selected project 26 | if project == "Competition Award Prediction": 27 | competition_award.run() 28 | elif project == "Number of Car Accidents Prediction": 29 | no_of_car_accident.run() 30 | 31 | if __name__ == "__main__": 32 | run() -------------------------------------------------------------------------------- /Poisson_Regression/readme.md: -------------------------------------------------------------------------------- 1 | # Poisson Regression Projects 2 | 3 | This repository contains various Poisson Regression projects implemented in Python. Each project demonstrates the application of Poisson Regression to solve real-world problems using datasets. 4 | 5 | ## Project Structure 6 | 7 | ``` 8 | Poisson_Regression/ 9 | ├── main.py 10 | ├── requirements.txt 11 | ├── Poisson_regression_projects/ 12 | │ ├── competition_award.py 13 | │ ├── no_of_car_accident.py 14 | │ ├── competition_awards_data.csv 15 | ``` 16 | 17 | ### Key Files 18 | - **`main.py`**: The main entry point for running the Streamlit app. 19 | - **`requirements.txt`**: Contains the dependencies required to run the project. 20 | - **`Poisson_regression_projects/`**: Contains individual project scripts and datasets. 21 | 22 | ## Projects Included 23 | 24 | 1. **Competition Award Prediction** 25 | Predicts the number of awards a student will receive based on their math scores using Poisson Regression. 26 | Dataset: `competition_awards_data.csv` 27 | 28 | **Screenshots:** 29 | ![Competition Award Prediction](screenshots/comp.png) 30 | 31 | 2. **Number of Car Accidents Prediction** 32 | Predicts the number of car accidents based on average speed, traffic density, and road conditions using Poisson Regression. 33 | Dataset: Synthetic data (hardcoded in the script). 34 | 35 | **Screenshots:** 36 | ![Car Accident Prediction](screenshots/caraccidents.png) 37 | 38 | ## How to Run 39 | 40 | 1. Clone the repository: 41 | ```bash 42 | git clone https://github.com/benasphy/ML_projects.git 43 | cd Poisson_Regression 44 | ``` 45 | 46 | 2. Install dependencies: 47 | ```bash 48 | pip install -r requirements.txt 49 | ``` 50 | 51 | 3. Run the Streamlit app: 52 | ```bash 53 | streamlit run main.py 54 | ``` 55 | 56 | 4. Select a project from the sidebar to explore its functionality. 57 | 58 | ## Requirements 59 | 60 | The project requires the following Python libraries: 61 | - `streamlit` 62 | - `numpy` 63 | - `pandas` 64 | - `scikit-learn` 65 | 66 | ## Datasets 67 | 68 | - **`competition_awards_data.csv`**: Contains data for predicting the number of awards based on math scores. 69 | 70 | ## Screenshots 71 | 72 | Add screenshots of the Streamlit app interface here to showcase the projects. 73 | 74 | ## License 75 | 76 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. 77 | 78 | ## Acknowledgments 79 | 80 | - Datasets used in this project are sourced from publicly available repositories. 81 | - Special thanks to the contributors of the Python libraries used in this project. 82 | 83 | --- 84 | Feel free to contribute to this repository by submitting issues or pull requests. 85 | -------------------------------------------------------------------------------- /Poisson_Regression/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit 2 | numpy 3 | pandas 4 | scikit-learn 5 | plotly 6 | scipy 7 | seaborn -------------------------------------------------------------------------------- /Poisson_Regression/screenshots/caraccident.png: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Poisson_Regression/screenshots/caraccidents.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Poisson_Regression/screenshots/caraccidents.png -------------------------------------------------------------------------------- /Poisson_Regression/screenshots/comp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Poisson_Regression/screenshots/comp.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Projects Collection 2 | 3 | A comprehensive collection of machine learning projects implemented in Python, covering various algorithms and techniques. Each project is designed to solve real-world problems using different machine learning approaches. 4 | 5 | ## Project Categories 6 | 7 | ### Supervised Learning 8 | - **Linear Regression** 9 | - House Price Prediction 10 | - Salary Prediction 11 | - Study Hours vs Exam Score Prediction 12 | - Messi Goal Prediction 13 | - Normal Equation vs Gradient Descent Implementation 14 | 15 | - **Logistic Regression** 16 | - Diabetes Prediction 17 | - Rock vs Mine Classification 18 | - Simple HIV Prediction 19 | 20 | - **Naive Bayes** 21 | - Fake News Detection 22 | - Spam Detection 23 | - Weather Prediction 24 | 25 | - **Support Vector Machine (SVM)** 26 | - Breast Cancer Prediction 27 | - Spam Detection 28 | 29 | - **K-Nearest Neighbors (KNN)** 30 | - Movie Recommendation System 31 | - T-Shirt Size Prediction 32 | 33 | - **Decision Trees** 34 | - Gym Decision Tree 35 | - Gini Impurity Implementation 36 | 37 | ### Unsupervised Learning 38 | - **Clustering** 39 | - **K-Means** 40 | - Customer Segmentation 41 | - Loan Approval Clustering 42 | 43 | - **Gaussian Mixture Models (GMM)** 44 | - Customer Segmentation 45 | - Image Color Segmentation 46 | 47 | - **DBSCAN/HDBSCAN** 48 | - Customer Behavior Analysis 49 | - Anomaly Detection 50 | 51 | - **Hierarchical Clustering** 52 | - Document Clustering 53 | - Market Basket Analysis 54 | 55 | - **Fuzzy C-Means** 56 | - Customer Profiling 57 | - Image Segmentation 58 | 59 | ### Other Techniques 60 | - **Dimensionality Reduction** 61 | - Feature Selection 62 | - Image Compression 63 | 64 | - **Association Rule Learning** 65 | - Market Basket Analysis 66 | - Recommendation System 67 | 68 | - **Poisson Regression** 69 | - Competition Award Prediction 70 | - Car Accident Prediction 71 | 72 | ## Project Structure 73 | 74 | Each project category has its own directory containing: 75 | - `main.py`: Main entry point for running the Streamlit app 76 | - `requirements.txt`: Required Python packages 77 | - Project-specific files and datasets 78 | - Detailed README.md with project documentation 79 | 80 | ## Getting Started 81 | 82 | 1. Clone the repository: 83 | ```bash 84 | git clone https://github.com/benasphy/ML_projects.git 85 | cd ML_projects 86 | ``` 87 | 88 | 2. Install dependencies for a specific project: 89 | ```bash 90 | cd 91 | pip install -r requirements.txt 92 | ``` 93 | 94 | 3. Run the Streamlit app: 95 | ```bash 96 | streamlit run main.py 97 | ``` 98 | 99 | ## Common Requirements 100 | 101 | Most projects require these Python libraries: 102 | - `streamlit` 103 | - `numpy` 104 | - `pandas` 105 | - `scikit-learn` 106 | - `matplotlib` 107 | - `plotly` 108 | 109 | Additional requirements are specified in each project's `requirements.txt` file. 110 | 111 | ## Features 112 | 113 | - Interactive web interfaces using Streamlit 114 | - Real-time data visualization 115 | - Model evaluation and metrics 116 | - Custom dataset support 117 | - Comprehensive documentation 118 | - Clean and modular code structure 119 | 120 | ## Contributing 121 | 122 | Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change. 123 | 124 | ## License 125 | 126 | This project is licensed under the MIT License - see the LICENSE file for details. 127 | 128 | ## Acknowledgments 129 | 130 | - Datasets used in these projects are sourced from publicly available repositories 131 | - Special thanks to the contributors of the Python libraries used in these projects 132 | - Inspired by various machine learning courses and tutorials 133 | 134 | --- 135 | Feel free to star the repository if you find it useful! -------------------------------------------------------------------------------- /SVM/SVM_projects/breast_cancer_prediction.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.svm import SVC 5 | from sklearn.preprocessing import StandardScaler 6 | from sklearn.model_selection import train_test_split, cross_val_score 7 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 8 | from sklearn.datasets import load_breast_cancer 9 | import plotly.express as px 10 | import plotly.graph_objects as go 11 | import seaborn as sns 12 | import matplotlib.pyplot as plt 13 | from sklearn.decomposition import PCA 14 | 15 | def run(): 16 | st.header("Breast Cancer Prediction using SVM") 17 | st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/SVM)", unsafe_allow_html=True) 18 | 19 | # Load dataset 20 | data = load_breast_cancer() 21 | df = pd.DataFrame(data.data, columns=data.feature_names) 22 | df['target'] = data.target 23 | 24 | # Display dataset info 25 | st.subheader("Dataset Overview") 26 | col1, col2 = st.columns(2) 27 | with col1: 28 | st.write("Dataset Shape:", df.shape) 29 | st.write("Number of Samples:", len(df)) 30 | with col2: 31 | target_dist = df['target'].value_counts() 32 | fig = px.pie(values=target_dist.values, 33 | names=['Benign', 'Malignant'], 34 | title='Diagnosis Distribution') 35 | st.plotly_chart(fig) 36 | 37 | # Data Analysis 38 | st.subheader("Data Analysis") 39 | 40 | # Feature distributions 41 | st.write("Feature Distributions by Diagnosis") 42 | selected_feature = st.selectbox("Select Feature to View:", data.feature_names) 43 | 44 | fig = px.box(df, x='target', y=selected_feature, 45 | title=f'{selected_feature} Distribution by Diagnosis', 46 | labels={'target': 'Diagnosis', selected_feature: selected_feature}) 47 | st.plotly_chart(fig) 48 | 49 | # PCA Visualization 50 | st.subheader("Data Visualization (PCA)") 51 | pca = PCA(n_components=2) 52 | X_pca = pca.fit_transform(df.drop('target', axis=1)) 53 | 54 | pca_df = pd.DataFrame({ 55 | 'PC1': X_pca[:, 0], 56 | 'PC2': X_pca[:, 1], 57 | 'Diagnosis': ['Benign' if x == 1 else 'Malignant' for x in df['target']] 58 | }) 59 | 60 | fig = px.scatter(pca_df, x='PC1', y='PC2', color='Diagnosis', 61 | title='PCA Visualization of Breast Cancer Data') 62 | st.plotly_chart(fig) 63 | 64 | # Data preprocessing 65 | X = df.drop('target', axis=1) 66 | y = df['target'] 67 | 68 | # Split data 69 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 70 | 71 | # Scaling 72 | scaler = StandardScaler() 73 | X_train_scaled = scaler.fit_transform(X_train) 74 | X_test_scaled = scaler.transform(X_test) 75 | 76 | # Train model 77 | model = SVC(kernel='rbf', probability=True) 78 | model.fit(X_train_scaled, y_train) 79 | 80 | # Model evaluation 81 | st.subheader("Model Performance") 82 | y_pred = model.predict(X_test_scaled) 83 | accuracy = accuracy_score(y_test, y_pred) 84 | 85 | # Display metrics 86 | col1, col2, col3 = st.columns(3) 87 | with col1: 88 | st.metric("Accuracy", f"{accuracy:.2%}") 89 | with col2: 90 | cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5) 91 | st.metric("Cross-validation Score", f"{cv_scores.mean():.2%}") 92 | with col3: 93 | st.metric("Cross-validation Std", f"{cv_scores.std():.2%}") 94 | 95 | # Confusion Matrix 96 | st.subheader("Confusion Matrix") 97 | cm = confusion_matrix(y_test, y_pred) 98 | fig = px.imshow(cm, 99 | labels=dict(x="Predicted", y="Actual", color="Count"), 100 | x=['Malignant', 'Benign'], 101 | y=['Malignant', 'Benign'], 102 | text_auto=True, 103 | aspect="auto") 104 | st.plotly_chart(fig) 105 | 106 | # Classification Report 107 | st.subheader("Detailed Classification Report") 108 | report = classification_report(y_test, y_pred, output_dict=True) 109 | report_df = pd.DataFrame(report).transpose() 110 | st.dataframe(report_df) 111 | 112 | # Feature Importance 113 | st.subheader("Feature Importance") 114 | # For SVM with RBF kernel, we'll use permutation importance 115 | from sklearn.inspection import permutation_importance 116 | result = permutation_importance(model, X_test_scaled, y_test, n_repeats=10, random_state=42) 117 | 118 | feature_importance = pd.DataFrame({ 119 | 'Feature': data.feature_names, 120 | 'Importance': result.importances_mean 121 | }).sort_values('Importance', ascending=False) 122 | 123 | fig = px.bar(feature_importance.head(10), x='Feature', y='Importance', 124 | title='Top 10 Most Important Features') 125 | st.plotly_chart(fig) 126 | 127 | # Prediction interface 128 | st.subheader("Predict Breast Cancer") 129 | 130 | # Create input fields for each feature 131 | input_data = {} 132 | cols = st.columns(3) 133 | for i, feature in enumerate(data.feature_names): 134 | with cols[i % 3]: 135 | input_data[feature] = st.number_input( 136 | f"{feature}", 137 | min_value=float(df[feature].min()), 138 | max_value=float(df[feature].max()), 139 | value=float(df[feature].mean()) 140 | ) 141 | 142 | if st.button("Predict"): 143 | # Prepare input data 144 | input_df = pd.DataFrame([input_data]) 145 | input_scaled = scaler.transform(input_df) 146 | 147 | # Make prediction 148 | prediction = model.predict(input_scaled)[0] 149 | probabilities = model.predict_proba(input_scaled)[0] 150 | 151 | # Display prediction 152 | col1, col2 = st.columns(2) 153 | with col1: 154 | st.metric("Prediction", "Benign" if prediction == 1 else "Malignant") 155 | with col2: 156 | st.metric("Confidence", f"{max(probabilities):.2%}") 157 | 158 | # Visualize prediction probabilities 159 | fig = go.Figure(data=[ 160 | go.Bar(x=['Malignant', 'Benign'], 161 | y=probabilities, 162 | text=[f'{p:.2%}' for p in probabilities], 163 | textposition='auto', 164 | ) 165 | ]) 166 | fig.update_layout(title='Prediction Probabilities', 167 | xaxis_title='Diagnosis', 168 | yaxis_title='Probability') 169 | st.plotly_chart(fig) 170 | 171 | # Feature Analysis 172 | st.subheader("Feature Analysis") 173 | 174 | # Compare input values with dataset statistics 175 | comparison_df = pd.DataFrame({ 176 | 'Feature': data.feature_names, 177 | 'Your Value': input_data.values(), 178 | 'Dataset Mean': df.drop('target', axis=1).mean(), 179 | 'Dataset Std': df.drop('target', axis=1).std() 180 | }) 181 | 182 | # Calculate z-scores 183 | comparison_df['Z-Score'] = (comparison_df['Your Value'] - comparison_df['Dataset Mean']) / comparison_df['Dataset Std'] 184 | 185 | # Plot feature comparison 186 | fig = px.bar(comparison_df.head(10), x='Feature', y='Z-Score', 187 | title='Feature Comparison (Z-Scores)', 188 | color='Z-Score', 189 | color_continuous_scale=['red', 'white', 'green']) 190 | fig.add_hline(y=0, line_dash="dash", line_color="black") 191 | st.plotly_chart(fig) 192 | 193 | if __name__ == "__main__": 194 | run() -------------------------------------------------------------------------------- /SVM/main.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import sys 3 | from pathlib import Path 4 | 5 | # Ensure the SVM_projects folder is in the Python path 6 | sys.path.append(str(Path(__file__).parent)) 7 | 8 | from SVM_projects import ( 9 | spam_detection, 10 | breast_cancer_prediction, 11 | ) 12 | 13 | def run(): 14 | st.title("SVM Projects") 15 | 16 | # Sidebar for project selection 17 | project = st.sidebar.selectbox( 18 | "Select a project", 19 | [ 20 | "Spam Detection", 21 | "Breast Cancer Prediction", 22 | ], 23 | ) 24 | 25 | # Run the selected project 26 | if project == "Spam Detection": 27 | spam_detection.run() 28 | elif project == "Breast Cancer Prediction": 29 | breast_cancer_prediction.run() 30 | 31 | if __name__ == "__main__": 32 | run() 33 | -------------------------------------------------------------------------------- /SVM/readme.md: -------------------------------------------------------------------------------- 1 | # SVM Projects 2 | 3 | This folder contains various projects that utilize the Support Vector Machine (SVM) algorithm for different applications. Each project is designed to demonstrate the use of SVM in machine learning tasks. 4 | 5 | ## Projects 6 | 7 | 1. **Spam Detection**: Classifies emails as spam or not spam using SVM. 8 | 9 | **Screenshots:** 10 | ![Spam Detection](screenshots/spam.png) 11 | 2. **Breast Cancer Prediction**: Predicts whether a breast cancer tumor is benign or malignant using SVM. 12 | 13 | **Screenshots:** 14 | ![Breast Cancer Prediction](screenshots/breast.png) 15 | 16 | ## How to Run 17 | 18 | To run any of the projects, follow these steps: 19 | 20 | 1. Ensure you have the required dependencies installed. You can install them using pip: 21 | 22 | ```bash 23 | pip install streamlit pandas numpy scikit-learn 24 | ``` 25 | 26 | 2. Navigate to the SVM directory in your terminal. 27 | 28 | 3. Run the Streamlit app using the following command: 29 | 30 | ```bash 31 | streamlit run main.py 32 | ``` 33 | 34 | 4. Use the sidebar to select the project you want to run. 35 | 36 | ## Project Structure 37 | 38 | - `main.py`: The main entry point for running the projects. 39 | - `SVM_projects/`: Contains individual project files: 40 | - `spam_detection.py`: Spam detection project. 41 | - `breast_cancer_prediction.py`: Breast cancer prediction project. 42 | 43 | ## Data 44 | 45 | - The spam detection project uses the spam.csv dataset 46 | - The breast cancer prediction project uses the built-in breast cancer dataset from scikit-learn 47 | 48 | ## Contributing 49 | 50 | Feel free to contribute to these projects by submitting pull requests or opening issues for any bugs or feature requests. 51 | 52 | ## License 53 | 54 | This project is licensed under the MIT License - see the LICENSE file for details. 55 | -------------------------------------------------------------------------------- /SVM/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit==1.22.0 2 | pandas==1.5.3 3 | numpy==1.24.3 4 | scikit-learn==1.2.2 5 | plotly 6 | scipy 7 | seaborn 8 | -------------------------------------------------------------------------------- /SVM/screenshots/breast.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/SVM/screenshots/breast.png -------------------------------------------------------------------------------- /SVM/screenshots/spam.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/SVM/screenshots/spam.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import importlib 3 | 4 | def run(): 5 | st.title("Machine Learning Algorithms") 6 | 7 | # List of available algorithm folders 8 | algorithms = [ 9 | "Linear Regression", 10 | "Logistic Regression", 11 | "Decision Trees", 12 | "Poisson Regression", 13 | "Support Vector Machines", 14 | "K-Nearest Neighbors", 15 | "Naive Bayes", 16 | "GMM", 17 | "Hierarchical Clustering", 18 | "DBSCAN & HDBSCAN", 19 | "Fuzzy C-Means", 20 | "Association Rule Learning", 21 | "K-Means Clustering", 22 | "Dimensionality Reduction", 23 | ] 24 | 25 | # Sidebar to select an algorithm 26 | selected_algorithm = st.sidebar.selectbox("Select an Algorithm", algorithms) 27 | 28 | # Map algorithm names to module paths 29 | algorithm_modules = { 30 | "Linear Regression": "Linear_Regression.main", 31 | "Logistic Regression": "Logistic_Regression.main", 32 | "Decision Trees": "Decision_Trees.main", 33 | "Poisson Regression": "Poisson_Regression.main", 34 | "Support Vector Machines": "SVM.main", 35 | "K-Nearest Neighbors": "KNN.main", 36 | "Naive Bayes": "Naive_Bayes.main", 37 | "GMM": "GMM.main", 38 | "Hierarchical Clustering": "Hierarchical_Clustering.main", 39 | "DBSCAN & HDBSCAN": "DBSCAN_HDBSCAN.main", 40 | "Fuzzy C-Means": "Fuzzy_C_Means.main", 41 | "Association Rule Learning": "Association_Rule_Learning.main", 42 | "K-Means Clustering": "K-Means.main", 43 | "Dimensionality Reduction": "Dimensionality_Reduction.main" 44 | } 45 | 46 | st.write(f"You selected: {selected_algorithm}") 47 | st.write("The selected algorithm's app will appear below.") 48 | 49 | # Dynamically import and run the selected module's run() function 50 | module_path = algorithm_modules[selected_algorithm] 51 | try: 52 | module = importlib.import_module(module_path) 53 | module.run() 54 | except Exception as e: 55 | st.error(f"Failed to load {selected_algorithm} app: {e}") 56 | 57 | if __name__ == "__main__": 58 | run() -------------------------------------------------------------------------------- /packages.txt: -------------------------------------------------------------------------------- 1 | libgl1-mesa-glx 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit 2 | numpy 3 | pandas 4 | scikit-learn 5 | plotly 6 | scipy 7 | seaborn 8 | wordcloud 9 | opencv-python 10 | hdbscan 11 | scikit-image 12 | mlxtend --------------------------------------------------------------------------------