├── .devcontainer
    └── devcontainer.json
├── .gitignore
├── Association_Rule_Learning
    ├── Association_Rule_Learning_projects
    │   ├── market_basket_analysis.py
    │   └── recommendation_system.py
    ├── README.md
    ├── main.py
    ├── requirements.txt
    └── screenshots
    │   ├── feature_sel.png
    │   ├── img_comp.png
    │   ├── mark_bask.png
    │   └── recomm.png
├── DBSCAN_HDBSCAN
    ├── DBSCAN_HDBSCAN_projects
    │   ├── anomaly_detection.py
    │   └── customer_behavior_analysis.py
    ├── main.py
    ├── readme.md
    ├── requirements.txt
    └── screenshots
    │   ├── anom_det.png
    │   └── customer_behavior.png
├── Decision_Trees
    ├── Decision_Trees_projects
    │   ├── gini_impurity_implementation.py
    │   └── gym_decision_tree.py
    ├── main.py
    ├── readme.md
    ├── requirements.txt
    └── screenshots
    │   ├── gini.png
    │   └── gym1.png
├── Dimensionality_Reduction
    ├── Dimensionality_Reduction_projects
    │   ├── feature_selection.py
    │   └── image_compression.py
    ├── README.md
    ├── main.py
    ├── requirements.txt
    └── screenshots
    │   ├── feature_sel.png
    │   └── img_comp.png
├── Fuzzy_C_Means
    ├── Fuzzy_C_Means_projects
    │   ├── customer_profiling.py
    │   └── image_segmentation.py
    ├── README.md
    ├── main.py
    ├── requirements.txt
    └── screenshots
    │   ├── cust_prof.png
    │   └── imag_seg.png
├── GMM
    ├── GMM_projects
    │   ├── customer_segmentation.py
    │   └── image_color_segmentation.py
    ├── main.py
    ├── readme.md
    ├── requirements.txt
    └── screenshots
    │   ├── cust.png
    │   └── image_clust.png
├── Hierarchical_Clustering
    ├── Hierarchical_projects
    │   ├── document_clustering.py
    │   └── market_basket_analysis.py
    ├── README.md
    ├── main.py
    ├── requirements.txt
    └── screenshots
    │   ├── doc_clust.png
    │   └── market_basket.png
├── K-Means
    ├── K_Means_projects
    │   ├── customer_segmentation.py
    │   └── loan_approval.py
    ├── main.py
    ├── readme.md
    ├── requirements.txt
    └── screenshots
    │   ├── cust_seg.png
    │   └── loan.png
├── KNN
    ├── KNN_projects
    │   ├── TShirt_size.csv
    │   ├── movie_recommendation.py
    │   ├── netflix_titles.csv
    │   └── tshirt_size_prediction.py
    ├── main.py
    ├── readme.md
    ├── requirements.txt
    └── screenshots
    │   ├── movie1.png
    │   ├── movie2.png
    │   └── t-shirt.png
├── Linear_Regression
    ├── Linear_regression_projects
    │   ├── Salary_dataset.csv
    │   ├── __init__.py
    │   ├── house_price_prediction.py
    │   ├── messi_goal_prediction.py
    │   ├── normal_equation_vs_gradient_descent.py
    │   ├── salary_prediction.py
    │   └── study_hours_exam_prediction.py
    ├── README.md
    ├── main.py
    ├── requirements.txt
    └── screenshots
    │   ├── house1.png
    │   ├── leo1.png
    │   ├── leo2.png
    │   ├── leo3.png
    │   ├── norm_grad1.png
    │   ├── norm_grad2.png
    │   ├── salary_pred.png
    │   ├── score1.png
    │   └── score2.png
├── Logistic_Regression
    ├── Logistic_Regression_projects
    │   ├── Copy of sonar data.csv
    │   ├── diabetes.csv
    │   ├── diabetes_prediction.py
    │   ├── rock_vs_mine.py
    │   └── simple_hiv_prediction.py
    ├── main.py
    ├── readme.md
    ├── requirements.txt
    └── screenshots
    │   ├── diab1.png
    │   ├── diab2.png
    │   ├── hiv.png
    │   └── rock_mine.png
├── Naive_Bayes
    ├── Naive_Bayes_projects
    │   ├── FakeNewsNet.csv
    │   ├── fake_news_detection.py
    │   ├── fake_news_prediction.py
    │   ├── spam.csv
    │   ├── spam_detection_nb.py
    │   └── weather_prediction.py
    ├── main.py
    ├── readme.md
    ├── requirements.txt
    └── screenshots
    │   ├── fake_news.png
    │   ├── spamde.png
    │   └── weath.png
├── Poisson_Regression
    ├── Poisson_Regression_projects
    │   ├── competition_award.py
    │   ├── competition_awards_data.csv
    │   └── no_of_car_accident.py
    ├── main.py
    ├── readme.md
    ├── requirements.txt
    └── screenshots
    │   ├── caraccident.png
    │   ├── caraccidents.png
    │   └── comp.png
├── README.md
├── SVM
    ├── SVM_projects
    │   ├── breast_cancer_prediction.py
    │   └── spam_detection.py
    ├── main.py
    ├── readme.md
    ├── requirements.txt
    └── screenshots
    │   ├── breast.png
    │   └── spam.png
├── main.py
├── packages.txt
└── requirements.txt


/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Python 3",
 3 |   // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
 4 |   "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye",
 5 |   "customizations": {
 6 |     "codespaces": {
 7 |       "openFiles": [
 8 |         "README.md",
 9 |         "main.py"
10 |       ]
11 |     },
12 |     "vscode": {
13 |       "settings": {},
14 |       "extensions": [
15 |         "ms-python.python",
16 |         "ms-python.vscode-pylance"
17 |       ]
18 |     }
19 |   },
20 |   "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y <packages.txt; [ -f requirements.txt ] && pip3 install --user -r requirements.txt; pip3 install --user streamlit; echo '✅ Packages installed and Requirements met'",
21 |   "postAttachCommand": {
22 |     "server": "streamlit run main.py --server.enableCORS false --server.enableXsrfProtection false"
23 |   },
24 |   "portsAttributes": {
25 |     "8501": {
26 |       "label": "Application",
27 |       "onAutoForward": "openPreview"
28 |     }
29 |   },
30 |   "forwardPorts": [
31 |     8501
32 |   ]
33 | }


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | *.so
 6 | .Python
 7 | build/
 8 | develop-eggs/
 9 | dist/
10 | downloads/
11 | eggs/
12 | .eggs/
13 | lib/
14 | lib64/
15 | parts/
16 | sdist/
17 | var/
18 | wheels/
19 | *.egg-info/
20 | .installed.cfg
21 | *.egg
22 | 
23 | # Virtual Environment
24 | venv/
25 | ENV/
26 | env/
27 | 
28 | # IDE
29 | .idea/
30 | .vscode/
31 | *.swp
32 | *.swo
33 | 
34 | # OS
35 | .DS_Store
36 | .DS_Store?
37 | ._*
38 | .Spotlight-V100
39 | .Trashes
40 | ehthumbs.db
41 | Thumbs.db
42 | 
43 | # Jupyter Notebook
44 | .ipynb_checkpoints
45 | 
46 | # Streamlit
47 | .streamlit/
48 | 
49 | # Logs
50 | *.log 


--------------------------------------------------------------------------------
/Association_Rule_Learning/Association_Rule_Learning_projects/market_basket_analysis.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import numpy as np
  4 | from mlxtend.frequent_patterns import apriori, association_rules
  5 | import plotly.express as px
  6 | import plotly.graph_objects as go
  7 | import networkx as nx
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | def generate_sample_data(n_transactions=1000):
 11 |     # Define product categories and their items with common combinations
 12 |     categories = {
 13 |         'Dairy': ['Milk', 'Cheese', 'Yogurt', 'Butter', 'Cream'],
 14 |         'Bakery': ['Bread', 'Cake', 'Cookies', 'Muffins', 'Pastries'],
 15 |         'Produce': ['Apples', 'Bananas', 'Oranges', 'Tomatoes', 'Lettuce'],
 16 |         'Meat': ['Chicken', 'Beef', 'Pork', 'Fish', 'Sausage'],
 17 |         'Snacks': ['Chips', 'Crackers', 'Nuts', 'Popcorn', 'Candy']
 18 |     }
 19 |     
 20 |     # Define common item combinations
 21 |     common_combinations = [
 22 |         ['Milk', 'Bread'],
 23 |         ['Cheese', 'Bread'],
 24 |         ['Chicken', 'Lettuce'],
 25 |         ['Chips', 'Soda'],
 26 |         ['Cookies', 'Milk'],
 27 |         ['Apples', 'Bananas'],
 28 |         ['Beef', 'Tomatoes'],
 29 |         ['Crackers', 'Cheese']
 30 |     ]
 31 |     
 32 |     # Generate transactions
 33 |     transactions = []
 34 |     for _ in range(n_transactions):
 35 |         transaction = []
 36 |         
 37 |         # 70% chance to include a common combination
 38 |         if np.random.random() < 0.7:
 39 |             # Randomly select a combination index
 40 |             combo_idx = np.random.randint(0, len(common_combinations))
 41 |             transaction.extend(common_combinations[combo_idx])
 42 |         
 43 |         # Add 1-3 random items
 44 |         n_additional = np.random.randint(1, 4)
 45 |         for _ in range(n_additional):
 46 |             category = np.random.choice(list(categories.keys()))
 47 |             item = np.random.choice(categories[category])
 48 |             if item not in transaction:  # Avoid duplicates
 49 |                 transaction.append(item)
 50 |         
 51 |         transactions.append(transaction)
 52 |     
 53 |     return transactions
 54 | 
 55 | def format_rule(rule):
 56 |     """Convert frozenset to string for display"""
 57 |     antecedents = ', '.join(list(rule['antecedents']))
 58 |     consequents = ', '.join(list(rule['consequents']))
 59 |     return f"{antecedents} → {consequents}"
 60 | 
 61 | def run():
 62 |     st.header("Market Basket Analysis")
 63 |     st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Association_Rule_Learning)", unsafe_allow_html=True)
 64 | 
 65 |     # Load or generate dataset
 66 |     uploaded_file = st.file_uploader("Upload a CSV file with transaction data", type=["csv"])
 67 |     if uploaded_file is not None:
 68 |         df = pd.read_csv(uploaded_file)
 69 |         # Convert to list of transactions
 70 |         transactions = df.values.tolist()
 71 |     else:
 72 |         st.info("Using sample transaction data")
 73 |         transactions = generate_sample_data()
 74 | 
 75 |     # Convert transactions to one-hot encoded DataFrame
 76 |     unique_items = list(set(item for transaction in transactions for item in transaction))
 77 |     df = pd.DataFrame([[1 if item in transaction else 0 for item in unique_items] 
 78 |                       for transaction in transactions], columns=unique_items)
 79 | 
 80 |     # Display data info
 81 |     st.subheader("Dataset Information")
 82 |     st.write(f"Number of transactions: {len(transactions)}")
 83 |     st.write(f"Number of unique items: {len(unique_items)}")
 84 |     st.write("Sample transactions:")
 85 |     st.dataframe(df.head())
 86 | 
 87 |     # Parameters
 88 |     st.subheader("Association Rule Parameters")
 89 |     min_support = st.slider("Minimum Support", min_value=0.001, max_value=0.5, value=0.005, step=0.001)
 90 |     min_confidence = st.slider("Minimum Confidence", min_value=0.1, max_value=1.0, value=0.2, step=0.1)
 91 |     min_lift = st.slider("Minimum Lift", min_value=1.0, max_value=5.0, value=1.1, step=0.1)
 92 | 
 93 |     if st.button("Generate Rules"):
 94 |         try:
 95 |             # Generate frequent itemsets
 96 |             frequent_itemsets = apriori(df, min_support=min_support, use_colnames=True)
 97 |             
 98 |             if len(frequent_itemsets) == 0:
 99 |                 st.warning("No frequent itemsets found. Try lowering the minimum support threshold.")
100 |                 return
101 |             
102 |             # Generate rules
103 |             rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
104 |             rules = rules[rules['lift'] >= min_lift]
105 |             
106 |             if len(rules) > 0:
107 |                 # Create a copy of rules with formatted strings for display
108 |                 display_rules = rules.copy()
109 |                 display_rules['rule'] = display_rules.apply(format_rule, axis=1)
110 |                 
111 |                 # Display rules
112 |                 st.subheader("Association Rules")
113 |                 st.dataframe(display_rules[['rule', 'support', 'confidence', 'lift']])
114 |                 
115 |                 # Visualize support vs confidence
116 |                 fig = px.scatter(rules, x="support", y="confidence", 
117 |                                size="lift", color="lift",
118 |                                hover_data=["antecedents", "consequents"],
119 |                                title="Support vs Confidence")
120 |                 st.plotly_chart(fig)
121 |                 
122 |                 # Network visualization
123 |                 st.subheader("Rule Network")
124 |                 G = nx.Graph()
125 |                 
126 |                 # Add nodes and edges
127 |                 for _, rule in rules.iterrows():
128 |                     antecedents = list(rule['antecedents'])[0]
129 |                     consequents = list(rule['consequents'])[0]
130 |                     G.add_edge(antecedents, consequents, weight=rule['lift'])
131 |                 
132 |                 # Create plot
133 |                 plt.figure(figsize=(12, 8))
134 |                 pos = nx.spring_layout(G)
135 |                 nx.draw(G, pos, with_labels=True, node_color='lightblue', 
136 |                        node_size=1500, font_size=10, font_weight='bold')
137 |                 st.pyplot(plt)
138 |                 
139 |                 # Top rules by lift
140 |                 st.subheader("Top Rules by Lift")
141 |                 top_rules = rules.sort_values('lift', ascending=False).head(5)
142 |                 for _, rule in top_rules.iterrows():
143 |                     antecedents = list(rule['antecedents'])[0]
144 |                     consequents = list(rule['consequents'])[0]
145 |                     st.write(f"If {antecedents} → {consequents}")
146 |                     st.write(f"Support: {rule['support']:.3f}, Confidence: {rule['confidence']:.3f}, Lift: {rule['lift']:.3f}")
147 |                     st.write("---")
148 |             else:
149 |                 st.warning("No rules found with the current parameters. Try adjusting the thresholds.")
150 |         except Exception as e:
151 |             st.error(f"An error occurred: {str(e)}")
152 |             st.info("Try adjusting the parameters or using different data.")
153 | 
154 | if __name__ == "__main__":
155 |     run() 


--------------------------------------------------------------------------------
/Association_Rule_Learning/Association_Rule_Learning_projects/recommendation_system.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import numpy as np
  4 | from mlxtend.frequent_patterns import apriori, association_rules
  5 | import plotly.express as px
  6 | import plotly.graph_objects as go
  7 | import networkx as nx
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | def generate_sample_data(n_users=1000):
 11 |     # Define product categories and their items
 12 |     categories = {
 13 |         'Electronics': ['Smartphone', 'Laptop', 'Tablet', 'Headphones', 'Camera'],
 14 |         'Books': ['Fiction', 'Non-Fiction', 'Biography', 'Science', 'History'],
 15 |         'Movies': ['Action', 'Comedy', 'Drama', 'Sci-Fi', 'Documentary'],
 16 |         'Music': ['Pop', 'Rock', 'Classical', 'Jazz', 'Hip-Hop'],
 17 |         'Games': ['Action', 'Strategy', 'Puzzle', 'Sports', 'RPG']
 18 |     }
 19 |     
 20 |     # Define common user preferences
 21 |     common_preferences = [
 22 |         ['Smartphone', 'Headphones'],
 23 |         ['Laptop', 'Tablet'],
 24 |         ['Fiction', 'Biography'],
 25 |         ['Action', 'Sci-Fi'],
 26 |         ['Pop', 'Rock'],
 27 |         ['Action', 'Strategy'],
 28 |         ['Comedy', 'Drama'],
 29 |         ['Classical', 'Jazz']
 30 |     ]
 31 |     
 32 |     # Generate user interactions
 33 |     interactions = []
 34 |     for _ in range(n_users):
 35 |         user_interactions = []
 36 |         
 37 |         # 80% chance to include a common preference
 38 |         if np.random.random() < 0.8:
 39 |             pref_idx = np.random.randint(0, len(common_preferences))
 40 |             user_interactions.extend(common_preferences[pref_idx])
 41 |         
 42 |         # Add 2-4 random items
 43 |         n_additional = np.random.randint(2, 5)
 44 |         for _ in range(n_additional):
 45 |             category = np.random.choice(list(categories.keys()))
 46 |             item = np.random.choice(categories[category])
 47 |             if item not in user_interactions:  # Avoid duplicates
 48 |                 user_interactions.append(item)
 49 |         
 50 |         interactions.append(user_interactions)
 51 |     
 52 |     return interactions
 53 | 
 54 | def format_rule(rule):
 55 |     """Convert frozenset to string for display"""
 56 |     antecedents = ', '.join(list(rule['antecedents']))
 57 |     consequents = ', '.join(list(rule['consequents']))
 58 |     return f"{antecedents} → {consequents}"
 59 | 
 60 | def run():
 61 |     st.header("Recommendation System using Association Rules")
 62 |     st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Association_Rule_Learning)", unsafe_allow_html=True)
 63 | 
 64 |     # Load or generate dataset
 65 |     uploaded_file = st.file_uploader("Upload a CSV file with user-item interactions", type=["csv"])
 66 |     if uploaded_file is not None:
 67 |         df = pd.read_csv(uploaded_file)
 68 |         # Convert to list of interactions
 69 |         interactions = df.values.tolist()
 70 |     else:
 71 |         st.info("Using sample user-item interaction data")
 72 |         interactions = generate_sample_data()
 73 | 
 74 |     # Convert interactions to one-hot encoded DataFrame
 75 |     unique_items = list(set(item for interaction in interactions for item in interaction))
 76 |     df = pd.DataFrame([[1 if item in interaction else 0 for item in unique_items] 
 77 |                       for interaction in interactions], columns=unique_items)
 78 | 
 79 |     # Display data info
 80 |     st.subheader("Dataset Information")
 81 |     st.write(f"Number of users: {len(interactions)}")
 82 |     st.write(f"Number of unique items: {len(unique_items)}")
 83 |     st.write("Sample interactions:")
 84 |     st.dataframe(df.head())
 85 | 
 86 |     # Parameters
 87 |     st.subheader("Association Rule Parameters")
 88 |     min_support = st.slider("Minimum Support", min_value=0.001, max_value=0.5, value=0.003, step=0.001)
 89 |     min_confidence = st.slider("Minimum Confidence", min_value=0.1, max_value=1.0, value=0.15, step=0.05)
 90 |     min_lift = st.slider("Minimum Lift", min_value=1.0, max_value=5.0, value=1.1, step=0.1)
 91 | 
 92 |     if st.button("Generate Recommendations"):
 93 |         try:
 94 |             # Generate frequent itemsets
 95 |             frequent_itemsets = apriori(df, min_support=min_support, use_colnames=True)
 96 |             
 97 |             if len(frequent_itemsets) == 0:
 98 |                 st.warning("No frequent itemsets found. Try lowering the minimum support threshold.")
 99 |                 return
100 |             
101 |             # Generate rules
102 |             rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
103 |             rules = rules[rules['lift'] >= min_lift]
104 |             
105 |             if len(rules) > 0:
106 |                 # Create a copy of rules with formatted strings for display
107 |                 display_rules = rules.copy()
108 |                 display_rules['rule'] = display_rules.apply(format_rule, axis=1)
109 |                 
110 |                 # Display rules
111 |                 st.subheader("Association Rules")
112 |                 st.dataframe(display_rules[['rule', 'support', 'confidence', 'lift']])
113 |                 
114 |                 # Visualize support vs confidence
115 |                 fig = px.scatter(rules, x="support", y="confidence", 
116 |                                size="lift", color="lift",
117 |                                hover_data=["antecedents", "consequents"],
118 |                                title="Support vs Confidence")
119 |                 st.plotly_chart(fig)
120 |                 
121 |                 # Interactive recommendation
122 |                 st.subheader("Get Recommendations")
123 |                 selected_items = st.multiselect("Select items you like:", unique_items)
124 |                 
125 |                 if selected_items:
126 |                     # Find rules where selected items are in antecedents
127 |                     recommendations = []
128 |                     for _, rule in rules.iterrows():
129 |                         if all(item in rule['antecedents'] for item in selected_items):
130 |                             recommendations.extend(list(rule['consequents']))
131 |                     
132 |                     if recommendations:
133 |                         # Remove duplicates and selected items
134 |                         recommendations = list(set(recommendations) - set(selected_items))
135 |                         
136 |                         # Sort by frequency
137 |                         recommendation_counts = pd.Series(recommendations).value_counts()
138 |                         
139 |                         st.write("Recommended items based on your selection:")
140 |                         for item, count in recommendation_counts.items():
141 |                             st.write(f"- {item} (recommended {count} times)")
142 |                     else:
143 |                         st.info("No specific recommendations found. Try selecting different items or adjusting the parameters.")
144 |             else:
145 |                 st.warning("No rules found with the current parameters. Try adjusting the thresholds.")
146 |         except Exception as e:
147 |             st.error(f"An error occurred: {str(e)}")
148 |             st.info("Try adjusting the parameters or using different data.")
149 | 
150 | if __name__ == "__main__":
151 |     run() 


--------------------------------------------------------------------------------
/Association_Rule_Learning/README.md:
--------------------------------------------------------------------------------
 1 | # Association Rule Learning Projects
 2 | 
 3 | This repository contains projects that demonstrate the application of Association Rule Learning algorithms in various domains.
 4 | 
 5 | ## Projects
 6 | 
 7 | ### 1. Market Basket Analysis
 8 | 
 9 | **Screenshots:**
10 | ![Market Basket Analysis](screenshots/mark_bask.png)
11 | - Interactive transaction data upload
12 | - Customizable support, confidence, and lift thresholds
13 | - Visualization of association rules
14 | - Network visualization of item relationships
15 | - Top rules analysis
16 | - Support vs Confidence scatter plot
17 | 
18 | ### 2. Recommendation System
19 | 
20 | **Screenshots:**
21 | ![Recommendation System](screenshots/recomm.png)
22 | - Interactive user-item interaction data upload
23 | - Customizable support, confidence, and lift thresholds
24 | - Visualization of item relationships
25 | - Network visualization of item connections
26 | - Personalized recommendations based on selected items
27 | - Rule-based item suggestions
28 | 
29 | ### 3. Feature Selection
30 | 
31 | **Screenshots:**
32 | ![Feature Selection](screenshots/feature_sel.png)
33 | - Association rule-based feature selection
34 | - Interactive parameter tuning
35 | - Rule visualization
36 | - Feature importance analysis
37 | 
38 | ### 4. Image Compression
39 | 
40 | **Screenshots:**
41 | ![Image Compression](screenshots/img_comp.png)
42 | - Association rule-based image compression
43 | - Visualization of compressed vs. original image
44 | - Parameter tuning
45 | - Compression ratio analysis
46 | 
47 | ## How to Run
48 | 
49 | 1. Install the required packages:
50 | ```bash
51 | pip install -r requirements.txt
52 | ```
53 | 
54 | 2. Run the Streamlit app:
55 | ```bash
56 | streamlit run main.py
57 | ```
58 | 
59 | ## Project Structure
60 | 
61 | - `main.py`: Main entry point for running the projects
62 | - `Association_Rule_Learning_projects/`: Directory containing individual project files
63 |   - `market_basket_analysis.py`: Market basket analysis using association rules
64 |   - `recommendation_system.py`: Recommendation system using association rules
65 | 
66 | ## Features
67 | 
68 | - Interactive parameter tuning
69 | - Rich visualizations using Plotly and NetworkX
70 | - Support for custom data upload
71 | - Sample data generation
72 | - Detailed analysis tools
73 | - Interactive recommendation capabilities
74 | 
75 | ## Contributing
76 | 
77 | Contributions are welcome! Please feel free to submit a Pull Request.
78 | 
79 | ## License
80 | 
81 | This project is licensed under the MIT License. 


--------------------------------------------------------------------------------
/Association_Rule_Learning/main.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | # Ensure the Decision_Trees_projects folder is in the Python path
 6 | sys.path.append(str(Path(__file__).parent))
 7 | from Association_Rule_Learning_projects import (
 8 |     market_basket_analysis,
 9 |     recommendation_system
10 | )
11 | 
12 | def run():
13 |     st.title("Association Rule Learning Projects")
14 | 
15 |     # Sidebar for project selection
16 |     project = st.sidebar.selectbox(
17 |         "Select a project",
18 |         [
19 |             "Market Basket Analysis",
20 |             "Recommendation System"
21 |         ],
22 |     )
23 | 
24 |     # Run the selected project
25 |     if project == "Market Basket Analysis":
26 |         market_basket_analysis.run()
27 |     elif project == "Recommendation System":
28 |         recommendation_system.run()
29 | 
30 | if __name__ == "__main__":
31 |     run() 


--------------------------------------------------------------------------------
/Association_Rule_Learning/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit==1.22.0
2 | pandas==1.5.3
3 | numpy==1.24.3
4 | mlxtend==0.23.1
5 | plotly==5.13.1
6 | matplotlib==3.7.1
7 | networkx==3.1
8 | scipy==1.10.1
9 | seaborn 


--------------------------------------------------------------------------------
/Association_Rule_Learning/screenshots/feature_sel.png:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Association_Rule_Learning/screenshots/img_comp.png:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Association_Rule_Learning/screenshots/mark_bask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Association_Rule_Learning/screenshots/mark_bask.png


--------------------------------------------------------------------------------
/Association_Rule_Learning/screenshots/recomm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Association_Rule_Learning/screenshots/recomm.png


--------------------------------------------------------------------------------
/DBSCAN_HDBSCAN/DBSCAN_HDBSCAN_projects/anomaly_detection.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import numpy as np
  4 | import plotly.express as px
  5 | import plotly.graph_objects as go
  6 | from sklearn.cluster import DBSCAN
  7 | from sklearn.preprocessing import StandardScaler
  8 | from sklearn.decomposition import PCA
  9 | import seaborn as sns
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | def run():
 13 |     st.header("Anomaly Detection using DBSCAN")
 14 |     st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/DBSCAN_HDBSCAN)", unsafe_allow_html=True)
 15 | 
 16 |     # Load or generate dataset
 17 |     uploaded_file = st.file_uploader("Upload a CSV file with transaction data", type=["csv"])
 18 |     if uploaded_file is not None:
 19 |         df = pd.read_csv(uploaded_file)
 20 |     else:
 21 |         st.info("Using sample transaction data")
 22 |         # Generate sample transaction data
 23 |         np.random.seed(42)
 24 |         n_transactions = 1000
 25 |         
 26 |         # Generate normal transactions
 27 |         normal_amounts = np.random.normal(100, 20, int(n_transactions * 0.95))
 28 |         normal_times = np.random.normal(12, 2, int(n_transactions * 0.95))
 29 |         
 30 |         # Generate anomalous transactions
 31 |         anomaly_amounts = np.random.uniform(500, 1000, int(n_transactions * 0.05))
 32 |         anomaly_times = np.random.uniform(0, 24, int(n_transactions * 0.05))
 33 |         
 34 |         # Combine normal and anomalous data
 35 |         amounts = np.concatenate([normal_amounts, anomaly_amounts])
 36 |         times = np.concatenate([normal_times, anomaly_times])
 37 |         
 38 |         data = {
 39 |             'Transaction_ID': range(1, n_transactions + 1),
 40 |             'Amount': amounts,
 41 |             'Time': times,
 42 |             'Location_X': np.random.normal(0, 1, n_transactions),
 43 |             'Location_Y': np.random.normal(0, 1, n_transactions),
 44 |             'Merchant_Category': np.random.choice(['Retail', 'Food', 'Travel', 'Other'], n_transactions)
 45 |         }
 46 |         df = pd.DataFrame(data)
 47 | 
 48 |     # Display data info
 49 |     st.subheader("Dataset Information")
 50 |     st.write(f"Number of transactions: {len(df)}")
 51 |     st.write("Sample data:")
 52 |     st.dataframe(df.head())
 53 | 
 54 |     # Feature selection
 55 |     st.subheader("Feature Selection")
 56 |     features = ['Amount', 'Time', 'Location_X', 'Location_Y']
 57 |     selected_features = st.multiselect("Select features for anomaly detection", features, 
 58 |                                      default=['Amount', 'Time'])
 59 | 
 60 |     if len(selected_features) >= 2:
 61 |         # Prepare data
 62 |         X = df[selected_features]
 63 |         scaler = StandardScaler()
 64 |         X_scaled = scaler.fit_transform(X)
 65 | 
 66 |         # DBSCAN parameters
 67 |         st.subheader("DBSCAN Parameters")
 68 |         eps = st.slider("Epsilon (eps)", min_value=0.1, max_value=2.0, value=0.5, step=0.1)
 69 |         min_samples = st.slider("Minimum Samples", min_value=2, max_value=20, value=5)
 70 | 
 71 |         # Apply DBSCAN
 72 |         dbscan = DBSCAN(eps=eps, min_samples=min_samples)
 73 |         df['Cluster'] = dbscan.fit_predict(X_scaled)
 74 |         
 75 |         # Label clusters
 76 |         df['Anomaly'] = df['Cluster'].apply(lambda x: 'Anomaly' if x == -1 else 'Normal')
 77 | 
 78 |         # Visualize clusters using PCA
 79 |         pca = PCA(n_components=2)
 80 |         X_pca = pca.fit_transform(X_scaled)
 81 |         df['PCA1'] = X_pca[:, 0]
 82 |         df['PCA2'] = X_pca[:, 1]
 83 | 
 84 |         # PCA Scatter plot
 85 |         fig = px.scatter(df, x='PCA1', y='PCA2', color='Anomaly',
 86 |                         hover_data=selected_features,
 87 |                         title='Transaction Clusters (PCA Visualization)')
 88 |         st.plotly_chart(fig)
 89 | 
 90 |         # Anomaly Analysis
 91 |         st.subheader("Anomaly Analysis")
 92 |         anomaly_count = len(df[df['Anomaly'] == 'Anomaly'])
 93 |         st.write(f"Number of anomalies detected: {anomaly_count}")
 94 |         st.write(f"Percentage of anomalies: {(anomaly_count/len(df))*100:.2f}%")
 95 | 
 96 |         # Display anomaly statistics
 97 |         st.write("\nAnomaly Statistics:")
 98 |         anomaly_stats = df[df['Anomaly'] == 'Anomaly'][selected_features].describe()
 99 |         st.dataframe(anomaly_stats)
100 | 
101 |         # Feature importance visualization
102 |         st.subheader("Feature Distribution by Cluster")
103 |         for feature in selected_features:
104 |             fig = px.box(df, x='Anomaly', y=feature, title=f'{feature} Distribution by Cluster')
105 |             st.plotly_chart(fig)
106 | 
107 |         # Correlation heatmap
108 |         st.subheader("Feature Correlation Matrix")
109 |         correlation_matrix = df[selected_features].corr()
110 |         fig, ax = plt.subplots(figsize=(10, 8))
111 |         sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
112 |         st.pyplot(fig)
113 | 
114 |         # Interactive prediction
115 |         st.subheader("Check New Transaction")
116 |         input_values = {}
117 |         for feature in selected_features:
118 |             if feature == 'Amount':
119 |                 input_values[feature] = st.number_input(feature, min_value=0.0, max_value=1000.0, value=100.0)
120 |             elif feature == 'Time':
121 |                 input_values[feature] = st.number_input(feature, min_value=0.0, max_value=24.0, value=12.0)
122 |             elif feature == 'Location_X':
123 |                 input_values[feature] = st.number_input(feature, min_value=-3.0, max_value=3.0, value=0.0)
124 |             elif feature == 'Location_Y':
125 |                 input_values[feature] = st.number_input(feature, min_value=-3.0, max_value=3.0, value=0.0)
126 | 
127 |         if st.button("Check for Anomaly"):
128 |             # Create input array with only the selected features
129 |             new_transaction = np.array([[input_values[feature] for feature in selected_features]])
130 |             new_transaction_scaled = scaler.transform(new_transaction)
131 |             prediction = dbscan.fit_predict(np.vstack([X_scaled, new_transaction_scaled]))[-1]
132 |             result = "Anomaly" if prediction == -1 else "Normal"
133 |             st.success(f"Transaction is classified as: {result}")
134 | 
135 | if __name__ == "__main__":
136 |     run() 


--------------------------------------------------------------------------------
/DBSCAN_HDBSCAN/DBSCAN_HDBSCAN_projects/customer_behavior_analysis.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import numpy as np
  4 | import plotly.express as px
  5 | import plotly.graph_objects as go
  6 | import hdbscan
  7 | from sklearn.preprocessing import StandardScaler
  8 | from sklearn.decomposition import PCA
  9 | import seaborn as sns
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | def run():
 13 |     st.header("Customer Behavior Analysis using HDBSCAN")
 14 |     st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/DBSCAN_HDBSCAN)", unsafe_allow_html=True)
 15 | 
 16 |     # Load or generate dataset
 17 |     uploaded_file = st.file_uploader("Upload a CSV file with customer behavior data", type=["csv"])
 18 |     if uploaded_file is not None:
 19 |         df = pd.read_csv(uploaded_file)
 20 |     else:
 21 |         st.info("Using sample customer behavior data")
 22 |         # Generate sample customer behavior data
 23 |         np.random.seed(42)
 24 |         n_customers = 1000
 25 |         
 26 |         # Generate different customer segments
 27 |         segments = {
 28 |             'High_Value': {'size': 0.2, 'income': (80000, 150000), 'frequency': (20, 40), 'recency': (0, 30)},
 29 |             'Regular': {'size': 0.4, 'income': (40000, 80000), 'frequency': (10, 20), 'recency': (30, 90)},
 30 |             'Occasional': {'size': 0.3, 'income': (20000, 40000), 'frequency': (5, 10), 'recency': (90, 180)},
 31 |             'Inactive': {'size': 0.1, 'income': (0, 20000), 'frequency': (0, 5), 'recency': (180, 365)}
 32 |         }
 33 |         
 34 |         data = {
 35 |             'Customer_ID': range(1, n_customers + 1),
 36 |             'Annual_Income': [],
 37 |             'Purchase_Frequency': [],
 38 |             'Days_Since_Last_Purchase': [],
 39 |             'Average_Order_Value': [],
 40 |             'Website_Time_Spent': [],
 41 |             'App_Usage_Frequency': []
 42 |         }
 43 |         
 44 |         for segment, params in segments.items():
 45 |             n_segment = int(n_customers * params['size'])
 46 |             data['Annual_Income'].extend(np.random.uniform(*params['income'], n_segment))
 47 |             data['Purchase_Frequency'].extend(np.random.uniform(*params['frequency'], n_segment))
 48 |             data['Days_Since_Last_Purchase'].extend(np.random.uniform(*params['recency'], n_segment))
 49 |             data['Average_Order_Value'].extend(np.random.uniform(50, 500, n_segment))
 50 |             data['Website_Time_Spent'].extend(np.random.uniform(5, 60, n_segment))
 51 |             data['App_Usage_Frequency'].extend(np.random.uniform(1, 30, n_segment))
 52 |         
 53 |         df = pd.DataFrame(data)
 54 | 
 55 |     # Display data info
 56 |     st.subheader("Dataset Information")
 57 |     st.write(f"Number of customers: {len(df)}")
 58 |     st.write("Sample data:")
 59 |     st.dataframe(df.head())
 60 | 
 61 |     # Feature selection
 62 |     st.subheader("Feature Selection")
 63 |     features = ['Annual_Income', 'Purchase_Frequency', 'Days_Since_Last_Purchase',
 64 |                 'Average_Order_Value', 'Website_Time_Spent', 'App_Usage_Frequency']
 65 |     selected_features = st.multiselect("Select features for behavior analysis", features, 
 66 |                                      default=['Annual_Income', 'Purchase_Frequency', 'Days_Since_Last_Purchase'])
 67 | 
 68 |     if len(selected_features) >= 2:
 69 |         # Prepare data
 70 |         X = df[selected_features]
 71 |         scaler = StandardScaler()
 72 |         X_scaled = scaler.fit_transform(X)
 73 | 
 74 |         # HDBSCAN parameters
 75 |         st.subheader("HDBSCAN Parameters")
 76 |         min_cluster_size = st.slider("Minimum Cluster Size", min_value=5, max_value=50, value=15)
 77 |         min_samples = st.slider("Minimum Samples", min_value=1, max_value=20, value=5)
 78 | 
 79 |         # Apply HDBSCAN
 80 |         clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples)
 81 |         df['Cluster'] = clusterer.fit_predict(X_scaled)
 82 |         
 83 |         # Label clusters
 84 |         df['Segment'] = df['Cluster'].apply(lambda x: f'Segment {x}' if x != -1 else 'Noise')
 85 | 
 86 |         # Visualize clusters using PCA
 87 |         pca = PCA(n_components=2)
 88 |         X_pca = pca.fit_transform(X_scaled)
 89 |         df['PCA1'] = X_pca[:, 0]
 90 |         df['PCA2'] = X_pca[:, 1]
 91 | 
 92 |         # PCA Scatter plot
 93 |         fig = px.scatter(df, x='PCA1', y='PCA2', color='Segment',
 94 |                         hover_data=selected_features,
 95 |                         title='Customer Segments (PCA Visualization)')
 96 |         st.plotly_chart(fig)
 97 | 
 98 |         # Segment Analysis
 99 |         st.subheader("Segment Analysis")
100 |         segment_counts = df['Segment'].value_counts()
101 |         st.write("Segment Distribution:")
102 |         st.write(segment_counts)
103 |         
104 |         # Display segment statistics
105 |         for segment in df['Segment'].unique():
106 |             st.write(f"\n{segment} Customers:")
107 |             segment_data = df[df['Segment'] == segment]
108 |             st.write(f"Number of customers: {len(segment_data)}")
109 |             stats = segment_data[selected_features].describe()
110 |             st.write("Segment Statistics:")
111 |             st.dataframe(stats)
112 | 
113 |         # Feature importance visualization
114 |         st.subheader("Feature Distribution by Segment")
115 |         for feature in selected_features:
116 |             fig = px.box(df, x='Segment', y=feature, title=f'{feature} Distribution by Segment')
117 |             st.plotly_chart(fig)
118 | 
119 |         # Correlation heatmap
120 |         st.subheader("Feature Correlation Matrix")
121 |         correlation_matrix = df[selected_features].corr()
122 |         fig, ax = plt.subplots(figsize=(10, 8))
123 |         sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
124 |         st.pyplot(fig)
125 | 
126 |         # Interactive prediction
127 |         st.subheader("Analyze New Customer")
128 |         input_values = {}
129 |         for feature in selected_features:
130 |             if feature == 'Annual_Income':
131 |                 input_values[feature] = st.number_input(feature, min_value=0.0, max_value=200000.0, value=50000.0)
132 |             elif feature == 'Purchase_Frequency':
133 |                 input_values[feature] = st.number_input(feature, min_value=0.0, max_value=50.0, value=10.0)
134 |             elif feature == 'Days_Since_Last_Purchase':
135 |                 input_values[feature] = st.number_input(feature, min_value=0.0, max_value=365.0, value=30.0)
136 |             elif feature == 'Average_Order_Value':
137 |                 input_values[feature] = st.number_input(feature, min_value=0.0, max_value=1000.0, value=100.0)
138 |             elif feature == 'Website_Time_Spent':
139 |                 input_values[feature] = st.number_input(feature, min_value=0.0, max_value=120.0, value=30.0)
140 |             elif feature == 'App_Usage_Frequency':
141 |                 input_values[feature] = st.number_input(feature, min_value=0.0, max_value=50.0, value=10.0)
142 | 
143 |         if st.button("Analyze Customer"):
144 |             # Create input array with only the selected features
145 |             new_customer = np.array([[input_values[feature] for feature in selected_features]])
146 |             new_customer_scaled = scaler.transform(new_customer)
147 |             prediction = clusterer.fit_predict(np.vstack([X_scaled, new_customer_scaled]))[-1]
148 |             segment = f'Segment {prediction}' if prediction != -1 else 'Noise'
149 |             st.success(f"Customer belongs to: {segment}")
150 | 
151 | if __name__ == "__main__":
152 |     run() 


--------------------------------------------------------------------------------
/DBSCAN_HDBSCAN/main.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | # Ensure the Decision_Trees_projects folder is in the Python path
 6 | sys.path.append(str(Path(__file__).parent))
 7 | from DBSCAN_HDBSCAN_projects import (
 8 |     anomaly_detection,
 9 |     customer_behavior_analysis
10 | )
11 | 
12 | def run():
13 |     st.title("DBSCAN & HDBSCAN Clustering Projects")
14 | 
15 |     # Sidebar for project selection
16 |     project = st.sidebar.selectbox(
17 |         "Select a project",
18 |         [
19 |             "Anomaly Detection",
20 |             "Customer Behavior Analysis"
21 |         ],
22 |     )
23 | 
24 |     # Run the selected project
25 |     if project == "Anomaly Detection":
26 |         anomaly_detection.run()
27 |     elif project == "Customer Behavior Analysis":
28 |         customer_behavior_analysis.run()
29 | 
30 | if __name__ == "__main__":
31 |     run() 


--------------------------------------------------------------------------------
/DBSCAN_HDBSCAN/readme.md:
--------------------------------------------------------------------------------
 1 | # DBSCAN/HDBSCAN Projects
 2 | 
 3 | This repository contains various DBSCAN and HDBSCAN clustering projects implemented in Python. Each project demonstrates the application of density-based clustering algorithms to solve real-world problems using datasets.
 4 | 
 5 | ## Project Structure
 6 | 
 7 | ```
 8 | DBSCAN_HDBSCAN/
 9 | ├── main.py
10 | ├── requirements.txt
11 | ├── DBSCAN_HDBSCAN_projects/
12 | │   ├── customer_behavior_analysis.py
13 | ```
14 | 
15 | ### Key Files
16 | - **`main.py`**: The main entry point for running the Streamlit app.
17 | - **`requirements.txt`**: Contains the dependencies required to run the project.
18 | - **`DBSCAN_HDBSCAN_projects/`**: Contains individual project scripts.
19 | 
20 | ## Projects Included
21 | 
22 | 1. **Customer Behavior Analysis**  
23 |    Analyzes customer behavior patterns using HDBSCAN clustering to identify distinct customer segments.
24 | 
25 |    **Screenshots:**
26 |    ![Customer Behavior Analysis](screenshots/customer_behavior.png)
27 | 
28 |    - Interactive parameter tuning
29 |    - Cluster visualization
30 |    - Behavior pattern analysis
31 |    - Noise point identification
32 | 
33 | 2. **Anomaly Detection**  
34 |    Detects anomalies in data using DBSCAN clustering.
35 | 
36 |    **Screenshots:**
37 |    ![Anomaly Detection](screenshots/anom_det.png)
38 | 
39 |    - Outlier detection
40 |    - Cluster visualization
41 |    - Interactive parameter tuning
42 | 
43 | ## How to Run
44 | 
45 | 1. Clone the repository:
46 |    ```bash
47 |    git clone https://github.com/benasphy/ML_projects.git
48 |    cd DBSCAN_HDBSCAN
49 |    ```
50 | 
51 | 2. Install dependencies:
52 |    ```bash
53 |    pip install -r requirements.txt
54 |    ```
55 | 
56 | 3. Run the Streamlit app:
57 |    ```bash
58 |    streamlit run main.py
59 |    ```
60 | 
61 | 4. Select a project from the sidebar to explore its functionality.
62 | 
63 | ## Requirements
64 | 
65 | The project requires the following Python libraries:
66 | - `streamlit`
67 | - `numpy`
68 | - `pandas`
69 | - `scikit-learn`
70 | - `hdbscan`
71 | - `matplotlib`
72 | - `plotly`
73 | 
74 | ## Datasets
75 | 
76 | - **`customer_behavior.csv`**: Contains customer behavior data for analysis.
77 | 
78 | ## License
79 | 
80 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
81 | 
82 | ## Acknowledgments
83 | 
84 | - Datasets used in this project are sourced from publicly available repositories.
85 | - Special thanks to the contributors of the Python libraries used in this project.
86 | 
87 | ---
88 | Feel free to contribute to this repository by submitting issues or pull requests. 


--------------------------------------------------------------------------------
/DBSCAN_HDBSCAN/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit==1.22.0
2 | pandas==1.5.3
3 | numpy==1.24.3
4 | scikit-learn==1.2.2
5 | plotly==5.13.1
6 | matplotlib==3.7.1
7 | seaborn==0.12.2
8 | scipy==1.10.1 


--------------------------------------------------------------------------------
/DBSCAN_HDBSCAN/screenshots/anom_det.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/DBSCAN_HDBSCAN/screenshots/anom_det.png


--------------------------------------------------------------------------------
/DBSCAN_HDBSCAN/screenshots/customer_behavior.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/DBSCAN_HDBSCAN/screenshots/customer_behavior.png


--------------------------------------------------------------------------------
/Decision_Trees/Decision_Trees_projects/gym_decision_tree.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import numpy as np
  4 | from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
  5 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
  6 | import plotly.express as px
  7 | import plotly.graph_objects as go
  8 | import matplotlib.pyplot as plt
  9 | import seaborn as sns
 10 | 
 11 | def generate_sample_data():
 12 |     np.random.seed(42)
 13 |     n_samples = 1000
 14 |     
 15 |     # Generate features
 16 |     energy_levels = np.random.choice(['High', 'Low'], n_samples, p=[0.4, 0.6])
 17 |     motivation_levels = np.random.choice(['Highly Motivated', 'Neutral', 'No Motivation'], 
 18 |                                        n_samples, p=[0.3, 0.4, 0.3])
 19 |     
 20 |     # Create DataFrame
 21 |     df = pd.DataFrame({
 22 |         'Energy': energy_levels,
 23 |         'Motivation': motivation_levels
 24 |     })
 25 |     
 26 |     # Generate target (gym attendance) with some patterns
 27 |     def determine_gym_attendance(row):
 28 |         if row['Energy'] == 'High' and row['Motivation'] in ['Highly Motivated', 'Neutral']:
 29 |             return 1
 30 |         elif row['Energy'] == 'Low' and row['Motivation'] == 'No Motivation':
 31 |             return 0
 32 |         else:
 33 |             # Add some randomness for other combinations
 34 |             return np.random.choice([0, 1], p=[0.7, 0.3])
 35 |     
 36 |     df['Gym'] = df.apply(determine_gym_attendance, axis=1)
 37 |     
 38 |     return df
 39 | 
 40 | def run():
 41 |     st.header("Gym Attendance Prediction using Decision Trees")
 42 |     st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Decision_Trees)", unsafe_allow_html=True)
 43 | 
 44 |     # Generate sample data
 45 |     df = generate_sample_data()
 46 |     
 47 |     # Display dataset info
 48 |     st.subheader("Dataset Overview")
 49 |     col1, col2 = st.columns(2)
 50 |     with col1:
 51 |         st.write("Dataset Shape:", df.shape)
 52 |         st.write("Features:", ", ".join(df.columns[:-1]))
 53 |         st.write("Target: Gym Attendance (0: No, 1: Yes)")
 54 |     with col2:
 55 |         st.write("Class Distribution:")
 56 |         class_dist = df['Gym'].value_counts()
 57 |         fig = px.pie(values=class_dist.values, names=['No', 'Yes'], 
 58 |                     title='Gym Attendance Distribution')
 59 |         st.plotly_chart(fig)
 60 | 
 61 |     # Feature Analysis
 62 |     st.subheader("Feature Analysis")
 63 |     
 64 |     # Energy level impact
 65 |     fig = px.histogram(df, x='Energy', color='Gym',
 66 |                       title='Gym Attendance by Energy Level',
 67 |                       barmode='group')
 68 |     st.plotly_chart(fig)
 69 |     
 70 |     # Motivation level impact
 71 |     fig = px.histogram(df, x='Motivation', color='Gym',
 72 |                       title='Gym Attendance by Motivation Level',
 73 |                       barmode='group')
 74 |     st.plotly_chart(fig)
 75 | 
 76 |     # Prepare data
 77 |     X = pd.get_dummies(df[['Energy', 'Motivation']])
 78 |     y = df['Gym']
 79 |     
 80 |     # Train model
 81 |     model = DecisionTreeClassifier(criterion="entropy", max_depth=3, random_state=42)
 82 |     model.fit(X, y)
 83 |     
 84 |     # Model evaluation
 85 |     st.subheader("Model Performance")
 86 |     y_pred = model.predict(X)
 87 |     accuracy = accuracy_score(y, y_pred)
 88 |     
 89 |     # Display metrics
 90 |     col1, col2, col3 = st.columns(3)
 91 |     with col1:
 92 |         st.metric("Accuracy", f"{accuracy:.2%}")
 93 |     with col2:
 94 |         st.metric("Precision", f"{classification_report(y, y_pred, output_dict=True)['1']['precision']:.2%}")
 95 |     with col3:
 96 |         st.metric("Recall", f"{classification_report(y, y_pred, output_dict=True)['1']['recall']:.2%}")
 97 |     
 98 |     # Confusion Matrix
 99 |     st.subheader("Confusion Matrix")
100 |     cm = confusion_matrix(y, y_pred)
101 |     fig = px.imshow(cm,
102 |                    labels=dict(x="Predicted", y="Actual", color="Count"),
103 |                    x=['No', 'Yes'],
104 |                    y=['No', 'Yes'],
105 |                    text_auto=True,
106 |                    aspect="auto")
107 |     st.plotly_chart(fig)
108 |     
109 |     # Feature Importance
110 |     st.subheader("Feature Importance")
111 |     importance = pd.DataFrame({
112 |         'Feature': X.columns,
113 |         'Importance': model.feature_importances_
114 |     })
115 |     fig = px.bar(importance, x='Feature', y='Importance',
116 |                 title='Feature Importance in Prediction')
117 |     st.plotly_chart(fig)
118 |     
119 |     # Decision Tree Visualization
120 |     st.subheader("Decision Tree Structure")
121 |     fig, ax = plt.subplots(figsize=(12, 8))
122 |     plot_tree(model, feature_names=X.columns, class_names=['No', 'Yes'], 
123 |              filled=True, rounded=True, fontsize=10)
124 |     st.pyplot(fig)
125 |     
126 |     # Interactive Prediction
127 |     st.subheader("Make a Prediction")
128 |     st.write("Enter your current state:")
129 |     
130 |     col1, col2 = st.columns(2)
131 |     with col1:
132 |         energy = st.selectbox("Energy Level:", df['Energy'].unique())
133 |     with col2:
134 |         motivation = st.selectbox("Motivation Level:", df['Motivation'].unique())
135 |     
136 |     if st.button("Predict"):
137 |         # Prepare input data
138 |         input_data = pd.DataFrame({
139 |             'Energy': [energy],
140 |             'Motivation': [motivation]
141 |         })
142 |         
143 |         # One-hot encode categorical variables
144 |         input_encoded = pd.get_dummies(input_data)
145 |         # Ensure all columns from training data are present
146 |         for col in X.columns:
147 |             if col not in input_encoded.columns:
148 |                 input_encoded[col] = 0
149 |         input_encoded = input_encoded[X.columns]
150 |         
151 |         # Make prediction
152 |         prediction = model.predict(input_encoded)[0]
153 |         probability = model.predict_proba(input_encoded)[0]
154 |         
155 |         # Display prediction
156 |         st.subheader("Prediction Result")
157 |         col1, col2 = st.columns(2)
158 |         with col1:
159 |             st.metric("Prediction", "Will go to the gym" if prediction == 1 else "Will not go to the gym")
160 |         with col2:
161 |             st.metric("Confidence", f"{max(probability):.2%}")
162 |         
163 |         # Visualize prediction probability
164 |         fig = go.Figure(data=[
165 |             go.Bar(x=['No', 'Yes'],
166 |                   y=probability,
167 |                   text=[f'{p:.2%}' for p in probability],
168 |                   textposition='auto',
169 |             )
170 |         ])
171 |         fig.update_layout(title='Prediction Probabilities')
172 |         st.plotly_chart(fig)
173 |     
174 |     # Data Insights
175 |     st.subheader("Data Insights")
176 |     
177 |     # Energy and Motivation combination analysis
178 |     fig = px.sunburst(df, path=['Energy', 'Motivation', 'Gym'],
179 |                      title='Gym Attendance by Energy and Motivation Levels')
180 |     st.plotly_chart(fig)
181 |     
182 |     # Success rate by feature combinations
183 |     success_rate = df.groupby(['Energy', 'Motivation'])['Gym'].mean().reset_index()
184 |     fig = px.treemap(success_rate, path=['Energy', 'Motivation'],
185 |                     values='Gym',
186 |                     title='Success Rate by Feature Combinations')
187 |     st.plotly_chart(fig)
188 | 
189 | if __name__ == "__main__":
190 |     run()


--------------------------------------------------------------------------------
/Decision_Trees/main.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | # Ensure the Decision_Trees_projects folder is in the Python path
 6 | sys.path.append(str(Path(__file__).parent))
 7 | 
 8 | from Decision_Trees_projects import (
 9 |     gym_decision_tree,
10 |     gini_impurity_implementation,
11 | )
12 | 
13 | def run():
14 |     st.title("Decision Tree Projects")
15 | 
16 |     # Sidebar for project selection
17 |     project = st.sidebar.selectbox(
18 |         "Select a project",
19 |         [
20 |             "Gym Decision Tree",
21 |             "Gini Impurity Implementation",
22 |         ],
23 |     )
24 | 
25 |     # Run the selected project
26 |     if project == "Gym Decision Tree":
27 |         gym_decision_tree.run()
28 |     elif project == "Gini Impurity Implementation":
29 |         gini_impurity_implementation.run()
30 | 
31 | if __name__ == "__main__":
32 |     run()


--------------------------------------------------------------------------------
/Decision_Trees/readme.md:
--------------------------------------------------------------------------------
 1 | # Decision Tree Projects
 2 | 
 3 | This repository contains various Decision Tree projects implemented in Python. Each project demonstrates the application of Decision Trees to solve real-world problems using datasets.
 4 | 
 5 | ## Project Structure
 6 | 
 7 | ```
 8 | Decision_Tree/
 9 | ├── main.py
10 | ├── requirements.txt
11 | ├── Decision_Tree_projects/
12 | │   ├── gym_decision_tree.py
13 | │   ├── gini_impurity_implementation.py
14 | ```
15 | 
16 | ### Key Files
17 | - **`main.py`**: The main entry point for running the Streamlit app.
18 | - **`requirements.txt`**: Contains the dependencies required to run the project.
19 | - **`Decision_Tree_projects/`**: Contains individual project scripts.
20 | 
21 | ## Projects Included
22 | 
23 | 1. **Gym Decision Tree**  
24 |    Predicts whether a person will go to the gym based on their energy level and motivation using a Decision Tree.  
25 |    - Visualizes the decision tree.
26 |    - Allows user input to predict gym attendance.
27 | 
28 |    **Screenshots:**
29 |    ![Gym Decision Tree](screenshots/gym1.png)
30 | 
31 | 2. **Gini Impurity Implementation**  
32 |    Demonstrates the use of Gini Impurity to build a Decision Tree for predicting gym attendance.  
33 |    - Visualizes the decision tree.
34 |    - Allows user input to predict gym attendance.
35 | 
36 |    **Screenshots:**
37 |    ![Gini Impurity Implementation](screenshots/gini.png)
38 | 
39 | ## How to Run
40 | 
41 | 1. Clone the repository:
42 |    ```bash
43 |    git clone https://github.com/benasphy/ML_projects.git
44 |    cd Decision_Tree
45 |    ```
46 | 
47 | 2. Install dependencies:
48 |    ```bash
49 |    pip install -r requirements.txt
50 |    ```
51 | 
52 | 3. Run the Streamlit app:
53 |    ```bash
54 |    streamlit run main.py
55 |    ```
56 | 
57 | 4. Select a project from the sidebar to explore its functionality.
58 | 
59 | ## Requirements
60 | 
61 | The project requires the following Python libraries:
62 | - `streamlit`
63 | - `numpy`
64 | - `pandas`
65 | - `scikit-learn`
66 | - `matplotlib`
67 | 
68 | ## Screenshots
69 | 
70 | Add screenshots of the Streamlit app interface here to showcase the projects.
71 | 
72 | ## License
73 | 
74 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
75 | 
76 | ## Acknowledgments
77 | 
78 | - Datasets used in this project are synthetic and created for demonstration purposes.
79 | - Special thanks to the contributors of the Python libraries used in this project.
80 | 
81 | ---
82 | Feel free to contribute to this repository by submitting issues or pull requests.
83 | 


--------------------------------------------------------------------------------
/Decision_Trees/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit
2 | numpy
3 | pandas
4 | scikit-learn
5 | plotly
6 | scipy
7 | seaborn
8 | matplotlib


--------------------------------------------------------------------------------
/Decision_Trees/screenshots/gini.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Decision_Trees/screenshots/gini.png


--------------------------------------------------------------------------------
/Decision_Trees/screenshots/gym1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Decision_Trees/screenshots/gym1.png


--------------------------------------------------------------------------------
/Dimensionality_Reduction/Dimensionality_Reduction_projects/feature_selection.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import numpy as np
  4 | from sklearn.decomposition import PCA
  5 | from sklearn.manifold import TSNE
  6 | from sklearn.preprocessing import StandardScaler
  7 | import plotly.express as px
  8 | import plotly.graph_objects as go
  9 | import seaborn as sns
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | def generate_sample_data(n_samples=1000):
 13 |     # Generate features with different patterns
 14 |     np.random.seed(42)
 15 |     
 16 |     # Generate correlated features
 17 |     x1 = np.random.normal(0, 1, n_samples)
 18 |     x2 = x1 + np.random.normal(0, 0.5, n_samples)
 19 |     x3 = x1 - x2 + np.random.normal(0, 0.3, n_samples)
 20 |     
 21 |     # Generate independent features
 22 |     x4 = np.random.normal(0, 1, n_samples)
 23 |     x5 = np.random.normal(0, 1, n_samples)
 24 |     
 25 |     # Generate target variable
 26 |     y = 2*x1 + 3*x2 - x3 + np.random.normal(0, 0.5, n_samples)
 27 |     
 28 |     # Create DataFrame
 29 |     data = {
 30 |         'Feature1': x1,
 31 |         'Feature2': x2,
 32 |         'Feature3': x3,
 33 |         'Feature4': x4,
 34 |         'Feature5': x5,
 35 |         'Target': y
 36 |     }
 37 |     
 38 |     return pd.DataFrame(data)
 39 | 
 40 | def run():
 41 |     st.header("Feature Selection using Dimensionality Reduction")
 42 |     st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Dimensionality_Reduction)", unsafe_allow_html=True)
 43 | 
 44 |     # Load or generate dataset
 45 |     uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
 46 |     if uploaded_file is not None:
 47 |         df = pd.read_csv(uploaded_file)
 48 |     else:
 49 |         st.info("Using sample dataset")
 50 |         df = generate_sample_data()
 51 | 
 52 |     # Display data info
 53 |     st.subheader("Dataset Information")
 54 |     st.write(f"Number of samples: {len(df)}")
 55 |     st.write(f"Number of features: {len(df.columns) - 1}")  # Excluding target
 56 |     st.write("Sample data:")
 57 |     st.dataframe(df.head())
 58 | 
 59 |     # Feature selection
 60 |     st.subheader("Feature Selection")
 61 |     features = [col for col in df.columns if col != 'Target']
 62 |     selected_features = st.multiselect("Select features for analysis", features, default=features)
 63 | 
 64 |     if len(selected_features) >= 2:
 65 |         # Prepare data
 66 |         X = df[selected_features]
 67 |         y = df['Target'] if 'Target' in df.columns else None
 68 |         
 69 |         # Scale data
 70 |         scaler = StandardScaler()
 71 |         X_scaled = scaler.fit_transform(X)
 72 | 
 73 |         # Dimensionality reduction methods
 74 |         st.subheader("Dimensionality Reduction Methods")
 75 |         method = st.selectbox(
 76 |             "Select method",
 77 |             ["PCA", "t-SNE"]
 78 |         )
 79 | 
 80 |         if method == "PCA":
 81 |             # PCA parameters
 82 |             n_components = st.slider("Number of Components", min_value=1, max_value=len(selected_features), value=2)
 83 |             
 84 |             # Apply PCA
 85 |             pca = PCA(n_components=n_components)
 86 |             X_pca = pca.fit_transform(X_scaled)
 87 |             
 88 |             # Create DataFrame with PCA results
 89 |             pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(n_components)])
 90 |             if y is not None:
 91 |                 pca_df['Target'] = y
 92 |             
 93 |             # Plot PCA results
 94 |             st.subheader("PCA Results")
 95 |             
 96 |             # Scatter plot
 97 |             fig = px.scatter(pca_df, x='PC1', y='PC2', color='Target' if y is not None else None,
 98 |                            title="PCA Visualization",
 99 |                            labels={'PC1': f'PC1 ({pca.explained_variance_ratio_[0]:.2%})',
100 |                                  'PC2': f'PC2 ({pca.explained_variance_ratio_[1]:.2%})'})
101 |             st.plotly_chart(fig)
102 |             
103 |             # Explained variance
104 |             fig = px.bar(x=range(1, len(pca.explained_variance_ratio_) + 1),
105 |                         y=pca.explained_variance_ratio_,
106 |                         title="Explained Variance by Component",
107 |                         labels={'x': 'Component', 'y': 'Explained Variance'})
108 |             st.plotly_chart(fig)
109 |             
110 |             # Cumulative explained variance
111 |             cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
112 |             fig = px.line(x=range(1, len(cumulative_variance) + 1),
113 |                          y=cumulative_variance,
114 |                          title="Cumulative Explained Variance",
115 |                          labels={'x': 'Number of Components', 'y': 'Cumulative Explained Variance'})
116 |             st.plotly_chart(fig)
117 |             
118 |             # Feature importance
119 |             st.subheader("Feature Importance")
120 |             feature_importance = pd.DataFrame(
121 |                 pca.components_.T,
122 |                 columns=[f'PC{i+1}' for i in range(n_components)],
123 |                 index=selected_features
124 |             )
125 |             st.dataframe(feature_importance)
126 |             
127 |             # Plot feature importance
128 |             fig, ax = plt.subplots(figsize=(10, 6))
129 |             sns.heatmap(feature_importance, annot=True, cmap='coolwarm', center=0, ax=ax)
130 |             st.pyplot(fig)
131 | 
132 |         else:  # t-SNE
133 |             # t-SNE parameters
134 |             perplexity = st.slider("Perplexity", min_value=5, max_value=50, value=30)
135 |             learning_rate = st.slider("Learning Rate", min_value=10, max_value=1000, value=200)
136 |             
137 |             # Apply t-SNE
138 |             tsne = TSNE(n_components=2, perplexity=perplexity, learning_rate=learning_rate)
139 |             X_tsne = tsne.fit_transform(X_scaled)
140 |             
141 |             # Create DataFrame with t-SNE results
142 |             tsne_df = pd.DataFrame(X_tsne, columns=['t-SNE1', 't-SNE2'])
143 |             if y is not None:
144 |                 tsne_df['Target'] = y
145 |             
146 |             # Plot t-SNE results
147 |             st.subheader("t-SNE Results")
148 |             fig = px.scatter(tsne_df, x='t-SNE1', y='t-SNE2', color='Target' if y is not None else None,
149 |                            title="t-SNE Visualization")
150 |             st.plotly_chart(fig)
151 |             
152 |             # Feature correlation
153 |             st.subheader("Feature Correlation")
154 |             correlation_matrix = df[selected_features].corr()
155 |             fig, ax = plt.subplots(figsize=(10, 8))
156 |             sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, ax=ax)
157 |             st.pyplot(fig)
158 | 
159 | if __name__ == "__main__":
160 |     run() 


--------------------------------------------------------------------------------
/Dimensionality_Reduction/Dimensionality_Reduction_projects/image_compression.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import numpy as np
  3 | import pandas as pd
  4 | from sklearn.decomposition import PCA
  5 | import plotly.express as px
  6 | import plotly.graph_objects as go
  7 | from skimage import io
  8 | import matplotlib.pyplot as plt
  9 | from PIL import Image
 10 | import io as io_lib
 11 | 
 12 | def compress_image(image, n_components):
 13 |     # Reshape image to 2D array
 14 |     h, w, d = image.shape
 15 |     image_2d = image.reshape(h * w, d)
 16 |     
 17 |     # Apply PCA
 18 |     pca = PCA(n_components=min(n_components, d))
 19 |     compressed = pca.fit_transform(image_2d)
 20 |     
 21 |     # Reconstruct image
 22 |     reconstructed = pca.inverse_transform(compressed)
 23 |     reconstructed = reconstructed.reshape(h, w, d)
 24 |     
 25 |     # Clip values to valid range
 26 |     reconstructed = np.clip(reconstructed, 0, 1)
 27 |     
 28 |     # Calculate actual compressed size
 29 |     # For each pixel, we store:
 30 |     # 1. n_components values (compressed data)
 31 |     # 2. mean vector (d values)
 32 |     # 3. component vectors (n_components * d values)
 33 |     compressed_size = (h * w * n_components +  # compressed data
 34 |                       d +                      # mean vector
 35 |                       n_components * d)        # component vectors
 36 |     
 37 |     return reconstructed, pca.explained_variance_ratio_, compressed_size
 38 | 
 39 | def run():
 40 |     st.header("Image Compression using PCA")
 41 |     st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Dimensionality_Reduction)", unsafe_allow_html=True)
 42 | 
 43 |     # File uploader
 44 |     uploaded_file = st.file_uploader("Upload an image", type=['jpg', 'jpeg', 'png'])
 45 |     
 46 |     if uploaded_file is not None:
 47 |         # Read image
 48 |         image = io.imread(uploaded_file)
 49 |         
 50 |         # Display original image
 51 |         st.subheader("Original Image")
 52 |         st.image(image, use_column_width=True)
 53 |         
 54 |         # Convert to float and normalize
 55 |         image_float = image.astype(np.float32) / 255.0
 56 |         
 57 |         # Parameters
 58 |         st.subheader("Compression Parameters")
 59 |         max_components = min(image.shape[2], 3)  # Limit to number of color channels
 60 |         n_components = st.slider("Number of Components", min_value=1, max_value=max_components, value=1)
 61 |         
 62 |         if st.button("Compress Image"):
 63 |             # Compress image
 64 |             compressed_image, explained_variance, compressed_size = compress_image(image_float, n_components)
 65 |             
 66 |             # Convert back to uint8 for display
 67 |             compressed_display = (compressed_image * 255).astype(np.uint8)
 68 |             
 69 |             # Display compressed image
 70 |             st.subheader("Compressed Image")
 71 |             st.image(compressed_display, use_column_width=True)
 72 |             
 73 |             # Display compression statistics
 74 |             st.subheader("Compression Statistics")
 75 |             original_size = image.size  # Number of pixels * number of channels
 76 |             compression_ratio = original_size / compressed_size
 77 |             st.write(f"Original Size: {original_size/1024:.1f} KB")
 78 |             st.write(f"Compressed Size: {compressed_size/1024:.1f} KB")
 79 |             st.write(f"Compression Ratio: {compression_ratio:.1f}x")
 80 |             st.write(f"Components Used: {n_components}")
 81 |             st.write(f"Explained Variance: {np.sum(explained_variance)*100:.1f}%")
 82 |             
 83 |             # Display explained variance
 84 |             st.subheader("Explained Variance")
 85 |             fig = go.Figure()
 86 |             fig.add_trace(go.Bar(
 87 |                 x=list(range(1, len(explained_variance) + 1)),
 88 |                 y=explained_variance,
 89 |                 name='Individual'
 90 |             ))
 91 |             fig.add_trace(go.Scatter(
 92 |                 x=list(range(1, len(explained_variance) + 1)),
 93 |                 y=np.cumsum(explained_variance),
 94 |                 name='Cumulative',
 95 |                 mode='lines+markers'
 96 |             ))
 97 |             fig.update_layout(
 98 |                 title='Explained Variance by Component',
 99 |                 xaxis_title='Component',
100 |                 yaxis_title='Explained Variance',
101 |                 showlegend=True
102 |             )
103 |             st.plotly_chart(fig)
104 |             
105 |             # Download compressed image
106 |             compressed_pil = Image.fromarray(compressed_display)
107 |             img_byte_arr = io_lib.BytesIO()
108 |             compressed_pil.save(img_byte_arr, format='PNG')
109 |             img_byte_arr = img_byte_arr.getvalue()
110 |             st.download_button(
111 |                 label="Download Compressed Image",
112 |                 data=img_byte_arr,
113 |                 file_name="compressed_image.png",
114 |                 mime="image/png"
115 |             )
116 | 
117 | if __name__ == "__main__":
118 |     run() 


--------------------------------------------------------------------------------
/Dimensionality_Reduction/README.md:
--------------------------------------------------------------------------------
 1 | # Dimensionality Reduction Projects
 2 | 
 3 | This repository contains projects that demonstrate the application of various dimensionality reduction techniques in different domains.
 4 | 
 5 | ## Projects
 6 | 
 7 | ### 1. Image Compression
 8 | 
 9 | **Screenshots:**
10 | ![Image Compression](screenshots/img_comp.png)
11 | - Interactive image upload and processing
12 | - PCA-based image compression
13 | - Adjustable number of components
14 | - Compression statistics and analysis
15 | - Explained variance visualization
16 | - Color channel analysis
17 | - Download compressed images
18 | 
19 | ### 2. Feature Selection
20 | 
21 | **Screenshots:**
22 | ![Feature Selection](screenshots/feature_sel.png)
23 | - Interactive dataset upload
24 | - Multiple dimensionality reduction methods (PCA, t-SNE)
25 | - Feature importance analysis
26 | - Explained variance visualization
27 | - Interactive parameter tuning
28 | - Correlation analysis
29 | - Visual exploration of reduced dimensions
30 | 
31 | ## How to Run
32 | 
33 | 1. Install the required packages:
34 | ```bash
35 | pip install -r requirements.txt
36 | ```
37 | 
38 | 2. Run the Streamlit app:
39 | ```bash
40 | streamlit run main.py
41 | ```
42 | 
43 | ## Project Structure
44 | 
45 | - `main.py`: Main entry point for running the projects
46 | - `Dimensionality_Reduction_projects/`: Directory containing individual project files
47 |   - `image_compression.py`: Image compression using PCA
48 |   - `feature_selection.py`: Feature selection using PCA and t-SNE
49 | 
50 | ## Features
51 | 
52 | - Interactive parameter tuning
53 | - Rich visualizations using Plotly
54 | - Support for custom data upload
55 | - Sample data generation
56 | - Detailed analysis tools
57 | - Multiple dimensionality reduction methods
58 | - Comprehensive visualizations
59 | 
60 | ## Contributing
61 | 
62 | Contributions are welcome! Please feel free to submit a Pull Request.
63 | 
64 | ## License
65 | 
66 | This project is licensed under the MIT License. 


--------------------------------------------------------------------------------
/Dimensionality_Reduction/main.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | # Ensure the Decision_Trees_projects folder is in the Python path
 6 | sys.path.append(str(Path(__file__).parent))
 7 | from Dimensionality_Reduction_projects import (
 8 |     image_compression,
 9 |     feature_selection
10 | )
11 | 
12 | def run():
13 |     st.title("Dimensionality Reduction Projects")
14 | 
15 |     # Sidebar for project selection
16 |     project = st.sidebar.selectbox(
17 |         "Select a project",
18 |         [
19 |             "Image Compression",
20 |             "Feature Selection"
21 |         ],
22 |     )
23 | 
24 |     # Run the selected project
25 |     if project == "Image Compression":
26 |         image_compression.run()
27 |     elif project == "Feature Selection":
28 |         feature_selection.run()
29 | 
30 | if __name__ == "__main__":
31 |     run() 


--------------------------------------------------------------------------------
/Dimensionality_Reduction/requirements.txt:
--------------------------------------------------------------------------------
 1 | streamlit==1.22.0
 2 | pandas==1.5.3
 3 | numpy==1.24.3
 4 | scikit-learn==1.2.2
 5 | plotly==5.13.1
 6 | matplotlib==3.7.1
 7 | seaborn==0.12.2
 8 | scikit-image==0.20.0
 9 | pillow==9.5.0
10 | scipy==1.10.1 
11 | scikit-image


--------------------------------------------------------------------------------
/Dimensionality_Reduction/screenshots/feature_sel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Dimensionality_Reduction/screenshots/feature_sel.png


--------------------------------------------------------------------------------
/Dimensionality_Reduction/screenshots/img_comp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Dimensionality_Reduction/screenshots/img_comp.png


--------------------------------------------------------------------------------
/Fuzzy_C_Means/Fuzzy_C_Means_projects/image_segmentation.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import numpy as np
  3 | import pandas as pd
  4 | from sklearn.decomposition import PCA
  5 | import plotly.express as px
  6 | import plotly.graph_objects as go
  7 | from skimage import io
  8 | import matplotlib.pyplot as plt
  9 | from PIL import Image
 10 | import io as io_lib
 11 | 
 12 | def fuzzy_c_means(data, n_clusters, m=2, max_iter=100, error=1e-5):
 13 |     # Initialize membership matrix randomly
 14 |     n_samples = data.shape[0]
 15 |     membership = np.random.random((n_samples, n_clusters))
 16 |     membership = membership / membership.sum(axis=1)[:, np.newaxis]
 17 |     
 18 |     # Initialize cluster centers
 19 |     centers = np.zeros((n_clusters, data.shape[1]))
 20 |     
 21 |     # Iterate until convergence
 22 |     for _ in range(max_iter):
 23 |         # Update cluster centers
 24 |         for j in range(n_clusters):
 25 |             centers[j] = np.sum(membership[:, j:j+1] ** m * data, axis=0) / np.sum(membership[:, j:j+1] ** m)
 26 |         
 27 |         # Update membership matrix
 28 |         old_membership = membership.copy()
 29 |         
 30 |         # Calculate distances between data points and centers
 31 |         distances = np.zeros((n_samples, n_clusters))
 32 |         for j in range(n_clusters):
 33 |             distances[:, j] = np.sum((data - centers[j]) ** 2, axis=1)
 34 |         
 35 |         # Update membership values
 36 |         for j in range(n_clusters):
 37 |             membership[:, j] = 1 / np.sum((distances[:, j:j+1] / distances) ** (1/(m-1)), axis=1)
 38 |         
 39 |         # Check convergence
 40 |         if np.max(np.abs(membership - old_membership)) < error:
 41 |             break
 42 |     
 43 |     return membership, centers
 44 | 
 45 | def run():
 46 |     st.header("Image Segmentation using Fuzzy C-Means")
 47 |     st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Fuzzy_C_Means)", unsafe_allow_html=True)
 48 | 
 49 |     # File uploader
 50 |     uploaded_file = st.file_uploader("Upload an image", type=['jpg', 'jpeg', 'png'])
 51 |     
 52 |     if uploaded_file is not None:
 53 |         # Read image
 54 |         image = io.imread(uploaded_file)
 55 |         
 56 |         # Display original image
 57 |         st.subheader("Original Image")
 58 |         st.image(image, use_column_width=True)
 59 |         
 60 |         # Convert to LAB color space for better segmentation
 61 |         if len(image.shape) == 3:  # Color image
 62 |             # Convert to float and normalize
 63 |             image_float = image.astype(np.float32) / 255.0
 64 |             
 65 |             # Parameters
 66 |             st.subheader("Segmentation Parameters")
 67 |             n_segments = st.slider("Number of Segments", min_value=2, max_value=8, value=4)
 68 |             fuzziness = st.slider("Fuzziness Parameter (m)", min_value=1.1, max_value=3.0, value=2.0, step=0.1)
 69 |             
 70 |             if st.button("Segment Image"):
 71 |                 # Reshape image for clustering
 72 |                 h, w, d = image_float.shape
 73 |                 image_2d = image_float.reshape(h * w, d)
 74 |                 
 75 |                 # Apply Fuzzy C-Means
 76 |                 membership, centers = fuzzy_c_means(image_2d, n_segments, m=fuzziness)
 77 |                 
 78 |                 # Get segment labels
 79 |                 labels = np.argmax(membership, axis=1)
 80 |                 
 81 |                 # Create segmented image
 82 |                 segmented = centers[labels].reshape(h, w, d)
 83 |                 segmented = np.clip(segmented * 255, 0, 255).astype(np.uint8)
 84 |                 
 85 |                 # Display segmented image
 86 |                 st.subheader("Segmented Image")
 87 |                 st.image(segmented, use_column_width=True)
 88 |                 
 89 |                 # Display membership maps
 90 |                 st.subheader("Membership Maps")
 91 |                 fig, axes = plt.subplots(1, n_segments, figsize=(15, 5))
 92 |                 for i in range(n_segments):
 93 |                     membership_map = membership[:, i].reshape(h, w)
 94 |                     axes[i].imshow(membership_map, cmap='viridis')
 95 |                     axes[i].set_title(f'Segment {i+1}')
 96 |                     axes[i].axis('off')
 97 |                 st.pyplot(fig)
 98 |                 
 99 |                 # Display segment statistics
100 |                 st.subheader("Segment Statistics")
101 |                 for i in range(n_segments):
102 |                     segment_size = np.sum(labels == i)
103 |                     segment_percentage = (segment_size / (h * w)) * 100
104 |                     st.write(f"Segment {i+1}: {segment_size} pixels ({segment_percentage:.1f}%)")
105 |                 
106 |                 # Display segment colors
107 |                 st.subheader("Segment Colors")
108 |                 fig, ax = plt.subplots(figsize=(10, 2))
109 |                 for i in range(n_segments):
110 |                     color = centers[i]
111 |                     # Convert color to RGB tuple for matplotlib
112 |                     rgb_color = tuple(color)
113 |                     ax.bar(i, 1, color=rgb_color)
114 |                 ax.set_xticks(range(n_segments))
115 |                 ax.set_xticklabels([f'Segment {i+1}' for i in range(n_segments)])
116 |                 ax.set_yticks([])
117 |                 st.pyplot(fig)
118 |                 
119 |                 # Download segmented image
120 |                 segmented_pil = Image.fromarray(segmented)
121 |                 img_byte_arr = io_lib.BytesIO()
122 |                 segmented_pil.save(img_byte_arr, format='PNG')
123 |                 img_byte_arr = img_byte_arr.getvalue()
124 |                 st.download_button(
125 |                     label="Download Segmented Image",
126 |                     data=img_byte_arr,
127 |                     file_name="segmented_image.png",
128 |                     mime="image/png"
129 |                 )
130 |         else:
131 |             st.warning("Please upload a color image.")
132 | 
133 | if __name__ == "__main__":
134 |     run() 


--------------------------------------------------------------------------------
/Fuzzy_C_Means/README.md:
--------------------------------------------------------------------------------
 1 | # Fuzzy C-Means Projects
 2 | 
 3 | This repository contains projects that demonstrate the application of Fuzzy C-Means clustering algorithm in various domains.
 4 | 
 5 | ## Projects
 6 | 
 7 | ### 1. Image Segmentation
 8 | 
 9 | **Screenshots:**
10 | ![Image Segmentation](screenshots/imag_seg.png)
11 | - Interactive image upload and processing
12 | - Customizable number of segments
13 | - Adjustable fuzziness parameter
14 | - Visualization of membership maps
15 | - Segment statistics and analysis
16 | - Color distribution analysis
17 | 
18 | ### 2. Customer Profiling
19 | 
20 | **Screenshots:**
21 | ![Customer Profiling](screenshots/cust_prof.png)
22 | - Interactive feature selection
23 | - Dynamic cluster number adjustment
24 | - Adjustable fuzziness parameter
25 | - PCA visualization with hover data
26 | - Detailed segment analysis
27 | - Feature importance visualization
28 | - Correlation heatmap
29 | - Interactive prediction for new customers
30 | 
31 | ## How to Run
32 | 
33 | 1. Install the required packages:
34 | ```bash
35 | pip install -r requirements.txt
36 | ```
37 | 
38 | 2. Run the Streamlit app:
39 | ```bash
40 | streamlit run main.py
41 | ```
42 | 
43 | ## Project Structure
44 | 
45 | - `main.py`: Main entry point for running the projects
46 | - `Fuzzy_C_Means_projects/`: Directory containing individual project files
47 |   - `image_segmentation.py`: Image segmentation using Fuzzy C-Means
48 |   - `customer_profiling.py`: Customer profiling using Fuzzy C-Means
49 | 
50 | ## Features
51 | 
52 | - Interactive parameter tuning
53 | - Rich visualizations using Plotly
54 | - Support for custom data upload
55 | - Sample data generation
56 | - Detailed analysis tools
57 | - Interactive prediction capabilities
58 | 
59 | ## Contributing
60 | 
61 | Contributions are welcome! Please feel free to submit a Pull Request.
62 | 
63 | ## License
64 | 
65 | This project is licensed under the MIT License. 


--------------------------------------------------------------------------------
/Fuzzy_C_Means/main.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | # Ensure the Decision_Trees_projects folder is in the Python path
 6 | sys.path.append(str(Path(__file__).parent))
 7 | from Fuzzy_C_Means_projects import (
 8 |     image_segmentation,
 9 |     customer_profiling
10 | )
11 | 
12 | def run():
13 |     st.title("Fuzzy C-Means Clustering Projects")
14 | 
15 |     # Sidebar for project selection
16 |     project = st.sidebar.selectbox(
17 |         "Select a project",
18 |         [
19 |             "Image Segmentation",
20 |             "Customer Profiling"
21 |         ],
22 |     )
23 | 
24 |     # Run the selected project
25 |     if project == "Image Segmentation":
26 |         image_segmentation.run()
27 |     elif project == "Customer Profiling":
28 |         customer_profiling.run()
29 | 
30 | if __name__ == "__main__":
31 |     run() 


--------------------------------------------------------------------------------
/Fuzzy_C_Means/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit==1.22.0
2 | pandas==1.5.3
3 | numpy==1.24.3
4 | scikit-learn==1.2.2
5 | plotly==5.13.1
6 | matplotlib==3.7.1
7 | seaborn==0.12.2
8 | scikit-image==0.20.0
9 | scipy==1.10.1 


--------------------------------------------------------------------------------
/Fuzzy_C_Means/screenshots/cust_prof.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Fuzzy_C_Means/screenshots/cust_prof.png


--------------------------------------------------------------------------------
/Fuzzy_C_Means/screenshots/imag_seg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Fuzzy_C_Means/screenshots/imag_seg.png


--------------------------------------------------------------------------------
/GMM/main.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | # Ensure the Linear_regression_projects folder is in the Python path
 6 | sys.path.append(str(Path(__file__).parent))
 7 | from GMM_projects import (
 8 |     customer_segmentation,
 9 |     image_color_segmentation,
10 | )
11 | 
12 | def run():
13 |     st.title("GMM Projects")
14 | 
15 |     # Sidebar for project selection
16 |     project = st.sidebar.selectbox(
17 |         "Select a project",
18 |         [
19 |             "Customer Segmentation",
20 |             "Image Color Segmentation",
21 |         ],
22 |     )
23 | 
24 |     # Run the selected project
25 |     if project == "Customer Segmentation":
26 |         customer_segmentation.run()
27 |     elif project == "Image Color Segmentation":
28 |         image_color_segmentation.run()
29 | 
30 | if __name__ == "__main__":
31 |     run()
32 | 


--------------------------------------------------------------------------------
/GMM/readme.md:
--------------------------------------------------------------------------------
 1 | # GMM Projects
 2 | 
 3 | This folder contains various projects that utilize Gaussian Mixture Models (GMM) for different applications. Each project is designed to demonstrate the use of GMM in machine learning tasks with interactive visualizations.
 4 | 
 5 | ## Projects
 6 | 
 7 | 1. **Customer Segmentation**: Clusters customers based on their characteristics using GMM. Features include:
 8 | 
 9 |    **Screenshots:**
10 |    ![Customer Segmentation](screenshots/cust.png)
11 |    - Interactive parameter tuning
12 |    - 2D and 3D visualizations
13 |    - Cluster analysis and interpretation
14 |    - Model evaluation metrics
15 | 
16 | 2. **Image Color Segmentation**: Segments images into color clusters using GMM. Features include:
17 | 
18 |    **Screenshots:**
19 |    ![Image Color Segmentation](screenshots/image_clust.png)
20 |    - Interactive image upload
21 |    - Color cluster visualization
22 |    - 3D color space analysis
23 |    - Cluster information and statistics
24 | 
25 | ## How to Run
26 | 
27 | To run any of the projects, follow these steps:
28 | 
29 | 1. Ensure you have the required dependencies installed. You can install them using pip:
30 | 
31 |    ```bash
32 |    pip install streamlit pandas numpy scikit-learn plotly pillow
33 |    ```
34 | 
35 | 2. Navigate to the GMM directory in your terminal.
36 | 
37 | 3. Run the Streamlit app using the following command:
38 | 
39 |    ```bash
40 |    streamlit run main.py
41 |    ```
42 | 
43 | 4. Use the sidebar to select the project you want to run.
44 | 
45 | ## Project Structure
46 | 
47 | - `main.py`: The main entry point for running the projects.
48 | - `GMM_projects/`: Contains individual project files:
49 |   - `customer_segmentation.py`: Customer segmentation project.
50 |   - `image_color_segmentation.py`: Image color segmentation project.
51 | 
52 | ## Features
53 | 
54 | - Interactive parameter tuning
55 | - Real-time visualizations
56 | - Model evaluation metrics
57 | - Detailed cluster analysis
58 | - Support for custom data input
59 | 
60 | ## Contributing
61 | 
62 | Feel free to contribute to these projects by submitting pull requests or opening issues for any bugs or feature requests.
63 | 
64 | ## License
65 | 
66 | This project is licensed under the MIT License - see the LICENSE file for details.
67 | 


--------------------------------------------------------------------------------
/GMM/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit==1.22.0
2 | pandas==1.5.3
3 | numpy==1.24.3
4 | scikit-learn==1.2.2
5 | plotly==5.13.1
6 | pillow==9.5.0
7 | opencv-python
8 | 


--------------------------------------------------------------------------------
/GMM/screenshots/cust.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/GMM/screenshots/cust.png


--------------------------------------------------------------------------------
/GMM/screenshots/image_clust.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/GMM/screenshots/image_clust.png


--------------------------------------------------------------------------------
/Hierarchical_Clustering/Hierarchical_projects/document_clustering.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import numpy as np
  4 | from sklearn.feature_extraction.text import TfidfVectorizer
  5 | from sklearn.cluster import AgglomerativeClustering
  6 | from sklearn.metrics.pairwise import cosine_similarity
  7 | import plotly.express as px
  8 | import plotly.graph_objects as go
  9 | from scipy.cluster.hierarchy import dendrogram, linkage
 10 | import matplotlib.pyplot as plt
 11 | import seaborn as sns
 12 | 
 13 | def run():
 14 |     st.header("Document Clustering using Hierarchical Clustering")
 15 |     st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Hierarchical_Clustering)", unsafe_allow_html=True)
 16 | 
 17 |     # Load dataset
 18 |     uploaded_file = st.file_uploader("Upload a CSV file with text documents", type=["csv"])
 19 |     if uploaded_file is not None:
 20 |         df = pd.read_csv(uploaded_file)
 21 |         # Assuming the text column is named 'text'
 22 |         texts = df['text'].values
 23 |     else:
 24 |         st.info("Using sample document data")
 25 |         # Generate sample documents
 26 |         texts = [
 27 |             "Machine learning is a subset of artificial intelligence",
 28 |             "Deep learning uses neural networks with multiple layers",
 29 |             "Natural language processing helps computers understand text",
 30 |             "Computer vision enables machines to interpret images",
 31 |             "Data science combines statistics and programming",
 32 |             "Big data refers to large and complex datasets",
 33 |             "Cloud computing provides on-demand computing resources",
 34 |             "Internet of Things connects physical devices to the internet",
 35 |             "Cybersecurity protects systems from digital attacks",
 36 |             "Blockchain is a distributed ledger technology",
 37 |             "Quantum computing uses quantum bits for calculations",
 38 |             "Augmented reality overlays digital content on the real world",
 39 |             "Virtual reality creates immersive digital environments",
 40 |             "Robotics combines mechanical and electronic systems",
 41 |             "5G technology enables faster wireless communication"
 42 |         ]
 43 | 
 44 |     # Text preprocessing and vectorization
 45 |     vectorizer = TfidfVectorizer(stop_words='english')
 46 |     X = vectorizer.fit_transform(texts)
 47 | 
 48 |     # Hierarchical Clustering parameters
 49 |     st.subheader("Clustering Parameters")
 50 |     n_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
 51 |     linkage_method = st.selectbox("Linkage Method", ['ward', 'complete', 'average', 'single'])
 52 | 
 53 |     # Perform hierarchical clustering
 54 |     clustering = AgglomerativeClustering(
 55 |         n_clusters=n_clusters,
 56 |         linkage=linkage_method
 57 |     )
 58 |     clusters = clustering.fit_predict(X.toarray())
 59 | 
 60 |     # Create dendrogram
 61 |     st.subheader("Dendrogram")
 62 |     Z = linkage(X.toarray(), method=linkage_method)
 63 |     fig, ax = plt.subplots(figsize=(10, 7))
 64 |     dendrogram(Z, truncate_mode='level', p=5)
 65 |     plt.title('Hierarchical Clustering Dendrogram')
 66 |     plt.xlabel('Sample Index')
 67 |     plt.ylabel('Distance')
 68 |     st.pyplot(fig)
 69 | 
 70 |     # Document similarity matrix
 71 |     st.subheader("Document Similarity Matrix")
 72 |     similarity_matrix = cosine_similarity(X)
 73 |     fig, ax = plt.subplots(figsize=(10, 8))
 74 |     sns.heatmap(similarity_matrix, cmap='viridis')
 75 |     plt.title('Document Similarity Matrix')
 76 |     st.pyplot(fig)
 77 | 
 78 |     # Cluster visualization using PCA
 79 |     from sklearn.decomposition import PCA
 80 |     pca = PCA(n_components=2)
 81 |     X_pca = pca.fit_transform(X.toarray())
 82 |     
 83 |     # Create DataFrame for visualization
 84 |     viz_df = pd.DataFrame({
 85 |         'PC1': X_pca[:, 0],
 86 |         'PC2': X_pca[:, 1],
 87 |         'Cluster': clusters,
 88 |         'Document': [f"Doc {i+1}" for i in range(len(texts))]
 89 |     })
 90 | 
 91 |     # 2D Scatter plot
 92 |     fig = px.scatter(viz_df, x='PC1', y='PC2', color='Cluster',
 93 |                     hover_data=['Document'],
 94 |                     title='Document Clusters (PCA Visualization)')
 95 |     st.plotly_chart(fig)
 96 | 
 97 |     # Cluster analysis
 98 |     st.subheader("Cluster Analysis")
 99 |     for cluster in range(n_clusters):
100 |         st.write(f"\nCluster {cluster}:")
101 |         cluster_docs = [texts[i] for i in range(len(texts)) if clusters[i] == cluster]
102 |         st.write(f"Number of documents: {len(cluster_docs)}")
103 |         st.write("Documents in this cluster:")
104 |         for doc in cluster_docs:
105 |             st.write(f"- {doc}")
106 | 
107 |     # Cluster statistics
108 |     st.subheader("Cluster Statistics")
109 |     cluster_sizes = pd.Series(clusters).value_counts().sort_index()
110 |     fig = px.bar(x=cluster_sizes.index, y=cluster_sizes.values,
111 |                  title='Number of Documents per Cluster',
112 |                  labels={'x': 'Cluster', 'y': 'Number of Documents'})
113 |     st.plotly_chart(fig)
114 | 
115 |     # Word clouds for each cluster
116 |     st.subheader("Word Clouds by Cluster")
117 |     from wordcloud import WordCloud
118 |     
119 |     for cluster in range(n_clusters):
120 |         cluster_docs = [texts[i] for i in range(len(texts)) if clusters[i] == cluster]
121 |         if cluster_docs:
122 |             text = ' '.join(cluster_docs)
123 |             wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
124 |             fig, ax = plt.subplots(figsize=(10, 5))
125 |             ax.imshow(wordcloud, interpolation='bilinear')
126 |             ax.axis('off')
127 |             plt.title(f'Word Cloud - Cluster {cluster}')
128 |             st.pyplot(fig)
129 | 
130 | if __name__ == "__main__":
131 |     run() 


--------------------------------------------------------------------------------
/Hierarchical_Clustering/Hierarchical_projects/market_basket_analysis.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import numpy as np
  4 | from sklearn.cluster import AgglomerativeClustering
  5 | from sklearn.preprocessing import StandardScaler
  6 | import plotly.express as px
  7 | import plotly.graph_objects as go
  8 | from scipy.cluster.hierarchy import dendrogram, linkage
  9 | import matplotlib.pyplot as plt
 10 | import seaborn as sns
 11 | 
 12 | def run():
 13 |     st.header("Market Basket Analysis using Hierarchical Clustering")
 14 |     st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Hierarchical_Clustering)", unsafe_allow_html=True)
 15 | 
 16 |     # Load dataset
 17 |     uploaded_file = st.file_uploader("Upload a CSV file with purchase data", type=["csv"])
 18 |     if uploaded_file is not None:
 19 |         df = pd.read_csv(uploaded_file)
 20 |     else:
 21 |         st.info("Using sample market basket data")
 22 |         # Generate sample market basket data
 23 |         np.random.seed(42)
 24 |         n_transactions = 1000
 25 |         n_items = 20
 26 |         
 27 |         # Generate random purchase patterns
 28 |         data = np.random.binomial(1, 0.3, (n_transactions, n_items))
 29 |         
 30 |         # Create item names
 31 |         item_names = [f'Item_{i+1}' for i in range(n_items)]
 32 |         
 33 |         # Create DataFrame
 34 |         df = pd.DataFrame(data, columns=item_names)
 35 | 
 36 |     # Display data info
 37 |     st.subheader("Dataset Information")
 38 |     st.write(f"Number of transactions: {len(df)}")
 39 |     st.write(f"Number of items: {len(df.columns)}")
 40 |     st.write("Sample data:")
 41 |     st.dataframe(df.head())
 42 | 
 43 |     # Hierarchical Clustering parameters
 44 |     st.subheader("Clustering Parameters")
 45 |     n_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
 46 |     linkage_method = st.selectbox("Linkage Method", ['ward', 'complete', 'average', 'single'])
 47 | 
 48 |     # Prepare data for clustering
 49 |     X = df.values
 50 |     scaler = StandardScaler()
 51 |     X_scaled = scaler.fit_transform(X)
 52 | 
 53 |     # Perform hierarchical clustering
 54 |     clustering = AgglomerativeClustering(
 55 |         n_clusters=n_clusters,
 56 |         linkage=linkage_method
 57 |     )
 58 |     clusters = clustering.fit_predict(X_scaled)
 59 | 
 60 |     # Create dendrogram
 61 |     st.subheader("Dendrogram")
 62 |     Z = linkage(X_scaled, method=linkage_method)
 63 |     fig, ax = plt.subplots(figsize=(10, 7))
 64 |     dendrogram(Z, truncate_mode='level', p=5)
 65 |     plt.title('Hierarchical Clustering Dendrogram')
 66 |     plt.xlabel('Sample Index')
 67 |     plt.ylabel('Distance')
 68 |     st.pyplot(fig)
 69 | 
 70 |     # Item correlation matrix
 71 |     st.subheader("Item Correlation Matrix")
 72 |     correlation_matrix = df.corr()
 73 |     fig, ax = plt.subplots(figsize=(12, 10))
 74 |     sns.heatmap(correlation_matrix, cmap='coolwarm', center=0)
 75 |     plt.title('Item Correlation Matrix')
 76 |     st.pyplot(fig)
 77 | 
 78 |     # Cluster visualization using PCA
 79 |     from sklearn.decomposition import PCA
 80 |     pca = PCA(n_components=2)
 81 |     X_pca = pca.fit_transform(X_scaled)
 82 |     
 83 |     # Create DataFrame for visualization
 84 |     viz_df = pd.DataFrame({
 85 |         'PC1': X_pca[:, 0],
 86 |         'PC2': X_pca[:, 1],
 87 |         'Cluster': clusters
 88 |     })
 89 | 
 90 |     # 2D Scatter plot
 91 |     fig = px.scatter(viz_df, x='PC1', y='PC2', color='Cluster',
 92 |                     title='Transaction Clusters (PCA Visualization)')
 93 |     st.plotly_chart(fig)
 94 | 
 95 |     # Cluster analysis
 96 |     st.subheader("Cluster Analysis")
 97 |     for cluster in range(n_clusters):
 98 |         st.write(f"\nCluster {cluster}:")
 99 |         cluster_data = df[clusters == cluster]
100 |         st.write(f"Number of transactions: {len(cluster_data)}")
101 |         
102 |         # Calculate item frequencies in this cluster
103 |         item_freq = cluster_data.mean().sort_values(ascending=False)
104 |         top_items = item_freq[item_freq > 0.1]  # Show items that appear in more than 10% of transactions
105 |         
106 |         st.write("Top items in this cluster:")
107 |         for item, freq in top_items.items():
108 |             st.write(f"- {item}: {freq:.1%} of transactions")
109 | 
110 |     # Cluster statistics
111 |     st.subheader("Cluster Statistics")
112 |     cluster_sizes = pd.Series(clusters).value_counts().sort_index()
113 |     fig = px.bar(x=cluster_sizes.index, y=cluster_sizes.values,
114 |                  title='Number of Transactions per Cluster',
115 |                  labels={'x': 'Cluster', 'y': 'Number of Transactions'})
116 |     st.plotly_chart(fig)
117 | 
118 |     # Item frequency by cluster
119 |     st.subheader("Item Frequency by Cluster")
120 |     cluster_item_freq = df.groupby(clusters).mean()
121 |     
122 |     # Create heatmap
123 |     fig, ax = plt.subplots(figsize=(12, 8))
124 |     sns.heatmap(cluster_item_freq, cmap='YlOrRd')
125 |     plt.title('Item Frequency by Cluster')
126 |     st.pyplot(fig)
127 | 
128 |     # Association rules
129 |     st.subheader("Association Rules")
130 |     from mlxtend.frequent_patterns import apriori, association_rules
131 |     
132 |     # Generate frequent itemsets
133 |     frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
134 |     
135 |     # Generate rules
136 |     rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
137 |     
138 |     if not rules.empty:
139 |         st.write("Top Association Rules:")
140 |         # Convert frozensets to strings for display
141 |         display_rules = rules.copy()
142 |         display_rules['antecedents'] = display_rules['antecedents'].apply(lambda x: ', '.join(list(x)))
143 |         display_rules['consequents'] = display_rules['consequents'].apply(lambda x: ', '.join(list(x)))
144 |         st.dataframe(display_rules.head())
145 |         
146 |         # Create a copy of rules for visualization with string versions of frozensets
147 |         viz_rules = rules.copy()
148 |         viz_rules['antecedents_str'] = viz_rules['antecedents'].apply(lambda x: ', '.join(list(x)))
149 |         viz_rules['consequents_str'] = viz_rules['consequents'].apply(lambda x: ', '.join(list(x)))
150 |         
151 |         # Visualize rules
152 |         fig = px.scatter(viz_rules, x='support', y='confidence',
153 |                         size='lift', color='lift',
154 |                         hover_data=['antecedents_str', 'consequents_str'],
155 |                         title='Association Rules Visualization')
156 |         st.plotly_chart(fig)
157 | 
158 | if __name__ == "__main__":
159 |     run() 


--------------------------------------------------------------------------------
/Hierarchical_Clustering/README.md:
--------------------------------------------------------------------------------
 1 | # Hierarchical Clustering Projects
 2 | 
 3 | This folder contains various projects that utilize Hierarchical Clustering for different applications. Each project is designed to demonstrate the use of hierarchical clustering in machine learning tasks with interactive visualizations.
 4 | 
 5 | ## Projects
 6 | 
 7 | 1. **Document Clustering**: Clusters text documents based on their content using hierarchical clustering. Features include:
 8 | 
 9 |    **Screenshots:**
10 |    ![Document Clustering](screenshots/doc_clust.png)
11 |    - Interactive parameter tuning
12 |    - Dendrogram visualization
13 |    - Document similarity matrix
14 |    - Word cloud visualization
15 |    - Cluster analysis and interpretation
16 | 
17 | 2. **Market Basket Analysis**: Analyzes shopping patterns using hierarchical clustering. Features include:
18 | 
19 |    **Screenshots:**
20 |    ![Market Basket Analysis](screenshots/market_basket.png)
21 |    - Transaction clustering
22 |    - Item correlation analysis
23 |    - Association rules mining
24 |    - Interactive visualizations
25 |    - Cluster analysis and interpretation
26 | 
27 | ## How to Run
28 | 
29 | To run any of the projects, follow these steps:
30 | 
31 | 1. Ensure you have the required dependencies installed. You can install them using pip:
32 | 
33 |    ```bash
34 |    pip install streamlit pandas numpy scikit-learn plotly matplotlib seaborn wordcloud mlxtend
35 |    ```
36 | 
37 | 2. Navigate to the Hierarchical directory in your terminal.
38 | 
39 | 3. Run the Streamlit app using the following command:
40 | 
41 |    ```bash
42 |    streamlit run main.py
43 |    ```
44 | 
45 | 4. Use the sidebar to select the project you want to run.
46 | 
47 | ## Project Structure
48 | 
49 | - `main.py`: The main entry point for running the projects.
50 | - `Hierarchical_projects/`: Contains individual project files:
51 |   - `document_clustering.py`: Document clustering project.
52 |   - `market_basket_analysis.py`: Market basket analysis project.
53 | 
54 | ## Features
55 | 
56 | - Interactive parameter tuning
57 | - Real-time visualizations
58 | - Detailed cluster analysis
59 | - Support for custom data input
60 | - Rich visualization tools including:
61 |   - Dendrograms
62 |   - Heatmaps
63 |   - Scatter plots
64 |   - Word clouds
65 |   - Association rule visualizations
66 | 
67 | ## Contributing
68 | 
69 | Feel free to contribute to these projects by submitting pull requests or opening issues for any bugs or feature requests.
70 | 
71 | ## License
72 | 
73 | This project is licensed under the MIT License - see the LICENSE file for details. 


--------------------------------------------------------------------------------
/Hierarchical_Clustering/main.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | # Ensure the Decision_Trees_projects folder is in the Python path
 6 | sys.path.append(str(Path(__file__).parent))
 7 | from Hierarchical_projects import (
 8 |     document_clustering,
 9 |     market_basket_analysis,
10 | 
11 | )
12 | 
13 | def run():
14 |     st.title("Clustering Projects")
15 | 
16 |     # Sidebar for project selection
17 |     project = st.sidebar.selectbox(
18 |         "Select a project",
19 |         [
20 |             "Document Clustering",
21 |             "Market Basket Analysis",
22 |             
23 |         ],
24 |     )
25 | 
26 |     # Run the selected project
27 |     if project == "Document Clustering":
28 |         document_clustering.run()
29 |     elif project == "Market Basket Analysis":
30 |         market_basket_analysis.run()
31 |     
32 | 
33 | if __name__ == "__main__":
34 |     run() 


--------------------------------------------------------------------------------
/Hierarchical_Clustering/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit==1.22.0
2 | pandas==1.5.3
3 | numpy==1.24.3
4 | scikit-learn==1.2.2
5 | plotly==5.13.1
6 | matplotlib==3.7.1
7 | seaborn==0.12.2
8 | wordcloud==1.9.2
9 | mlxtend==0.23.1 


--------------------------------------------------------------------------------
/Hierarchical_Clustering/screenshots/doc_clust.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Hierarchical_Clustering/screenshots/doc_clust.png


--------------------------------------------------------------------------------
/Hierarchical_Clustering/screenshots/market_basket.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Hierarchical_Clustering/screenshots/market_basket.png


--------------------------------------------------------------------------------
/K-Means/K_Means_projects/customer_segmentation.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import numpy as np
  4 | import plotly.express as px
  5 | import plotly.graph_objects as go
  6 | from sklearn.cluster import KMeans
  7 | from sklearn.preprocessing import StandardScaler
  8 | from sklearn.decomposition import PCA
  9 | import seaborn as sns
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | def run():
 13 |     st.header("Customer Segmentation using K-Means Clustering")
 14 |     st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/K-Means)", unsafe_allow_html=True)
 15 | 
 16 |     # Load or generate dataset
 17 |     uploaded_file = st.file_uploader("Upload a CSV file with customer data", type=["csv"])
 18 |     if uploaded_file is not None:
 19 |         df = pd.read_csv(uploaded_file)
 20 |     else:
 21 |         st.info("Using sample customer data")
 22 |         # Generate sample customer data
 23 |         np.random.seed(42)
 24 |         n_customers = 1000
 25 |         
 26 |         data = {
 27 |             'Customer_ID': range(1, n_customers + 1),
 28 |             'Age': np.random.randint(18, 70, n_customers),
 29 |             'Annual_Income': np.random.randint(20000, 150000, n_customers),
 30 |             'Spending_Score': np.random.randint(1, 100, n_customers),
 31 |             'Purchase_Frequency': np.random.randint(1, 50, n_customers),
 32 |             'Average_Order_Value': np.random.randint(50, 500, n_customers),
 33 |             'Days_Since_Last_Purchase': np.random.randint(0, 365, n_customers)
 34 |         }
 35 |         df = pd.DataFrame(data)
 36 | 
 37 |     # Display data info
 38 |     st.subheader("Dataset Information")
 39 |     st.write(f"Number of customers: {len(df)}")
 40 |     st.write("Sample data:")
 41 |     st.dataframe(df.head())
 42 | 
 43 |     # Feature selection
 44 |     st.subheader("Feature Selection")
 45 |     features = ['Age', 'Annual_Income', 'Spending_Score', 'Purchase_Frequency', 
 46 |                 'Average_Order_Value', 'Days_Since_Last_Purchase']
 47 |     selected_features = st.multiselect("Select features for clustering", features, 
 48 |                                      default=['Annual_Income', 'Spending_Score'])
 49 | 
 50 |     if len(selected_features) >= 2:
 51 |         # Prepare data
 52 |         X = df[selected_features]
 53 |         scaler = StandardScaler()
 54 |         X_scaled = scaler.fit_transform(X)
 55 | 
 56 |         # K-Means parameters
 57 |         st.subheader("Clustering Parameters")
 58 |         n_clusters = st.slider("Number of Clusters", min_value=2, max_value=6, value=4)
 59 |         random_state = st.slider("Random State", min_value=0, max_value=100, value=42)
 60 | 
 61 |         # Apply K-Means
 62 |         kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
 63 |         df['Segment'] = kmeans.fit_predict(X_scaled)
 64 | 
 65 |         # Label segments based on average income and spending score
 66 |         cluster_means = df.groupby('Segment')[['Annual_Income', 'Spending_Score']].mean()
 67 |         segment_labels = {
 68 |             0: 'High Value',
 69 |             1: 'Low Value',
 70 |             2: 'High Potential',
 71 |             3: 'At Risk'
 72 |         }
 73 |         df['Segment'] = df['Segment'].map(segment_labels)
 74 | 
 75 |         # Visualize clusters using PCA
 76 |         pca = PCA(n_components=2)
 77 |         X_pca = pca.fit_transform(X_scaled)
 78 |         df['PCA1'] = X_pca[:, 0]
 79 |         df['PCA2'] = X_pca[:, 1]
 80 | 
 81 |         # PCA Scatter plot
 82 |         fig = px.scatter(df, x='PCA1', y='PCA2', color='Segment',
 83 |                         hover_data=selected_features,
 84 |                         title='Customer Segments (PCA Visualization)')
 85 |         st.plotly_chart(fig)
 86 | 
 87 |         # Cluster Analysis
 88 |         st.subheader("Segment Analysis")
 89 |         for segment in df['Segment'].unique():
 90 |             st.write(f"\n{segment} Customers:")
 91 |             segment_data = df[df['Segment'] == segment]
 92 |             st.write(f"Number of customers: {len(segment_data)}")
 93 |             
 94 |             # Display segment statistics
 95 |             stats = segment_data[selected_features].describe()
 96 |             st.write("Segment Statistics:")
 97 |             st.dataframe(stats)
 98 | 
 99 |         # Feature importance visualization
100 |         st.subheader("Feature Importance by Segment")
101 |         segment_means = df.groupby('Segment')[selected_features].mean()
102 |         fig = px.bar(segment_means, title='Average Feature Values by Segment',
103 |                     labels={'value': 'Average Value', 'variable': 'Feature'})
104 |         st.plotly_chart(fig)
105 | 
106 |         # Correlation heatmap
107 |         st.subheader("Feature Correlation Matrix")
108 |         correlation_matrix = df[selected_features].corr()
109 |         fig, ax = plt.subplots(figsize=(10, 8))
110 |         sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
111 |         st.pyplot(fig)
112 | 
113 |         # Segment distribution
114 |         st.subheader("Segment Distribution")
115 |         segment_counts = df['Segment'].value_counts()
116 |         fig = px.pie(values=segment_counts.values, names=segment_counts.index,
117 |                     title='Customer Segment Distribution')
118 |         st.plotly_chart(fig)
119 | 
120 |         # Interactive prediction
121 |         st.subheader("Predict Segment for New Customer")
122 |         input_values = {}
123 |         for feature in selected_features:
124 |             if feature == 'Age':
125 |                 input_values[feature] = st.number_input(feature, min_value=18, max_value=70, value=35)
126 |             elif feature == 'Annual_Income':
127 |                 input_values[feature] = st.number_input(feature, min_value=20000, max_value=150000, value=50000)
128 |             elif feature == 'Spending_Score':
129 |                 input_values[feature] = st.number_input(feature, min_value=1, max_value=100, value=50)
130 |             elif feature == 'Purchase_Frequency':
131 |                 input_values[feature] = st.number_input(feature, min_value=1, max_value=50, value=10)
132 |             elif feature == 'Average_Order_Value':
133 |                 input_values[feature] = st.number_input(feature, min_value=50, max_value=500, value=100)
134 |             elif feature == 'Days_Since_Last_Purchase':
135 |                 input_values[feature] = st.number_input(feature, min_value=0, max_value=365, value=30)
136 | 
137 |         if st.button("Predict Customer Segment"):
138 |             # Create input array with only the selected features
139 |             new_customer = np.array([[input_values[feature] for feature in selected_features]])
140 |             new_customer_scaled = scaler.transform(new_customer)
141 |             prediction = kmeans.predict(new_customer_scaled)[0]
142 |             segment = segment_labels[prediction]
143 |             st.success(f"Predicted Customer Segment: {segment}")
144 | 
145 | if __name__ == "__main__":
146 |     run() 


--------------------------------------------------------------------------------
/K-Means/K_Means_projects/loan_approval.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import numpy as np
  4 | import plotly.express as px
  5 | import plotly.graph_objects as go
  6 | from sklearn.cluster import KMeans
  7 | from sklearn.preprocessing import StandardScaler
  8 | from sklearn.decomposition import PCA
  9 | import seaborn as sns
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | def run():
 13 |     st.header("Loan Approval Analysis using K-Means Clustering")
 14 |     st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/K-Means)", unsafe_allow_html=True)
 15 | 
 16 |     # Load or generate dataset
 17 |     uploaded_file = st.file_uploader("Upload a CSV file with loan data", type=["csv"])
 18 |     if uploaded_file is not None:
 19 |         df = pd.read_csv(uploaded_file)
 20 |     else:
 21 |         st.info("Using sample loan data")
 22 |         # Generate sample loan data
 23 |         np.random.seed(42)
 24 |         n_applicants = 1000
 25 |         
 26 |         data = {
 27 |             'Applicant_ID': range(1, n_applicants + 1),
 28 |             'Annual_Income': np.random.randint(30000, 120000, n_applicants),
 29 |             'Credit_Score': np.random.randint(300, 850, n_applicants),
 30 |             'Loan_Amount': np.random.randint(5000, 50000, n_applicants),
 31 |             'Debt_to_Income_Ratio': np.random.uniform(0.1, 0.5, n_applicants),
 32 |             'Employment_Years': np.random.randint(0, 30, n_applicants),
 33 |             'Age': np.random.randint(18, 65, n_applicants)
 34 |         }
 35 |         df = pd.DataFrame(data)
 36 | 
 37 |     # Display data info
 38 |     st.subheader("Dataset Information")
 39 |     st.write(f"Number of applicants: {len(df)}")
 40 |     st.write("Sample data:")
 41 |     st.dataframe(df.head())
 42 | 
 43 |     # Feature selection
 44 |     st.subheader("Feature Selection")
 45 |     features = ['Annual_Income', 'Credit_Score', 'Loan_Amount', 'Debt_to_Income_Ratio', 
 46 |                 'Employment_Years', 'Age']
 47 |     selected_features = st.multiselect("Select features for clustering", features, 
 48 |                                      default=['Annual_Income', 'Credit_Score', 'Loan_Amount', 'Debt_to_Income_Ratio'])
 49 | 
 50 |     if len(selected_features) >= 2:
 51 |         # Prepare data
 52 |         X = df[selected_features]
 53 |         scaler = StandardScaler()
 54 |         X_scaled = scaler.fit_transform(X)
 55 | 
 56 |         # K-Means parameters
 57 |         st.subheader("Clustering Parameters")
 58 |         n_clusters = st.slider("Number of Clusters", min_value=2, max_value=6, value=3)
 59 |         random_state = st.slider("Random State", min_value=0, max_value=100, value=42)
 60 | 
 61 |         # Apply K-Means
 62 |         kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
 63 |         df['Risk_Category'] = kmeans.fit_predict(X_scaled)
 64 | 
 65 |         # Label clusters based on average credit score and income
 66 |         cluster_means = df.groupby('Risk_Category')[['Credit_Score', 'Annual_Income']].mean()
 67 |         risk_order = cluster_means['Credit_Score'].rank(ascending=False).astype(int) - 1
 68 |         risk_labels = {i: f"{['Low', 'Medium', 'High'][j]} Risk" 
 69 |                       for i, j in risk_order.items()}
 70 |         df['Risk_Category'] = df['Risk_Category'].map(risk_labels)
 71 | 
 72 |         # Visualize clusters using PCA
 73 |         pca = PCA(n_components=2)
 74 |         X_pca = pca.fit_transform(X_scaled)
 75 |         df['PCA1'] = X_pca[:, 0]
 76 |         df['PCA2'] = X_pca[:, 1]
 77 | 
 78 |         # PCA Scatter plot
 79 |         fig = px.scatter(df, x='PCA1', y='PCA2', color='Risk_Category',
 80 |                         hover_data=selected_features,
 81 |                         title='Loan Approval Clusters (PCA Visualization)')
 82 |         st.plotly_chart(fig)
 83 | 
 84 |         # Cluster Analysis
 85 |         st.subheader("Cluster Analysis")
 86 |         for risk in df['Risk_Category'].unique():
 87 |             st.write(f"\n{risk} Applicants:")
 88 |             cluster_data = df[df['Risk_Category'] == risk]
 89 |             st.write(f"Number of applicants: {len(cluster_data)}")
 90 |             
 91 |             # Display cluster statistics
 92 |             stats = cluster_data[selected_features].describe()
 93 |             st.write("Cluster Statistics:")
 94 |             st.dataframe(stats)
 95 | 
 96 |         # Feature importance visualization
 97 |         st.subheader("Feature Importance by Cluster")
 98 |         cluster_means = df.groupby('Risk_Category')[selected_features].mean()
 99 |         fig = px.bar(cluster_means, title='Average Feature Values by Risk Category',
100 |                     labels={'value': 'Average Value', 'variable': 'Feature'})
101 |         st.plotly_chart(fig)
102 | 
103 |         # Correlation heatmap
104 |         st.subheader("Feature Correlation Matrix")
105 |         correlation_matrix = df[selected_features].corr()
106 |         fig, ax = plt.subplots(figsize=(10, 8))
107 |         sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
108 |         st.pyplot(fig)
109 | 
110 |         # Interactive prediction
111 |         st.subheader("Predict Risk Category for New Applicant")
112 |         col1, col2 = st.columns(2)
113 |         with col1:
114 |             income = st.number_input("Annual Income", min_value=30000, max_value=120000, value=50000)
115 |             credit_score = st.number_input("Credit Score", min_value=300, max_value=850, value=700)
116 |         with col2:
117 |             loan_amount = st.number_input("Loan Amount", min_value=5000, max_value=50000, value=20000)
118 |             dti_ratio = st.number_input("Debt-to-Income Ratio", min_value=0.1, max_value=0.5, value=0.3)
119 | 
120 |         if st.button("Predict Risk Category"):
121 |             new_applicant = np.array([[income, credit_score, loan_amount, dti_ratio]])
122 |             new_applicant_scaled = scaler.transform(new_applicant)
123 |             prediction = kmeans.predict(new_applicant_scaled)[0]
124 |             risk_category = risk_labels[prediction]
125 |             st.success(f"Predicted Risk Category: {risk_category}")
126 | 
127 | if __name__ == "__main__":
128 |     run() 


--------------------------------------------------------------------------------
/K-Means/main.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | # Ensure the Decision_Trees_projects folder is in the Python path
 6 | sys.path.append(str(Path(__file__).parent))
 7 | from K_Means_projects import (
 8 |     loan_approval,
 9 |     customer_segmentation
10 | )
11 | 
12 | def run():
13 |     st.title("K-Means Clustering Projects")
14 | 
15 |     # Sidebar for project selection
16 |     project = st.sidebar.selectbox(
17 |         "Select a project",
18 |         [
19 |             "Loan Approval",
20 |             "Customer Segmentation"
21 |         ],
22 |     )
23 | 
24 |     # Run the selected project
25 |     if project == "Loan Approval":
26 |         loan_approval.run()
27 |     elif project == "Customer Segmentation":
28 |         customer_segmentation.run()
29 | 
30 | if __name__ == "__main__":
31 |     run()
32 | 


--------------------------------------------------------------------------------
/K-Means/readme.md:
--------------------------------------------------------------------------------
 1 | # K-Means Projects
 2 | 
 3 | This repository contains various K-Means clustering projects implemented in Python. Each project demonstrates the application of K-Means clustering to solve real-world problems using datasets.
 4 | 
 5 | ## Project Structure
 6 | 
 7 | ```
 8 | K-Means/
 9 | ├── main.py
10 | ├── requirements.txt
11 | ├── K_Means_projects/
12 | │   ├── customer_segmentation.py
13 | │   ├── loan_approval.py
14 | ```
15 | 
16 | ### Key Files
17 | - **`main.py`**: The main entry point for running the Streamlit app.
18 | - **`requirements.txt`**: Contains the dependencies required to run the project.
19 | - **`K_Means_projects/`**: Contains individual project scripts.
20 | 
21 | ## Projects Included
22 | 
23 | 1. **Customer Segmentation**  
24 |    Segments customers into different groups based on their behavior and characteristics using K-Means clustering.
25 | 
26 |    **Screenshots:**
27 |    ![Customer Segmentation](screenshots/cust_seg.png)
28 | 
29 |    - Interactive parameter tuning
30 |    - Cluster visualization
31 |    - Customer group analysis
32 | 
33 | 2. **Loan Approval Clustering**  
34 |    Groups loan applications into clusters based on various features using K-Means clustering.
35 | 
36 |    **Screenshots:**
37 |    ![Loan Approval Clustering](screenshots/loan.png)
38 | 
39 |    - Risk assessment
40 |    - Cluster analysis
41 |    - Interactive visualization
42 | 
43 | ## How to Run
44 | 
45 | 1. Clone the repository:
46 |    ```bash
47 |    git clone https://github.com/benasphy/ML_projects.git
48 |    cd K-Means
49 |    ```
50 | 
51 | 2. Install dependencies:
52 |    ```bash
53 |    pip install -r requirements.txt
54 |    ```
55 | 
56 | 3. Run the Streamlit app:
57 |    ```bash
58 |    streamlit run main.py
59 |    ```
60 | 
61 | 4. Select a project from the sidebar to explore its functionality.
62 | 
63 | ## Requirements
64 | 
65 | The project requires the following Python libraries:
66 | - `streamlit`
67 | - `numpy`
68 | - `pandas`
69 | - `scikit-learn`
70 | - `matplotlib`
71 | - `plotly`
72 | 
73 | ## Datasets
74 | 
75 | - **`customer_data.csv`**: Contains customer behavior data for segmentation.
76 | - **`loan_data.csv`**: Contains loan application data for clustering.
77 | 
78 | ## License
79 | 
80 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
81 | 
82 | ## Acknowledgments
83 | 
84 | - Datasets used in this project are sourced from publicly available repositories.
85 | - Special thanks to the contributors of the Python libraries used in this project.
86 | 
87 | ---
88 | Feel free to contribute to this repository by submitting issues or pull requests.
89 | 


--------------------------------------------------------------------------------
/K-Means/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit==1.22.0
2 | pandas==1.5.3
3 | numpy==1.24.3
4 | scikit-learn==1.2.2
5 | plotly==5.13.1
6 | matplotlib==3.7.1
7 | seaborn==0.12.2
8 | scipy
9 | 


--------------------------------------------------------------------------------
/K-Means/screenshots/cust_seg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/K-Means/screenshots/cust_seg.png


--------------------------------------------------------------------------------
/K-Means/screenshots/loan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/K-Means/screenshots/loan.png


--------------------------------------------------------------------------------
/KNN/KNN_projects/TShirt_size.csv:
--------------------------------------------------------------------------------
 1 | ﻿Height (in cms),Weight (in kgs),T Shirt Size
 2 | 158,58,M
 3 | 158,59,M
 4 | 158,63,M
 5 | 160,59,M
 6 | 160,60,M
 7 | 163,60,M
 8 | 163,61,M
 9 | 160,64,L
10 | 163,64,L
11 | 165,61,L
12 | 165,62,L
13 | 165,65,L
14 | 168,62,L
15 | 168,63,L
16 | 168,66,L
17 | 170,63,L
18 | 170,64,L
19 | 170,68,L
20 | 


--------------------------------------------------------------------------------
/KNN/KNN_projects/netflix_titles.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/KNN/KNN_projects/netflix_titles.csv


--------------------------------------------------------------------------------
/KNN/KNN_projects/tshirt_size_prediction.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import numpy as np
  4 | import os
  5 | from sklearn.neighbors import KNeighborsClassifier
  6 | from sklearn.preprocessing import LabelEncoder, StandardScaler
  7 | from sklearn.model_selection import train_test_split, cross_val_score
  8 | from sklearn.metrics import classification_report, confusion_matrix
  9 | import plotly.express as px
 10 | import plotly.graph_objects as go
 11 | import seaborn as sns
 12 | import matplotlib.pyplot as plt
 13 | 
 14 | def run():
 15 |     st.header("T-Shirt Size Prediction using KNN")
 16 |     st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/KNN)", unsafe_allow_html=True)
 17 | 
 18 |     # Load dataset
 19 |     uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
 20 |     if uploaded_file is not None:
 21 |         df = pd.read_csv(uploaded_file)
 22 |     else:
 23 |         st.info("Using default dataset: TShirt_size.csv")
 24 |         df = pd.read_csv(os.path.join(os.path.dirname(__file__), "TShirt_size.csv"))
 25 | 
 26 |     # Display dataset info
 27 |     st.subheader("Dataset Overview")
 28 |     col1, col2 = st.columns(2)
 29 |     with col1:
 30 |         st.write("Dataset Shape:", df.shape)
 31 |         st.write("Number of Samples:", len(df))
 32 |     with col2:
 33 |         size_dist = df["T Shirt Size"].value_counts()
 34 |         fig = px.pie(values=size_dist.values, names=size_dist.index,
 35 |                     title='T-Shirt Size Distribution')
 36 |         st.plotly_chart(fig)
 37 | 
 38 |     # Data Analysis
 39 |     st.subheader("Data Analysis")
 40 |     
 41 |     # Height and Weight Distribution
 42 |     col1, col2 = st.columns(2)
 43 |     with col1:
 44 |         fig = px.box(df, x="T Shirt Size", y="Height (in cms)",
 45 |                     title='Height Distribution by Size')
 46 |         st.plotly_chart(fig)
 47 |     
 48 |     with col2:
 49 |         fig = px.box(df, x="T Shirt Size", y="Weight (in kgs)",
 50 |                     title='Weight Distribution by Size')
 51 |         st.plotly_chart(fig)
 52 | 
 53 |     # Scatter plot with size distribution
 54 |     fig = px.scatter(df, x="Height (in cms)", y="Weight (in kgs)",
 55 |                     color="T Shirt Size",
 56 |                     title='Height vs Weight by T-Shirt Size')
 57 |     st.plotly_chart(fig)
 58 | 
 59 |     # Data preprocessing
 60 |     encoder = LabelEncoder()
 61 |     df["T Shirt Size"] = encoder.fit_transform(df["T Shirt Size"])
 62 |     
 63 |     X = df[["Height (in cms)", "Weight (in kgs)"]]
 64 |     y = df["T Shirt Size"]
 65 | 
 66 |     # Split data
 67 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
 68 | 
 69 |     # Scaling
 70 |     scaler = StandardScaler()
 71 |     scaler.fit(X_train)
 72 |     X_train = scaler.transform(X_train)
 73 |     X_test = scaler.transform(X_test)
 74 | 
 75 |     # Train model
 76 |     model = KNeighborsClassifier(n_neighbors=3, metric="manhattan")
 77 |     model.fit(X_train, y_train)
 78 | 
 79 |     # Model evaluation
 80 |     st.subheader("Model Performance")
 81 |     y_pred = model.predict(X_test)
 82 |     
 83 |     # Display metrics
 84 |     col1, col2, col3 = st.columns(3)
 85 |     with col1:
 86 |         scores = cross_val_score(model, X, y, cv=5, scoring='precision_weighted')
 87 |         st.metric("Average Precision", f"{scores.mean():.2%}")
 88 |     with col2:
 89 |         st.metric("Standard Deviation", f"{scores.std():.2%}")
 90 |     with col3:
 91 |         accuracy = (y_pred == y_test).mean()
 92 |         st.metric("Test Accuracy", f"{accuracy:.2%}")
 93 | 
 94 |     # Confusion Matrix
 95 |     st.subheader("Confusion Matrix")
 96 |     cm = confusion_matrix(y_test, y_pred)
 97 |     fig = px.imshow(cm,
 98 |                    labels=dict(x="Predicted", y="Actual", color="Count"),
 99 |                    x=['Medium', 'Large'],
100 |                    y=['Medium', 'Large'],
101 |                    text_auto=True,
102 |                    aspect="auto")
103 |     st.plotly_chart(fig)
104 | 
105 |     # KNN Visualization
106 |     st.subheader("KNN Decision Boundaries")
107 |     
108 |     # Create a mesh grid
109 |     h_min, h_max = X["Height (in cms)"].min() - 1, X["Height (in cms)"].max() + 1
110 |     w_min, w_max = X["Weight (in kgs)"].min() - 1, X["Weight (in kgs)"].max() + 1
111 |     h_grid = np.arange(h_min, h_max, 0.5)
112 |     w_grid = np.arange(w_min, w_max, 0.5)
113 |     hh, ww = np.meshgrid(h_grid, w_grid)
114 |     
115 |     # Predict for each point in the grid
116 |     grid_points = np.c_[hh.ravel(), ww.ravel()]
117 |     grid_points_scaled = scaler.transform(grid_points)
118 |     grid_predictions = model.predict(grid_points_scaled)
119 |     
120 |     # Plot decision boundaries
121 |     fig = px.scatter(x=grid_points[:, 0], y=grid_points[:, 1],
122 |                     color=grid_predictions,
123 |                     title='KNN Decision Boundaries')
124 |     fig.add_scatter(x=X["Height (in cms)"], y=X["Weight (in kgs)"],
125 |                    mode='markers',
126 |                    marker=dict(color=y, symbol='circle'),
127 |                    name='Training Data')
128 |     st.plotly_chart(fig)
129 | 
130 |     # Prediction interface
131 |     st.subheader("Predict T-Shirt Size")
132 |     col1, col2 = st.columns(2)
133 |     with col1:
134 |         height = st.number_input("Height (in cms):", min_value=140, max_value=200, value=170)
135 |     with col2:
136 |         weight = st.number_input("Weight (in kgs):", min_value=40, max_value=120, value=70)
137 | 
138 |     if st.button("Predict Size"):
139 |         new_sample = np.array([height, weight]).reshape(1, -1)
140 |         new_sample_scaled = scaler.transform(new_sample)
141 |         prediction = model.predict(new_sample_scaled)[0]
142 |         probabilities = model.predict_proba(new_sample_scaled)[0]
143 |         
144 |         size_mapping = {0: "Large", 1: "Medium"}
145 |         predicted_size = size_mapping[prediction]
146 |         
147 |         # Display prediction
148 |         col1, col2 = st.columns(2)
149 |         with col1:
150 |             st.metric("Predicted Size", predicted_size)
151 |         with col2:
152 |             st.metric("Confidence", f"{max(probabilities):.2%}")
153 |         
154 |         # Visualize prediction probabilities
155 |         fig = go.Figure(data=[
156 |             go.Bar(x=['Large', 'Medium'],
157 |                   y=probabilities,
158 |                   text=[f'{p:.2%}' for p in probabilities],
159 |                   textposition='auto',
160 |             )
161 |         ])
162 |         fig.update_layout(title='Prediction Probabilities',
163 |                         xaxis_title='Size',
164 |                         yaxis_title='Probability')
165 |         st.plotly_chart(fig)
166 |         
167 |         # Show nearest neighbors
168 |         st.subheader("Nearest Neighbors")
169 |         distances, indices = model.kneighbors(new_sample_scaled)
170 |         
171 |         neighbors_df = pd.DataFrame({
172 |             'Height (cm)': X.iloc[indices[0]]["Height (in cms)"],
173 |             'Weight (kg)': X.iloc[indices[0]]["Weight (in kgs)"],
174 |             'Size': [size_mapping[y.iloc[i]] for i in indices[0]],
175 |             'Distance': distances[0]
176 |         })
177 |         
178 |         fig = px.scatter(neighbors_df, x='Height (cm)', y='Weight (kg)',
179 |                         color='Size',
180 |                         size='Distance',
181 |                         title='Nearest Neighbors',
182 |                         hover_data=['Distance'])
183 |         fig.add_scatter(x=[height], y=[weight],
184 |                        mode='markers',
185 |                        marker=dict(color='red', symbol='star', size=15),
186 |                        name='Your Measurements')
187 |         st.plotly_chart(fig)
188 | 
189 | if __name__ == "__main__":
190 |     run() 


--------------------------------------------------------------------------------
/KNN/main.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | # Ensure the KNN_projects folder is in the Python path
 6 | sys.path.append(str(Path(__file__).parent))
 7 | 
 8 | from KNN_projects import (
 9 |     movie_recommendation,
10 |     tshirt_size_prediction,
11 | )
12 | 
13 | def run():
14 |     st.title("KNN Projects")
15 | 
16 |     # Sidebar for project selection
17 |     project = st.sidebar.selectbox(
18 |         "Select a project",
19 |         [
20 |             "Movie Recommendation",
21 |             "T-Shirt Size Prediction",
22 |         ],
23 |     )
24 | 
25 |     # Run the selected project
26 |     if project == "Movie Recommendation":
27 |         movie_recommendation.run()
28 |     elif project == "T-Shirt Size Prediction":
29 |         tshirt_size_prediction.run()
30 | 
31 | if __name__ == "__main__":
32 |     run()
33 | 


--------------------------------------------------------------------------------
/KNN/readme.md:
--------------------------------------------------------------------------------
 1 | # KNN Projects
 2 | 
 3 | This folder contains various projects that utilize the K-Nearest Neighbors (KNN) algorithm for different applications. Each project is designed to demonstrate the use of KNN in machine learning tasks.
 4 | 
 5 | ## Projects
 6 | 
 7 | 1. **Movie Recommendation System**: Recommends similar movies based on content features using KNN.
 8 | 
 9 |    **Screenshots:**
10 |    ![Movie Recommendation 1](screenshots/movie1.png)
11 |    ![Movie Recommendation 2](screenshots/movie2.png)
12 | 2. **T-Shirt Size Prediction**: Predicts T-shirt sizes based on height and weight measurements using KNN.
13 | 
14 |    **Screenshots:**
15 |    ![T-Shirt Size Prediction](screenshots/t-shirt.png)
16 | 
17 | ## How to Run
18 | 
19 | To run any of the projects, follow these steps:
20 | 
21 | 1. Ensure you have the required dependencies installed. You can install them using pip:
22 | 
23 |    ```bash
24 |    pip install streamlit pandas numpy scikit-learn
25 |    ```
26 | 
27 | 2. Navigate to the KNN directory in your terminal.
28 | 
29 | 3. Run the Streamlit app using the following command:
30 | 
31 |    ```bash
32 |    streamlit run main.py
33 |    ```
34 | 
35 | 4. Use the sidebar to select the project you want to run.
36 | 
37 | ## Project Structure
38 | 
39 | - `main.py`: The main entry point for running the projects.
40 | - `KNN_projects/`: Contains individual project files:
41 |   - `movie_recommendation.py`: Movie recommendation system project.
42 |   - `tshirt_size_prediction.py`: T-shirt size prediction project.
43 | 
44 | ## Data
45 | 
46 | Each project uses its own dataset, which is either uploaded by the user or loaded from a default CSV file located in the `KNN_projects/` directory.
47 | 
48 | ## Contributing
49 | 
50 | Feel free to contribute to these projects by submitting pull requests or opening issues for any bugs or feature requests.
51 | 
52 | ## License
53 | 
54 | This project is licensed under the MIT License - see the LICENSE file for details.
55 | 


--------------------------------------------------------------------------------
/KNN/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit==1.22.0
2 | pandas==1.5.3
3 | numpy==1.24.3
4 | scikit-learn==1.2.2
5 | plotly
6 | scipy
7 | seaborn
8 | wordcloud
9 | 


--------------------------------------------------------------------------------
/KNN/screenshots/movie1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/KNN/screenshots/movie1.png


--------------------------------------------------------------------------------
/KNN/screenshots/movie2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/KNN/screenshots/movie2.png


--------------------------------------------------------------------------------
/KNN/screenshots/t-shirt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/KNN/screenshots/t-shirt.png


--------------------------------------------------------------------------------
/Linear_Regression/Linear_regression_projects/Salary_dataset.csv:
--------------------------------------------------------------------------------
 1 | ,YearsExperience,Salary
 2 | 0,1.2000000000000002,39344.0
 3 | 1,1.4000000000000001,46206.0
 4 | 2,1.6,37732.0
 5 | 3,2.1,43526.0
 6 | 4,2.3000000000000003,39892.0
 7 | 5,3.0,56643.0
 8 | 6,3.1,60151.0
 9 | 7,3.3000000000000003,54446.0
10 | 8,3.3000000000000003,64446.0
11 | 9,3.8000000000000003,57190.0
12 | 10,4.0,63219.0
13 | 11,4.1,55795.0
14 | 12,4.1,56958.0
15 | 13,4.199999999999999,57082.0
16 | 14,4.6,61112.0
17 | 15,5.0,67939.0
18 | 16,5.199999999999999,66030.0
19 | 17,5.3999999999999995,83089.0
20 | 18,6.0,81364.0
21 | 19,6.1,93941.0
22 | 20,6.8999999999999995,91739.0
23 | 21,7.199999999999999,98274.0
24 | 22,8.0,101303.0
25 | 23,8.299999999999999,113813.0
26 | 24,8.799999999999999,109432.0
27 | 25,9.1,105583.0
28 | 26,9.6,116970.0
29 | 27,9.7,112636.0
30 | 28,10.4,122392.0
31 | 29,10.6,121873.0
32 | 


--------------------------------------------------------------------------------
/Linear_Regression/Linear_regression_projects/__init__.py:
--------------------------------------------------------------------------------
1 |  


--------------------------------------------------------------------------------
/Linear_Regression/Linear_regression_projects/house_price_prediction.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import numpy as np
  3 | import pandas as pd
  4 | from sklearn.linear_model import LinearRegression
  5 | from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
  6 | import plotly.express as px
  7 | import plotly.graph_objects as go
  8 | from scipy import stats
  9 | 
 10 | def calculate_residuals(y_true, y_pred):
 11 |     """Calculate and return residuals."""
 12 |     return y_true - y_pred
 13 | 
 14 | def run():
 15 |     st.header("House Price Prediction")
 16 |     st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Linear_Regression)", unsafe_allow_html=True)
 17 | 
 18 |     # Example data
 19 |     house_sizes = np.array([1400, 1600, 1700, 1875, 1100, 1550, 2350, 2450, 1425, 1700])
 20 |     house_prices = np.array([245, 312, 279, 308, 199, 219, 405, 324, 319, 255])
 21 | 
 22 |     # Create DataFrame
 23 |     df = pd.DataFrame({
 24 |         'Size': house_sizes,
 25 |         'Price': house_prices
 26 |     })
 27 | 
 28 |     # Data Overview
 29 |     st.subheader("Data Overview")
 30 |     col1, col2 = st.columns(2)
 31 |     
 32 |     with col1:
 33 |         st.write("**Dataset Information:**")
 34 |         st.write(f"Number of houses: {len(df)}")
 35 |         st.write("\n**Basic Statistics:**")
 36 |         st.write(df.describe().round(2))
 37 |     
 38 |     with col2:
 39 |         # Distribution plots
 40 |         fig = px.box(df, title='Price and Size Distributions')
 41 |         st.plotly_chart(fig)
 42 | 
 43 |     # Data Analysis
 44 |     st.subheader("Data Analysis")
 45 |     
 46 |     # Correlation analysis
 47 |     correlation = df['Size'].corr(df['Price'])
 48 |     st.write(f"**Correlation between Size and Price:** {correlation:.3f}")
 49 |     
 50 |     # Scatter plot with trend line
 51 |     fig = px.scatter(df, x='Size', y='Price',
 52 |                     title='House Size vs Price',
 53 |                     labels={'Size': 'House Size (sq. ft.)',
 54 |                            'Price': 'Price ($1000)'})
 55 |     fig.add_trace(go.Scatter(x=df['Size'],
 56 |                             y=stats.linregress(df['Size'], df['Price'])[0] * df['Size'] + 
 57 |                               stats.linregress(df['Size'], df['Price'])[1],
 58 |                             mode='lines',
 59 |                             name='Trend Line'))
 60 |     st.plotly_chart(fig)
 61 | 
 62 |     # Reshape data
 63 |     X = house_sizes.reshape(-1, 1)
 64 |     y = house_prices
 65 | 
 66 |     # Train model
 67 |     model = LinearRegression()
 68 |     model.fit(X, y)
 69 | 
 70 |     # Model Evaluation
 71 |     st.subheader("Model Evaluation")
 72 |     y_pred = model.predict(X)
 73 |     
 74 |     col1, col2, col3 = st.columns(3)
 75 |     with col1:
 76 |         st.metric("R² Score", f"{r2_score(y, y_pred):.3f}")
 77 |     with col2:
 78 |         st.metric("RMSE", f"${np.sqrt(mean_squared_error(y, y_pred)):.2f}K")
 79 |     with col3:
 80 |         st.metric("MAE", f"${mean_absolute_error(y, y_pred):.2f}K")
 81 | 
 82 |     # Residual Analysis
 83 |     st.subheader("Residual Analysis")
 84 |     residuals = calculate_residuals(y, y_pred)
 85 |     
 86 |     col1, col2 = st.columns(2)
 87 |     with col1:
 88 |         # Residuals vs Predicted
 89 |         fig = px.scatter(x=y_pred, y=residuals,
 90 |                         title='Residuals vs Predicted Values',
 91 |                         labels={'x': 'Predicted Price ($1000)',
 92 |                                'y': 'Residuals'})
 93 |         fig.add_hline(y=0, line_dash="dash", line_color="red")
 94 |         st.plotly_chart(fig)
 95 |     
 96 |     with col2:
 97 |         # Residuals distribution
 98 |         fig = px.histogram(residuals,
 99 |                           title='Residuals Distribution',
100 |                           labels={'value': 'Residuals'})
101 |         st.plotly_chart(fig)
102 | 
103 |     # Prediction Interface
104 |     st.subheader("Price Prediction")
105 |     
106 |     col1, col2 = st.columns(2)
107 |     with col1:
108 |         new_size = st.number_input("Enter house size (sq. ft.):",
109 |                                  min_value=500,
110 |                                  max_value=10000,
111 |                                  value=2000,
112 |                                  step=100)
113 |         
114 |         # Calculate prediction
115 |         prediction = model.predict([[new_size]])[0]
116 |         confidence_interval = 1.96 * np.sqrt(mean_squared_error(y, y_pred))
117 |         
118 |         st.metric("Predicted Price",
119 |                  f"${prediction:.2f}K",
120 |                  f"±${confidence_interval:.2f}K")
121 |     
122 |     with col2:
123 |         # Prediction visualization
124 |         fig = go.Figure()
125 |         
126 |         # Add actual data points
127 |         fig.add_trace(go.Scatter(
128 |             x=df['Size'],
129 |             y=df['Price'],
130 |             mode='markers',
131 |             name='Actual Data',
132 |             marker=dict(color='blue')
133 |         ))
134 |         
135 |         # Add regression line
136 |         x_range = np.linspace(min(df['Size']), max(df['Size']), 100)
137 |         y_range = model.predict(x_range.reshape(-1, 1))
138 |         fig.add_trace(go.Scatter(
139 |             x=x_range,
140 |             y=y_range,
141 |             mode='lines',
142 |             name='Regression Line',
143 |             line=dict(color='red')
144 |         ))
145 |         
146 |         # Add prediction point
147 |         fig.add_trace(go.Scatter(
148 |             x=[new_size],
149 |             y=[prediction],
150 |             mode='markers',
151 |             name='Prediction',
152 |             marker=dict(color='green', size=12)
153 |         ))
154 |         
155 |         # Add confidence interval
156 |         fig.add_trace(go.Scatter(
157 |             x=x_range,
158 |             y=y_range + confidence_interval,
159 |             mode='lines',
160 |             line=dict(width=0),
161 |             showlegend=False
162 |         ))
163 |         fig.add_trace(go.Scatter(
164 |             x=x_range,
165 |             y=y_range - confidence_interval,
166 |             mode='lines',
167 |             line=dict(width=0),
168 |             fill='tonexty',
169 |             name='95% Confidence Interval'
170 |         ))
171 |         
172 |         fig.update_layout(
173 |             title='House Price Prediction',
174 |             xaxis_title='House Size (sq. ft.)',
175 |             yaxis_title='Price ($1000)',
176 |             showlegend=True
177 |         )
178 |         st.plotly_chart(fig)
179 | 
180 |     # Model Information
181 |     st.subheader("Model Information")
182 |     st.write(f"**Slope (Price per sq. ft.):** ${model.coef_[0]:.2f}")
183 |     st.write(f"**Intercept:** ${model.intercept_:.2f}")
184 |     st.write(f"**Equation:** Price = ${model.coef_[0]:.2f} × Size + ${model.intercept_:.2f}")
185 | 
186 | if __name__ == "__main__":
187 |     run()


--------------------------------------------------------------------------------
/Linear_Regression/README.md:
--------------------------------------------------------------------------------
 1 | # Linear Regression Projects
 2 | 
 3 | This repository contains various Linear Regression projects implemented in Python. Each project demonstrates the application of Linear Regression to solve real-world problems using datasets.
 4 | 
 5 | ## Project Structure
 6 | 
 7 | ```
 8 | LinearRegression/
 9 | ├── main.py
10 | ├── requirements.txt
11 | ├── Linear_regression_projects/
12 | │   ├── messi_goal_prediction.py
13 | │   ├── house_price_prediction.py
14 | │   ├── study_hours_exam_prediction.py
15 | │   ├── normal_equation_vs_gradient_descent.py
16 | │   ├── salary_prediction.py
17 | │   ├── diabetes.csv
18 | ```
19 | 
20 | ### Key Files
21 | - **`main.py`**: The main entry point for running the Streamlit app.
22 | - **`requirements.txt`**: Contains the dependencies required to run the project.
23 | - **`Linear_regression_projects/`**: Contains individual project scripts and datasets.
24 | 
25 | ## Projects Included
26 | 
27 | 1. **Messi Goal Prediction**  
28 |    Predicts the number of goals Messi will score based on the number of matches played using Linear Regression.
29 | 
30 |    **Screenshots:**
31 |    ![Messi Goal Distribution](screenshots/leo1.png)
32 |    ![Goal Prediction Model](screenshots/leo2.png)
33 |    ![Model Performance Metrics](screenshots/leo3.png)
34 | 
35 | 2. **House Price Prediction**  
36 |    Predicts house prices based on their sizes using Linear Regression.
37 | 
38 |    **Screenshots:**
39 |    ![House Price Prediction](screenshots/house1.png)
40 | 
41 | 3. **Study Hours and Exam Prediction**  
42 |    Predicts exam scores based on the number of hours studied using Linear Regression.
43 | 
44 |    **Screenshots:**
45 |    ![Score Prediction 1](screenshots/score1.png)
46 |    ![Score Prediction 2](screenshots/score2.png)
47 | 
48 | 4. **Normal Equation vs Gradient Descent**  
49 |    Compares the performance of the normal equation and gradient descent methods for solving linear regression.
50 | 
51 |    **Screenshots:**
52 |    ![Normal vs Gradient 1](screenshots/norm_grad1.png)
53 |    ![Normal vs Gradient 2](screenshots/norm_grad2.png)
54 | 
55 | 5. **Salary Prediction**  
56 |    Predicts salary based on years of experience using Linear Regression.
57 | 
58 |    **Screenshots:**
59 |    ![Salary Prediction](screenshots/salary_pred.png)
60 | 


--------------------------------------------------------------------------------
/Linear_Regression/main.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | # Ensure the Linear_regression_projects folder is in the Python path
 6 | sys.path.append(str(Path(__file__).parent))
 7 | 
 8 | from Linear_regression_projects import (
 9 |     messi_goal_prediction,
10 |     house_price_prediction,
11 |     study_hours_exam_prediction,
12 |     normal_equation_vs_gradient_descent,
13 |     salary_prediction,
14 | )
15 | 
16 | def run():
17 |     st.title("Linear Regression Projects")
18 | 
19 |     # Sidebar for project selection
20 |     project = st.sidebar.selectbox(
21 |         "Select a project",
22 |         [
23 |             "Messi Goal Prediction",
24 |             "House Price Prediction",
25 |             "Study Hours and Exam Prediction",
26 |             "Normal Equation vs Gradient Descent",
27 |             "Salary Prediction",
28 |         ],
29 |     )
30 | 
31 |     # Run the selected project
32 |     if project == "Messi Goal Prediction":
33 |         messi_goal_prediction.run()
34 |     elif project == "House Price Prediction":
35 |         house_price_prediction.run()
36 |     elif project == "Study Hours and Exam Prediction":
37 |         study_hours_exam_prediction.run()
38 |     elif project == "Normal Equation vs Gradient Descent":
39 |         normal_equation_vs_gradient_descent.run()
40 |     elif project == "Salary Prediction":
41 |         salary_prediction.run()
42 | 
43 | if __name__ == "__main__":
44 |     run()


--------------------------------------------------------------------------------
/Linear_Regression/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Linear_Regression/requirements.txt


--------------------------------------------------------------------------------
/Linear_Regression/screenshots/house1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Linear_Regression/screenshots/house1.png


--------------------------------------------------------------------------------
/Linear_Regression/screenshots/leo1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Linear_Regression/screenshots/leo1.png


--------------------------------------------------------------------------------
/Linear_Regression/screenshots/leo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Linear_Regression/screenshots/leo2.png


--------------------------------------------------------------------------------
/Linear_Regression/screenshots/leo3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Linear_Regression/screenshots/leo3.png


--------------------------------------------------------------------------------
/Linear_Regression/screenshots/norm_grad1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Linear_Regression/screenshots/norm_grad1.png


--------------------------------------------------------------------------------
/Linear_Regression/screenshots/norm_grad2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Linear_Regression/screenshots/norm_grad2.png


--------------------------------------------------------------------------------
/Linear_Regression/screenshots/salary_pred.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Linear_Regression/screenshots/salary_pred.png


--------------------------------------------------------------------------------
/Linear_Regression/screenshots/score1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Linear_Regression/screenshots/score1.png


--------------------------------------------------------------------------------
/Linear_Regression/screenshots/score2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Linear_Regression/screenshots/score2.png


--------------------------------------------------------------------------------
/Logistic_Regression/Logistic_Regression_projects/diabetes_prediction.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import numpy as np
  4 | from sklearn.model_selection import train_test_split
  5 | from sklearn.linear_model import LogisticRegression
  6 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
  7 | import plotly.express as px
  8 | import plotly.graph_objects as go
  9 | from sklearn.preprocessing import StandardScaler
 10 | import seaborn as sns
 11 | import matplotlib.pyplot as plt
 12 | 
 13 | def run():
 14 |     st.header("Diabetes Prediction using Logistic Regression")
 15 |     st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Logistic_Regression)", unsafe_allow_html=True)
 16 | 
 17 |     # Load dataset
 18 |     df = pd.read_csv("Logistic_Regression/Logistic_Regression_projects/diabetes.csv")
 19 |     
 20 |     # Display dataset info
 21 |     st.subheader("Dataset Overview")
 22 |     col1, col2 = st.columns(2)
 23 |     with col1:
 24 |         st.write("Dataset Shape:", df.shape)
 25 |         st.write("Features:", ", ".join(df.columns[:-1]))
 26 |         st.write("Target: Outcome (0: No Diabetes, 1: Diabetes)")
 27 |     with col2:
 28 |         st.write("Class Distribution:")
 29 |         class_dist = df['Outcome'].value_counts()
 30 |         fig = px.pie(values=class_dist.values, names=['No Diabetes', 'Diabetes'], 
 31 |                     title='Diabetes Distribution')
 32 |         st.plotly_chart(fig)
 33 | 
 34 |     # Feature selection
 35 |     st.subheader("Feature Selection")
 36 |     selected_features = st.multiselect(
 37 |         "Select features for prediction",
 38 |         df.columns[:-1],
 39 |         default=['Glucose', 'BMI', 'Age']
 40 |     )
 41 | 
 42 |     if selected_features:
 43 |         # Prepare data
 44 |         X = df[selected_features]
 45 |         y = df['Outcome']
 46 |         
 47 |         # Split data
 48 |         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 49 |         
 50 |         # Scale features
 51 |         scaler = StandardScaler()
 52 |         X_train_scaled = scaler.fit_transform(X_train)
 53 |         X_test_scaled = scaler.transform(X_test)
 54 |         
 55 |         # Train model
 56 |         model = LogisticRegression(max_iter=1000)
 57 |         model.fit(X_train_scaled, y_train)
 58 |         
 59 |         # Model evaluation
 60 |         st.subheader("Model Performance")
 61 |         y_pred = model.predict(X_test_scaled)
 62 |         accuracy = accuracy_score(y_test, y_pred)
 63 |         
 64 |         # Display metrics
 65 |         col1, col2, col3 = st.columns(3)
 66 |         with col1:
 67 |             st.metric("Accuracy", f"{accuracy:.2%}")
 68 |         with col2:
 69 |             st.metric("Precision", f"{classification_report(y_test, y_pred, output_dict=True)['1']['precision']:.2%}")
 70 |         with col3:
 71 |             st.metric("Recall", f"{classification_report(y_test, y_pred, output_dict=True)['1']['recall']:.2%}")
 72 |         
 73 |         # Confusion Matrix
 74 |         st.subheader("Confusion Matrix")
 75 |         cm = confusion_matrix(y_test, y_pred)
 76 |         fig = px.imshow(cm,
 77 |                        labels=dict(x="Predicted", y="Actual", color="Count"),
 78 |                        x=['No Diabetes', 'Diabetes'],
 79 |                        y=['No Diabetes', 'Diabetes'],
 80 |                        text_auto=True,
 81 |                        aspect="auto")
 82 |         st.plotly_chart(fig)
 83 |         
 84 |         # Feature Importance
 85 |         st.subheader("Feature Importance")
 86 |         importance = pd.DataFrame({
 87 |             'Feature': selected_features,
 88 |             'Importance': np.abs(model.coef_[0])
 89 |         })
 90 |         fig = px.bar(importance, x='Feature', y='Importance',
 91 |                     title='Feature Importance in Prediction')
 92 |         st.plotly_chart(fig)
 93 |         
 94 |         # Interactive Prediction
 95 |         st.subheader("Make a Prediction")
 96 |         st.write("Enter patient information:")
 97 |         
 98 |         # Create input fields for selected features
 99 |         input_data = {}
100 |         cols = st.columns(len(selected_features))
101 |         for i, feature in enumerate(selected_features):
102 |             with cols[i]:
103 |                 input_data[feature] = st.number_input(
104 |                     f"{feature}",
105 |                     min_value=float(df[feature].min()),
106 |                     max_value=float(df[feature].max()),
107 |                     value=float(df[feature].mean())
108 |                 )
109 |         
110 |         if st.button("Predict"):
111 |             # Scale input data
112 |             input_scaled = scaler.transform(pd.DataFrame([input_data]))
113 |             prediction = model.predict(input_scaled)[0]
114 |             probability = model.predict_proba(input_scaled)[0]
115 |             
116 |             # Display prediction
117 |             st.subheader("Prediction Result")
118 |             col1, col2 = st.columns(2)
119 |             with col1:
120 |                 st.metric("Prediction", "Diabetes" if prediction == 1 else "No Diabetes")
121 |             with col2:
122 |                 st.metric("Confidence", f"{max(probability):.2%}")
123 |             
124 |             # Visualize prediction probability
125 |             fig = go.Figure(data=[
126 |                 go.Bar(x=['No Diabetes', 'Diabetes'],
127 |                       y=probability,
128 |                       text=[f'{p:.2%}' for p in probability],
129 |                       textposition='auto',
130 |                 )
131 |             ])
132 |             fig.update_layout(title='Prediction Probabilities')
133 |             st.plotly_chart(fig)
134 |         
135 |         # Data Visualization
136 |         st.subheader("Data Analysis")
137 |         selected_feature = st.selectbox("Select feature to analyze", selected_features)
138 |         
139 |         # Distribution plot
140 |         fig = px.histogram(df, x=selected_feature, color='Outcome',
141 |                           title=f'Distribution of {selected_feature} by Diabetes Status',
142 |                           barmode='overlay')
143 |         st.plotly_chart(fig)
144 |         
145 |         # Correlation heatmap
146 |         st.subheader("Feature Correlations")
147 |         corr = df[selected_features + ['Outcome']].corr()
148 |         fig = px.imshow(corr,
149 |                        labels=dict(color="Correlation"),
150 |                        x=corr.columns,
151 |                        y=corr.columns,
152 |                        aspect="auto")
153 |         st.plotly_chart(fig)
154 | 
155 | if __name__ == "__main__":
156 |     run()


--------------------------------------------------------------------------------
/Logistic_Regression/Logistic_Regression_projects/rock_vs_mine.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import numpy as np
  4 | from sklearn.model_selection import train_test_split
  5 | from sklearn.linear_model import LogisticRegression
  6 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
  7 | import plotly.express as px
  8 | import plotly.graph_objects as go
  9 | from sklearn.preprocessing import StandardScaler
 10 | import seaborn as sns
 11 | import matplotlib.pyplot as plt
 12 | 
 13 | def run():
 14 |     st.header("Rock vs Mine Classification using Logistic Regression")
 15 |     st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Logistic_Regression)", unsafe_allow_html=True)
 16 | 
 17 |     # Load dataset
 18 |     df = pd.read_csv("Logistic_Regression/Logistic_Regression_projects/Copy of sonar data.csv")
 19 |     
 20 |     # Display dataset info
 21 |     st.subheader("Dataset Overview")
 22 |     col1, col2 = st.columns(2)
 23 |     with col1:
 24 |         st.write("Dataset Shape:", df.shape)
 25 |         st.write("Features: 60 frequency bands")
 26 |         st.write("Target: R (Rock) or M (Mine)")
 27 |     with col2:
 28 |         st.write("Class Distribution:")
 29 |         class_dist = df['R'].value_counts()
 30 |         fig = px.pie(values=class_dist.values, names=['Rock', 'Mine'], 
 31 |                     title='Rock vs Mine Distribution')
 32 |         st.plotly_chart(fig)
 33 | 
 34 |     # Feature selection
 35 |     st.subheader("Feature Selection")
 36 |     n_features = st.slider("Number of Features to Use", min_value=5, max_value=60, value=20)
 37 |     
 38 |     # Select top features based on variance
 39 |     feature_vars = df.iloc[:, :-1].var()
 40 |     selected_features = feature_vars.nlargest(n_features).index.tolist()
 41 | 
 42 |     # Prepare data
 43 |     X = df[selected_features]
 44 |     y = df['R'].map({'R': 0, 'M': 1})  # Convert to binary
 45 |     
 46 |     # Split data
 47 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 48 |     
 49 |     # Scale features
 50 |     scaler = StandardScaler()
 51 |     X_train_scaled = scaler.fit_transform(X_train)
 52 |     X_test_scaled = scaler.transform(X_test)
 53 |     
 54 |     # Train model
 55 |     model = LogisticRegression(max_iter=1000)
 56 |     model.fit(X_train_scaled, y_train)
 57 |     
 58 |     # Model evaluation
 59 |     st.subheader("Model Performance")
 60 |     y_pred = model.predict(X_test_scaled)
 61 |     accuracy = accuracy_score(y_test, y_pred)
 62 |     
 63 |     # Display metrics
 64 |     col1, col2, col3 = st.columns(3)
 65 |     with col1:
 66 |         st.metric("Accuracy", f"{accuracy:.2%}")
 67 |     with col2:
 68 |         st.metric("Precision", f"{classification_report(y_test, y_pred, output_dict=True)['1']['precision']:.2%}")
 69 |     with col3:
 70 |         st.metric("Recall", f"{classification_report(y_test, y_pred, output_dict=True)['1']['recall']:.2%}")
 71 |     
 72 |     # Confusion Matrix
 73 |     st.subheader("Confusion Matrix")
 74 |     cm = confusion_matrix(y_test, y_pred)
 75 |     fig = px.imshow(cm,
 76 |                    labels=dict(x="Predicted", y="Actual", color="Count"),
 77 |                    x=['Rock', 'Mine'],
 78 |                    y=['Rock', 'Mine'],
 79 |                    text_auto=True,
 80 |                    aspect="auto")
 81 |     st.plotly_chart(fig)
 82 |     
 83 |     # Feature Importance
 84 |     st.subheader("Feature Importance")
 85 |     importance = pd.DataFrame({
 86 |         'Feature': selected_features,
 87 |         'Importance': np.abs(model.coef_[0])
 88 |     })
 89 |     fig = px.bar(importance, x='Feature', y='Importance',
 90 |                 title='Feature Importance in Prediction')
 91 |     st.plotly_chart(fig)
 92 |     
 93 |     # Interactive Prediction
 94 |     st.subheader("Make a Prediction")
 95 |     st.write("Enter frequency band values:")
 96 |     
 97 |     # Create input fields for selected features
 98 |     input_data = {}
 99 |     cols = st.columns(3)
100 |     for i, feature in enumerate(selected_features):
101 |         with cols[i % 3]:
102 |             input_data[feature] = st.number_input(
103 |                 f"{feature}",
104 |                 min_value=float(df[feature].min()),
105 |                 max_value=float(df[feature].max()),
106 |                 value=float(df[feature].mean())
107 |             )
108 |     
109 |     if st.button("Predict"):
110 |         # Scale input data
111 |         input_scaled = scaler.transform(pd.DataFrame([input_data]))
112 |         prediction = model.predict(input_scaled)[0]
113 |         probability = model.predict_proba(input_scaled)[0]
114 |         
115 |         # Display prediction
116 |         st.subheader("Prediction Result")
117 |         col1, col2 = st.columns(2)
118 |         with col1:
119 |             st.metric("Prediction", "Mine" if prediction == 1 else "Rock")
120 |         with col2:
121 |             st.metric("Confidence", f"{max(probability):.2%}")
122 |         
123 |         # Visualize prediction probability
124 |         fig = go.Figure(data=[
125 |             go.Bar(x=['Rock', 'Mine'],
126 |                   y=probability,
127 |                   text=[f'{p:.2%}' for p in probability],
128 |                   textposition='auto',
129 |             )
130 |         ])
131 |         fig.update_layout(title='Prediction Probabilities')
132 |         st.plotly_chart(fig)
133 |     
134 |     # Data Visualization
135 |     st.subheader("Data Analysis")
136 |     
137 |     # PCA for dimensionality reduction
138 |     from sklearn.decomposition import PCA
139 |     pca = PCA(n_components=2)
140 |     X_pca = pca.fit_transform(X)
141 |     
142 |     # Plot PCA results
143 |     fig = px.scatter(
144 |         x=X_pca[:, 0], y=X_pca[:, 1],
145 |         color=df['R'].map({'R': 'Rock', 'M': 'Mine'}),
146 |         title='PCA Visualization of Rock vs Mine Data',
147 |         labels={'x': 'First Principal Component', 'y': 'Second Principal Component'}
148 |     )
149 |     st.plotly_chart(fig)
150 |     
151 |     # Feature correlation heatmap
152 |     st.subheader("Feature Correlations")
153 |     corr = df[selected_features].corr()
154 |     fig = px.imshow(corr,
155 |                    labels=dict(color="Correlation"),
156 |                    x=corr.columns,
157 |                    y=corr.columns,
158 |                    aspect="auto")
159 |     st.plotly_chart(fig)
160 |     
161 |     # Frequency band analysis
162 |     st.subheader("Frequency Band Analysis")
163 |     selected_band = st.selectbox("Select frequency band to analyze", selected_features)
164 |     
165 |     fig = px.box(df, x='R', y=selected_band,
166 |                 title=f'Distribution of {selected_band} by Class',
167 |                 labels={'R': 'Class', selected_band: 'Value'})
168 |     st.plotly_chart(fig)
169 | 
170 | if __name__ == "__main__":
171 |     run()


--------------------------------------------------------------------------------
/Logistic_Regression/Logistic_Regression_projects/simple_hiv_prediction.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import numpy as np
  4 | from sklearn.model_selection import train_test_split
  5 | from sklearn.linear_model import LogisticRegression
  6 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
  7 | import plotly.express as px
  8 | import plotly.graph_objects as go
  9 | from sklearn.preprocessing import StandardScaler
 10 | import seaborn as sns
 11 | import matplotlib.pyplot as plt
 12 | 
 13 | def generate_sample_data():
 14 |     np.random.seed(42)
 15 |     n_samples = 1000
 16 |     
 17 |     # Generate features
 18 |     age = np.random.normal(35, 10, n_samples)
 19 |     age = np.clip(age, 18, 65)
 20 |     
 21 |     risk_factors = np.random.choice(['Low', 'Medium', 'High'], n_samples, p=[0.6, 0.3, 0.1])
 22 |     sexual_activity = np.random.choice(['None', 'Protected', 'Unprotected'], n_samples, p=[0.3, 0.5, 0.2])
 23 |     drug_use = np.random.choice(['None', 'Past', 'Current'], n_samples, p=[0.7, 0.2, 0.1])
 24 |     
 25 |     # Create DataFrame
 26 |     df = pd.DataFrame({
 27 |         'Age': age,
 28 |         'RiskFactor': risk_factors,
 29 |         'SexualActivity': sexual_activity,
 30 |         'DrugUse': drug_use
 31 |     })
 32 |     
 33 |     # Generate target (HIV status) with some patterns
 34 |     base_prob = 0.05
 35 |     risk_effect = {
 36 |         'Low': 0.5,
 37 |         'Medium': 1.0,
 38 |         'High': 2.0
 39 |     }
 40 |     activity_effect = {
 41 |         'None': 0.3,
 42 |         'Protected': 0.7,
 43 |         'Unprotected': 1.5
 44 |     }
 45 |     drug_effect = {
 46 |         'None': 0.5,
 47 |         'Past': 1.2,
 48 |         'Current': 1.8
 49 |     }
 50 |     
 51 |     # Calculate probability of HIV
 52 |     prob = base_prob * \
 53 |            df['RiskFactor'].map(risk_effect) * \
 54 |            df['SexualActivity'].map(activity_effect) * \
 55 |            df['DrugUse'].map(drug_effect) * \
 56 |            (1 + (df['Age'] - 35) / 100)
 57 |     
 58 |     # Generate actual HIV status
 59 |     df['HIV'] = np.random.binomial(1, prob)
 60 |     
 61 |     return df
 62 | 
 63 | def run():
 64 |     st.header("HIV Risk Prediction using Logistic Regression")
 65 |     st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Logistic_Regression)", unsafe_allow_html=True)
 66 | 
 67 |     # Generate sample data
 68 |     df = generate_sample_data()
 69 |     
 70 |     # Display dataset info
 71 |     st.subheader("Dataset Overview")
 72 |     col1, col2 = st.columns(2)
 73 |     with col1:
 74 |         st.write("Dataset Shape:", df.shape)
 75 |         st.write("Features:", ", ".join(df.columns[:-1]))
 76 |         st.write("Target: HIV Status (0: Negative, 1: Positive)")
 77 |     with col2:
 78 |         st.write("Class Distribution:")
 79 |         class_dist = df['HIV'].value_counts()
 80 |         fig = px.pie(values=class_dist.values, names=['Negative', 'Positive'], 
 81 |                     title='HIV Status Distribution')
 82 |         st.plotly_chart(fig)
 83 | 
 84 |     # Prepare data
 85 |     X = pd.get_dummies(df[['Age', 'RiskFactor', 'SexualActivity', 'DrugUse']])
 86 |     y = df['HIV']
 87 |     
 88 |     # Split data
 89 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 90 |     
 91 |     # Scale features
 92 |     scaler = StandardScaler()
 93 |     X_train_scaled = scaler.fit_transform(X_train)
 94 |     X_test_scaled = scaler.transform(X_test)
 95 |     
 96 |     # Train model
 97 |     model = LogisticRegression(max_iter=1000)
 98 |     model.fit(X_train_scaled, y_train)
 99 |     
100 |     # Model evaluation
101 |     st.subheader("Model Performance")
102 |     y_pred = model.predict(X_test_scaled)
103 |     accuracy = accuracy_score(y_test, y_pred)
104 |     
105 |     # Display metrics
106 |     col1, col2, col3 = st.columns(3)
107 |     with col1:
108 |         st.metric("Accuracy", f"{accuracy:.2%}")
109 |     with col2:
110 |         st.metric("Precision", f"{classification_report(y_test, y_pred, output_dict=True)['1']['precision']:.2%}")
111 |     with col3:
112 |         st.metric("Recall", f"{classification_report(y_test, y_pred, output_dict=True)['1']['recall']:.2%}")
113 |     
114 |     # Confusion Matrix
115 |     st.subheader("Confusion Matrix")
116 |     cm = confusion_matrix(y_test, y_pred)
117 |     fig = px.imshow(cm,
118 |                    labels=dict(x="Predicted", y="Actual", color="Count"),
119 |                    x=['Negative', 'Positive'],
120 |                    y=['Negative', 'Positive'],
121 |                    text_auto=True,
122 |                    aspect="auto")
123 |     st.plotly_chart(fig)
124 |     
125 |     # Feature Importance
126 |     st.subheader("Feature Importance")
127 |     importance = pd.DataFrame({
128 |         'Feature': X.columns,
129 |         'Importance': np.abs(model.coef_[0])
130 |     })
131 |     fig = px.bar(importance, x='Feature', y='Importance',
132 |                 title='Feature Importance in Prediction')
133 |     st.plotly_chart(fig)
134 |     
135 |     # Interactive Prediction
136 |     st.subheader("Make a Prediction")
137 |     st.write("Enter patient information:")
138 |     
139 |     col1, col2 = st.columns(2)
140 |     with col1:
141 |         age = st.slider("Age", 18, 65, 35)
142 |         risk_factor = st.selectbox("Risk Factor", df['RiskFactor'].unique())
143 |     with col2:
144 |         sexual_activity = st.selectbox("Sexual Activity", df['SexualActivity'].unique())
145 |         drug_use = st.selectbox("Drug Use", df['DrugUse'].unique())
146 |     
147 |     if st.button("Predict"):
148 |         # Prepare input data
149 |         input_data = pd.DataFrame({
150 |             'Age': [age],
151 |             'RiskFactor': [risk_factor],
152 |             'SexualActivity': [sexual_activity],
153 |             'DrugUse': [drug_use]
154 |         })
155 |         
156 |         # One-hot encode categorical variables
157 |         input_encoded = pd.get_dummies(input_data)
158 |         # Ensure all columns from training data are present
159 |         for col in X.columns:
160 |             if col not in input_encoded.columns:
161 |                 input_encoded[col] = 0
162 |         input_encoded = input_encoded[X.columns]
163 |         
164 |         # Scale input data
165 |         input_scaled = scaler.transform(input_encoded)
166 |         
167 |         # Make prediction
168 |         prediction = model.predict(input_scaled)[0]
169 |         probability = model.predict_proba(input_scaled)[0]
170 |         
171 |         # Display prediction
172 |         st.subheader("Prediction Result")
173 |         col1, col2 = st.columns(2)
174 |         with col1:
175 |             st.metric("Prediction", "Positive" if prediction == 1 else "Negative")
176 |         with col2:
177 |             st.metric("Risk Probability", f"{probability[1]:.2%}")
178 |         
179 |         # Visualize prediction probability
180 |         fig = go.Figure(data=[
181 |             go.Bar(x=['Negative', 'Positive'],
182 |                   y=probability,
183 |                   text=[f'{p:.2%}' for p in probability],
184 |                   textposition='auto',
185 |             )
186 |         ])
187 |         fig.update_layout(title='Prediction Probabilities')
188 |         st.plotly_chart(fig)
189 |     
190 |     # Data Visualization
191 |     st.subheader("Data Analysis")
192 |     
193 |     # Age distribution by HIV status
194 |     fig = px.histogram(df, x='Age', color='HIV',
195 |                       title='Age Distribution by HIV Status',
196 |                       barmode='overlay',
197 |                       labels={'HIV': 'HIV Status'})
198 |     st.plotly_chart(fig)
199 |     
200 |     # Risk factor analysis
201 |     fig = px.box(df, x='RiskFactor', y='Age', color='HIV',
202 |                 title='Age Distribution by Risk Factor and HIV Status')
203 |     st.plotly_chart(fig)
204 |     
205 |     # Sexual activity analysis
206 |     fig = px.sunburst(df, path=['SexualActivity', 'HIV'],
207 |                      title='HIV Status by Sexual Activity')
208 |     st.plotly_chart(fig)
209 |     
210 |     # Drug use analysis
211 |     fig = px.treemap(df, path=['DrugUse', 'HIV'],
212 |                     title='HIV Status by Drug Use')
213 |     st.plotly_chart(fig)
214 | 
215 | if __name__ == "__main__":
216 |     run()


--------------------------------------------------------------------------------
/Logistic_Regression/main.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | # Ensure the Logistic_Regression_projects folder is in the Python path
 6 | sys.path.append(str(Path(__file__).parent))
 7 | 
 8 | from Logistic_Regression_projects import (
 9 |     diabetes_prediction,
10 |     rock_vs_mine,
11 |     simple_hiv_prediction,
12 | )
13 | 
14 | def run():
15 |     st.title("Logistic Regression Projects")
16 | 
17 |     # Sidebar for project selection
18 |     project = st.sidebar.selectbox(
19 |         "Select a project",
20 |         [
21 |             "Diabetes Prediction",
22 |             "Rock vs Mine",
23 |             "Simple HIV Prediction",
24 |         ],
25 |     )
26 | 
27 |     # Run the selected project
28 |     if project == "Diabetes Prediction":
29 |         diabetes_prediction.run()
30 |     elif project == "Rock vs Mine":
31 |         rock_vs_mine.run()
32 |     elif project == "Simple HIV Prediction":
33 |         simple_hiv_prediction.run()
34 | 
35 | if __name__ == "__main__":
36 |     run()


--------------------------------------------------------------------------------
/Logistic_Regression/readme.md:
--------------------------------------------------------------------------------
  1 | # Logistic Regression Projects
  2 | 
  3 | This repository contains various Logistic Regression projects implemented in Python. Each project demonstrates the application of Logistic Regression to solve real-world problems using datasets.
  4 | 
  5 | ## Project Structure
  6 | 
  7 | ```
  8 | Logistic_Regression/
  9 | ├── main.py
 10 | ├── requirements.txt
 11 | ├── Logistic_regression_projects/
 12 | │   ├── diabetes_prediction.py
 13 | │   ├── rock_vs_mine.py
 14 | │   ├── simple_hiv_prediction.py
 15 | │   ├── diabetes.csv
 16 | │   ├── sonar.csv
 17 | ```
 18 | 
 19 | ### Key Files
 20 | - **`main.py`**: The main entry point for running the Streamlit app.
 21 | - **`requirements.txt`**: Contains the dependencies required to run the project.
 22 | - **`Logistic_regression_projects/`**: Contains individual project scripts and datasets.
 23 | 
 24 | ## Projects Included
 25 | 
 26 | 1. **Diabetes Prediction**  
 27 |    Predicts the likelihood of diabetes based on health metrics such as glucose level, blood pressure, BMI, etc.  
 28 |    Dataset: `diabetes.csv`
 29 | 
 30 |    **Screenshots:**
 31 |    ![Diabetes Prediction 1](screenshots/diab1.png)
 32 |    ![Diabetes Prediction 2](screenshots/diab2.png)
 33 | 
 34 | 2. **Rock vs Mine Classification**  
 35 |    Classifies sonar signals as either "Rock" or "Mine" using Logistic Regression.  
 36 |    Dataset: `sonar.csv`
 37 | 
 38 |    **Screenshots:**
 39 |    ![Rock vs Mine](screenshots/rock_mine.png)
 40 | 
 41 | 3. **Simple HIV Prediction**  
 42 |    Predicts HIV status based on features like age, CD4 count, and viral load.  
 43 |    Dataset: Synthetic data (hardcoded in the script).
 44 | 
 45 |    **Screenshots:**
 46 |    ![HIV Prediction](screenshots/hiv.png)
 47 | 
 48 | ## How to Run
 49 | 
 50 | 1. Clone the repository:
 51 |    ```bash
 52 |    git clone https://github.com/benasphy/ML_projects.git
 53 |    cd Logistic_Regression
 54 |    ```
 55 | 
 56 | 2. Install dependencies:
 57 |    ```bash
 58 |    pip install -r requirements.txt
 59 |    ```
 60 | 
 61 | 3. Run the Streamlit app:
 62 |    ```bash
 63 |    streamlit run main.py
 64 |    ```
 65 | 
 66 | 4. Select a project from the sidebar to explore its functionality.
 67 | 
 68 | ## Requirements
 69 | 
 70 | The project requires the following Python libraries:
 71 | - `streamlit`
 72 | - `numpy`
 73 | - `pandas`
 74 | - `scikit-learn`
 75 | 
 76 | ## Datasets
 77 | 
 78 | - **`diabetes.csv`**: Contains data for predicting diabetes outcomes based on various health metrics.
 79 | - **`sonar.csv`**: Contains sonar data for classification tasks.
 80 | 
 81 | ## Screenshots
 82 | 
 83 | Add screenshots of the Streamlit app interface here to showcase the projects.
 84 | 
 85 | ## License
 86 | 
 87 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
 88 | 
 89 | ## Acknowledgments
 90 | 
 91 | - Datasets used in this project are sourced from publicly available repositories.
 92 | - Special thanks to the contributors of the Python libraries used in this project.
 93 | 
 94 | ---
 95 | Feel free to contribute to this repository by submitting issues or pull requests.
 96 | 
 97 | ### Steps to Save:
 98 | 1. Save this content as `README.md` in the `Logistic_Regression` folder.
 99 | 2. Ensure the project structure matches the one described in the `README.md`.
100 | 


--------------------------------------------------------------------------------
/Logistic_Regression/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit
2 | numpy
3 | pandas
4 | scikit-learn
5 | plotly
6 | scipy
7 | seaborn


--------------------------------------------------------------------------------
/Logistic_Regression/screenshots/diab1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Logistic_Regression/screenshots/diab1.png


--------------------------------------------------------------------------------
/Logistic_Regression/screenshots/diab2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Logistic_Regression/screenshots/diab2.png


--------------------------------------------------------------------------------
/Logistic_Regression/screenshots/hiv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Logistic_Regression/screenshots/hiv.png


--------------------------------------------------------------------------------
/Logistic_Regression/screenshots/rock_mine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Logistic_Regression/screenshots/rock_mine.png


--------------------------------------------------------------------------------
/Naive_Bayes/Naive_Bayes_projects/fake_news_prediction.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import numpy as np
  4 | from sklearn.model_selection import train_test_split
  5 | from sklearn.feature_extraction.text import CountVectorizer
  6 | from sklearn.naive_bayes import MultinomialNB
  7 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
  8 | import plotly.express as px
  9 | import plotly.graph_objects as go
 10 | from collections import Counter
 11 | import re
 12 | from wordcloud import WordCloud
 13 | import matplotlib.pyplot as plt
 14 | 
 15 | def preprocess_text(text):
 16 |     """Clean and preprocess text data."""
 17 |     # Convert to lowercase
 18 |     text = str(text).lower()
 19 |     # Remove special characters and digits
 20 |     text = re.sub(r'[^a-zA-Z\s]', '', text)
 21 |     # Remove extra whitespace
 22 |     text = re.sub(r'\s+', ' ', text).strip()
 23 |     return text
 24 | 
 25 | def generate_wordcloud(texts, title):
 26 |     """Generate and display a word cloud."""
 27 |     wordcloud = WordCloud(width=800, height=400,
 28 |                          background_color='white',
 29 |                          min_font_size=10).generate(' '.join(texts))
 30 |     
 31 |     fig, ax = plt.subplots(figsize=(10, 5))
 32 |     ax.imshow(wordcloud)
 33 |     ax.axis('off')
 34 |     ax.set_title(title)
 35 |     return fig
 36 | 
 37 | def run():
 38 |     st.header("Fake News Prediction")
 39 |     st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/Naive_Bayes)", unsafe_allow_html=True)
 40 | 
 41 |     # Load dataset
 42 |     uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
 43 |     if uploaded_file is not None:
 44 |         df = pd.read_csv(uploaded_file)
 45 |         X = df[['title', 'news_url', 'source_domain', 'tweet_num']].apply(lambda x: ' '.join(x.astype(str)), axis=1)
 46 |         y = df['real']
 47 |     else:
 48 |         st.info("Using default dataset: FakeNewsNet.csv")
 49 |         df = pd.read_csv("Naive_Bayes/Naive_Bayes_projects/FakeNewsNet.csv")
 50 |         X = df[['title', 'news_url', 'source_domain', 'tweet_num']].apply(lambda x: ' '.join(x.astype(str)), axis=1)
 51 |         y = df['real']
 52 | 
 53 |     # Display dataset info
 54 |     st.subheader("Dataset Overview")
 55 |     col1, col2 = st.columns(2)
 56 |     with col1:
 57 |         st.write("Dataset Shape:", df.shape)
 58 |         st.write("Number of Articles:", len(df))
 59 |     with col2:
 60 |         class_dist = df['real'].value_counts()
 61 |         fig = px.pie(values=class_dist.values, names=['Fake', 'Real'],
 62 |                     title='News Distribution')
 63 |         st.plotly_chart(fig)
 64 | 
 65 |     # Text Analysis
 66 |     st.subheader("Text Analysis")
 67 |     
 68 |     # Preprocess text
 69 |     df['cleaned_text'] = X.apply(preprocess_text)
 70 |     
 71 |     # Text length analysis
 72 |     df['text_length'] = X.str.len()
 73 |     
 74 |     # Text length distribution
 75 |     fig = px.box(df, x='real', y='text_length',
 76 |                 title='Text Length Distribution by Category',
 77 |                 labels={'real': 'Category', 'text_length': 'Text Length'})
 78 |     st.plotly_chart(fig)
 79 | 
 80 |     # Vectorize text
 81 |     vectorizer = CountVectorizer(stop_words='english', max_features=5000)
 82 |     X_vec = vectorizer.fit_transform(X)
 83 | 
 84 |     # Split data
 85 |     X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)
 86 | 
 87 |     # Train model
 88 |     model = MultinomialNB()
 89 |     model.fit(X_train, y_train)
 90 | 
 91 |     # Evaluate model
 92 |     y_pred = model.predict(X_test)
 93 |     accuracy = accuracy_score(y_test, y_pred)
 94 |     
 95 |     # Display metrics
 96 |     st.subheader("Model Performance")
 97 |     col1, col2, col3 = st.columns(3)
 98 |     with col1:
 99 |         st.metric("Accuracy", f"{accuracy:.2%}")
100 |     with col2:
101 |         st.metric("Real News Precision", 
102 |                  f"{classification_report(y_test, y_pred, output_dict=True)['1']['precision']:.2%}")
103 |     with col3:
104 |         st.metric("Real News Recall",
105 |                  f"{classification_report(y_test, y_pred, output_dict=True)['1']['recall']:.2%}")
106 | 
107 |     # Confusion Matrix
108 |     st.subheader("Confusion Matrix")
109 |     cm = confusion_matrix(y_test, y_pred)
110 |     fig = px.imshow(cm,
111 |                    labels=dict(x="Predicted", y="Actual", color="Count"),
112 |                    x=['Fake', 'Real'],
113 |                    y=['Fake', 'Real'],
114 |                    text_auto=True,
115 |                    aspect="auto")
116 |     st.plotly_chart(fig)
117 | 
118 |     # Feature Importance
119 |     st.subheader("Most Important Words")
120 |     feature_importance = pd.DataFrame({
121 |         'Word': vectorizer.get_feature_names_out(),
122 |         'Importance': model.feature_log_prob_[1] - model.feature_log_prob_[0]
123 |     })
124 |     feature_importance = feature_importance.sort_values('Importance', ascending=False)
125 |     
126 |     col1, col2 = st.columns(2)
127 |     with col1:
128 |         # Most indicative of real news
129 |         fig = px.bar(feature_importance.head(10), x='Word', y='Importance',
130 |                     title='Top Words Indicating Real News')
131 |         st.plotly_chart(fig)
132 |     
133 |     with col2:
134 |         # Most indicative of fake news
135 |         fig = px.bar(feature_importance.tail(10), x='Word', y='Importance',
136 |                     title='Top Words Indicating Fake News')
137 |         st.plotly_chart(fig)
138 | 
139 |     # Predict custom input
140 |     st.subheader("Test a News Article")
141 |     news_text = st.text_area("Enter news article text:", height=200)
142 |     
143 |     if st.button("Check News"):
144 |         if news_text:
145 |             # Preprocess and vectorize input
146 |             cleaned_input = preprocess_text(news_text)
147 |             input_vectorized = vectorizer.transform([cleaned_input])
148 |             
149 |             # Make prediction
150 |             prediction = model.predict(input_vectorized)[0]
151 |             probabilities = model.predict_proba(input_vectorized)[0]
152 |             
153 |             # Display prediction
154 |             col1, col2 = st.columns(2)
155 |             with col1:
156 |                 st.metric("Prediction", "Real News" if prediction == 1 else "Fake News")
157 |             with col2:
158 |                 st.metric("Confidence", f"{max(probabilities):.2%}")
159 |             
160 |             # Visualize prediction probabilities
161 |             fig = go.Figure(data=[
162 |                 go.Bar(x=['Fake News', 'Real News'],
163 |                       y=probabilities,
164 |                       text=[f'{p:.2%}' for p in probabilities],
165 |                       textposition='auto',
166 |                 )
167 |             ])
168 |             fig.update_layout(title='Prediction Probabilities',
169 |                             xaxis_title='Category',
170 |                             yaxis_title='Probability')
171 |             st.plotly_chart(fig)
172 |             
173 |             # Text Analysis
174 |             st.subheader("Text Analysis")
175 |             col1, col2 = st.columns(2)
176 |             with col1:
177 |                 st.write("**Text Length:**", len(news_text))
178 |                 st.write("**Word Count:**", len(cleaned_input.split()))
179 |             with col2:
180 |                 # Get top contributing words
181 |                 words = cleaned_input.split()
182 |                 word_scores = []
183 |                 for word in set(words):
184 |                     if word in vectorizer.vocabulary_:
185 |                         idx = vectorizer.vocabulary_[word]
186 |                         score = model.feature_log_prob_[1][idx] - model.feature_log_prob_[0][idx]
187 |                         word_scores.append((word, score))
188 |                 
189 |                 if word_scores:
190 |                     word_scores.sort(key=lambda x: abs(x[1]), reverse=True)
191 |                     st.write("**Top Contributing Words:**")
192 |                     for word, score in word_scores[:5]:
193 |                         indicator = "→ Real News" if score > 0 else "→ Fake News"
194 |                         st.write(f"- {word} {indicator}")
195 | 
196 | if __name__ == "__main__":
197 |     run() 


--------------------------------------------------------------------------------
/Naive_Bayes/Naive_Bayes_projects/weather_prediction.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import numpy as np
  4 | from sklearn.naive_bayes import GaussianNB
  5 | from sklearn.model_selection import train_test_split
  6 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
  7 | import plotly.express as px
  8 | import plotly.graph_objects as go
  9 | import seaborn as sns
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | def generate_sample_data():
 13 |     np.random.seed(42)
 14 |     n_samples = 1000
 15 |     
 16 |     # Generate weather features
 17 |     temperature = np.random.normal(25, 5, n_samples)  # Mean 25°C, std 5°C
 18 |     humidity = np.random.normal(60, 15, n_samples)    # Mean 60%, std 15%
 19 |     pressure = np.random.normal(1013, 5, n_samples)   # Mean 1013 hPa, std 5 hPa
 20 |     wind_speed = np.random.exponential(5, n_samples)  # Mean 5 m/s
 21 |     
 22 |     # Create DataFrame
 23 |     df = pd.DataFrame({
 24 |         'Temperature': temperature,
 25 |         'Humidity': humidity,
 26 |         'Pressure': pressure,
 27 |         'Wind_Speed': wind_speed
 28 |     })
 29 |     
 30 |     # Generate weather conditions based on features
 31 |     def determine_weather(row):
 32 |         if row['Temperature'] > 30 and row['Humidity'] > 70:
 33 |             return 'Stormy'
 34 |         elif row['Temperature'] < 20 and row['Humidity'] > 80:
 35 |             return 'Rainy'
 36 |         elif row['Temperature'] > 25 and row['Humidity'] < 50:
 37 |             return 'Sunny'
 38 |         else:
 39 |             return 'Cloudy'
 40 |     
 41 |     df['Weather'] = df.apply(determine_weather, axis=1)
 42 |     
 43 |     return df
 44 | 
 45 | def run():
 46 |     st.header("Weather Prediction using Naive Bayes")
 47 |     st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Naive_Bayes)", unsafe_allow_html=True)
 48 |     
 49 |     # Generate sample data
 50 |     df = generate_sample_data()
 51 |     
 52 |     # Display dataset info
 53 |     st.subheader("Dataset Overview")
 54 |     col1, col2 = st.columns(2)
 55 |     with col1:
 56 |         st.write("Dataset Shape:", df.shape)
 57 |         st.write("Features:", ", ".join(df.columns[:-1]))
 58 |     with col2:
 59 |         st.write("Weather Distribution:")
 60 |         weather_dist = df['Weather'].value_counts()
 61 |         fig = px.pie(values=weather_dist.values, names=weather_dist.index,
 62 |                     title='Weather Conditions Distribution')
 63 |         st.plotly_chart(fig)
 64 |     
 65 |     # Feature Analysis
 66 |     st.subheader("Feature Analysis")
 67 |     
 68 |     # Temperature distribution by weather
 69 |     fig = px.box(df, x='Weather', y='Temperature',
 70 |                 title='Temperature Distribution by Weather Condition')
 71 |     st.plotly_chart(fig)
 72 |     
 73 |     # Humidity distribution by weather
 74 |     fig = px.box(df, x='Weather', y='Humidity',
 75 |                 title='Humidity Distribution by Weather Condition')
 76 |     st.plotly_chart(fig)
 77 |     
 78 |     # Feature correlations
 79 |     st.subheader("Feature Correlations")
 80 |     numeric_cols = ['Temperature', 'Humidity', 'Pressure', 'Wind_Speed']
 81 |     corr = df[numeric_cols].corr()
 82 |     fig = px.imshow(corr,
 83 |                    labels=dict(x="Features", y="Features", color="Correlation"),
 84 |                    x=numeric_cols,
 85 |                    y=numeric_cols,
 86 |                    text_auto=True,
 87 |                    aspect="auto")
 88 |     st.plotly_chart(fig)
 89 |     
 90 |     # Prepare data for modeling
 91 |     X = df[['Temperature', 'Humidity', 'Pressure', 'Wind_Speed']]
 92 |     y = df['Weather']
 93 |     
 94 |     # Split data
 95 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 96 |     
 97 |     # Train model
 98 |     model = GaussianNB()
 99 |     model.fit(X_train, y_train)
100 |     
101 |     # Model evaluation
102 |     st.subheader("Model Performance")
103 |     y_pred = model.predict(X_test)
104 |     accuracy = accuracy_score(y_test, y_pred)
105 |     
106 |     # Display metrics
107 |     col1, col2, col3 = st.columns(3)
108 |     with col1:
109 |         st.metric("Accuracy", f"{accuracy:.2%}")
110 |     with col2:
111 |         st.metric("Macro Precision", 
112 |                  f"{classification_report(y_test, y_pred, output_dict=True)['macro avg']['precision']:.2%}")
113 |     with col3:
114 |         st.metric("Macro Recall",
115 |                  f"{classification_report(y_test, y_pred, output_dict=True)['macro avg']['recall']:.2%}")
116 |     
117 |     # Confusion Matrix
118 |     st.subheader("Confusion Matrix")
119 |     cm = confusion_matrix(y_test, y_pred)
120 |     fig = px.imshow(cm,
121 |                    labels=dict(x="Predicted", y="Actual", color="Count"),
122 |                    x=sorted(df['Weather'].unique()),
123 |                    y=sorted(df['Weather'].unique()),
124 |                    text_auto=True,
125 |                    aspect="auto")
126 |     st.plotly_chart(fig)
127 |     
128 |     # Feature Distributions
129 |     st.subheader("Feature Distributions by Weather Condition")
130 |     
131 |     # Create subplot for feature distributions
132 |     fig = plt.figure(figsize=(12, 8))
133 |     for i, feature in enumerate(numeric_cols, 1):
134 |         plt.subplot(2, 2, i)
135 |         for weather in df['Weather'].unique():
136 |             sns.kdeplot(data=df[df['Weather'] == weather][feature], label=weather)
137 |         plt.title(f'{feature} Distribution')
138 |         plt.legend()
139 |     plt.tight_layout()
140 |     st.pyplot(fig)
141 |     
142 |     # Interactive Prediction
143 |     st.subheader("Make a Weather Prediction")
144 |     st.write("Enter weather conditions:")
145 |     
146 |     col1, col2 = st.columns(2)
147 |     with col1:
148 |         temperature = st.slider("Temperature (°C)", float(df['Temperature'].min()), 
149 |                               float(df['Temperature'].max()), 25.0)
150 |         humidity = st.slider("Humidity (%)", float(df['Humidity'].min()), 
151 |                            float(df['Humidity'].max()), 60.0)
152 |     with col2:
153 |         pressure = st.slider("Pressure (hPa)", float(df['Pressure'].min()), 
154 |                            float(df['Pressure'].max()), 1013.0)
155 |         wind_speed = st.slider("Wind Speed (m/s)", float(df['Wind_Speed'].min()), 
156 |                              float(df['Wind_Speed'].max()), 5.0)
157 |     
158 |     if st.button("Predict Weather"):
159 |         # Prepare input data
160 |         input_data = np.array([[temperature, humidity, pressure, wind_speed]])
161 |         
162 |         # Make prediction
163 |         prediction = model.predict(input_data)[0]
164 |         probabilities = model.predict_proba(input_data)[0]
165 |         
166 |         # Display prediction
167 |         st.subheader("Prediction Result")
168 |         col1, col2 = st.columns(2)
169 |         with col1:
170 |             st.metric("Predicted Weather", prediction)
171 |         with col2:
172 |             st.metric("Confidence", f"{max(probabilities):.2%}")
173 |         
174 |         # Visualize prediction probabilities
175 |         fig = go.Figure(data=[
176 |             go.Bar(x=sorted(df['Weather'].unique()),
177 |                   y=probabilities,
178 |                   text=[f'{p:.2%}' for p in probabilities],
179 |                   textposition='auto',
180 |             )
181 |         ])
182 |         fig.update_layout(title='Weather Condition Probabilities',
183 |                         xaxis_title='Weather Condition',
184 |                         yaxis_title='Probability')
185 |         st.plotly_chart(fig)
186 |         
187 |         # Weather Condition Characteristics
188 |         st.subheader("Typical Characteristics of Predicted Weather")
189 |         weather_stats = df[df['Weather'] == prediction].describe()
190 |         for feature in numeric_cols:
191 |             st.write(f"**{feature}**:")
192 |             st.write(f"- Average: {weather_stats[feature]['mean']:.2f}")
193 |             st.write(f"- Range: {weather_stats[feature]['min']:.2f} to {weather_stats[feature]['max']:.2f}")
194 | 
195 | if __name__ == "__main__":
196 |     run()


--------------------------------------------------------------------------------
/Naive_Bayes/main.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | # Ensure the Linear_regression_projects folder is in the Python path
 6 | sys.path.append(str(Path(__file__).parent))
 7 | from Naive_Bayes_projects import (
 8 |     weather_prediction,
 9 |     spam_detection_nb,
10 |     fake_news_prediction,
11 | )
12 | 
13 | def run():
14 |     st.title("Naive Bayes Projects")
15 | 
16 |     # Sidebar for project selection
17 |     project = st.sidebar.selectbox(
18 |         "Select a project",
19 |         [
20 |             "Weather Prediction",
21 |             "Spam Detection",
22 |             "Fake News Prediction",
23 |         ],
24 |     )
25 | 
26 |     # Run the selected project
27 |     if project == "Weather Prediction":
28 |         weather_prediction.run()
29 |     elif project == "Spam Detection":
30 |         spam_detection_nb.run()
31 |     elif project == "Fake News Prediction":
32 |         fake_news_prediction.run()
33 | 
34 | if __name__ == "__main__":
35 |     run()


--------------------------------------------------------------------------------
/Naive_Bayes/readme.md:
--------------------------------------------------------------------------------
 1 | # Naive Bayes Projects
 2 | 
 3 | This folder contains various projects that utilize the Naive Bayes algorithm for different applications. Each project is designed to demonstrate the use of Naive Bayes in machine learning tasks.
 4 | 
 5 | ## Projects
 6 | 
 7 | 1. **Weather Prediction**: Predicts weather conditions using historical data.
 8 | 
 9 |    **Screenshots:**
10 |    ![Weather Prediction](screenshots/weath.png)
11 | 2. **Spam Detection**: Classifies emails as spam or not spam using text data.
12 | 
13 |    **Screenshots:**
14 |    ![Spam Detection](screenshots/spamde.png)
15 | 3. **Fake News Prediction**: Detects fake news articles using features like title, news URL, source domain, and tweet number.
16 | 
17 |    **Screenshots:**
18 |    ![Fake News Prediction](screenshots/fake_news.png)
19 | 
20 | ## How to Run
21 | 
22 | To run any of the projects, follow these steps:
23 | 
24 | 1. Ensure you have the required dependencies installed. You can install them using pip:
25 | 
26 |    ```bash
27 |    pip install streamlit pandas scikit-learn
28 |    ```
29 | 
30 | 2. Navigate to the Naive_Bayes directory in your terminal.
31 | 
32 | 3. Run the Streamlit app using the following command:
33 | 
34 |    ```bash
35 |    streamlit run main.py
36 |    ```
37 | 
38 | 4. Use the sidebar to select the project you want to run.
39 | 
40 | ## Project Structure
41 | 
42 | - `main.py`: The main entry point for running the projects.
43 | - `Naive_Bayes_projects/`: Contains individual project files:
44 |   - `weather_prediction.py`: Weather prediction project.
45 |   - `spam_detection_nb.py`: Spam detection project.
46 |   - `fake_news_prediction.py`: Fake news prediction project.
47 | 
48 | ## Data
49 | 
50 | Each project uses its own dataset, which is either uploaded by the user or loaded from a default CSV file located in the `Naive_Bayes_projects/` directory.
51 | 
52 | ## Contributing
53 | 
54 | Feel free to contribute to these projects by submitting pull requests or opening issues for any bugs or feature requests.
55 | 
56 | ## License
57 | 
58 | This project is licensed under the MIT License - see the LICENSE file for details.
59 | 


--------------------------------------------------------------------------------
/Naive_Bayes/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit
2 | numpy
3 | pandas
4 | scikit-learn
5 | plotly
6 | scipy
7 | seaborn


--------------------------------------------------------------------------------
/Naive_Bayes/screenshots/fake_news.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Naive_Bayes/screenshots/fake_news.png


--------------------------------------------------------------------------------
/Naive_Bayes/screenshots/spamde.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Naive_Bayes/screenshots/spamde.png


--------------------------------------------------------------------------------
/Naive_Bayes/screenshots/weath.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Naive_Bayes/screenshots/weath.png


--------------------------------------------------------------------------------
/Poisson_Regression/Poisson_Regression_projects/competition_awards_data.csv:
--------------------------------------------------------------------------------
  1 | Awards,Math Score
  2 | 0,43
  3 | 0,38
  4 | 0,41
  5 | 0,33
  6 | 0,39
  7 | 0,43
  8 | 0,35
  9 | 0,41
 10 | 0,36
 11 | 0,38
 12 | 0,60
 13 | 0,30
 14 | 0,32
 15 | 0,30
 16 | 0,37
 17 | 0,44
 18 | 0,45
 19 | 0,44
 20 | 0,37
 21 | 0,43
 22 | 0,34
 23 | 0,40
 24 | 0,34
 25 | 0,38
 26 | 0,32
 27 | 0,42
 28 | 1,64
 29 | 1,60
 30 | 0,39
 31 | 1,62
 32 | 0,35
 33 | 0,37
 34 | 1,50
 35 | 0,40
 36 | 1,65
 37 | 1,68
 38 | 0,45
 39 | 0,35
 40 | 0,36
 41 | 0,37
 42 | 0,31
 43 | 0,31
 44 | 0,30
 45 | 0,42
 46 | 0,42
 47 | 1,62
 48 | 0,38
 49 | 0,39
 50 | 0,47
 51 | 1,65
 52 | 0,33
 53 | 0,34
 54 | 0,31
 55 | 3,89
 56 | 0,30
 57 | 0,44
 58 | 1,70
 59 | 1,68
 60 | 0,38
 61 | 0,44
 62 | 0,30
 63 | 1,70
 64 | 0,39
 65 | 1,66
 66 | 0,30
 67 | 1,61
 68 | 0,37
 69 | 0,33
 70 | 1,64
 71 | 0,30
 72 | 2,83
 73 | 0,43
 74 | 0,35
 75 | 0,30
 76 | 1,59
 77 | 0,47
 78 | 0,35
 79 | 0,39
 80 | 0,32
 81 | 0,31
 82 | 0,38
 83 | 0,33
 84 | 1,62
 85 | 0,39
 86 | 0,38
 87 | 0,30
 88 | 1,66
 89 | 0,41
 90 | 0,42
 91 | 0,31
 92 | 0,34
 93 | 0,48
 94 | 0,37
 95 | 0,30
 96 | 0,40
 97 | 0,41
 98 | 1,69
 99 | 0,42
100 | 1,63
101 | 0,40
102 | 0,30
103 | 0,38
104 | 0,34
105 | 0,30
106 | 0,32
107 | 0,35
108 | 0,38
109 | 1,70
110 | 0,33
111 | 0,36
112 | 1,63
113 | 1,66
114 | 2,86
115 | 0,34
116 | 1,63
117 | 0,40
118 | 1,72
119 | 0,40
120 | 0,47
121 | 2,86
122 | 1,67
123 | 3,88
124 | 1,64
125 | 0,39
126 | 0,37
127 | 1,63
128 | 1,72
129 | 0,30
130 | 0,38
131 | 1,67
132 | 0,31
133 | 1,62
134 | 0,44
135 | 0,42
136 | 0,36
137 | 0,30
138 | 3,89
139 | 1,71
140 | 0,35
141 | 0,33
142 | 0,42
143 | 3,80
144 | 0,40
145 | 0,30
146 | 1,67
147 | 1,59
148 | 0,43
149 | 0,40
150 | 0,41
151 | 3,88
152 | 0,30
153 | 0,45
154 | 0,30
155 | 2,82
156 | 0,47
157 | 1,70
158 | 5,90
159 | 0,33
160 | 0,30
161 | 1,63
162 | 0,36
163 | 0,34
164 | 2,87
165 | 4,90
166 | 1,66
167 | 0,35
168 | 1,63
169 | 1,70
170 | 3,90
171 | 1,61
172 | 1,67
173 | 3,89
174 | 2,86
175 | 0,39
176 | 0,30
177 | 1,67
178 | 1,68
179 | 3,88
180 | 2,83
181 | 0,33
182 | 6,90
183 | 4,90
184 | 1,70
185 | 0,39
186 | 2,87
187 | 2,88
188 | 1,59
189 | 2,86
190 | 1,65
191 | 1,61
192 | 0,46
193 | 1,62
194 | 2,81
195 | 5,91
196 | 1,70
197 | 2,85
198 | 2,86
199 | 1,66
200 | 0,41
201 | 3,89
202 | 


--------------------------------------------------------------------------------
/Poisson_Regression/Poisson_Regression_projects/no_of_car_accident.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import numpy as np
  4 | from sklearn.model_selection import train_test_split
  5 | from sklearn.linear_model import PoissonRegressor
  6 | from sklearn.metrics import mean_squared_error, r2_score
  7 | import plotly.express as px
  8 | import plotly.graph_objects as go
  9 | from sklearn.preprocessing import StandardScaler
 10 | import seaborn as sns
 11 | import matplotlib.pyplot as plt
 12 | 
 13 | def generate_sample_data():
 14 |     np.random.seed(42)
 15 |     n_samples = 1000
 16 |     
 17 |     # Generate features
 18 |     traffic_volume = np.random.normal(5000, 2000, n_samples)
 19 |     traffic_volume = np.clip(traffic_volume, 1000, 10000)
 20 |     
 21 |     weather_conditions = np.random.choice(['Clear', 'Rainy', 'Snowy', 'Foggy'], n_samples, p=[0.6, 0.2, 0.1, 0.1])
 22 |     time_of_day = np.random.choice(['Morning', 'Afternoon', 'Evening', 'Night'], n_samples, p=[0.3, 0.3, 0.2, 0.2])
 23 |     road_type = np.random.choice(['Highway', 'Urban', 'Rural'], n_samples, p=[0.4, 0.4, 0.2])
 24 |     
 25 |     # Create DataFrame
 26 |     df = pd.DataFrame({
 27 |         'TrafficVolume': traffic_volume,
 28 |         'WeatherCondition': weather_conditions,
 29 |         'TimeOfDay': time_of_day,
 30 |         'RoadType': road_type
 31 |     })
 32 |     
 33 |     # Generate target (number of accidents) with some patterns
 34 |     base_rate = 0.001
 35 |     weather_effect = {
 36 |         'Clear': 1.0,
 37 |         'Rainy': 1.5,
 38 |         'Snowy': 2.0,
 39 |         'Foggy': 1.8
 40 |     }
 41 |     time_effect = {
 42 |         'Morning': 1.2,
 43 |         'Afternoon': 1.0,
 44 |         'Evening': 1.5,
 45 |         'Night': 1.8
 46 |     }
 47 |     road_effect = {
 48 |         'Highway': 1.0,
 49 |         'Urban': 1.5,
 50 |         'Rural': 0.8
 51 |     }
 52 |     
 53 |     # Calculate expected number of accidents
 54 |     expected_accidents = base_rate * \
 55 |                         df['TrafficVolume'] * \
 56 |                         df['WeatherCondition'].map(weather_effect) * \
 57 |                         df['TimeOfDay'].map(time_effect) * \
 58 |                         df['RoadType'].map(road_effect)
 59 |     
 60 |     # Generate actual number of accidents using Poisson distribution
 61 |     df['Accidents'] = np.random.poisson(expected_accidents)
 62 |     
 63 |     return df
 64 | 
 65 | def run():
 66 |     st.header("Car Accident Prediction using Poisson Regression")
 67 |     st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/Poisson_Regression)", unsafe_allow_html=True)
 68 | 
 69 |     # Generate sample data
 70 |     df = generate_sample_data()
 71 |     
 72 |     # Display dataset info
 73 |     st.subheader("Dataset Overview")
 74 |     col1, col2 = st.columns(2)
 75 |     with col1:
 76 |         st.write("Dataset Shape:", df.shape)
 77 |         st.write("Features:", ", ".join(df.columns[:-1]))
 78 |         st.write("Target: Number of Accidents")
 79 |     with col2:
 80 |         st.write("Accident Statistics:")
 81 |         st.write(f"Mean: {df['Accidents'].mean():.2f}")
 82 |         st.write(f"Max: {df['Accidents'].max()}")
 83 |         st.write(f"Min: {df['Accidents'].min()}")
 84 |     
 85 |     # Data distribution visualization
 86 |     fig = px.histogram(df, x='Accidents',
 87 |                       title='Distribution of Number of Accidents',
 88 |                       nbins=30)
 89 |     st.plotly_chart(fig)
 90 | 
 91 |     # Prepare data
 92 |     X = pd.get_dummies(df[['TrafficVolume', 'WeatherCondition', 'TimeOfDay', 'RoadType']])
 93 |     y = df['Accidents']
 94 |     
 95 |     # Split data
 96 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 97 |     
 98 |     # Scale features
 99 |     scaler = StandardScaler()
100 |     X_train_scaled = scaler.fit_transform(X_train)
101 |     X_test_scaled = scaler.transform(X_test)
102 |     
103 |     # Train model
104 |     model = PoissonRegressor(alpha=0.1)
105 |     model.fit(X_train_scaled, y_train)
106 |     
107 |     # Model evaluation
108 |     st.subheader("Model Performance")
109 |     y_pred = model.predict(X_test_scaled)
110 |     mse = mean_squared_error(y_test, y_pred)
111 |     r2 = r2_score(y_test, y_pred)
112 |     
113 |     # Display metrics
114 |     col1, col2 = st.columns(2)
115 |     with col1:
116 |         st.metric("Mean Squared Error", f"{mse:.2f}")
117 |     with col2:
118 |         st.metric("R² Score", f"{r2:.2%}")
119 |     
120 |     # Actual vs Predicted Plot
121 |     fig = px.scatter(x=y_test, y=y_pred,
122 |                     labels={'x': 'Actual Accidents', 'y': 'Predicted Accidents'},
123 |                     title='Actual vs Predicted Accidents')
124 |     fig.add_trace(go.Scatter(x=[0, max(y_test)], y=[0, max(y_test)],
125 |                             mode='lines', name='Perfect Prediction'))
126 |     st.plotly_chart(fig)
127 |     
128 |     # Feature Importance
129 |     st.subheader("Feature Importance")
130 |     importance = pd.DataFrame({
131 |         'Feature': X.columns,
132 |         'Importance': np.abs(model.coef_)
133 |     })
134 |     fig = px.bar(importance, x='Feature', y='Importance',
135 |                 title='Feature Importance in Prediction')
136 |     st.plotly_chart(fig)
137 |     
138 |     # Interactive Prediction
139 |     st.subheader("Make a Prediction")
140 |     st.write("Enter traffic conditions:")
141 |     
142 |     col1, col2 = st.columns(2)
143 |     with col1:
144 |         traffic_volume = st.slider("Traffic Volume", 1000, 10000, 5000)
145 |         weather = st.selectbox("Weather Condition", df['WeatherCondition'].unique())
146 |     with col2:
147 |         time = st.selectbox("Time of Day", df['TimeOfDay'].unique())
148 |         road = st.selectbox("Road Type", df['RoadType'].unique())
149 |     
150 |     if st.button("Predict"):
151 |         # Prepare input data
152 |         input_data = pd.DataFrame({
153 |             'TrafficVolume': [traffic_volume],
154 |             'WeatherCondition': [weather],
155 |             'TimeOfDay': [time],
156 |             'RoadType': [road]
157 |         })
158 |         
159 |         # One-hot encode categorical variables
160 |         input_encoded = pd.get_dummies(input_data)
161 |         # Ensure all columns from training data are present
162 |         for col in X.columns:
163 |             if col not in input_encoded.columns:
164 |                 input_encoded[col] = 0
165 |         input_encoded = input_encoded[X.columns]
166 |         
167 |         # Scale input data
168 |         input_scaled = scaler.transform(input_encoded)
169 |         
170 |         # Make prediction
171 |         prediction = model.predict(input_scaled)[0]
172 |         
173 |         # Display prediction
174 |         st.subheader("Prediction Result")
175 |         st.metric("Expected Number of Accidents", f"{prediction:.1f}")
176 |         
177 |         # Visualize prediction with confidence interval
178 |         fig = go.Figure()
179 |         fig.add_trace(go.Bar(
180 |             x=['Predicted Accidents'],
181 |             y=[prediction],
182 |             error_y=dict(type='data', array=[np.sqrt(prediction)], visible=True),
183 |             name='Prediction'
184 |         ))
185 |         fig.update_layout(title='Predicted Accidents with 95% Confidence Interval')
186 |         st.plotly_chart(fig)
187 |     
188 |     # Data Analysis
189 |     st.subheader("Data Analysis")
190 |     
191 |     # Weather impact
192 |     fig = px.box(df, x='WeatherCondition', y='Accidents',
193 |                 title='Accident Distribution by Weather Condition')
194 |     st.plotly_chart(fig)
195 |     
196 |     # Time of day impact
197 |     fig = px.box(df, x='TimeOfDay', y='Accidents',
198 |                 title='Accident Distribution by Time of Day')
199 |     st.plotly_chart(fig)
200 |     
201 |     # Road type impact
202 |     fig = px.box(df, x='RoadType', y='Accidents',
203 |                 title='Accident Distribution by Road Type')
204 |     st.plotly_chart(fig)
205 |     
206 |     # Traffic volume vs accidents
207 |     fig = px.scatter(df, x='TrafficVolume', y='Accidents',
208 |                     color='WeatherCondition',
209 |                     title='Traffic Volume vs Accidents by Weather Condition')
210 |     st.plotly_chart(fig)
211 | 
212 | if __name__ == "__main__":
213 |     run()


--------------------------------------------------------------------------------
/Poisson_Regression/main.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | # Ensure the Poisson_Regression_projects folder is in the Python path
 6 | sys.path.append(str(Path(__file__).parent))
 7 | 
 8 | from Poisson_Regression_projects import (
 9 |     competition_award,
10 |     no_of_car_accident,
11 | )
12 | 
13 | def run():
14 |     st.title("Poisson Regression Projects")
15 | 
16 |     # Sidebar for project selection
17 |     project = st.sidebar.selectbox(
18 |         "Select a project",
19 |         [
20 |             "Competition Award Prediction",
21 |             "Number of Car Accidents Prediction",
22 |         ],
23 |     )
24 | 
25 |     # Run the selected project
26 |     if project == "Competition Award Prediction":
27 |         competition_award.run()
28 |     elif project == "Number of Car Accidents Prediction":
29 |         no_of_car_accident.run()
30 | 
31 | if __name__ == "__main__":
32 |     run()


--------------------------------------------------------------------------------
/Poisson_Regression/readme.md:
--------------------------------------------------------------------------------
 1 | # Poisson Regression Projects
 2 | 
 3 | This repository contains various Poisson Regression projects implemented in Python. Each project demonstrates the application of Poisson Regression to solve real-world problems using datasets.
 4 | 
 5 | ## Project Structure
 6 | 
 7 | ```
 8 | Poisson_Regression/
 9 | ├── main.py
10 | ├── requirements.txt
11 | ├── Poisson_regression_projects/
12 | │   ├── competition_award.py
13 | │   ├── no_of_car_accident.py
14 | │   ├── competition_awards_data.csv
15 | ```
16 | 
17 | ### Key Files
18 | - **`main.py`**: The main entry point for running the Streamlit app.
19 | - **`requirements.txt`**: Contains the dependencies required to run the project.
20 | - **`Poisson_regression_projects/`**: Contains individual project scripts and datasets.
21 | 
22 | ## Projects Included
23 | 
24 | 1. **Competition Award Prediction**  
25 |    Predicts the number of awards a student will receive based on their math scores using Poisson Regression.  
26 |    Dataset: `competition_awards_data.csv`
27 | 
28 |    **Screenshots:**
29 |    ![Competition Award Prediction](screenshots/comp.png)
30 | 
31 | 2. **Number of Car Accidents Prediction**  
32 |    Predicts the number of car accidents based on average speed, traffic density, and road conditions using Poisson Regression.  
33 |    Dataset: Synthetic data (hardcoded in the script).
34 | 
35 |    **Screenshots:**
36 |    ![Car Accident Prediction](screenshots/caraccidents.png)
37 | 
38 | ## How to Run
39 | 
40 | 1. Clone the repository:
41 |    ```bash
42 |    git clone https://github.com/benasphy/ML_projects.git
43 |    cd Poisson_Regression
44 |    ```
45 | 
46 | 2. Install dependencies:
47 |    ```bash
48 |    pip install -r requirements.txt
49 |    ```
50 | 
51 | 3. Run the Streamlit app:
52 |    ```bash
53 |    streamlit run main.py
54 |    ```
55 | 
56 | 4. Select a project from the sidebar to explore its functionality.
57 | 
58 | ## Requirements
59 | 
60 | The project requires the following Python libraries:
61 | - `streamlit`
62 | - `numpy`
63 | - `pandas`
64 | - `scikit-learn`
65 | 
66 | ## Datasets
67 | 
68 | - **`competition_awards_data.csv`**: Contains data for predicting the number of awards based on math scores.
69 | 
70 | ## Screenshots
71 | 
72 | Add screenshots of the Streamlit app interface here to showcase the projects.
73 | 
74 | ## License
75 | 
76 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
77 | 
78 | ## Acknowledgments
79 | 
80 | - Datasets used in this project are sourced from publicly available repositories.
81 | - Special thanks to the contributors of the Python libraries used in this project.
82 | 
83 | ---
84 | Feel free to contribute to this repository by submitting issues or pull requests.
85 | 


--------------------------------------------------------------------------------
/Poisson_Regression/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit
2 | numpy
3 | pandas
4 | scikit-learn
5 | plotly
6 | scipy
7 | seaborn


--------------------------------------------------------------------------------
/Poisson_Regression/screenshots/caraccident.png:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Poisson_Regression/screenshots/caraccidents.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Poisson_Regression/screenshots/caraccidents.png


--------------------------------------------------------------------------------
/Poisson_Regression/screenshots/comp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/Poisson_Regression/screenshots/comp.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Machine Learning Projects Collection
  2 | 
  3 | A comprehensive collection of machine learning projects implemented in Python, covering various algorithms and techniques. Each project is designed to solve real-world problems using different machine learning approaches.
  4 | 
  5 | ## Project Categories
  6 | 
  7 | ### Supervised Learning
  8 | - **Linear Regression**
  9 |   - House Price Prediction
 10 |   - Salary Prediction
 11 |   - Study Hours vs Exam Score Prediction
 12 |   - Messi Goal Prediction
 13 |   - Normal Equation vs Gradient Descent Implementation
 14 | 
 15 | - **Logistic Regression**
 16 |   - Diabetes Prediction
 17 |   - Rock vs Mine Classification
 18 |   - Simple HIV Prediction
 19 | 
 20 | - **Naive Bayes**
 21 |   - Fake News Detection
 22 |   - Spam Detection
 23 |   - Weather Prediction
 24 | 
 25 | - **Support Vector Machine (SVM)**
 26 |   - Breast Cancer Prediction
 27 |   - Spam Detection
 28 | 
 29 | - **K-Nearest Neighbors (KNN)**
 30 |   - Movie Recommendation System
 31 |   - T-Shirt Size Prediction
 32 | 
 33 | - **Decision Trees**
 34 |   - Gym Decision Tree
 35 |   - Gini Impurity Implementation
 36 | 
 37 | ### Unsupervised Learning
 38 | - **Clustering**
 39 |   - **K-Means**
 40 |     - Customer Segmentation
 41 |     - Loan Approval Clustering
 42 | 
 43 |   - **Gaussian Mixture Models (GMM)**
 44 |     - Customer Segmentation
 45 |     - Image Color Segmentation
 46 | 
 47 |   - **DBSCAN/HDBSCAN**
 48 |     - Customer Behavior Analysis
 49 |     - Anomaly Detection
 50 | 
 51 |   - **Hierarchical Clustering**
 52 |     - Document Clustering
 53 |     - Market Basket Analysis
 54 | 
 55 |   - **Fuzzy C-Means**
 56 |     - Customer Profiling
 57 |     - Image Segmentation
 58 | 
 59 | ### Other Techniques
 60 | - **Dimensionality Reduction**
 61 |   - Feature Selection
 62 |   - Image Compression
 63 | 
 64 | - **Association Rule Learning**
 65 |   - Market Basket Analysis
 66 |   - Recommendation System
 67 | 
 68 | - **Poisson Regression**
 69 |   - Competition Award Prediction
 70 |   - Car Accident Prediction
 71 | 
 72 | ## Project Structure
 73 | 
 74 | Each project category has its own directory containing:
 75 | - `main.py`: Main entry point for running the Streamlit app
 76 | - `requirements.txt`: Required Python packages
 77 | - Project-specific files and datasets
 78 | - Detailed README.md with project documentation
 79 | 
 80 | ## Getting Started
 81 | 
 82 | 1. Clone the repository:
 83 |    ```bash
 84 |    git clone https://github.com/benasphy/ML_projects.git
 85 |    cd ML_projects
 86 |    ```
 87 | 
 88 | 2. Install dependencies for a specific project:
 89 |    ```bash
 90 |    cd <project_directory>
 91 |    pip install -r requirements.txt
 92 |    ```
 93 | 
 94 | 3. Run the Streamlit app:
 95 |    ```bash
 96 |    streamlit run main.py
 97 |    ```
 98 | 
 99 | ## Common Requirements
100 | 
101 | Most projects require these Python libraries:
102 | - `streamlit`
103 | - `numpy`
104 | - `pandas`
105 | - `scikit-learn`
106 | - `matplotlib`
107 | - `plotly`
108 | 
109 | Additional requirements are specified in each project's `requirements.txt` file.
110 | 
111 | ## Features
112 | 
113 | - Interactive web interfaces using Streamlit
114 | - Real-time data visualization
115 | - Model evaluation and metrics
116 | - Custom dataset support
117 | - Comprehensive documentation
118 | - Clean and modular code structure
119 | 
120 | ## Contributing
121 | 
122 | Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change.
123 | 
124 | ## License
125 | 
126 | This project is licensed under the MIT License - see the LICENSE file for details.
127 | 
128 | ## Acknowledgments
129 | 
130 | - Datasets used in these projects are sourced from publicly available repositories
131 | - Special thanks to the contributors of the Python libraries used in these projects
132 | - Inspired by various machine learning courses and tutorials
133 | 
134 | ---
135 | Feel free to star the repository if you find it useful! 


--------------------------------------------------------------------------------
/SVM/SVM_projects/breast_cancer_prediction.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import numpy as np
  4 | from sklearn.svm import SVC
  5 | from sklearn.preprocessing import StandardScaler
  6 | from sklearn.model_selection import train_test_split, cross_val_score
  7 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
  8 | from sklearn.datasets import load_breast_cancer
  9 | import plotly.express as px
 10 | import plotly.graph_objects as go
 11 | import seaborn as sns
 12 | import matplotlib.pyplot as plt
 13 | from sklearn.decomposition import PCA
 14 | 
 15 | def run():
 16 |     st.header("Breast Cancer Prediction using SVM")
 17 |     st.markdown("[View this project on GitHub](https://github.com/benasphy/ML_projects/tree/main/SVM)", unsafe_allow_html=True)
 18 | 
 19 |     # Load dataset
 20 |     data = load_breast_cancer()
 21 |     df = pd.DataFrame(data.data, columns=data.feature_names)
 22 |     df['target'] = data.target
 23 | 
 24 |     # Display dataset info
 25 |     st.subheader("Dataset Overview")
 26 |     col1, col2 = st.columns(2)
 27 |     with col1:
 28 |         st.write("Dataset Shape:", df.shape)
 29 |         st.write("Number of Samples:", len(df))
 30 |     with col2:
 31 |         target_dist = df['target'].value_counts()
 32 |         fig = px.pie(values=target_dist.values, 
 33 |                     names=['Benign', 'Malignant'],
 34 |                     title='Diagnosis Distribution')
 35 |         st.plotly_chart(fig)
 36 | 
 37 |     # Data Analysis
 38 |     st.subheader("Data Analysis")
 39 |     
 40 |     # Feature distributions
 41 |     st.write("Feature Distributions by Diagnosis")
 42 |     selected_feature = st.selectbox("Select Feature to View:", data.feature_names)
 43 |     
 44 |     fig = px.box(df, x='target', y=selected_feature,
 45 |                 title=f'{selected_feature} Distribution by Diagnosis',
 46 |                 labels={'target': 'Diagnosis', selected_feature: selected_feature})
 47 |     st.plotly_chart(fig)
 48 | 
 49 |     # PCA Visualization
 50 |     st.subheader("Data Visualization (PCA)")
 51 |     pca = PCA(n_components=2)
 52 |     X_pca = pca.fit_transform(df.drop('target', axis=1))
 53 |     
 54 |     pca_df = pd.DataFrame({
 55 |         'PC1': X_pca[:, 0],
 56 |         'PC2': X_pca[:, 1],
 57 |         'Diagnosis': ['Benign' if x == 1 else 'Malignant' for x in df['target']]
 58 |     })
 59 |     
 60 |     fig = px.scatter(pca_df, x='PC1', y='PC2', color='Diagnosis',
 61 |                     title='PCA Visualization of Breast Cancer Data')
 62 |     st.plotly_chart(fig)
 63 | 
 64 |     # Data preprocessing
 65 |     X = df.drop('target', axis=1)
 66 |     y = df['target']
 67 | 
 68 |     # Split data
 69 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 70 | 
 71 |     # Scaling
 72 |     scaler = StandardScaler()
 73 |     X_train_scaled = scaler.fit_transform(X_train)
 74 |     X_test_scaled = scaler.transform(X_test)
 75 | 
 76 |     # Train model
 77 |     model = SVC(kernel='rbf', probability=True)
 78 |     model.fit(X_train_scaled, y_train)
 79 | 
 80 |     # Model evaluation
 81 |     st.subheader("Model Performance")
 82 |     y_pred = model.predict(X_test_scaled)
 83 |     accuracy = accuracy_score(y_test, y_pred)
 84 |     
 85 |     # Display metrics
 86 |     col1, col2, col3 = st.columns(3)
 87 |     with col1:
 88 |         st.metric("Accuracy", f"{accuracy:.2%}")
 89 |     with col2:
 90 |         cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
 91 |         st.metric("Cross-validation Score", f"{cv_scores.mean():.2%}")
 92 |     with col3:
 93 |         st.metric("Cross-validation Std", f"{cv_scores.std():.2%}")
 94 | 
 95 |     # Confusion Matrix
 96 |     st.subheader("Confusion Matrix")
 97 |     cm = confusion_matrix(y_test, y_pred)
 98 |     fig = px.imshow(cm,
 99 |                    labels=dict(x="Predicted", y="Actual", color="Count"),
100 |                    x=['Malignant', 'Benign'],
101 |                    y=['Malignant', 'Benign'],
102 |                    text_auto=True,
103 |                    aspect="auto")
104 |     st.plotly_chart(fig)
105 | 
106 |     # Classification Report
107 |     st.subheader("Detailed Classification Report")
108 |     report = classification_report(y_test, y_pred, output_dict=True)
109 |     report_df = pd.DataFrame(report).transpose()
110 |     st.dataframe(report_df)
111 | 
112 |     # Feature Importance
113 |     st.subheader("Feature Importance")
114 |     # For SVM with RBF kernel, we'll use permutation importance
115 |     from sklearn.inspection import permutation_importance
116 |     result = permutation_importance(model, X_test_scaled, y_test, n_repeats=10, random_state=42)
117 |     
118 |     feature_importance = pd.DataFrame({
119 |         'Feature': data.feature_names,
120 |         'Importance': result.importances_mean
121 |     }).sort_values('Importance', ascending=False)
122 |     
123 |     fig = px.bar(feature_importance.head(10), x='Feature', y='Importance',
124 |                 title='Top 10 Most Important Features')
125 |     st.plotly_chart(fig)
126 | 
127 |     # Prediction interface
128 |     st.subheader("Predict Breast Cancer")
129 |     
130 |     # Create input fields for each feature
131 |     input_data = {}
132 |     cols = st.columns(3)
133 |     for i, feature in enumerate(data.feature_names):
134 |         with cols[i % 3]:
135 |             input_data[feature] = st.number_input(
136 |                 f"{feature}",
137 |                 min_value=float(df[feature].min()),
138 |                 max_value=float(df[feature].max()),
139 |                 value=float(df[feature].mean())
140 |             )
141 | 
142 |     if st.button("Predict"):
143 |         # Prepare input data
144 |         input_df = pd.DataFrame([input_data])
145 |         input_scaled = scaler.transform(input_df)
146 |         
147 |         # Make prediction
148 |         prediction = model.predict(input_scaled)[0]
149 |         probabilities = model.predict_proba(input_scaled)[0]
150 |         
151 |         # Display prediction
152 |         col1, col2 = st.columns(2)
153 |         with col1:
154 |             st.metric("Prediction", "Benign" if prediction == 1 else "Malignant")
155 |         with col2:
156 |             st.metric("Confidence", f"{max(probabilities):.2%}")
157 |         
158 |         # Visualize prediction probabilities
159 |         fig = go.Figure(data=[
160 |             go.Bar(x=['Malignant', 'Benign'],
161 |                   y=probabilities,
162 |                   text=[f'{p:.2%}' for p in probabilities],
163 |                   textposition='auto',
164 |             )
165 |         ])
166 |         fig.update_layout(title='Prediction Probabilities',
167 |                         xaxis_title='Diagnosis',
168 |                         yaxis_title='Probability')
169 |         st.plotly_chart(fig)
170 |         
171 |         # Feature Analysis
172 |         st.subheader("Feature Analysis")
173 |         
174 |         # Compare input values with dataset statistics
175 |         comparison_df = pd.DataFrame({
176 |             'Feature': data.feature_names,
177 |             'Your Value': input_data.values(),
178 |             'Dataset Mean': df.drop('target', axis=1).mean(),
179 |             'Dataset Std': df.drop('target', axis=1).std()
180 |         })
181 |         
182 |         # Calculate z-scores
183 |         comparison_df['Z-Score'] = (comparison_df['Your Value'] - comparison_df['Dataset Mean']) / comparison_df['Dataset Std']
184 |         
185 |         # Plot feature comparison
186 |         fig = px.bar(comparison_df.head(10), x='Feature', y='Z-Score',
187 |                     title='Feature Comparison (Z-Scores)',
188 |                     color='Z-Score',
189 |                     color_continuous_scale=['red', 'white', 'green'])
190 |         fig.add_hline(y=0, line_dash="dash", line_color="black")
191 |         st.plotly_chart(fig)
192 | 
193 | if __name__ == "__main__":
194 |     run() 


--------------------------------------------------------------------------------
/SVM/main.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | # Ensure the SVM_projects folder is in the Python path
 6 | sys.path.append(str(Path(__file__).parent))
 7 | 
 8 | from SVM_projects import (
 9 |     spam_detection,
10 |     breast_cancer_prediction,
11 | )
12 | 
13 | def run():
14 |     st.title("SVM Projects")
15 | 
16 |     # Sidebar for project selection
17 |     project = st.sidebar.selectbox(
18 |         "Select a project",
19 |         [
20 |             "Spam Detection",
21 |             "Breast Cancer Prediction",
22 |         ],
23 |     )
24 | 
25 |     # Run the selected project
26 |     if project == "Spam Detection":
27 |         spam_detection.run()
28 |     elif project == "Breast Cancer Prediction":
29 |         breast_cancer_prediction.run()
30 | 
31 | if __name__ == "__main__":
32 |     run()
33 | 


--------------------------------------------------------------------------------
/SVM/readme.md:
--------------------------------------------------------------------------------
 1 | # SVM Projects
 2 | 
 3 | This folder contains various projects that utilize the Support Vector Machine (SVM) algorithm for different applications. Each project is designed to demonstrate the use of SVM in machine learning tasks.
 4 | 
 5 | ## Projects
 6 | 
 7 | 1. **Spam Detection**: Classifies emails as spam or not spam using SVM.
 8 | 
 9 |    **Screenshots:**
10 |    ![Spam Detection](screenshots/spam.png)
11 | 2. **Breast Cancer Prediction**: Predicts whether a breast cancer tumor is benign or malignant using SVM.
12 | 
13 |    **Screenshots:**
14 |    ![Breast Cancer Prediction](screenshots/breast.png)
15 | 
16 | ## How to Run
17 | 
18 | To run any of the projects, follow these steps:
19 | 
20 | 1. Ensure you have the required dependencies installed. You can install them using pip:
21 | 
22 |    ```bash
23 |    pip install streamlit pandas numpy scikit-learn
24 |    ```
25 | 
26 | 2. Navigate to the SVM directory in your terminal.
27 | 
28 | 3. Run the Streamlit app using the following command:
29 | 
30 |    ```bash
31 |    streamlit run main.py
32 |    ```
33 | 
34 | 4. Use the sidebar to select the project you want to run.
35 | 
36 | ## Project Structure
37 | 
38 | - `main.py`: The main entry point for running the projects.
39 | - `SVM_projects/`: Contains individual project files:
40 |   - `spam_detection.py`: Spam detection project.
41 |   - `breast_cancer_prediction.py`: Breast cancer prediction project.
42 | 
43 | ## Data
44 | 
45 | - The spam detection project uses the spam.csv dataset
46 | - The breast cancer prediction project uses the built-in breast cancer dataset from scikit-learn
47 | 
48 | ## Contributing
49 | 
50 | Feel free to contribute to these projects by submitting pull requests or opening issues for any bugs or feature requests.
51 | 
52 | ## License
53 | 
54 | This project is licensed under the MIT License - see the LICENSE file for details.
55 | 


--------------------------------------------------------------------------------
/SVM/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit==1.22.0
2 | pandas==1.5.3
3 | numpy==1.24.3
4 | scikit-learn==1.2.2
5 | plotly
6 | scipy
7 | seaborn
8 | 


--------------------------------------------------------------------------------
/SVM/screenshots/breast.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/SVM/screenshots/breast.png


--------------------------------------------------------------------------------
/SVM/screenshots/spam.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benasphy/ML_projects/1c06098a2ad4d73e6080b717c6ea77b7853fd53c/SVM/screenshots/spam.png


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import importlib
 3 | 
 4 | def run():
 5 |     st.title("Machine Learning Algorithms")
 6 | 
 7 |     # List of available algorithm folders
 8 |     algorithms = [
 9 |         "Linear Regression",
10 |         "Logistic Regression",
11 |         "Decision Trees",
12 |         "Poisson Regression",
13 |         "Support Vector Machines",
14 |         "K-Nearest Neighbors",
15 |         "Naive Bayes",
16 |         "GMM",
17 |         "Hierarchical Clustering",
18 |         "DBSCAN & HDBSCAN",
19 |         "Fuzzy C-Means",
20 |         "Association Rule Learning",
21 |         "K-Means Clustering",
22 |         "Dimensionality Reduction",
23 |     ]
24 | 
25 |     # Sidebar to select an algorithm
26 |     selected_algorithm = st.sidebar.selectbox("Select an Algorithm", algorithms)
27 | 
28 |     # Map algorithm names to module paths
29 |     algorithm_modules = {
30 |         "Linear Regression": "Linear_Regression.main",
31 |         "Logistic Regression": "Logistic_Regression.main",
32 |         "Decision Trees": "Decision_Trees.main",
33 |         "Poisson Regression": "Poisson_Regression.main",
34 |         "Support Vector Machines": "SVM.main",
35 |         "K-Nearest Neighbors": "KNN.main",
36 |         "Naive Bayes": "Naive_Bayes.main",
37 |         "GMM": "GMM.main",
38 |         "Hierarchical Clustering": "Hierarchical_Clustering.main",
39 |         "DBSCAN & HDBSCAN": "DBSCAN_HDBSCAN.main",
40 |         "Fuzzy C-Means": "Fuzzy_C_Means.main",
41 |         "Association Rule Learning": "Association_Rule_Learning.main",
42 |         "K-Means Clustering": "K-Means.main",
43 |         "Dimensionality Reduction": "Dimensionality_Reduction.main"
44 |     }
45 | 
46 |     st.write(f"You selected: {selected_algorithm}")
47 |     st.write("The selected algorithm's app will appear below.")
48 | 
49 |     # Dynamically import and run the selected module's run() function
50 |     module_path = algorithm_modules[selected_algorithm]
51 |     try:
52 |         module = importlib.import_module(module_path)
53 |         module.run()
54 |     except Exception as e:
55 |         st.error(f"Failed to load {selected_algorithm} app: {e}")
56 | 
57 | if __name__ == "__main__":
58 |     run()


--------------------------------------------------------------------------------
/packages.txt:
--------------------------------------------------------------------------------
1 | libgl1-mesa-glx
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | streamlit
 2 | numpy
 3 | pandas
 4 | scikit-learn
 5 | plotly
 6 | scipy
 7 | seaborn
 8 | wordcloud 
 9 | opencv-python
10 | hdbscan
11 | scikit-image
12 | mlxtend


--------------------------------------------------------------------------------