├── LICENSE ├── Machine Learning Foundations ├── 02 Unsupervised Learning │ ├── 02 Dimensionality Reduction │ │ ├── 02 t-Distributed Stochastic Neighbor Embedding (t-SNE) │ │ │ └── tSNE.py │ │ ├── 03 Linear Discriminant Analysis (LDA) │ │ │ └── LDA.py │ │ └── 01 Principal Component Analysis (PCA) │ │ │ └── PCA.py │ ├── 01 Clustering │ │ ├── 04 Mean Shift │ │ │ └── MeanShift.py │ │ ├── 01 K-Means Clustering │ │ │ └── KMeansClustering.py │ │ ├── 03 DBSCAN │ │ │ └── DBSCAN.py │ │ └── 02 Hierarchical Clustering │ │ │ └── HierarchicalClustering.py │ └── 03 Association Rules │ │ ├── 02 FP-Growth │ │ └── FPGrowth.py │ │ └── 01 Apriori Algorithm │ │ └── AprioriAlgorithm.py ├── 03 ML Pipelines │ ├── 03 Model Selection │ │ ├── 02 K-Fold Cross-Validation │ │ │ └── KFoldCrossValidation.py │ │ ├── 04 Grid Search │ │ │ └── GridSearch.py │ │ ├── 01 Train-Test Split │ │ │ └── TrainTestSplit.py │ │ ├── 05 Random Search │ │ │ └── RandomSearch.py │ │ └── 03 Stratified K-Fold │ │ │ └── StratifiedKFold.py │ ├── 05 Deployment │ │ ├── 01 Model Serialization │ │ │ └── ModelSerialization.py │ │ └── 02 API Integration │ │ │ └── APIIntegration.py │ ├── 01 Data Preprocessing │ │ ├── 05 Outlier Detection │ │ │ └── OutlierDetection.py │ │ ├── 03 Standardization │ │ │ └── Standardization.py │ │ ├── 01 Feature Scaling │ │ │ └── FeatureScaling.py │ │ ├── 06 Encoding Categorical Variables │ │ │ └── EncodingCategoricalVariables.py │ │ ├── 02 Normalization │ │ │ └── Normalization.py │ │ └── 04 Handling Missing Values │ │ │ └── HandlingMissingValues.py │ ├── 04 Model Evaluation │ │ ├── 01 Bias-Variance Tradeoff │ │ │ └── BiasVarianceTradeoff.py │ │ ├── 02 Overfitting │ │ │ └── Overfitting.py │ │ └── 03 Underfitting │ │ │ └── Underfitting.py │ └── 02 Feature Engineering │ │ ├── 01 Feature Selection │ │ └── FeatureSelection.py │ │ ├── 02 Polynomial Features │ │ └── PolynomialFeatures.py │ │ ├── 04 Binning │ │ └── Binning.py │ │ └── 03 Interaction Terms │ │ └── InteractionTerms.py ├── 01 Supervised Learning │ ├── 03 Evaluation Metrics │ │ ├── 01 Regression Metrics │ │ │ └── RegressionMetrics.py │ │ └── 02 Classification Metrics │ │ │ └── ClassificationMetrics.py │ ├── 01 Regression │ │ ├── 04 Lasso Regression │ │ │ └── LassoRegression.py │ │ ├── 03 Ridge Regression │ │ │ └── RidgeRegression.py │ │ ├── 02 Polynomial Regression │ │ │ └── PolynomialRegression.py │ │ └── 01 Linear Regression │ │ │ └── LinearRegression.py │ └── 02 Classification │ │ ├── 04 Naive Bayes │ │ └── NaiveBayes.py │ │ ├── 05 K-Nearest Neighbors (KNN) │ │ └── KNN.py │ │ ├── 02 Decision Trees │ │ └── DecisionTrees.py │ │ ├── 01 Logistic Regression │ │ └── LogisticRegression.py │ │ ├── 06 Support Vector Machines (SVM) │ │ └── SVM.py │ │ └── 03 Random Forest │ │ └── RandomForest.py └── 04 Ensemble Methods │ ├── 01 Bagging │ ├── 02 Random Forest │ │ └── RandomForest.py │ └── 01 Bootstrap Aggregating │ │ └── BootstrapAggregating.py │ └── 02 Boosting │ ├── 01 AdaBoost │ └── AdaBoost.py │ └── 02 Gradient Boosting │ └── GradientBoosting.py ├── README.md └── Machine Learning Interview Questions with Ans └── README.md /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 rohanmistry231 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /Machine Learning Foundations/02 Unsupervised Learning/02 Dimensionality Reduction/02 t-Distributed Stochastic Neighbor Embedding (t-SNE)/tSNE.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import load_iris 5 | from sklearn.manifold import TSNE 6 | from sklearn.preprocessing import StandardScaler 7 | import seaborn as sns 8 | 9 | # t-Distributed Stochastic Neighbor Embedding (t-SNE) 10 | # This script demonstrates t-SNE for dimensionality reduction on the Iris dataset. 11 | 12 | # Tasks: 13 | # 1. Load the Iris dataset. 14 | # 2. Standardize the features. 15 | # 3. Apply t-SNE to reduce to 2 dimensions. 16 | # 4. Visualize the reduced data. 17 | 18 | # Step 1: Load data 19 | iris = load_iris() 20 | X = iris.data 21 | y = iris.target 22 | data = pd.DataFrame(X, columns=iris.feature_names) 23 | data['Target'] = y 24 | 25 | # Step 2: Standardize features 26 | scaler = StandardScaler() 27 | X_scaled = scaler.fit_transform(X) 28 | 29 | # Step 3: Apply t-SNE 30 | tsne = TSNE(n_components=2, random_state=42) 31 | X_tsne = tsne.fit_transform(X_scaled) 32 | 33 | # Step 4: Visualize reduced data 34 | plt.figure(figsize=(10, 6)) 35 | sns.scatterplot(x=X_tsne[:, 0], y=X_tsne[:, 1], hue=iris.target_names[y], palette='viridis', s=100) 36 | plt.xlabel('t-SNE Component 1') 37 | plt.ylabel('t-SNE Component 2') 38 | plt.title('t-SNE: Iris Dataset Reduced to 2D') 39 | plt.grid(True) 40 | plt.savefig('tsne.png') 41 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/02 Unsupervised Learning/02 Dimensionality Reduction/03 Linear Discriminant Analysis (LDA)/LDA.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import load_iris 5 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 6 | from sklearn.preprocessing import StandardScaler 7 | import seaborn as sns 8 | 9 | # Linear Discriminant Analysis (LDA) 10 | # This script demonstrates LDA for supervised dimensionality reduction on the Iris dataset. 11 | 12 | # Tasks: 13 | # 1. Load the Iris dataset. 14 | # 2. Standardize the features. 15 | # 3. Apply LDA to reduce to 2 dimensions (since we have 3 classes). 16 | # 4. Visualize the reduced data. 17 | 18 | # Step 1: Load data 19 | iris = load_iris() 20 | X = iris.data 21 | y = iris.target 22 | data = pd.DataFrame(X, columns=iris.feature_names) 23 | data['Target'] = y 24 | 25 | # Step 2: Standardize features 26 | scaler = StandardScaler() 27 | X_scaled = scaler.fit_transform(X) 28 | 29 | # Step 3: Apply LDA 30 | lda = LinearDiscriminantAnalysis(n_components=2) 31 | X_lda = lda.fit_transform(X_scaled, y) 32 | 33 | # Step 4: Visualize reduced data 34 | plt.figure(figsize=(10, 6)) 35 | sns.scatterplot(x=X_lda[:, 0], y=X_lda[:, 1], hue=iris.target_names[y], palette='viridis', s=100) 36 | plt.xlabel('LDA Component 1') 37 | plt.ylabel('LDA Component 2') 38 | plt.title('LDA: Iris Dataset Reduced to 2D') 39 | plt.grid(True) 40 | plt.savefig('lda.png') 41 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/02 Unsupervised Learning/01 Clustering/04 Mean Shift/MeanShift.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import make_blobs 5 | from sklearn.cluster import MeanShift 6 | from sklearn.metrics import silhouette_score 7 | import seaborn as sns 8 | 9 | # Mean Shift Clustering 10 | # This script demonstrates Mean Shift clustering. 11 | 12 | # Tasks: 13 | # 1. Generate synthetic clustering data. 14 | # 2. Apply Mean Shift clustering. 15 | # 3. Evaluate clustering performance using silhouette score. 16 | # 4. Visualize clusters. 17 | 18 | # Step 1: Generate synthetic data 19 | np.random.seed(42) 20 | X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=1.0, random_state=42) 21 | 22 | # Convert to DataFrame 23 | data = pd.DataFrame(X, columns=['Feature_1', 'Feature_2']) 24 | 25 | # Step 2: Apply Mean Shift clustering 26 | mean_shift = MeanShift() 27 | labels = mean_shift.fit_predict(X) 28 | cluster_centers = mean_shift.cluster_centers_ 29 | 30 | # Step 3: Evaluate performance 31 | silhouette = silhouette_score(X, labels) 32 | print(f'Silhouette Score: {silhouette:.2f}') 33 | 34 | # Step 4: Visualize clusters 35 | plt.figure(figsize=(10, 6)) 36 | sns.scatterplot(x=data['Feature_1'], y=data['Feature_2'], hue=labels, palette='viridis', s=100) 37 | plt.scatter(cluster_centers[:, 0], cluster_centers[:, 1], c='red', marker='x', s=200, linewidths=3, label='Cluster Centers') 38 | plt.xlabel('Feature 1') 39 | plt.ylabel('Feature 2') 40 | plt.title('Mean Shift Clustering') 41 | plt.legend() 42 | plt.grid(True) 43 | plt.savefig('mean_shift_clustering.png') 44 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/03 ML Pipelines/03 Model Selection/02 K-Fold Cross-Validation/KFoldCrossValidation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import load_iris 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn.model_selection import cross_val_score 7 | import seaborn as sns 8 | 9 | # K-Fold Cross-Validation 10 | # This script demonstrates K-Fold Cross-Validation. 11 | 12 | # Tasks: 13 | # 1. Load the Iris dataset. 14 | # 2. Apply K-Fold Cross-Validation (k=5). 15 | # 3. Train a Logistic Regression model. 16 | # 4. Evaluate performance (mean accuracy and standard deviation). 17 | # 5. Visualize cross-validation scores. 18 | 19 | # Step 1: Load data 20 | iris = load_iris() 21 | X = iris.data 22 | y = iris.target 23 | 24 | # Step 2: Apply K-Fold Cross-Validation 25 | model = LogisticRegression(random_state=42) 26 | k = 5 27 | scores = cross_val_score(model, X, y, cv=k, scoring='accuracy') 28 | 29 | # Step 3: Evaluate performance 30 | mean_accuracy = np.mean(scores) 31 | std_accuracy = np.std(scores) 32 | print(f'K-Fold CV Scores: {scores}') 33 | print(f'Mean Accuracy: {mean_accuracy:.2f}') 34 | print(f'Standard Deviation: {std_accuracy:.2f}') 35 | 36 | # Step 4: Visualize CV scores 37 | plt.figure(figsize=(10, 6)) 38 | sns.barplot(x=np.arange(1, k+1), y=scores, palette='viridis') 39 | plt.axhline(mean_accuracy, color='red', linestyle='--', label=f'Mean Accuracy: {mean_accuracy:.2f}') 40 | plt.xlabel('Fold') 41 | plt.ylabel('Accuracy') 42 | plt.title('K-Fold Cross-Validation Scores') 43 | plt.legend() 44 | plt.grid(True) 45 | plt.savefig('kfold_cross_validation.png') 46 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/02 Unsupervised Learning/01 Clustering/01 K-Means Clustering/KMeansClustering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import make_blobs 5 | from sklearn.cluster import KMeans 6 | from sklearn.metrics import silhouette_score 7 | import seaborn as sns 8 | 9 | # K-Means Clustering 10 | # This script demonstrates K-Means Clustering on synthetic data. 11 | 12 | # Tasks: 13 | # 1. Generate synthetic clustering data. 14 | # 2. Apply K-Means clustering with a chosen number of clusters. 15 | # 3. Evaluate clustering performance using silhouette score. 16 | # 4. Visualize clusters and centroids. 17 | 18 | # Step 1: Generate synthetic data 19 | np.random.seed(42) 20 | X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=1.0, random_state=42) 21 | 22 | # Convert to DataFrame 23 | data = pd.DataFrame(X, columns=['Feature_1', 'Feature_2']) 24 | 25 | # Step 2: Apply K-Means clustering 26 | n_clusters = 4 27 | kmeans = KMeans(n_clusters=n_clusters, random_state=42) 28 | kmeans.fit(X) 29 | labels = kmeans.labels_ 30 | centroids = kmeans.cluster_centers_ 31 | 32 | # Step 3: Evaluate performance 33 | silhouette = silhouette_score(X, labels) 34 | print(f'Silhouette Score: {silhouette:.2f}') 35 | 36 | # Step 4: Visualize clusters 37 | plt.figure(figsize=(10, 6)) 38 | sns.scatterplot(x=data['Feature_1'], y=data['Feature_2'], hue=labels, palette='viridis', s=100) 39 | plt.scatter(centroids[:, 0], centroids[:, 1], c='red', marker='x', s=200, linewidths=3, label='Centroids') 40 | plt.xlabel('Feature 1') 41 | plt.ylabel('Feature 2') 42 | plt.title('K-Means Clustering') 43 | plt.legend() 44 | plt.grid(True) 45 | plt.savefig('kmeans_clustering.png') 46 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/02 Unsupervised Learning/01 Clustering/03 DBSCAN/DBSCAN.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import make_moons 5 | from sklearn.cluster import DBSCAN 6 | from sklearn.metrics import silhouette_score 7 | import seaborn as sns 8 | 9 | # DBSCAN (Density-Based Spatial Clustering of Applications with Noise) 10 | # This script demonstrates DBSCAN on non-spherical data. 11 | 12 | # Tasks: 13 | # 1. Generate synthetic moon-shaped data. 14 | # 2. Apply DBSCAN clustering. 15 | # 3. Evaluate clustering performance using silhouette score (if applicable). 16 | # 4. Visualize clusters and noise points. 17 | 18 | # Step 1: Generate synthetic data 19 | np.random.seed(42) 20 | X, y_true = make_moons(n_samples=300, noise=0.05, random_state=42) 21 | 22 | # Convert to DataFrame 23 | data = pd.DataFrame(X, columns=['Feature_1', 'Feature_2']) 24 | 25 | # Step 2: Apply DBSCAN clustering 26 | dbscan = DBSCAN(eps=0.3, min_samples=5) 27 | labels = dbscan.fit_predict(X) 28 | 29 | # Step 3: Evaluate performance 30 | # Silhouette score only if there are at least 2 clusters and no noise (-1 labels) 31 | if len(set(labels)) > 1 and -1 not in labels: 32 | silhouette = silhouette_score(X, labels) 33 | print(f'Silhouette Score: {silhouette:.2f}') 34 | else: 35 | print('Silhouette Score: Not applicable due to noise or single cluster') 36 | 37 | # Step 4: Visualize clusters 38 | plt.figure(figsize=(10, 6)) 39 | sns.scatterplot(x=data['Feature_1'], y=data['Feature_2'], hue=labels, palette='viridis', s=100) 40 | plt.xlabel('Feature 1') 41 | plt.ylabel('Feature 2') 42 | plt.title('DBSCAN Clustering (Noise points in black)') 43 | plt.grid(True) 44 | plt.savefig('dbscan_clustering.png') 45 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/03 ML Pipelines/03 Model Selection/04 Grid Search/GridSearch.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import load_iris 5 | from sklearn.svm import SVC 6 | from sklearn.model_selection import GridSearchCV 7 | import seaborn as sns 8 | 9 | # Grid Search 10 | # This script demonstrates Grid Search for hyperparameter tuning. 11 | 12 | # Tasks: 13 | # 1. Load the Iris dataset. 14 | # 2. Define a parameter grid for SVC. 15 | # 3. Perform Grid Search with cross-validation. 16 | # 4. Evaluate the best model (accuracy). 17 | # 5. Visualize hyperparameter performance. 18 | 19 | # Step 1: Load data 20 | iris = load_iris() 21 | X = iris.data 22 | y = iris.target 23 | 24 | # Step 2: Define parameter grid 25 | param_grid = { 26 | 'C': [0.1, 1, 10], 27 | 'kernel': ['linear', 'rbf'], 28 | 'gamma': ['scale', 'auto'] 29 | } 30 | 31 | # Step 3: Perform Grid Search 32 | model = SVC(random_state=42) 33 | grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy') 34 | grid_search.fit(X, y) 35 | 36 | # Step 4: Evaluate best model 37 | best_model = grid_search.best_estimator_ 38 | best_score = grid_search.best_score_ 39 | print(f'Best Parameters: {grid_search.best_params_}') 40 | print(f'Best Cross-Validation Accuracy: {best_score:.2f}') 41 | 42 | # Step 5: Visualize hyperparameter performance 43 | results = pd.DataFrame(grid_search.cv_results_) 44 | pivot_table = results.pivot_table(values='mean_test_score', index='param_C', columns='param_kernel') 45 | 46 | plt.figure(figsize=(10, 6)) 47 | sns.heatmap(pivot_table, annot=True, cmap='viridis', fmt='.2f') 48 | plt.xlabel('Kernel') 49 | plt.ylabel('C') 50 | plt.title('Grid Search: Mean Test Accuracy') 51 | plt.savefig('grid_search.png') 52 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/03 ML Pipelines/03 Model Selection/01 Train-Test Split/TrainTestSplit.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import load_iris 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn.metrics import accuracy_score 7 | from sklearn.model_selection import train_test_split 8 | import seaborn as sns 9 | 10 | # Train-Test Split 11 | # This script demonstrates splitting data into training and testing sets. 12 | 13 | # Tasks: 14 | # 1. Load the Iris dataset. 15 | # 2. Split data into training and testing sets. 16 | # 3. Train a Logistic Regression model. 17 | # 4. Evaluate performance (accuracy). 18 | # 5. Visualize training vs testing data distribution. 19 | 20 | # Step 1: Load data 21 | iris = load_iris() 22 | X = iris.data 23 | y = iris.target 24 | data = pd.DataFrame(X, columns=iris.feature_names) 25 | data['Target'] = y 26 | 27 | # Step 2: Split data 28 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 29 | 30 | # Step 3: Train Logistic Regression 31 | model = LogisticRegression(random_state=42) 32 | model.fit(X_train, y_train) 33 | 34 | # Step 4: Evaluate performance 35 | y_pred = model.predict(X_test) 36 | accuracy = accuracy_score(y_test, y_pred) 37 | print(f'Accuracy: {accuracy:.2f}') 38 | 39 | # Step 5: Visualize data distribution (using first feature as example) 40 | plt.figure(figsize=(10, 6)) 41 | sns.histplot(data=X_train[:, 0], color='blue', label='Training Data', alpha=0.5, bins=20) 42 | sns.histplot(data=X_test[:, 0], color='red', label='Testing Data', alpha=0.5, bins=20) 43 | plt.xlabel(iris.feature_names[0]) 44 | plt.ylabel('Count') 45 | plt.title('Train-Test Split: Distribution of First Feature') 46 | plt.legend() 47 | plt.grid(True) 48 | plt.savefig('train_test_split.png') 49 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/03 ML Pipelines/05 Deployment/01 Model Serialization/ModelSerialization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.datasets import load_iris 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn.model_selection import train_test_split 6 | import joblib 7 | import matplotlib.pyplot as plt 8 | import seaborn as sns 9 | 10 | # Model Serialization 11 | # This script demonstrates saving and loading a trained model. 12 | 13 | # Tasks: 14 | # 1. Load the Iris dataset. 15 | # 2. Train a Logistic Regression model. 16 | # 3. Save the model using joblib. 17 | # 4. Load the model and make predictions. 18 | # 5. Visualize prediction results. 19 | 20 | # Step 1: Load data 21 | iris = load_iris() 22 | X = iris.data 23 | y = iris.target 24 | data = pd.DataFrame(X, columns=iris.feature_names) 25 | data['Target'] = y 26 | 27 | # Step 2: Train Logistic Regression 28 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 29 | model = LogisticRegression(random_state=42) 30 | model.fit(X_train, y_train) 31 | 32 | # Step 3: Save the model 33 | joblib.dump(model, 'logistic_model.pkl') 34 | 35 | # Step 4: Load the model and predict 36 | loaded_model = joblib.load('logistic_model.pkl') 37 | y_pred = loaded_model.predict(X_test) 38 | 39 | # Evaluate performance 40 | accuracy = loaded_model.score(X_test, y_test) 41 | print(f'Accuracy of Loaded Model: {accuracy:.2f}') 42 | 43 | # Step 5: Visualize predictions 44 | plt.figure(figsize=(10, 6)) 45 | sns.scatterplot(x=X_test[:, 0], y=X_test[:, 1], hue=iris.target_names[y_pred], style=iris.target_names[y_test], s=100) 46 | plt.xlabel(iris.feature_names[0]) 47 | plt.ylabel(iris.feature_names[1]) 48 | plt.title('Model Serialization: Predictions from Loaded Model') 49 | plt.grid(True) 50 | plt.savefig('model_serialization.png') 51 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/03 ML Pipelines/03 Model Selection/05 Random Search/RandomSearch.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import load_iris 5 | from sklearn.svm import SVC 6 | from sklearn.model_selection import RandomizedSearchCV 7 | from scipy.stats import uniform, randint 8 | import seaborn as sns 9 | 10 | # Random Search 11 | # This script demonstrates Random Search for hyperparameter tuning. 12 | 13 | # Tasks: 14 | # 1. Load the Iris dataset. 15 | # 2. Define a parameter distribution for SVC. 16 | # 3. Perform Random Search with cross-validation. 17 | # 4. Evaluate the best model (accuracy). 18 | # 5. Visualize hyperparameter performance. 19 | 20 | # Step 1: Load data 21 | iris = load_iris() 22 | X = iris.data 23 | y = iris.target 24 | 25 | # Step 2: Define parameter distribution 26 | param_dist = { 27 | 'C': uniform(0.1, 10), 28 | 'kernel': ['linear', 'rbf'], 29 | 'gamma': ['scale', 'auto'] 30 | } 31 | 32 | # Step 3: Perform Random Search 33 | model = SVC(random_state=42) 34 | random_search = RandomizedSearchCV(model, param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=42) 35 | random_search.fit(X, y) 36 | 37 | # Step 4: Evaluate best model 38 | best_model = random_search.best_estimator_ 39 | best_score = random_search.best_score_ 40 | print(f'Best Parameters: {random_search.best_params_}') 41 | print(f'Best Cross-Validation Accuracy: {best_score:.2f}') 42 | 43 | # Step 5: Visualize hyperparameter performance 44 | results = pd.DataFrame(random_search.cv_results_) 45 | plt.figure(figsize=(10, 6)) 46 | sns.scatterplot(data=results, x='param_C', y='mean_test_score', hue='param_kernel', style='param_gamma', size='mean_test_score') 47 | plt.xlabel('C') 48 | plt.ylabel('Mean Test Accuracy') 49 | plt.title('Random Search: Hyperparameter Performance') 50 | plt.grid(True) 51 | plt.savefig('random_search.png') 52 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/02 Unsupervised Learning/01 Clustering/02 Hierarchical Clustering/HierarchicalClustering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import make_blobs 5 | from sklearn.cluster import AgglomerativeClustering 6 | from sklearn.metrics import silhouette_score 7 | from scipy.cluster.hierarchy import dendrogram, linkage 8 | import seaborn as sns 9 | 10 | # Hierarchical Clustering 11 | # This script demonstrates Hierarchical Clustering with a dendrogram. 12 | 13 | # Tasks: 14 | # 1. Generate synthetic clustering data. 15 | # 2. Apply Hierarchical Clustering (Agglomerative). 16 | # 3. Evaluate clustering performance using silhouette score. 17 | # 4. Visualize clusters and dendrogram. 18 | 19 | # Step 1: Generate synthetic data 20 | np.random.seed(42) 21 | X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=1.0, random_state=42) 22 | 23 | # Convert to DataFrame 24 | data = pd.DataFrame(X, columns=['Feature_1', 'Feature_2']) 25 | 26 | # Step 2: Apply Hierarchical Clustering 27 | n_clusters = 4 28 | hierarchical = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward') 29 | labels = hierarchical.fit_predict(X) 30 | 31 | # Step 3: Evaluate performance 32 | silhouette = silhouette_score(X, labels) 33 | print(f'Silhouette Score: {silhouette:.2f}') 34 | 35 | # Step 4: Visualize clusters and dendrogram 36 | plt.figure(figsize=(12, 10)) 37 | 38 | # Clusters 39 | plt.subplot(2, 1, 1) 40 | sns.scatterplot(x=data['Feature_1'], y=data['Feature_2'], hue=labels, palette='viridis', s=100) 41 | plt.xlabel('Feature 1') 42 | plt.ylabel('Feature 2') 43 | plt.title('Hierarchical Clustering') 44 | 45 | # Dendrogram 46 | plt.subplot(2, 1, 2) 47 | Z = linkage(X, method='ward') 48 | dendrogram(Z) 49 | plt.title('Dendrogram') 50 | plt.xlabel('Sample Index') 51 | plt.ylabel('Distance') 52 | plt.tight_layout() 53 | plt.savefig('hierarchical_clustering.png') 54 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/03 ML Pipelines/03 Model Selection/03 Stratified K-Fold/StratifiedKFold.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import load_iris 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn.model_selection import StratifiedKFold 7 | import seaborn as sns 8 | 9 | # Stratified K-Fold Cross-Validation 10 | # This script demonstrates Stratified K-Fold Cross-Validation for imbalanced classes. 11 | 12 | # Tasks: 13 | # 1. Load the Iris dataset. 14 | # 2. Apply Stratified K-Fold Cross-Validation (k=5). 15 | # 3. Train a Logistic Regression model. 16 | # 4. Evaluate performance (mean accuracy and standard deviation). 17 | # 5. Visualize cross-validation scores. 18 | 19 | # Step 1: Load data 20 | iris = load_iris() 21 | X = iris.data 22 | y = iris.target 23 | 24 | # Step 2: Apply Stratified K-Fold Cross-Validation 25 | model = LogisticRegression(random_state=42) 26 | k = 5 27 | skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42) 28 | scores = [] 29 | for train_idx, test_idx in skf.split(X, y): 30 | X_train, X_test = X[train_idx], X[test_idx] 31 | y_train, y_test = y[train_idx], y[test_idx] 32 | model.fit(X_train, y_train) 33 | score = model.score(X_test, y_test) 34 | scores.append(score) 35 | 36 | # Step 3: Evaluate performance 37 | mean_accuracy = np.mean(scores) 38 | std_accuracy = np.std(scores) 39 | print(f'Stratified K-Fold CV Scores: {scores}') 40 | print(f'Mean Accuracy: {mean_accuracy:.2f}') 41 | print(f'Standard Deviation: {std_accuracy:.2f}') 42 | 43 | # Step 4: Visualize CV scores 44 | plt.figure(figsize=(10, 6)) 45 | sns.barplot(x=np.arange(1, k+1), y=scores, palette='viridis') 46 | plt.axhline(mean_accuracy, color='red', linestyle='--', label=f'Mean Accuracy: {mean_accuracy:.2f}') 47 | plt.xlabel('Fold') 48 | plt.ylabel('Accuracy') 49 | plt.title('Stratified K-Fold Cross-Validation Scores') 50 | plt.legend() 51 | plt.grid(True) 52 | plt.savefig('stratified_kfold.png') 53 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/02 Unsupervised Learning/02 Dimensionality Reduction/01 Principal Component Analysis (PCA)/PCA.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import load_iris 5 | from sklearn.decomposition import PCA 6 | from sklearn.preprocessing import StandardScaler 7 | import seaborn as sns 8 | 9 | # Principal Component Analysis (PCA) 10 | # This script demonstrates PCA for dimensionality reduction on the Iris dataset. 11 | 12 | # Tasks: 13 | # 1. Load the Iris dataset. 14 | # 2. Standardize the features. 15 | # 3. Apply PCA to reduce to 2 dimensions. 16 | # 4. Evaluate explained variance. 17 | # 5. Visualize the reduced data. 18 | 19 | # Step 1: Load data 20 | iris = load_iris() 21 | X = iris.data 22 | y = iris.target 23 | data = pd.DataFrame(X, columns=iris.feature_names) 24 | data['Target'] = y 25 | 26 | # Step 2: Standardize features 27 | scaler = StandardScaler() 28 | X_scaled = scaler.fit_transform(X) 29 | 30 | # Step 3: Apply PCA 31 | pca = PCA(n_components=2) 32 | X_pca = pca.fit_transform(X_scaled) 33 | 34 | # Step 4: Evaluate explained variance 35 | explained_variance = pca.explained_variance_ratio_ 36 | print(f'Explained Variance Ratio: {explained_variance}') 37 | print(f'Total Explained Variance: {sum(explained_variance):.2f}') 38 | 39 | # Step 5: Visualize reduced data 40 | plt.figure(figsize=(10, 6)) 41 | sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=iris.target_names[y], palette='viridis', s=100) 42 | plt.xlabel('Principal Component 1') 43 | plt.ylabel('Principal Component 2') 44 | plt.title('PCA: Iris Dataset Reduced to 2D') 45 | plt.grid(True) 46 | plt.savefig('pca.png') 47 | plt.close() 48 | 49 | # Scree plot for explained variance 50 | plt.figure(figsize=(8, 5)) 51 | plt.bar(range(1, len(explained_variance) + 1), explained_variance, alpha=0.6, color='blue') 52 | plt.xlabel('Principal Component') 53 | plt.ylabel('Explained Variance Ratio') 54 | plt.title('Scree Plot') 55 | plt.grid(True) 56 | plt.savefig('pca_scree.png') 57 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/01 Supervised Learning/03 Evaluation Metrics/01 Regression Metrics/RegressionMetrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.linear_model import LinearRegression 5 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 6 | from sklearn.model_selection import train_test_split 7 | 8 | # Regression Evaluation Metrics 9 | # This script demonstrates MSE, MAE, and R² for regression. 10 | 11 | # Tasks: 12 | # 1. Generate synthetic regression data. 13 | # 2. Split data into training and testing sets. 14 | # 3. Train a Linear Regression model. 15 | # 4. Calculate MSE, MAE, and R². 16 | # 5. Visualize actual vs predicted values. 17 | 18 | # Step 1: Generate synthetic data 19 | np.random.seed(42) 20 | X = np.random.rand(100, 1) * 10 21 | y = 2 * X.flatten() + 1 + np.random.randn(100) * 2 22 | 23 | # Convert to DataFrame 24 | data = pd.DataFrame({'X': X.flatten(), 'y': y}) 25 | 26 | # Step 2: Split data 27 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 28 | 29 | # Step 3: Train Linear Regression model 30 | model = LinearRegression() 31 | model.fit(X_train, y_train) 32 | 33 | # Step 4: Make predictions and calculate metrics 34 | y_pred = model.predict(X_test) 35 | mse = mean_squared_error(y_test, y_pred) 36 | mae = mean_absolute_error(y_test, y_pred) 37 | r2 = r2_score(y_test, y_pred) 38 | 39 | print(f'Mean Squared Error (MSE): {mse:.2f}') 40 | print(f'Mean Absolute Error (MAE): {mae:.2f}') 41 | print(f'R² Score: {r2:.2f}') 42 | 43 | # Step 5: Visualize actual vs predicted 44 | plt.figure(figsize=(10, 6)) 45 | plt.scatter(y_test, y_pred, color='blue', label='Predicted vs Actual') 46 | plt.plot([y_test.min(), y demolition_test.max()], [y_test.min(), y_test.max()], 'r--', label='Perfect Prediction') 47 | plt.xlabel('Actual Values') 48 | plt.ylabel('Predicted Values') 49 | plt.title('Regression Metrics: Actual vs Predicted') 50 | plt.legend() 51 | plt.grid(True) 52 | plt.savefig('regression_metrics.png') 53 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/03 ML Pipelines/01 Data Preprocessing/05 Outlier Detection/OutlierDetection.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import load_iris 5 | from sklearn.ensemble import IsolationForest 6 | import seaborn as sns 7 | 8 | # Outlier Detection 9 | # This script demonstrates outlier detection using Isolation Forest. 10 | 11 | # Tasks: 12 | # 1. Load the Iris dataset and introduce synthetic outliers. 13 | # 2. Apply Isolation Forest to detect outliers. 14 | # 3. Evaluate the number of detected outliers. 15 | # 4. Visualize outliers in the dataset. 16 | 17 | # Step 1: Load data and introduce outliers 18 | iris = load_iris() 19 | X = iris.data 20 | data = pd.DataFrame(X, columns=iris.feature_names) 21 | 22 | # Introduce 5% outliers 23 | np.random.seed(42) 24 | n_outliers = int(0.05 * X.shape[0]) 25 | outliers = np.random.uniform(low=-10, high=10, size=(n_outliers, X.shape[1])) 26 | X_with_outliers = np.vstack([X, outliers]) 27 | data_with_outliers = pd.DataFrame(X_with_outliers, columns=iris.feature_names) 28 | 29 | # Step 2: Apply Isolation Forest 30 | iso_forest = IsolationForest(contamination=0.05, random_state=42) 31 | outlier_labels = iso_forest.fit_predict(X_with_outliers) 32 | # -1 indicates outlier, 1 indicates inlier 33 | outliers_detected = X_with_outliers[outlier_labels == -1] 34 | inliers = X_with_outliers[outlier_labels == 1] 35 | 36 | # Step 3: Evaluate 37 | n_outliers_detected = len(outliers_detected) 38 | print(f'Number of Outliers Detected: {n_outliers_detected}') 39 | 40 | # Step 4: Visualize outliers (using first two features for 2D plot) 41 | plt.figure(figsize=(10, 6)) 42 | sns.scatterplot(x=data_with_outliers.iloc[:, 0], y=data_with_outliers.iloc[:, 1], hue=outlier_labels, palette={1: 'blue', -1: 'red'}, s=100) 43 | plt.xlabel(iris.feature_names[0]) 44 | plt.ylabel(iris.feature_names[1]) 45 | plt.title('Outlier Detection with Isolation Forest') 46 | plt.legend(['Inliers', 'Outliers']) 47 | plt.grid(True) 48 | plt.savefig('outlier_detection.png') 49 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/03 ML Pipelines/04 Model Evaluation/01 Bias-Variance Tradeoff/BiasVarianceTradeoff.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.preprocessing import PolynomialFeatures 5 | from sklearn.linear_model import LinearRegression 6 | from sklearn.metrics import mean_squared_error 7 | from sklearn.model_selection import train_test_split 8 | 9 | # Bias-Variance Tradeoff 10 | # This script demonstrates the bias-variance tradeoff using polynomial regression. 11 | 12 | # Tasks: 13 | # 1. Generate synthetic non-linear data. 14 | # 2. Train polynomial regression models with varying degrees. 15 | # 3. Evaluate training and testing errors. 16 | # 4. Visualize bias-variance tradeoff. 17 | 18 | # Step 1: Generate synthetic data 19 | np.random.seed(42) 20 | X = np.sort(5 * np.random.rand(100, 1), axis=0) 21 | y = np.sin(X).ravel() + np.random.randn(100) * 0.1 22 | 23 | # Step 2: Split data 24 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 25 | 26 | # Step 3: Train models with varying polynomial degrees 27 | degrees = [1, 3, 10] 28 | train_errors = [] 29 | test_errors = [] 30 | 31 | for degree in degrees: 32 | poly = PolynomialFeatures(degree=degree) 33 | X_train_poly = poly.fit_transform(X_train) 34 | X_test_poly = poly.transform(X_test) 35 | 36 | model = LinearRegression() 37 | model.fit(X_train_poly, y_train) 38 | 39 | y_train_pred = model.predict(X_train_poly) 40 | y_test_pred = model.predict(X_test_poly) 41 | 42 | train_errors.append(mean_squared_error(y_train, y_train_pred)) 43 | test_errors.append(mean_squared_error(y_test, y_test_pred)) 44 | 45 | # Step 4: Visualize bias-variance tradeoff 46 | plt.figure(figsize=(10, 6)) 47 | plt.plot(degrees, train_errors, marker='o', label='Training Error', color='blue') 48 | plt.plot(degrees, test_errors, marker='o', label='Testing Error', color='red') 49 | plt.xlabel('Polynomial Degree') 50 | plt.ylabel('Mean Squared Error') 51 | plt.title('Bias-Variance Tradeoff') 52 | plt.legend() 53 | plt.grid(True) 54 | plt.savefig('bias_variance_tradeoff.png') 55 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/01 Supervised Learning/01 Regression/04 Lasso Regression/LassoRegression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.linear_model import Lasso 5 | from sklearn.metrics import mean_squared_error, r2_score 6 | from sklearn.model_selection import train_test_split 7 | 8 | # Lasso Regression 9 | # This script demonstrates Lasso Regression with synthetic data for feature selection. 10 | 11 | # Tasks: 12 | # 1. Generate synthetic data with some irrelevant features. 13 | # 2. Split data into training and testing sets. 14 | # 3. Train a Lasso Regression model with L1 regularization. 15 | # 4. Make predictions and evaluate performance (MSE, R²). 16 | # 5. Visualize feature coefficients to show sparsity. 17 | 18 | # Step 1: Generate synthetic data 19 | np.random.seed(42) 20 | n_samples, n_features = 100, 10 21 | X = np.random.randn(n_samples, n_features) 22 | # Only first 3 features are relevant 23 | y = 3 * X[:, 0] + 2 * X[:, 1] - 1.5 * X[:, 2] + np.random.randn(n_samples) * 0.5 24 | 25 | # Convert to DataFrame 26 | data = pd.DataFrame(X, columns=[f'Feature_{i}' for i in range(n_features)]) 27 | data['Target'] = y 28 | 29 | # Step 2: Split data 30 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 31 | 32 | # Step 3: Train Lasso Regression model 33 | lasso = Lasso(alpha=0.1) # Regularization strength 34 | lasso.fit(X_train, y_train) 35 | 36 | # Step 4: Make predictions 37 | y_pred = lasso.predict(X_test) 38 | 39 | # Evaluate performance 40 | mse = mean_squared_error(y_test, y_pred) 41 | r2 = r2_score(y_test, y_pred) 42 | 43 | print(f'Mean Squared Error: {mse:.2f}') 44 | print(f'R² Score: {r2:.2f}') 45 | print('Feature Coefficients:', lasso.coef_) 46 | 47 | # Step 5: Visualize feature coefficients 48 | plt.figure(figsize=(10, 6)) 49 | plt.bar(range(n_features), lasso.coef_, color='blue') 50 | plt.xticks(range(n_features), [f'Feature_{i}' for i in range(n_features)]) 51 | plt.xlabel('Features') 52 | plt.ylabel('Coefficient Value') 53 | plt.title('Lasso Regression: Feature Coefficients') 54 | plt.grid(True) 55 | plt.savefig('lasso_regression.png') 56 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/02 Unsupervised Learning/03 Association Rules/02 FP-Growth/FPGrowth.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | from mlxtend.frequent_patterns import fpgrowth 4 | from mlxtend.frequent_patterns import association_rules 5 | import seaborn as sns 6 | 7 | # FP-Growth Algorithm 8 | # This script demonstrates the FP-Growth algorithm for association rule mining. 9 | 10 | # Tasks: 11 | # 1. Create synthetic transactional data. 12 | # 2. Apply the FP-Growth algorithm to find frequent itemsets. 13 | # 3. Generate association rules. 14 | # 4. Evaluate rules using support, confidence, and lift. 15 | # 5. Visualize rule metrics. 16 | 17 | # Step 1: Create synthetic transactional data 18 | transactions = [ 19 | ['Bread', 'Milk', 'Eggs'], 20 | ['Bread', 'Butter', 'Eggs'], 21 | ['Milk', 'Butter', 'Cheese'], 22 | ['Bread', 'Milk', 'Butter'], 23 | ['Bread', 'Milk', 'Eggs', 'Cheese'], 24 | ['Milk', 'Cheese'], 25 | ['Bread', 'Eggs'], 26 | ['Butter', 'Cheese'], 27 | ['Bread', 'Milk', 'Butter', 'Eggs'], 28 | ['Milk', 'Butter'] 29 | ] 30 | 31 | # Convert to one-hot encoded DataFrame 32 | items = set(item for transaction in transactions for item in transaction) 33 | data = pd.DataFrame([[item in transaction for item in items] for transaction in transactions], columns=items) 34 | 35 | # Step 2: Apply FP-Growth algorithm 36 | frequent_itemsets = fpgrowth(data, min_support=0.3, use_colnames=True) 37 | 38 | # Step 3: Generate association rules 39 | rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.6) 40 | rules = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']] 41 | 42 | # Step 4: Evaluate rules 43 | print('Frequent Itemsets:') 44 | print(frequent_itemsets) 45 | print('\nAssociation Rules:') 46 | print(rules) 47 | 48 | # Step 5: Visualize rule metrics 49 | plt.figure(figsize=(10, 6)) 50 | sns.scatterplot(data=rules, x='support', y='confidence', size='lift', hue='lift', palette='viridis') 51 | plt.xlabel('Support') 52 | plt.ylabel('Confidence') 53 | plt.title('FP-Growth: Association Rules (Size and Color by Lift)') 54 | plt.grid(True) 55 | plt.savefig('fpgrowth_rules.png') 56 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/02 Unsupervised Learning/03 Association Rules/01 Apriori Algorithm/AprioriAlgorithm.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | from mlxtend.frequent_patterns import apriori 4 | from mlxtend.frequent_patterns import association_rules 5 | import seaborn as sns 6 | 7 | # Apriori Algorithm 8 | # This script demonstrates the Apriori algorithm for association rule mining. 9 | 10 | # Tasks: 11 | # 1. Create synthetic transactional data. 12 | # 2. Apply the Apriori algorithm to find frequent itemsets. 13 | # 3. Generate association rules. 14 | # 4. Evaluate rules using support, confidence, and lift. 15 | # 5. Visualize rule metrics. 16 | 17 | # Step 1: Create synthetic transactional data 18 | transactions = [ 19 | ['Bread', 'Milk', 'Eggs'], 20 | ['Bread', 'Butter', 'Eggs'], 21 | ['Milk', 'Butter', 'Cheese'], 22 | ['Bread', 'Milk', 'Butter'], 23 | ['Bread', 'Milk', 'Eggs', 'Cheese'], 24 | ['Milk', 'Cheese'], 25 | ['Bread', 'Eggs'], 26 | ['Butter', 'Cheese'], 27 | ['Bread', 'Milk', 'Butter', 'Eggs'], 28 | ['Milk', 'Butter'] 29 | ] 30 | 31 | # Convert to one-hot encoded DataFrame 32 | items = set(item for transaction in transactions for item in transaction) 33 | data = pd.DataFrame([[item in transaction for item in items] for transaction in transactions], columns=items) 34 | 35 | # Step 2: Apply Apriori algorithm 36 | frequent_itemsets = apriori(data, min_support=0.3, use_colnames=True) 37 | 38 | # Step 3: Generate association rules 39 | rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.6) 40 | rules = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']] 41 | 42 | # Step 4: Evaluate rules 43 | print('Frequent Itemsets:') 44 | print(frequent_itemsets) 45 | print('\nAssociation Rules:') 46 | print(rules) 47 | 48 | # Step 5: Visualize rule metrics 49 | plt.figure(figsize=(10, 6)) 50 | sns.scatterplot(data=rules, x='support', y='confidence', size='lift', hue='lift', palette='viridis') 51 | plt.xlabel('Support') 52 | plt.ylabel('Confidence') 53 | plt.title('Apriori: Association Rules (Size and Color by Lift)') 54 | plt.grid(True) 55 | plt.savefig('apriori_rules.png') 56 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/01 Supervised Learning/01 Regression/03 Ridge Regression/RidgeRegression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.linear_model import Ridge 5 | from sklearn.metrics import mean_squared_error, r2_score 6 | from sklearn.model_selection import train_test_split 7 | 8 | # Ridge Regression 9 | # This script demonstrates Ridge Regression with synthetic data to handle multicollinearity. 10 | 11 | # Tasks: 12 | # 1. Generate synthetic data with correlated features. 13 | # 2. Split data into training and testing sets. 14 | # 3. Train a Ridge Regression model with regularization. 15 | # 4. Make predictions and evaluate performance (MSE, R²). 16 | # 5. Visualize feature coefficients. 17 | 18 | # Step 1: Generate synthetic data 19 | np.random.seed(42) 20 | n_samples, n_features = 100, 5 21 | X = np.random.randn(n_samples, n_features) 22 | # Introduce multicollinearity 23 | X[:, 1] = X[:, 0] + np.random.randn(n_samples) * 0.1 # Correlated feature 24 | y = 3 * X[:, 0] + 2 * X[:, 1] + np.random.randn(n_samples) * 0.5 25 | 26 | # Convert to DataFrame 27 | data = pd.DataFrame(X, columns=[f'Feature_{i}' for i in range(n_features)]) 28 | data['Target'] = y 29 | 30 | # Step 2: Split data 31 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 32 | 33 | # Step 3: Train Ridge Regression model 34 | ridge = Ridge(alpha=1.0) # Regularization strength 35 | ridge.fit(X_train, y_train) 36 | 37 | # Step 4: Make predictions 38 | y_pred = ridge.predict(X_test) 39 | 40 | # Evaluate performance 41 | mse = mean_squared_error(y_test, y_pred) 42 | r2 = r2_score(y_test, y_pred) 43 | 44 | print(f'Mean Squared Error: {mse:.2f}') 45 | print(f'R² Score: {r2:.2f}') 46 | print('Feature Coefficients:', ridge.coef_) 47 | 48 | # Step 5: Visualize feature coefficients 49 | plt.figure(figsize=(10, 6)) 50 | plt.bar(range(n_features), ridge.coef_, color='blue') 51 | plt.xticks(range(n_features), [f'Feature_{i}' for i in range(n_features)]) 52 | plt.xlabel('Features') 53 | plt.ylabel('Coefficient Value') 54 | plt.title('Ridge Regression: Feature Coefficients') 55 | plt.grid(True) 56 | plt.savefig('ridge_regression.png') 57 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/01 Supervised Learning/02 Classification/04 Naive Bayes/NaiveBayes.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import make_classification 5 | from sklearn.naive_bayes import GaussianNB 6 | from sklearn.metrics import accuracy_score, classification_report 7 | from sklearn.model_selection import train_test_split 8 | 9 | # Naive Bayes 10 | # This script demonstrates Gaussian Naive Bayes classification. 11 | 12 | # Tasks: 13 | # 1. Generate synthetic classification data. 14 | # 2. Split data into training and testing sets. 15 | # 3. Train a Gaussian Naive Bayes model. 16 | # 4. Make predictions and evaluate performance (accuracy, classification report). 17 | # 5. Visualize decision boundary. 18 | 19 | # Step 1: Generate synthetic data 20 | X, y = make_classification(n_samples=100, n_features=2, n_classes=2, n_clusters_per_class=1, random_state=42) 21 | 22 | # Convert to DataFrame 23 | data = pd.DataFrame(X, columns=['Feature_1', 'Feature_2']) 24 | data['Target'] = y 25 | 26 | # Step 2: Split data 27 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 28 | 29 | # Step 3: Train Naive Bayes model 30 | model = GaussianNB() 31 | model.fit(X_train, y_train) 32 | 33 | # Step 4: Make predictions 34 | y_pred = model.predict(X_test) 35 | 36 | # Evaluate performance 37 | accuracy = accuracy_score(y_test, y_pred) 38 | print(f'Accuracy: {accuracy:.2f}') 39 | print('Classification Report:') 40 | print(classification_report(y_test, y_pred)) 41 | 42 | # Step 5: Visualize decision boundary 43 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 44 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 45 | xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) 46 | Z = model.predict(np.c_[xx.ravel(), yy.ravel()]) 47 | Z = Z.reshape(xx.shape) 48 | 49 | plt.figure(figsize=(10, 6)) 50 | plt.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm') 51 | plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, edgecolors='k', label='Training data') 52 | plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, marker='x', label='Testing data') 53 | plt.xlabel('Feature 1') 54 | plt.ylabel('Feature 2') 55 | plt.title('Naive Bayes: Decision Boundary') 56 | plt.legend() 57 | plt.grid(True) 58 | plt.savefig('naive_bayes.png') 59 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/01 Supervised Learning/02 Classification/05 K-Nearest Neighbors (KNN)/KNN.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import make_classification 5 | from sklearn.neighbors import KNeighborsClassifier 6 | from sklearn.metrics import accuracy_score, classification_report 7 | from sklearn.model_selection import train_test_split 8 | 9 | # K-Nearest Neighbors (K truely 10 | # This script demonstrates KNN classification. 11 | 12 | # Tasks: 13 | # 1. Generate synthetic classification data. 14 | # 2. Split data into training and testing sets. 15 | # 3. Train a KNN Classifier. 16 | # 4. Make predictions and evaluate performance (accuracy, classification report). 17 | # 5. Visualize decision boundary. 18 | 19 | # Step 1: Generate synthetic data 20 | X, y = make_classification(n_samples=100, n_features=2, n_classes=2, n_clusters_per_class=1, random_state=42) 21 | 22 | # Convert to DataFrame 23 | data = pd.DataFrame(X, columns=['Feature_1', 'Feature_2']) 24 | data['Target'] = y 25 | 26 | # Step 2: Split data 27 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 28 | 29 | # Step 3: Train KNN model 30 | model = KNeighborsClassifier(n_neighbors=5) 31 | model.fit(X_train, y_train) 32 | 33 | # Step 4: Make predictions 34 | y_pred = model.predict(X_test) 35 | 36 | # Evaluate performance 37 | accuracy = accuracy_score(y_test, y_pred) 38 | print(f'Accuracy: {accuracy:.2f}') 39 | print('Classification Report:') 40 | print(classification_report(y_test, y_pred)) 41 | 42 | # Step 5: Visualize decision boundary 43 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 44 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 45 | xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) 46 | Z = model.predict(np.c_[xx.ravel(), yy.ravel()]) 47 | Z = Z.reshape(xx.shape) 48 | 49 | plt.figure(figsize=(10, 6)) 50 | plt.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm') 51 | plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, edgecolors='k', label='Training data') 52 | plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, marker='x', label='Testing data') 53 | plt.xlabel('Feature 1') 54 | plt.ylabel('Feature 2') 55 | plt.title('K-Nearest Neighbors: Decision Boundary') 56 | plt.legend() 57 | plt.grid(True) 58 | plt.savefig('knn.png') 59 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/01 Supervised Learning/01 Regression/02 Polynomial Regression/PolynomialRegression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.preprocessing import PolynomialFeatures 5 | from sklearn.linear_model import LinearRegression 6 | from sklearn.pipeline import make_pipeline 7 | from sklearn.metrics import mean_squared_error, r2_score 8 | from sklearn.model_selection import train_test_split 9 | 10 | # Polynomial Regression 11 | # This script demonstrates Polynomial Regression using synthetic non-linear data. 12 | 13 | # Tasks: 14 | # 1. Generate synthetic non-linear data (quadratic relationship). 15 | # 2. Split data into training and testing sets. 16 | # 3. Create and train a Polynomial Regression model (degree 2). 17 | # 4. Make predictions and evaluate performance (MSE, R²). 18 | # 5. Visualize the polynomial fit. 19 | 20 | # Step 1: Generate synthetic data 21 | np.random.seed(42) 22 | X = np.sort(6 * np.random.rand(100, 1) - 3, axis=0) # Values between -3 and 3 23 | y = 0.5 * X**2 + X + 2 + np.random.randn(100, 1) * 0.5 # Quadratic + noise 24 | 25 | # Convert to DataFrame 26 | data = pd.DataFrame({'X': X.flatten(), 'y': y.flatten()}) 27 | 28 | # Step 2: Split data 29 | X_train, X_test, y_train, y_test = train_test_split(X, y.flatten(), test_size=0.2, random_state=42) 30 | 31 | # Step 3: Create and train Polynomial Regression model 32 | degree = 2 33 | polyreg = make_pipeline(PolynomialFeatures(degree), LinearRegression()) 34 | polyreg.fit(X_train, y_train) 35 | 36 | # Step 4: Make predictions 37 | X_test_sorted = np.sort(X_test, axis=0) 38 | y_pred = polyreg.predict(X_test) 39 | y_pred_plot = polyreg.predict(X_test_sorted) 40 | 41 | # Evaluate performance 42 | mse = mean_squared_error(y_test, y_pred) 43 | r2 = r2_score(y_test, y_pred) 44 | 45 | print(f'Mean Squared Error: {mse:.2f}') 46 | print(f'R² Score: {r2:.2f}') 47 | 48 | # Step 5: Visualize results 49 | plt.figure(figsize=(10, 6)) 50 | plt.scatter(X_train, y_train, color='blue', label='Training data') 51 | plt.scatter(X_test, y_test, color='green', label='Testing data') 52 | plt.plot(X_test_sorted, y_pred_plot, color='red', label='Polynomial fit (degree=2)') 53 | plt.xlabel('X') 54 | plt.ylabel('y') 55 | plt.title('Polynomial Regression') 56 | plt.legend() 57 | plt.grid(True) 58 | plt.savefig('polynomial_regression.png') 59 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/04 Ensemble Methods/01 Bagging/02 Random Forest/RandomForest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import make_classification 5 | from sklearn.ensemble import RandomForestClassifier 6 | from sklearn.metrics import accuracy_score, classification_report 7 | from sklearn.model_selection import train_test_split 8 | 9 | # Random Forest 10 | # This script demonstrates Random Forest classification. 11 | 12 | # Tasks: 13 | # 1. Generate synthetic classification data. 14 | # 2. Split data into training and testing sets. 15 | # 3. Train a Random Forest Classifier. 16 | # 4. Make predictions and evaluate performance (accuracy, classification report). 17 | # 5. Visualize decision boundary. 18 | 19 | # Step 1: Generate synthetic data 20 | X, y = make_classification(n_samples=100, n_features=2, n_classes=2, n_clusters_per_class=1, random_state=42) 21 | 22 | # Convert to DataFrame 23 | data = pd.DataFrame(X, columns=['Feature_1', 'Feature_2']) 24 | data['Target'] = y 25 | 26 | # Step 2: Split data 27 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 28 | 29 | # Step 3: Train Random Forest model 30 | model = RandomForestClassifier(n_estimators=100, random_state=42) 31 | model.fit(X_train, y_train) 32 | 33 | # Step 4: Make predictions 34 | y_pred = model.predict(X_test) 35 | 36 | # Evaluate performance 37 | accuracy = accuracy_score(y_test, y_pred) 38 | print(f'Accuracy: {accuracy:.2f}') 39 | print('Classification Report:') 40 | print(classification_report(y_test, y_pred)) 41 | 42 | # Step 5: Visualize decision boundary 43 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 44 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 45 | xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) 46 | Z = model.predict(np.c_[xx.ravel(), yy.ravel()]) 47 | Z = Z.reshape(xx.shape) 48 | 49 | plt.figure(figsize=(10, 6)) 50 | plt.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm') 51 | plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, edgecolors='k', label='Training data') 52 | plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, marker='x', label='Testing data') 53 | plt.xlabel('Feature 1') 54 | plt.ylabel('Feature 2') 55 | plt.title('Random Forest: Decision Boundary') 56 | plt.legend() 57 | plt.grid(True) 58 | plt.savefig('random_forest.png') 59 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/01 Supervised Learning/02 Classification/02 Decision Trees/DecisionTrees.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import make_classification 5 | from sklearn.tree import DecisionTreeClassifier 6 | from sklearn.metrics import accuracy_score, classification_report 7 | from sklearn.model_selection import train_test_split 8 | 9 | # Decision Trees 10 | # This script demonstrates Decision Tree classification. 11 | 12 | # Tasks: 13 | # 1. Generate synthetic classification data. 14 | # 2. Split data into training and testing sets. 15 | # 3. Train a Decision Tree Classifier. 16 | # 4. Make predictions and evaluate performance (accuracy, classification report). 17 | # 5. Visualize decision boundary. 18 | 19 | # Step 1: Generate synthetic data 20 | X, y = make_classification(n_samples=100, n_features=2, n_classes=2, n_clusters_per_class=1, random_state=42) 21 | 22 | # Convert to DataFrame 23 | data = pd.DataFrame(X, columns=['Feature_1', 'Feature_2']) 24 | data['Target'] = y 25 | 26 | # Step 2: Split data 27 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 28 | 29 | # Step 3: Train Decision Tree model 30 | model = DecisionTreeClassifier(max_depth=3, random_state=42) 31 | model.fit(X_train, y_train) 32 | 33 | # Step 4: Make predictions 34 | y_pred = model.predict(X_test) 35 | 36 | # Evaluate performance 37 | accuracy = accuracy_score(y_test, y_pred) 38 | print(f'Accuracy: {accuracy:.2f}') 39 | print('Classification Report:') 40 | print(classification_report(y_test, y_pred)) 41 | 42 | # Step 5: Visualize decision boundary 43 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 44 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 45 | xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) 46 | Z = model.predict(np.c_[xx.ravel(), yy.ravel()]) 47 | Z = Z.reshape(xx.shape) 48 | 49 | plt.figure(figsize=(10, 6)) 50 | plt.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm') 51 | plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, edgecolors='k', label='Training data') 52 | plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, marker='x', label='Testing data') 53 | plt.xlabel('Feature 1') 54 | plt.ylabel('Feature 2') 55 | plt.title('Decision Tree: Decision Boundary') 56 | plt.legend() 57 | plt.grid(True) 58 | plt.savefig('decision_tree.png') 59 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/01 Supervised Learning/01 Regression/01 Linear Regression/LinearRegression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.linear_model import LinearRegression 5 | from sklearn.metrics import mean_squared_error, r2_score 6 | from sklearn.model_selection import train_test_split 7 | 8 | # Linear Regression 9 | # This script demonstrates Linear Regression using a synthetic dataset to predict house prices based on size. 10 | 11 | # Tasks: 12 | # 1. Generate synthetic data for house sizes and prices. 13 | # 2. Split data into training and testing sets. 14 | # 3. Train a Linear Regression model. 15 | # 4. Make predictions and evaluate performance (MSE, R²). 16 | # 5. Visualize the regression line and predictions. 17 | 18 | # Step 1: Generate synthetic data 19 | np.random.seed(42) 20 | house_sizes = np.random.rand(100, 1) * 200 # Size in square feet (0-200) 21 | prices = 50 + 3 * house_sizes + np.random.randn(100, 1) * 10 # Price = 50 + 3*size + noise 22 | 23 | # Convert to DataFrame for clarity 24 | data = pd.DataFrame({'Size': house_sizes.flatten(), 'Price': prices.flatten()}) 25 | 26 | # Step 2: Split data 27 | X = data[['Size']] 28 | y = data['Price'] 29 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 30 | 31 | # Step 3: Train Linear Regression model 32 | model = LinearRegression() 33 | model.fit(X_train, y_train) 34 | 35 | # Step 4: Make predictions 36 | y_pred = model.predict(X_test) 37 | 38 | # Evaluate performance 39 | mse = mean_squared_error(y_test, y_pred) 40 | r2 = r2_score(y_test, y_pred) 41 | 42 | print(f'Mean Squared Error: {mse:.2f}') 43 | print(f'R² Score: {r2:.2f}') 44 | print(f'Coefficients: {model.coef_[0]:.2f}') 45 | print(f'Intercept: {model.intercept_:.2f}') 46 | 47 | # Step 5: Visualize results 48 | plt.figure(figsize=(10, 6)) 49 | plt.scatter(X_train, y_train, color='blue', label='Training data') 50 | plt.scatter(X_test, y_test, color='green', label='Testing data') 51 | plt.plot(X_test, y_pred, color='red', label='Regression line') 52 | plt.xlabel('House Size (sq ft)/100)') 53 | plt.ylabel('Price ($1000)') 54 | plt.title('Linear Regression: House Size vs Price') 55 | plt.legend() 56 | plt.grid(True) 57 | plt.savefig('linear_regression.png') 58 | plt.close() 59 | 60 | # Save the plot for reference 61 | # The plot shows the regression line fitting the data, with training and testing points. -------------------------------------------------------------------------------- /Machine Learning Foundations/01 Supervised Learning/02 Classification/01 Logistic Regression/LogisticRegression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import make_classification 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn.metrics import accuracy_score, classification_report 7 | from sklearn.model_selection import train_test_split 8 | 9 | # Logistic Regression 10 | # This script demonstrates Logistic Regression for binary classification. 11 | 12 | # Tasks: 13 | # 1. Generate synthetic classification data. 14 | # 2. Split data into training and testing sets. 15 | # 3. Train a Logistic Regression model. 16 | # 4. Make predictions and evaluate performance (accuracy, classification report). 17 | # 5. Visualize decision boundary. 18 | 19 | # Step 1: Generate synthetic data 20 | X, y = make_classification(n_samples=100, n_features=2, n_classes=2, n_clusters_per_class=1, random_state=42) 21 | 22 | # Convert to DataFrame 23 | data = pd.DataFrame(X, columns=['Feature_1', 'Feature_2']) 24 | data['Target'] = y 25 | 26 | # Step 2: Split data 27 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 28 | 29 | # Step 3: Train Logistic Regression model 30 | model = LogisticRegression(random_state=42) 31 | model.fit(X_train, y_train) 32 | 33 | # Step 4: Make predictions 34 | y_pred = model.predict(X_test) 35 | 36 | # Evaluate performance 37 | accuracy = accuracy_score(y_test, y_pred) 38 | print(f'Accuracy: {accuracy:.2f}') 39 | print('Classification Report:') 40 | print(classification_report(y_test, y_pred)) 41 | 42 | # Step 5: Visualize decision boundary 43 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 44 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 45 | xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) 46 | Z = model.predict(np.c_[xx.ravel(), yy.ravel()]) 47 | Z = Z.reshape(xx.shape) 48 | 49 | plt.figure(figsize=(10, 6)) 50 | plt.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm') 51 | plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, edgecolors='k', label='Training data') 52 | plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, marker='x', label='Testing data') 53 | plt.xlabel('Feature 1') 54 | plt.ylabel('Feature 2') 55 | plt.title('Logistic Regression: Decision Boundary') 56 | plt.legend() 57 | plt.grid(True) 58 | plt.savefig('logistic_regression.png') 59 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/03 ML Pipelines/02 Feature Engineering/01 Feature Selection/FeatureSelection.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import load_iris 5 | from sklearn.feature_selection import SelectKBest, f_classif 6 | from sklearn.linear_model import LogisticRegression 7 | from sklearn.metrics import accuracy_score 8 | from sklearn.model_selection import train_test_split 9 | import seaborn as sns 10 | 11 | # Feature Selection 12 | # This script demonstrates feature selection using SelectKBest on the Iris dataset. 13 | 14 | # Tasks: 15 | # 1. Load the Iris dataset. 16 | # 2. Apply SelectKBest to select top features. 17 | # 3. Train a Logistic Regression model on selected features. 18 | # 4. Evaluate model performance (accuracy). 19 | # 5. Visualize feature importance scores. 20 | 21 | # Step 1: Load data 22 | iris = load_iris() 23 | X = iris.data 24 | y = iris.target 25 | data = pd.DataFrame(X, columns=iris.feature_names) 26 | 27 | # Step 2: Apply feature selection 28 | selector = SelectKBest(score_func=f_classif, k=2) 29 | X_selected = selector.fit_transform(X, y) 30 | selected_features = [iris.feature_names[i] for i in selector.get_support(indices=True)] 31 | feature_scores = selector.scores_ 32 | 33 | # Step 3: Train Logistic Regression 34 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 35 | X_train_selected, X_test_selected = train_test_split(X_selected, test_size=0.2, random_state=42) 36 | 37 | # Full features 38 | model_full = LogisticRegression(random_state=42) 39 | model_full.fit(X_train, y_train) 40 | y_pred_full = model_full.predict(X_test) 41 | acc_full = accuracy_score(y_test, y_pred_full) 42 | 43 | # Selected features 44 | model_selected = LogisticRegression(random_state=42) 45 | model_selected.fit(X_train_selected, y_train) 46 | y_pred_selected = model_selected.predict(X_test_selected) 47 | acc_selected = accuracy_score(y_test, y_pred_selected) 48 | 49 | print(f'Accuracy (Full Features): {acc_full:.2f}') 50 | print(f'Accuracy (Selected Features): {acc_selected:.2f}') 51 | print(f'Selected Features: {selected_features}') 52 | 53 | # Step 4: Visualize feature scores 54 | plt.figure(figsize=(10, 6)) 55 | sns.barplot(x=feature_scores, y=iris.feature_names, palette='viridis') 56 | plt.xlabel('Feature Score (f_classif)') 57 | plt.ylabel('Feature') 58 | plt.title('Feature Importance Scores') 59 | plt.grid(True) 60 | plt.savefig('feature_selection.png') 61 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/03 ML Pipelines/02 Feature Engineering/02 Polynomial Features/PolynomialFeatures.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.preprocessing import PolynomialFeatures 5 | from sklearn.linear_model import LinearRegression 6 | from sklearn.metrics import mean_squared_error 7 | from sklearn.model_selection import train_test_split 8 | import seaborn as sns 9 | 10 | # Polynomial Features 11 | # This script demonstrates adding polynomial features to improve regression. 12 | 13 | # Tasks: 14 | # 1. Generate synthetic non-linear data. 15 | # 2. Apply PolynomialFeatures to create polynomial terms. 16 | # 3. Train Linear Regression models with and without polynomial features. 17 | # 4. Evaluate performance (MSE). 18 | # 5. Visualize regression fits. 19 | 20 | # Step 1: Generate synthetic data 21 | np.random.seed(42) 22 | X = np.sort(5 * np.random.rand(100, 1), axis=0) 23 | y = np.sin(X).ravel() + np.random.randn(100) * 0.1 24 | 25 | # Step 2: Apply PolynomialFeatures 26 | poly = PolynomialFeatures(degree=3) 27 | X_poly = poly.fit_transform(X) 28 | 29 | # Step 3: Train Linear Regression models 30 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 31 | X_train_poly, X_test_poly = train_test_split(X_poly, test_size=0.2, random_state=42) 32 | 33 | # Linear model 34 | model_linear = LinearRegression() 35 | model_linear.fit(X_train, y_train) 36 | y_pred_linear = model_linear.predict(X_test) 37 | mse_linear = mean_squared_error(y_test, y_pred_linear) 38 | 39 | # Polynomial model 40 | model_poly = LinearRegression() 41 | model_poly.fit(X_train_poly, y_train) 42 | y_pred_poly = model_poly.predict(X_test_poly) 43 | mse_poly = mean_squared_error(y_test, y_pred_poly) 44 | 45 | print(f'MSE (Linear): {mse_linear:.2f}') 46 | print(f'MSE (Polynomial): {mse_poly:.2f}') 47 | 48 | # Step 4: Visualize regression fits 49 | X_plot = np.linspace(X.min(), X.max(), 100).reshape(-1, 1) 50 | y_plot_linear = model_linear.predict(X_plot) 51 | y_plot_poly = model_poly.predict(poly.transform(X_plot)) 52 | 53 | plt.figure(figsize=(10, 6)) 54 | plt.scatter(X, y, color='blue', label='Data') 55 | plt.plot(X_plot, y_plot_linear, color='green', label='Linear Fit') 56 | plt.plot(X_plot, y_plot_poly, color='red', label='Polynomial Fit (degree=3)') 57 | plt.xlabel('X') 58 | plt.ylabel('y') 59 | plt.title('Polynomial Features: Linear vs Polynomial Regression') 60 | plt.legend() 61 | plt.grid(True) 62 | plt.savefig('polynomial_features.png') 63 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/04 Ensemble Methods/02 Boosting/01 AdaBoost/AdaBoost.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import make_classification 5 | from sklearn.ensemble import AdaBoostClassifier 6 | from sklearn.tree import DecisionTreeClassifier 7 | from sklearn.metrics import accuracy_score, classification_report 8 | from sklearn.model_selection import train_test_split 9 | import seaborn as sns 10 | 11 | # AdaBoost 12 | # This script demonstrates AdaBoost classification. 13 | 14 | # Tasks: 15 | # 1. Generate synthetic classification data. 16 | # 2. Split data into training and testing sets. 17 | # 3. Train an AdaBoost Classifier with Decision Trees. 18 | # 4. Make predictions and evaluate performance (accuracy, classification report). 19 | # 5. Visualize decision boundary. 20 | 21 | # Step 1: Generate synthetic data 22 | np.random.seed(42) 23 | X, y = make_classification(n_samples=300, n_features=2, n_classes=2, n_clusters_per_class=1, random_state=42) 24 | 25 | # Convert to DataFrame 26 | data = pd.DataFrame(X, columns=['Feature_1', 'Feature_2']) 27 | data['Target'] = y 28 | 29 | # Step 2: Split data 30 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 31 | 32 | # Step 3: Train AdaBoost Classifier 33 | base_estimator = DecisionTreeClassifier(max_depth=1) # Weak learner 34 | adaboost = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=50, random_state=42) 35 | adaboost.fit(X_train, y_train) 36 | 37 | # Step 4: Make predictions and evaluate 38 | y_pred = adaboost.predict(X_test) 39 | accuracy = accuracy_score(y_test, y_pred) 40 | print(f'Accuracy: {accuracy:.2f}') 41 | print('Classification Report:') 42 | print(classification_report(y_test, y_pred)) 43 | 44 | # Step 5: Visualize decision boundary 45 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 46 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 47 | xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) 48 | Z = adaboost.predict(np.c_[xx.ravel(), yy.ravel()]) 49 | Z = Z.reshape(xx.shape) 50 | 51 | plt.figure(figsize=(10, 6)) 52 | plt.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm') 53 | plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, edgecolors='k', label='Training Data') 54 | plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, marker='x', label='Testing Data') 55 | plt.xlabel('Feature 1') 56 | plt.ylabel('Feature 2') 57 | plt.title('AdaBoost: Decision Boundary') 58 | plt.legend() 59 | plt.grid(True) 60 | plt.savefig('adaboost.png') 61 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/03 ML Pipelines/02 Feature Engineering/04 Binning/Binning.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.preprocessing import KBinsDiscretizer 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn.metrics import accuracy_score 7 | from sklearn.model_selection import train_test_split 8 | import seaborn as sns 9 | 10 | # Binning 11 | # This script demonstrates binning continuous features. 12 | 13 | # Tasks: 14 | # 1. Generate synthetic data with continuous features. 15 | # 2. Apply KBinsDiscretizer to bin a feature. 16 | # 3. Train a Logistic Regression model with binned features. 17 | # 4. Evaluate performance (accuracy). 18 | # 5. Visualize binned feature distribution. 19 | 20 | # Step 1: Generate synthetic data 21 | np.random.seed(42) 22 | data = pd.DataFrame({ 23 | 'Age': np.random.randint(20, 80, 100), 24 | 'Income': np.random.randint(20000, 120000, 100), 25 | 'Target': np.random.choice([0, 1], 100) 26 | }) 27 | 28 | # Step 2: Apply binning 29 | binner = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform') 30 | data['Age_Binned'] = binner.fit_transform(data[['Age']]) 31 | 32 | # Step 3: Train Logistic Regression 33 | X = data[['Age', 'Income']] 34 | X_binned = data[['Age_Binned', 'Income']] 35 | y = data['Target'] 36 | 37 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 38 | X_train_binned, X_test_binned = train_test_split(X_binned, test_size=0.2, random_state=42) 39 | 40 | # Original features 41 | model_orig = LogisticRegression(random_state=42) 42 | model_orig.fit(X_train, y_train) 43 | y_pred_orig = model_orig.predict(X_test) 44 | acc_orig = accuracy_score(y_test, y_pred_orig) 45 | 46 | # Binned features 47 | model_binned = LogisticRegression(random_state=42) 48 | model_binned.fit(X_train_binned, y_train) 49 | y_pred_binned = model_binned.predict(X_test_binned) 50 | acc_binned = accuracy_score(y_test, y_pred_binned) 51 | 52 | print(f'Accuracy (Original): {acc_orig:.2f}') 53 | print(f'Accuracy (Binned): {acc_binned:.2f}') 54 | 55 | # Step 4: Visualize binned feature distribution 56 | plt.figure(figsize=(10, 6)) 57 | sns.histplot(data=data, x='Age', color='blue', alpha=0.5, label='Original Age', bins=20) 58 | sns.histplot(data=data, x='Age_Binned', color='red', alpha=0.5, label='Binned Age', bins=5) 59 | plt.xlabel('Age / Binned Age') 60 | plt.ylabel('Count') 61 | plt.title('Binning: Original vs Binned Age Distribution') 62 | plt.legend() 63 | plt.grid(True) 64 | plt.savefig('binning.png') 65 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/01 Supervised Learning/02 Classification/06 Support Vector Machines (SVM)/SVM.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import make_classification 5 | from sklearn.svm import SVC 6 | from sklearn.metrics import accuracy_score, classification_report 7 | from sklearn.model_selection import train_test_split 8 | 9 | # Support Vector Machines (SVM) 10 | # This script demonstrates SVM classification with a linear kernel. 11 | 12 | # Tasks: 13 | # 1. Generate synthetic classification data. 14 | # 2. Split data into training and testing sets. 15 | # 3. Train an SVM Classifier. 16 | # 4. Make predictions and evaluate performance (accuracy, classification report). 17 | # 5. Visualize decision boundary and support vectors. 18 | 19 | # Step 1: Generate synthetic data 20 | X, y = make_classification(n_samples=100, n_features=2, n_classes=2, n_clusters_per_class=1, random_state=42) 21 | 22 | # Convert to DataFrame 23 | data = pd.DataFrame(X, columns=['Feature_1', 'Feature_2']) 24 | data['Target'] = y 25 | 26 | # Step 2: Split data 27 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 28 | 29 | # Step 3: Train SVM model 30 | model = SVC(kernel='linear', random_state=42) 31 | model.fit(X_train, y_train) 32 | 33 | # Step 4: Make predictions 34 | y_pred = model.predict(X_test) 35 | 36 | # Evaluate performance 37 | accuracy = accuracy_score(y_test, y_pred) 38 | print(f'Accuracy: {accuracy:.2f}') 39 | print('Classification Report:') 40 | print(classification_report(y_test, y_pred)) 41 | 42 | # Step 5: Visualize decision boundary and support vectors 43 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 44 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 45 | xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) 46 | Z = model.predict(np.c_[xx.ravel(), yy.ravel()]) 47 | Z = Z.reshape(xx.shape) 48 | 49 | plt.figure(figsize=(10, 6)) 50 | plt.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm') 51 | plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, edgecolors='k', label='Training data') 52 | plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, marker='x', label='Testing data') 53 | # Highlight support vectors 54 | plt.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1], s=100, facecolors='none', edgecolors='k', label='Support Vectors') 55 | plt.xlabel('Feature 1') 56 | plt.ylabel('Feature 2') 57 | plt.title('SVM: Decision Boundary with Support Vectors') 58 | plt.legend() 59 | plt.grid(True) 60 | plt.savefig('svm.png') 61 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/04 Ensemble Methods/01 Bagging/01 Bootstrap Aggregating/BootstrapAggregating.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import make_classification 5 | from sklearn.ensemble import BaggingClassifier 6 | from sklearn.tree import DecisionTreeClassifier 7 | from sklearn.metrics import accuracy_score, classification_report 8 | from sklearn.model_selection import train_test_split 9 | import seaborn as sns 10 | 11 | # Bootstrap Aggregating (Bagging) 12 | # This script demonstrates Bagging using Decision Trees as base estimators. 13 | 14 | # Tasks: 15 | # 1. Generate synthetic classification data. 16 | # 2. Split data into training and testing sets. 17 | # 3. Train a Bagging Classifier with Decision Trees. 18 | # 4. Make predictions and evaluate performance (accuracy, classification report). 19 | # 5. Visualize decision boundary. 20 | 21 | # Step 1: Generate synthetic data 22 | np.random.seed(42) 23 | X, y = make_classification(n_samples=300, n_features=2, n_classes=2, n_clusters_per_class=1, random_state=42) 24 | 25 | # Convert to DataFrame 26 | data = pd.DataFrame(X, columns=['Feature_1', 'Feature_2']) 27 | data['Target'] = y 28 | 29 | # Step 2: Split data 30 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 31 | 32 | # Step 3: Train Bagging Classifier 33 | base_estimator = DecisionTreeClassifier(max_depth=3) 34 | bagging = BaggingClassifier(base_estimator=base_estimator, n_estimators=50, random_state=42) 35 | bagging.fit(X_train, y_train) 36 | 37 | # Step 4: Make predictions and evaluate 38 | y_pred = bagging.predict(X_test) 39 | accuracy = accuracy_score(y_test, y_pred) 40 | print(f'Accuracy: {accuracy:.2f}') 41 | print('Classification Report:') 42 | print(classification_report(y_test, y_pred)) 43 | 44 | # Step 5: Visualize decision boundary 45 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 46 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 47 | xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) 48 | Z = bagging.predict(np.c_[xx.ravel(), yy.ravel()]) 49 | Z = Z.reshape(xx.shape) 50 | 51 | plt.figure(figsize=(10, 6)) 52 | plt.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm') 53 | plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, edgecolors='k', label='Training Data') 54 | plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, marker='x', label='Testing Data') 55 | plt.xlabel('Feature 1') 56 | plt.ylabel('Feature 2') 57 | plt.title('Bagging Classifier: Decision Boundary') 58 | plt.legend() 59 | plt.grid(True) 60 | plt.savefig('bagging_classifier.png') 61 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/03 ML Pipelines/02 Feature Engineering/03 Interaction Terms/InteractionTerms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.preprocessing import PolynomialFeatures 5 | from sklearn.linear_model import LinearRegression 6 | from sklearn.metrics import mean_squared_error 7 | from sklearn.model_selection import train_test_split 8 | import seaborn as sns 9 | 10 | # Interaction Terms 11 | # This script demonstrates adding interaction terms to capture feature interactions. 12 | 13 | # Tasks: 14 | # 1. Generate synthetic data with interacting features. 15 | # 2. Apply PolynomialFeatures to include interaction terms. 16 | # 3. Train Linear Regression models with and without interaction terms. 17 | # 4. Evaluate performance (MSE). 18 | # 5. Visualize model predictions. 19 | 20 | # Step 1: Generate synthetic data 21 | np.random.seed(42) 22 | X = np.random.rand(100, 2) 23 | y = 2 * X[:, 0] + 3 * X[:, 1] + 5 * X[:, 0] * X[:, 1] + np.random.randn(100) * 0.1 24 | 25 | data = pd.DataFrame(X, columns=['Feature_1', 'Feature_2']) 26 | data['Target'] = y 27 | 28 | # Step 2: Apply PolynomialFeatures for interaction terms 29 | poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False) 30 | X_interaction = poly.fit_transform(X) 31 | 32 | # Step 3: Train Linear Regression models 33 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 34 | X_train_inter, X_test_inter = train_test_split(X_interaction, test_size=0.2, random_state=42) 35 | 36 | # Linear model (no interactions) 37 | model_linear = LinearRegression() 38 | model_linear.fit(X_train, y_train) 39 | y_pred_linear = model_linear.predict(X_test) 40 | mse_linear = mean_squared_error(y_test, y_pred_linear) 41 | 42 | # Interaction model 43 | model_inter = LinearRegression() 44 | model_inter.fit(X_train_inter, y_train) 45 | y_pred_inter = model_inter.predict(X_test_inter) 46 | mse_inter = mean_squared_error(y_test, y_pred_inter) 47 | 48 | print(f'MSE (Linear): {mse_linear:.2f}') 49 | print(f'MSE (Interaction Terms): {mse_inter:.2f}') 50 | 51 | # Step 4: Visualize actual vs predicted 52 | plt.figure(figsize=(10, 6)) 53 | plt.scatter(y_test, y_pred_linear, color='green', label='Linear Predictions') 54 | plt.scatter(y_test, y_pred_inter, color='red', label='Interaction Predictions') 55 | plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'b--', label='Perfect Prediction') 56 | plt.xlabel('Actual Values') 57 | plt.ylabel('Predicted Values') 58 | plt.title('Interaction Terms: Actual vs Predicted') 59 | plt.legend() 60 | plt.grid(True) 61 | plt.savefig('interaction_terms.png') 62 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/03 ML Pipelines/05 Deployment/02 API Integration/APIIntegration.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.datasets import load_iris 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn.model_selection import train_test_split 6 | import joblib 7 | from flask import Flask, request, jsonify 8 | import matplotlib.pyplot as plt 9 | import seaborn as sns 10 | 11 | # API Integration 12 | # This script demonstrates creating a Flask API for a trained model. 13 | 14 | # Tasks: 15 | # 1. Load the Iris dataset and train a Logistic Regression model. 16 | # 2. Save the model using joblib. 17 | # 3. Create a Flask API to serve predictions. 18 | # 4. Test the API with sample data. 19 | # 5. Visualize API predictions. 20 | 21 | # Step 1: Load data and train model 22 | iris = load_iris() 23 | X = iris.data 24 | y = iris.target 25 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 26 | model = LogisticRegression(random_state=42) 27 | model.fit(X_train, y_train) 28 | 29 | # Step 2: Save the model 30 | joblib.dump(model, 'logistic_model_api.pkl') 31 | 32 | # Step 3: Create Flask API 33 | app = Flask(__name__) 34 | model = joblib.load('logistic_model_api.pkl') 35 | 36 | @app.route('/predict', methods=['POST']) 37 | def predict(): 38 | data = request.get_json(force=True) 39 | features = np.array(data['features']).reshape(1, -1) 40 | prediction = model.predict(features) 41 | return jsonify({'prediction': int(prediction[0]), 'class': iris.target_names[prediction[0]]}) 42 | 43 | # Note: The Flask app would be run with app.run(debug=True) in a real environment. 44 | # For demonstration, we'll simulate API testing. 45 | 46 | # Step 4: Test the API (simulated) 47 | sample_data = X_test[:5] 48 | predictions = model.predict(sample_data) 49 | class_names = [iris.target_names[pred] for pred in predictions] 50 | 51 | print('Sample Predictions:') 52 | for i, (features, pred, name) in enumerate(zip(sample_data, predictions, class_names)): 53 | print(f'Sample {i+1}: Features={features.tolist()}, Prediction={pred}, Class={name}') 54 | 55 | # Step 5: Visualize predictions 56 | plt.figure(figsize=(10, 6)) 57 | sns.scatterplot(x=X_test[:5, 0], y=X_test[:5, 1], hue=class_names, style=iris.target_names[y_test[:5]], s=100) 58 | plt.xlabel(iris.feature_names[0]) 59 | plt.ylabel(iris.feature_names[1]) 60 | plt.title('API Integration: Simulated API Predictions') 61 | plt.grid(True) 62 | plt.savefig('api_integration.png') 63 | plt.close() 64 | 65 | # To run the Flask API, use: app.run(debug=True) 66 | # Then send POST requests to http://localhost:5000/predict with JSON like {"features": [5.1, 3.5, 1.4, 0.2]} -------------------------------------------------------------------------------- /Machine Learning Foundations/03 ML Pipelines/04 Model Evaluation/02 Overfitting/Overfitting.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.preprocessing import PolynomialFeatures 5 | from sklearn.linear_model import LinearRegression 6 | from sklearn.metrics import mean_squared_error 7 | from sklearn.model_selection import train_test_split 8 | 9 | # Overfitting 10 | # This script demonstrates overfitting using a high-degree polynomial regression. 11 | 12 | # Tasks: 13 | # 1. Generate synthetic non-linear data. 14 | # 2. Train a high-degree polynomial regression model. 15 | # 3. Compare with a simpler model. 16 | # 4. Evaluate training and testing errors. 17 | # 5. Visualize overfitting. 18 | 19 | # Step 1: Generate synthetic data 20 | np.random.seed(42) 21 | X = np.sort(5 * np.random.rand(100, 1), axis=0) 22 | y = np.sin(X).ravel() + np.random.randn(100) * 0.1 23 | 24 | # Step 2: Split data 25 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 26 | 27 | # Step 3: Train models 28 | # Simple model (degree 3) 29 | poly_simple = PolynomialFeatures(degree=3) 30 | X_train_simple = poly_simple.fit_transform(X_train) 31 | X_test_simple = poly_simple.transform(X_test) 32 | model_simple = LinearRegression() 33 | model_simple.fit(X_train_simple, y_train) 34 | y_pred_simple = model_simple.predict(X_test_simple) 35 | mse_simple = mean_squared_error(y_test, y_pred_simple) 36 | 37 | # Overfit model (degree 15) 38 | poly_overfit = PolynomialFeatures(degree=15) 39 | X_train_overfit = poly_overfit.fit_transform(X_train) 40 | X_test_overfit = poly_overfit.transform(X_test) 41 | model_overfit = LinearRegression() 42 | model_overfit.fit(X_train_overfit, y_train) 43 | y_pred_overfit = model_overfit.predict(X_test_overfit) 44 | mse_overfit = mean_squared_error(y_test, y_pred_overfit) 45 | 46 | print(f'MSE (Simple, degree=3): {mse_simple:.2f}') 47 | print(f'MSE (Overfit, degree=15): {mse_overfit:.2f}') 48 | 49 | # Step 4: Visualize overfitting 50 | X_plot = np.linspace(X.min(), X.max(), 100).reshape(-1, 1) 51 | y_plot_simple = model_simple.predict(poly_simple.transform(X_plot)) 52 | y_plot_overfit = model_overfit.predict(poly_overfit.transform(X_plot)) 53 | 54 | plt.figure(figsize=(10, 6)) 55 | plt.scatter(X_train, y_train, color='blue', label='Training Data') 56 | plt.scatter(X_test, y_test, color='green', label='Testing Data') 57 | plt.plot(X_plot, y_plot_simple, color='red', label='Simple Model (degree=3)') 58 | plt.plot(X_plot, y_plot_overfit, color='purple', label='Overfit Model (degree=15)') 59 | plt.xlabel('X') 60 | plt.ylabel('y') 61 | plt.title('Overfitting: Simple vs Overfit Model') 62 | plt.legend() 63 | plt.grid(True) 64 | plt.savefig('overfitting.png') 65 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/01 Supervised Learning/02 Classification/03 Random Forest/RandomForest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import make_classification 5 | from sklearn.ensemble import RandomForestClassifier 6 | from sklearn.metrics import accuracy_score, classification_report 7 | from sklearn.model_selection import train_test_split 8 | import seaborn as sns 9 | 10 | # Random Forest 11 | # This script demonstrates Random Forest classification. 12 | 13 | # Tasks: 14 | # 1. Generate synthetic classification data. 15 | # 2. Split data into training and testing sets. 16 | # 3. Train a Random Forest Classifier. 17 | # 4. Make predictions and evaluate performance (accuracy, classification report). 18 | # 5. Visualize decision boundary and feature importance. 19 | 20 | # Step 1: Generate synthetic data 21 | np.random.seed(42) 22 | X, y = make_classification(n_samples=300, n_features=2, n_classes=2, n_clusters_per_class=1, random_state=42) 23 | 24 | # Convert to DataFrame 25 | data = pd.DataFrame(X, columns=['Feature_1', 'Feature_2']) 26 | data['Target'] = y 27 | 28 | # Step 2: Split data 29 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 30 | 31 | # Step 3: Train Random Forest Classifier 32 | rf = RandomForestClassifier(n_estimators=100, random_state=42) 33 | rf.fit(X_train, y_train) 34 | 35 | # Step 4: Make predictions and evaluate 36 | y_pred = rf.predict(X_test) 37 | accuracy = accuracy_score(y_test, y_pred) 38 | print(f'Accuracy: {accuracy:.2f}') 39 | print('Classification Report:') 40 | print(classification_report(y_test, y_pred)) 41 | 42 | # Step 5: Visualize decision boundary and feature importance 43 | plt.figure(figsize=(12, 5)) 44 | 45 | # Decision boundary 46 | plt.subplot(1, 2, 1) 47 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 48 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 49 | xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) 50 | Z = rf.predict(np.c_[xx.ravel(), yy.ravel()]) 51 | Z = Z.reshape(xx.shape) 52 | plt.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm') 53 | plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, edgecolors='k', label='Training Data') 54 | plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, marker='x', label='Testing Data') 55 | plt.xlabel('Feature 1') 56 | plt.ylabel('Feature 2') 57 | plt.title('Random Forest: Decision Boundary') 58 | plt.legend() 59 | 60 | # Feature importance 61 | plt.subplot(1, 2, 2) 62 | sns.barplot(x=rf.feature_importances_, y=['Feature_1', 'Feature_2'], palette='viridis') 63 | plt.xlabel('Feature Importance') 64 | plt.title('Random Forest: Feature Importance') 65 | plt.tight_layout() 66 | plt.savefig('random_forest.png') 67 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/03 ML Pipelines/04 Model Evaluation/03 Underfitting/Underfitting.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.preprocessing import PolynomialFeatures 5 | from sklearn.linear_model import LinearRegression 6 | from sklearn.metrics import mean_squared_error 7 | from sklearn.model_selection import train_test_split 8 | 9 | # Underfitting 10 | # This script demonstrates underfitting using a low-degree polynomial regression. 11 | 12 | # Tasks: 13 | # 1. Generate synthetic non-linear data. 14 | # 2. Train a low-degree polynomial regression model. 15 | # 3. Compare with a better-fitting model. 16 | # 4. Evaluate training and testing errors. 17 | # 5. Visualize underfitting. 18 | 19 | # Step 1: Generate synthetic data 20 | np.random.seed(42) 21 | X = np.sort(5 * np.random.rand(100, 1), axis=0) 22 | y = np.sin(X).ravel() + np.random.randn(100) * 0.1 23 | 24 | # Step 2: Split data 25 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 26 | 27 | # Step 3: Train models 28 | # Underfit model (degree 1) 29 | poly_underfit = PolynomialFeatures(degree=1) 30 | X_train_underfit = poly_underfit.fit_transform(X_train) 31 | X_test_underfit = poly_underfit.transform(X_test) 32 | model_underfit = LinearRegression() 33 | model_underfit.fit(X_train_underfit, y_train) 34 | y_pred_underfit = model_underfit.predict(X_test_underfit) 35 | mse_underfit = mean_squared_error(y_test, y_pred_underfit) 36 | 37 | # Better model (degree 3) 38 | poly_better = PolynomialFeatures(degree=3) 39 | X_train_better = poly_better.fit_transform(X_train) 40 | X_test_better = poly_better.transform(X_test) 41 | model_better = LinearRegression() 42 | model_better.fit(X_train_better, y_train) 43 | y_pred_better = model_better.predict(X_test_better) 44 | mse_better = mean_squared_error(y_test, y_pred_better) 45 | 46 | print(f'MSE (Underfit, degree=1): {mse_underfit:.2f}') 47 | print(f'MSE (Better, degree=3): {mse_better:.2f}') 48 | 49 | # Step 4: Visualize underfitting 50 | X_plot = np.linspace(X.min(), X.max(), 100).reshape(-1, 1) 51 | y_plot_underfit = model_underfit.predict(poly_underfit.transform(X_plot)) 52 | y_plot_better = model_better.predict(poly_better.transform(X_plot)) 53 | 54 | plt.figure(figsize=(10, 6)) 55 | plt.scatter(X_train, y_train, color='blue', label='Training Data') 56 | plt.scatter(X_test, y_test, color='green', label='Testing Data') 57 | plt.plot(X_plot, y_plot_underfit, color='red', label='Underfit Model (degree=1)') 58 | plt.plot(X_plot, y_plot_better, color='purple', label='Better Model (degree=3)') 59 | plt.xlabel('X') 60 | plt.ylabel('y') 61 | plt.title('Underfitting: Underfit vs Better Model') 62 | plt.legend() 63 | plt.grid(True) 64 | plt.savefig('underfitting.png') 65 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/04 Ensemble Methods/02 Boosting/02 Gradient Boosting/GradientBoosting.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import make_classification 5 | from sklearn.ensemble import GradientBoostingClassifier 6 | from sklearn.metrics import accuracy_score, classification_report 7 | from sklearn.model_selection import train_test_split 8 | import seaborn as sns 9 | 10 | # Gradient Boosting 11 | # This script demonstrates Gradient Boosting classification. 12 | 13 | # Tasks: 14 | # 1. Generate synthetic classification data. 15 | # 2. Split data into training and testing sets. 16 | # 3. Train a Gradient Boosting Classifier. 17 | # 4. Make predictions and evaluate performance (accuracy, classification report). 18 | # 5. Visualize decision boundary and feature importance. 19 | 20 | # Step 1: Generate synthetic data 21 | np.random.seed(42) 22 | X, y = make_classification(n_samples=300, n_features=2, n_classes=2, n_clusters_per_class=1, random_state=42) 23 | 24 | # Convert to DataFrame 25 | data = pd.DataFrame(X, columns=['Feature_1', 'Feature_2']) 26 | data['Target'] = y 27 | 28 | # Step 2: Split data 29 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 30 | 31 | # Step 3: Train Gradient Boosting Classifier 32 | gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42) 33 | gb.fit(X_train, y_train) 34 | 35 | # Step 4: Make predictions and evaluate 36 | y_pred = gb.predict(X_test) 37 | accuracy = accuracy_score(y_test, y_pred) 38 | print(f'Accuracy: {accuracy:.2f}') 39 | print('Classification Report:') 40 | print(classification_report(y_test, y_pred)) 41 | 42 | # Step 5: Visualize decision boundary and feature importance 43 | plt.figure(figsize=(12, 5)) 44 | 45 | # Decision boundary 46 | plt.subplot(1, 2, 1) 47 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 48 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 49 | xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) 50 | Z = gb.predict(np.c_[xx.ravel(), yy.ravel()]) 51 | Z = Z.reshape(xx.shape) 52 | plt.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm') 53 | plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, edgecolors='k', label='Training Data') 54 | plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, marker='x', label='Testing Data') 55 | plt.xlabel('Feature 1') 56 | plt.ylabel('Feature 2') 57 | plt.title('Gradient Boosting: Decision Boundary') 58 | plt.legend() 59 | 60 | # Feature importance 61 | plt.subplot(1, 2, 2) 62 | sns.barplot(x=gb.feature_importances_, y=['Feature_1', 'Feature_2'], palette='viridis') 63 | plt.xlabel('Feature Importance') 64 | plt.title('Gradient Boosting: Feature Importance') 65 | plt.tight_layout() 66 | plt.savefig('gradient_boosting.png') 67 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/01 Supervised Learning/03 Evaluation Metrics/02 Classification Metrics/ClassificationMetrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import make_classification 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc 7 | from sklearn.model_selection import train_test_split 8 | import seaborn as sns 9 | 10 | # Classification Evaluation Metrics 11 | # This script demonstrates Accuracy, Precision, Recall, F1 Score, Confusion Matrix, ROC Curve, and AUC. 12 | 13 | # Tasks: 14 | # 1. Generate synthetic classification data. 15 | # 2. Split data into training and testing sets. 16 | # 3. Train a Logistic Regression model. 17 | # 4. Calculate classification metrics. 18 | # 5. Visualize confusion matrix and ROC curve. 19 | 20 | # Step 1: Generate synthetic data 21 | X, y = make_classification(n_samples=100, n_features=2, n_classes=2, n_clusters_per_class=1, random_state=42) 22 | 23 | # Convert to DataFrame 24 | data = pd.DataFrame(X, columns=['Feature_1', 'Feature_2']) 25 | data['Target'] = y 26 | 27 | # Step 2: Split data 28 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 29 | 30 | # Step 3: Train Logistic Regression model 31 | model = LogisticRegression(random_state=42) 32 | model.fit(X_train, y_train) 33 | 34 | # Step 4: Make predictions and calculate metrics 35 | y_pred = model.predict(X_test) 36 | y_proba = model.predict_proba(X_test)[:, 1] # Probability for positive class 37 | 38 | accuracy = accuracy_score(y_test, y_pred) 39 | precision = precision_score(y_test, y_pred) 40 | recall = recall_score(y_test, y_pred) 41 | f1 = f1_score(y_test, y_pred) 42 | cm = confusion_matrix(y_test, y_pred) 43 | fpr, tpr, _ = roc_curve(y_test, y_proba) 44 | roc_auc = auc(fpr, tpr) 45 | 46 | print(f'Accuracy: {accuracy:.2f}') 47 | print(f'Precision: {precision:.2f}') 48 | print(f'Recall: {recall:.2f}') 49 | print(f'F1 Score: {f1:.2f}') 50 | print('Confusion Matrix:') 51 | print(cm) 52 | print(f'AUC Score: {roc_auc:.2f}') 53 | 54 | # Step 5: Visualize confusion matrix and ROC curve 55 | plt.figure(figsize=(12, 5)) 56 | 57 | # Confusion Matrix 58 | plt.subplot(1, 2, 1) 59 | sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False) 60 | plt.xlabel('Predicted') 61 | plt.ylabel('Actual') 62 | plt.title('Confusion Matrix') 63 | 64 | # ROC Curve 65 | plt.subplot(1, 2, 2) 66 | plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (AUC = {roc_auc:.2f})') 67 | plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random') 68 | plt.xlabel('False Positive Rate') 69 | plt.ylabel('True Positive Rate') 70 | plt.title('ROC Curve') 71 | plt.legend() 72 | plt.grid(True) 73 | 74 | plt.tight_layout() 75 | plt.savefig('classification_metrics.png') 76 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/03 ML Pipelines/01 Data Preprocessing/03 Standardization/Standardization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import load_iris 5 | from sklearn.preprocessing import StandardScaler 6 | from sklearn.linear_model import LogisticRegression 7 | from sklearn.metrics import accuracy_score 8 | from sklearn.model_selection import train_test_split 9 | import seaborn as sns 10 | 11 | # Standardization Demonstration 12 | # This script focuses on Standardization techniques using StandardScaler on the Iris dataset. 13 | 14 | # Tasks: 15 | # 1. Load and explore the Iris dataset. 16 | # 2. Apply StandardScaler for standardization (zero mean, unit variance). 17 | # 3. Train a Logistic Regression model on standardized data. 18 | # 4. Compare performance with raw data. 19 | # 5. Visualize the effect of standardization. 20 | 21 | # Step 1: Load data 22 | iris = load_iris() 23 | X = iris.data 24 | y = iris.target 25 | data = pd.DataFrame(X, columns=iris.feature_names) 26 | 27 | # Explore raw data 28 | print("Raw Data Statistics:") 29 | print(data.describe()) 30 | 31 | # Step 2: Apply Standardization 32 | standard_scaler = StandardScaler() 33 | X_standardized = standard_scaler.fit_transform(X) 34 | 35 | # Step 3: Train Logistic Regression 36 | X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 37 | X_train_std, X_test_std = train_test_split(X_standardized, test_size=0.2, random_state=42) 38 | 39 | # Raw data model 40 | model_raw = LogisticRegression(random_state=42, max_iter=200) 41 | model_raw.fit(X_train_raw, y_train) 42 | y_pred_raw = model_raw.predict(X_test_raw) 43 | acc_raw = accuracy_score(y_test, y_pred_raw) 44 | 45 | # Standardized data model 46 | model_std = LogisticRegression(random_state=42, max_iter=200) 47 | model_std.fit(X_train_std, y_train) 48 | y_pred_std = model_std.predict(X_test_std) 49 | acc_std = accuracy_score(y_test, y_pred_std) 50 | 51 | # Step 4: Print results 52 | print(f'\nAccuracy (Raw Data): {acc_raw:.2f}') 53 | print(f'Accuracy (Standardized): {acc_std:.2f}') 54 | 55 | # Step 5: Visualize 56 | plt.figure(figsize=(10, 6)) 57 | 58 | # Raw data 59 | plt.subplot(2, 1, 1) 60 | sns.boxplot(data=data) 61 | plt.title('Raw Features') 62 | plt.xticks(rotation=45) 63 | 64 | # Standardized data 65 | plt.subplot(2, 1, 2) 66 | sns.boxplot(data=pd.DataFrame(X_standardized, columns=iris.feature_names)) 67 | plt.title('Standardized Features (StandardScaler)') 68 | plt.xticks(rotation=45) 69 | 70 | plt.tight_layout() 71 | plt.savefig('standardization_effect.png') 72 | plt.close() 73 | 74 | # Additional: Check mean and variance after standardization 75 | standardized_data = pd.DataFrame(X_standardized, columns=iris.feature_names) 76 | print("\nStandardized Data Statistics (Mean ~ 0, Std ~ 1):") 77 | print(standardized_data.describe()) 78 | 79 | print("\nStandardization complete. Check 'standardization_effect.png' for visualization.") -------------------------------------------------------------------------------- /Machine Learning Foundations/03 ML Pipelines/01 Data Preprocessing/01 Feature Scaling/FeatureScaling.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import load_iris 5 | from sklearn.preprocessing import MinMaxScaler, StandardScaler 6 | from sklearn.linear_model import LogisticRegression 7 | from sklearn.metrics import accuracy_score 8 | from sklearn.model_selection import train_test_split 9 | import seaborn as sns 10 | 11 | # Feature Scaling 12 | # This script demonstrates Normalization and Standardization on the Iris dataset. 13 | 14 | # Tasks: 15 | # 1. Load the Iris dataset. 16 | # 2. Apply Normalization (MinMaxScaler) and Standardization (StandardScaler). 17 | # 3. Train a Logistic Regression model on scaled data. 18 | # 4. Compare model performance (accuracy). 19 | # 5. Visualize feature distributions before and after scaling. 20 | 21 | # Step 1: Load data 22 | iris = load_iris() 23 | X = iris.data 24 | y = iris.target 25 | data = pd.DataFrame(X, columns=iris.feature_names) 26 | 27 | # Step 2: Apply scaling 28 | # Normalization (MinMaxScaler) 29 | minmax_scaler = MinMaxScaler() 30 | X_normalized = minmax_scaler.fit_transform(X) 31 | 32 | # Standardization (StandardScaler) 33 | standard_scaler = StandardScaler() 34 | X_standardized = standard_scaler.fit_transform(X) 35 | 36 | # Step 3: Train Logistic Regression on each dataset 37 | X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 38 | X_train_norm, X_test_norm = train_test_split(X_normalized, test_size=0.2, random_state=42) 39 | X_train_std, X_test_std = train_test_split(X_standardized, test_size=0.2, random_state=42) 40 | 41 | # Raw data 42 | model_raw = LogisticRegression(random_state=42) 43 | model_raw.fit(X_train_raw, y_train) 44 | y_pred_raw = model_raw.predict(X_test_raw) 45 | acc_raw = accuracy_score(y_test, y_pred_raw) 46 | 47 | # Normalized data 48 | model_norm = LogisticRegression(random_state=42) 49 | model_norm.fit(X_train_norm, y_train) 50 | y_pred_norm = model_norm.predict(X_test_norm) 51 | acc_norm = accuracy_score(y_test, y_pred_norm) 52 | 53 | # Standardized data 54 | model_std = LogisticRegression(random_state=42) 55 | model_std.fit(X_train_std, y_train) 56 | y_pred_std = model_std.predict(X_test_std) 57 | acc_std = accuracy_score(y_test, y_pred_std) 58 | 59 | print(f'Accuracy (Raw): {acc_raw:.2f}') 60 | print(f'Accuracy (Normalized): {acc_norm:.2f}') 61 | print(f'Accuracy (Standardized): {acc_std:.2f}') 62 | 63 | # Step 4: Visualize feature distributions 64 | plt.figure(figsize=(12, 8)) 65 | 66 | # Raw data 67 | plt.subplot(3, 1, 1) 68 | sns.boxplot(data=data) 69 | plt.title('Raw Features') 70 | plt.xticks(rotation=45) 71 | 72 | # Normalized data 73 | plt.subplot(3, 1, 2) 74 | sns.boxplot(data=pd.DataFrame(X_normalized, columns=iris.feature_names)) 75 | plt.title('Normalized Features (MinMaxScaler)') 76 | plt.xticks(rotation=45) 77 | 78 | # Standardized data 79 | plt.subplot(3, 1, 3) 80 | sns.boxplot(data=pd.DataFrame(X_standardized, columns=iris.feature_names)) 81 | plt.title('Standardized Features (StandardScaler)') 82 | plt.xticks(rotation=45) 83 | 84 | plt.tight_layout() 85 | plt.savefig('feature_scaling.png') 86 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/03 ML Pipelines/01 Data Preprocessing/06 Encoding Categorical Variables/EncodingCategoricalVariables.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | from sklearn.datasets import load_iris 4 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn.metrics import accuracy_score 7 | from sklearn.model_selection import train_test_split 8 | import seaborn as sns 9 | 10 | # Encoding Categorical Variables 11 | # This script demonstrates Label Encoding and One-Hot Encoding. 12 | 13 | # Tasks: 14 | # 1. Create a synthetic dataset with categorical variables. 15 | # 2. Apply Label Encoding and One-Hot Encoding. 16 | # 3. Train a Logistic Regression model on encoded data. 17 | # 4. Compare model performance (accuracy). 18 | # 5. Visualize the effect of encoding. 19 | 20 | # Step 1: Create synthetic dataset 21 | np.random.seed(42) 22 | data = pd.DataFrame({ 23 | 'Age': np.random.randint(20, 60, 100), 24 | 'Income': np.random.randint(30000, 100000, 100), 25 | 'Category': np.random.choice(['Low', 'Medium', 'High'], 100), 26 | 'Target': np.random.choice([0, 1], 100) 27 | }) 28 | 29 | # Step 2: Apply encoding 30 | # Label Encoding 31 | label_encoder = LabelEncoder() 32 | data['Category_Label'] = label_encoder.fit_transform(data['Category']) 33 | 34 | # One-Hot Encoding 35 | one_hot_encoder = OneHotEncoder(sparse=False) 36 | category_ohe = one_hot_encoder.fit_transform(data[['Category']]) 37 | category_ohe_df = pd.DataFrame(category_ohe, columns=one_hot_encoder.get_feature_names_out(['Category'])) 38 | 39 | data_encoded = pd.concat([data[['Age', 'Income']], category_ohe_df, data['Target']], axis=1) 40 | 41 | # Step 3: Train Logistic Regression 42 | X_label = data[['Age', 'Income', 'Category_Label']] 43 | X_ohe = data_encoded.drop('Target', axis=1) 44 | y = data['Target'] 45 | 46 | X_train_label, X_test_label, y_train, y_test = train_test_split(X_label, y, test_size=0.2, random_state=42) 47 | X_train_ohe, X_test_ohe = train_test_split(X_ohe, test_size=0.2, random_state=42) 48 | 49 | # Label encoded model 50 | model_label = LogisticRegression(random_state=42) 51 | model_label.fit(X_train_label, y_train) 52 | y_pred_label = model_label.predict(X_test_label) 53 | acc_label = accuracy_score(y_test, y_pred_label) 54 | 55 | # One-Hot encoded model 56 | model_ohe = LogisticRegression(random_state=42) 57 | model_ohe.fit(X_train_ohe, y_train) 58 | y_pred_ohe = model_ohe.predict(X_test_ohe) 59 | acc_ohe = accuracy_score(y_test, y_pred_label) 60 | 61 | print(f'Accuracy (Label Encoded): {acc_label:.2f}') 62 | print(f'Accuracy (One-Hot Encoded): {acc_ohe:.2f}') 63 | 64 | # Step 4: Visualize data distribution 65 | plt.figure(figsize=(12, 5)) 66 | 67 | # Label encoded 68 | plt.subplot(1, 2, 1) 69 | sns.countplot(x='Category_Label', hue='Target', data=data) 70 | plt.title('Label Encoded Categories') 71 | plt.xlabel('Category (Encoded)') 72 | 73 | # One-Hot encoded (show distribution of original categories) 74 | plt.subplot(1, 2, 2) 75 | sns.countplot(x='Category', hue='Target', data=data) 76 | plt.title('Original Categories') 77 | plt.xlabel('Category') 78 | 79 | plt.tight_layout() 80 | plt.savefig('encoding_categorical_variables.png') 81 | plt.close() -------------------------------------------------------------------------------- /Machine Learning Foundations/03 ML Pipelines/01 Data Preprocessing/02 Normalization/Normalization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import load_iris 5 | from sklearn.preprocessing import MinMaxScaler 6 | from sklearn.linear_model import LogisticRegression 7 | from sklearn.metrics import accuracy_score 8 | from sklearn.model_selection import train_test_split 9 | import seaborn as sns 10 | 11 | # Normalization Demonstration 12 | # This script focuses on Normalization techniques using MinMaxScaler on the Iris dataset. 13 | 14 | # Tasks: 15 | # 1. Load and explore the Iris dataset. 16 | # 2. Apply MinMaxScaler for normalization (scales data to [0, 1] or custom range). 17 | # 3. Train a Logistic Regression model on normalized data. 18 | # 4. Compare performance with raw data. 19 | # 5. Visualize the effect of normalization. 20 | 21 | # Step 1: Load data 22 | iris = load_iris() 23 | X = iris.data 24 | y = iris.target 25 | data = pd.DataFrame(X, columns=iris.feature_names) 26 | 27 | # Explore raw data 28 | print("Raw Data Statistics:") 29 | print(data.describe()) 30 | 31 | # Step 2: Apply Normalization 32 | # Default range [0, 1] 33 | minmax_scaler = MinMaxScaler() 34 | X_normalized = minmax_scaler.fit_transform(X) 35 | 36 | # Custom range example [-1, 1] 37 | minmax_scaler_custom = MinMaxScaler(feature_range=(-1, 1)) 38 | X_normalized_custom = minmax_scaler_custom.fit_transform(X) 39 | 40 | # Step 3: Train Logistic Regression 41 | X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 42 | X_train_norm, X_test_norm = train_test_split(X_normalized, test_size=0.2, random_state=42) 43 | 44 | # Raw data model 45 | model_raw = LogisticRegression(random_state=42, max_iter=200) 46 | model_raw.fit(X_train_raw, y_train) 47 | y_pred_raw = model_raw.predict(X_test_raw) 48 | acc_raw = accuracy_score(y_test, y_pred_raw) 49 | 50 | # Normalized data model 51 | model_norm = LogisticRegression(random_state=42, max_iter=200) 52 | model_norm.fit(X_train_norm, y_train) 53 | y_pred_norm = model_norm.predict(X_test_norm) 54 | acc_norm = accuracy_score(y_test, y_pred_norm) 55 | 56 | # Step 4: Print results 57 | print(f'\nAccuracy (Raw Data): {acc_raw:.2f}') 58 | print(f'Accuracy (Normalized [0,1]): {acc_norm:.2f}') 59 | 60 | # Step 5: Visualize 61 | plt.figure(figsize=(10, 6)) 62 | 63 | # Raw data 64 | plt.subplot(2, 1, 1) 65 | sns.boxplot(data=data) 66 | plt.title('Raw Features') 67 | plt.xticks(rotation=45) 68 | 69 | # Normalized data [0, 1] 70 | plt.subplot(2, 1, 2) 71 | sns.boxplot(data=pd.DataFrame(X_normalized, columns=iris.feature_names)) 72 | plt.title('Normalized Features (MinMaxScaler [0,1])') 73 | plt.xticks(rotation=45) 74 | 75 | plt.tight_layout() 76 | plt.savefig('normalization_effect.png') 77 | plt.close() 78 | 79 | # Additional: Show custom range effect 80 | plt.figure(figsize=(6, 4)) 81 | sns.boxplot(data=pd.DataFrame(X_normalized_custom, columns=iris.feature_names)) 82 | plt.title('Normalized Features (MinMaxScaler [-1,1])') 83 | plt.xticks(rotation=45) 84 | plt.tight_layout() 85 | plt.savefig('normalization_custom_range.png') 86 | plt.close() 87 | 88 | print("\nNormalization complete. Check 'normalization_effect.png' and 'normalization_custom_range.png' for visualizations.") -------------------------------------------------------------------------------- /Machine Learning Foundations/03 ML Pipelines/01 Data Preprocessing/04 Handling Missing Values/HandlingMissingValues.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets import load_iris 5 | from sklearn.impute import SimpleImputer 6 | from sklearn.linear_model import LogisticRegression 7 | from sklearn.metrics import accuracy_score 8 | from sklearn.model_selection import train_test_split 9 | import seaborn as sns 10 | 11 | # Handling Missing Values 12 | # This script demonstrates techniques to handle missing values in a dataset. 13 | 14 | # Tasks: 15 | # 1. Load the Iris dataset and introduce synthetic missing values. 16 | # 2. Apply mean imputation and median imputation. 17 | # 3. Train a Logistic Regression model on imputed data. 18 | # 4. Compare model performance (accuracy). 19 | # 5. Visualize data distribution before and after imputation. 20 | 21 | # Step 1: Load data and introduce missing values 22 | iris = load_iris() 23 | X = iris.data 24 | y = iris.target 25 | data = pd.DataFrame(X, columns=iris.feature_names) 26 | 27 | # Introduce 10% missing values randomly 28 | np.random.seed(42) 29 | mask = np.random.rand(*X.shape) < 0.1 30 | X_with_missing = X.copy() 31 | X_with_missing[mask] = np.nan 32 | data_missing = pd.DataFrame(X_with_missing, columns=iris.feature_names) 33 | 34 | # Step 2: Apply imputation 35 | # Mean imputation 36 | mean_imputer = SimpleImputer(strategy='mean') 37 | X_mean_imputed = mean_imputer.fit_transform(X_with_missing) 38 | 39 | # Median imputation 40 | median_imputer = SimpleImputer(strategy='median') 41 | X_median_imputed = median_imputer.fit_transform(X_with_missing) 42 | 43 | # Step 3: Train Logistic Regression on each dataset 44 | X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 45 | X_train_mean, X_test_mean = train_test_split(X_mean_imputed, test_size=0.2, random_state=42) 46 | X_train_median, X_test_median = train_test_split(X_median_imputed, test_size=0.2, random_state=42) 47 | 48 | # Raw data (no missing values) 49 | model_raw = LogisticRegression(random_state=42) 50 | model_raw.fit(X_train_raw, y_train) 51 | y_pred_raw = model_raw.predict(X_test_raw) 52 | acc_raw = accuracy_score(y_test, y_pred_raw) 53 | 54 | # Mean imputed data 55 | model_mean = LogisticRegression(random_state=42) 56 | model_mean.fit(X_train_mean, y_train) 57 | y_pred_mean = model_mean.predict(X_test_mean) 58 | acc_mean = accuracy_score(y_test, y_pred_mean) 59 | 60 | # Median imputed data 61 | model_median = LogisticRegression(random_state=42) 62 | model_median.fit(X_train_median, y_train) 63 | y_pred_median = model_median.predict(X_test_median) 64 | acc_median = accuracy_score(y_test, y_pred_median) 65 | 66 | print(f'Accuracy (Raw): {acc_raw:.2f}') 67 | print(f'Accuracy (Mean Imputed): {acc_mean:.2f}') 68 | print(f'Accuracy (Median Imputed): {acc_median:.2f}') 69 | 70 | # Step 4: Visualize data distribution 71 | plt.figure(figsize=(12, 8)) 72 | 73 | # Original data with missing values 74 | plt.subplot(3, 1, 1) 75 | sns.boxplot(data=data_missing) 76 | plt.title('Data with Missing Values') 77 | plt.xticks(rotation=45) 78 | 79 | # Mean imputed data 80 | plt.subplot(3, 1, 2) 81 | sns.boxplot(data=pd.DataFrame(X_mean_imputed, columns=iris.feature_names)) 82 | plt.title('Mean Imputed Data') 83 | plt.xticks(rotation=45) 84 | 85 | # Median imputed data 86 | plt.subplot(3, 1, 3) 87 | sns.boxplot(data=pd.DataFrame(X_median_imputed, columns=iris.feature_names)) 88 | plt.title('Median Imputed Data') 89 | plt.xticks(rotation=45) 90 | 91 | plt.tight_layout() 92 | plt.savefig('handling_missing_values.png') 93 | plt.close() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🤖 Machine Learning Interview Preparation 2 | 3 |
Your comprehensive guide to mastering Machine Learning for AI/ML interviews
9 | 10 | --- 11 | 12 | ## 📖 Introduction 13 | 14 | Welcome to my Machine Learning prep for AI/ML interviews! 🚀 This repository is your go-to guide for mastering ML, the heart of AI, with hands-on practice and interview-focused insights. From core concepts to advanced techniques, it’s designed to help you excel in technical interviews and AI projects with clarity and confidence. 15 | 16 | ## 🌟 What’s Inside? 17 | 18 | - **Algorithms Mastery**: Conquer regression, classification, clustering, and more to ace coding tests. 19 | - **Pipelines Expertise**: Master preprocessing, evaluation, and deployment workflows. 20 | - **Hands-on Practice**: Implement ML algorithms with detailed solutions to sharpen your edge. 21 | - **Interview Question Bank**: Dive into ML theory with clear, concise answers. 22 | - **Performance Optimization**: Learn tips for building efficient, interview-ready models. 23 | 24 | ## 🔍 Who Is This For? 25 | 26 | - Data Scientists prepping for technical interviews. 27 | - Machine Learning Engineers strengthening ML foundations. 28 | - AI Researchers enhancing algorithm skills. 29 | - Software Engineers transitioning to AI/ML roles. 30 | - Anyone mastering ML for data-driven applications. 31 | 32 | ## 🗺️ Comprehensive Learning Roadmap 33 | 34 | --- 35 | 36 | ### 🤖 Machine Learning Foundations 37 | 38 | #### 📈 Supervised Learning 39 | - Regression 40 | - Linear Regression 41 | - Polynomial Regression 42 | - Ridge Regression 43 | - Lasso Regression 44 | - Classification 45 | - Logistic Regression 46 | - Decision Trees 47 | - Random Forest 48 | - Naive Bayes 49 | - K-Nearest Neighbors (KNN) 50 | - Support Vector Machines (SVM) 51 | - Evaluation Metrics 52 | - Regression Metrics 53 | - Mean Squared Error 54 | - Mean Absolute Error 55 | - R² Score 56 | - Classification Metrics 57 | - Accuracy 58 | - Precision 59 | - Recall 60 | - F1 Score 61 | - Confusion Matrix 62 | - ROC Curve 63 | - AUC Score 64 | 65 | #### 📊 Unsupervised Learning 66 | - Clustering 67 | - K-Means Clustering 68 | - Hierarchical Clustering 69 | - DBSCAN 70 | - Mean Shift 71 | - Dimensionality Reduction 72 | - Principal Component Analysis (PCA) 73 | - t-Distributed Stochastic Neighbor Embedding (t-SNE) 74 | - Linear Discriminant Analysis (LDA) 75 | - Association Rules 76 | - Apriori Algorithm 77 | - FP-Growth 78 | 79 | #### 🛠️ ML Pipelines 80 | - Data Preprocessing 81 | - Feature Scaling 82 | - Normalization 83 | - Standardization 84 | - Handling Missing Values 85 | - Outlier Detection 86 | - Encoding Categorical Variables 87 | - Feature Engineering 88 | - Feature Selection 89 | - Polynomial Features 90 | - Interaction Terms 91 | - Binning 92 | - Model Selection 93 | - Train-Test Split 94 | - K-Fold Cross-Validation 95 | - Stratified K-Fold 96 | - Grid Search 97 | - Random Search 98 | - Model Evaluation 99 | - Bias-Variance Tradeoff 100 | - Overfitting 101 | - Underfitting 102 | - Deployment 103 | - Model Serialization 104 | - API Integration 105 | 106 | #### 🎯 Ensemble Methods 107 | - Bagging 108 | - Bootstrap Aggregating 109 | - Random Forest 110 | - Boosting 111 | - AdaBoost 112 | - Gradient Boosting 113 | 114 | --- 115 | 116 | ## 💡 Why Master Machine Learning for AI/ML? 117 | 118 | Machine Learning fuels AI innovation, and here’s why: 119 | 1. **Versatility**: Powers predictive modeling and data insights. 120 | 2. **Industry Demand**: A core skill for 6 LPA+ AI/ML roles. 121 | 3. **Real-World Impact**: Solves complex problems across domains. 122 | 4. **Evolving Field**: Continuous learning with cutting-edge tools. 123 | 5. **Community Support**: Backed by a thriving network of experts. 124 | 125 | This repo is my toolkit for mastering ML for technical interviews and AI/ML careers—let’s build that expertise together! 126 | 127 | ## 📆 Study Plan 128 | 129 | - **Week 1-2**: Supervised Learning Basics 130 | - **Week 3-4**: Unsupervised Learning and Evaluation 131 | - **Week 5-6**: ML Pipelines and Preprocessing 132 | - **Week 7-8**: Feature Engineering and Model Selection 133 | - **Week 9-10**: Ensemble Methods and Deployment 134 | - **Week 11-12**: Interview Practice and Optimization 135 | 136 | ## 🤝 Contributions 137 | 138 | Love to collaborate? Here’s how! 🌟 139 | 1. Fork the repository. 140 | 2. Create a feature branch (`git checkout -b feature/amazing-addition`). 141 | 3. Commit your changes (`git commit -m 'Add some amazing content'`). 142 | 4. Push to the branch (`git push origin feature/amazing-addition`). 143 | 5. Open a Pull Request. 144 | 145 | --- 146 | 147 |Happy Learning and Good Luck with Your Interviews! ✨
149 |