├── .gitignore ├── LICENSE ├── README.md ├── mlfromscratch ├── __init__.py ├── adaboost.py ├── decision_tree.py ├── kmeans.py ├── knn.py ├── lda.py ├── linear_regression.py ├── load_data.py ├── logistic_regression.py ├── naivebayes.py ├── pca.py ├── perceptron.py ├── random_forest.py ├── regression.py └── svm.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | .idea/ 3 | .spyproject/ 4 | 5 | __pycache__/ 6 | 7 | .env 8 | TODO 9 | 10 | .DS_STORE 11 | 12 | # -- Others -- # 13 | # Byte-compiled / optimized / DLL files 14 | __pycache__/ 15 | *.py[cod] 16 | *$py.class 17 | 18 | # C extensions 19 | *.so 20 | 21 | # Distribution / packaging 22 | .Python 23 | build/ 24 | develop-eggs/ 25 | dist/ 26 | downloads/ 27 | eggs/ 28 | .eggs/ 29 | lib/ 30 | lib64/ 31 | parts/ 32 | sdist/ 33 | var/ 34 | wheels/ 35 | share/python-wheels/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | MANIFEST 40 | 41 | # PyInstaller 42 | # Usually these files are written by a python script from a template 43 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 44 | *.manifest 45 | *.spec 46 | 47 | # Installer logs 48 | pip-log.txt 49 | pip-delete-this-directory.txt 50 | 51 | # Unit test / coverage reports 52 | htmlcov/ 53 | .tox/ 54 | .nox/ 55 | .coverage 56 | .coverage.* 57 | .cache 58 | nosetests.xml 59 | coverage.xml 60 | *.cover 61 | *.py,cover 62 | .hypothesis/ 63 | .pytest_cache/ 64 | cover/ 65 | 66 | # Translations 67 | *.mo 68 | *.pot 69 | 70 | # Django stuff: 71 | *.log 72 | local_settings.py 73 | db.sqlite3 74 | db.sqlite3-journal 75 | 76 | # Flask stuff: 77 | instance/ 78 | .webassets-cache 79 | 80 | # Scrapy stuff: 81 | .scrapy 82 | 83 | # Sphinx documentation 84 | docs/_build/ 85 | 86 | # PyBuilder 87 | .pybuilder/ 88 | target/ 89 | 90 | # Jupyter Notebook 91 | .ipynb_checkpoints 92 | 93 | # IPython 94 | profile_default/ 95 | ipython_config.py 96 | 97 | # pyenv 98 | # For a library or package, you might want to ignore these files since the code is 99 | # intended to run in multiple environments; otherwise, check them in: 100 | # .python-version 101 | 102 | # pipenv 103 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 104 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 105 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 106 | # install all needed dependencies. 107 | #Pipfile.lock 108 | 109 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 110 | __pypackages__/ 111 | 112 | # Celery stuff 113 | celerybeat-schedule 114 | celerybeat.pid 115 | 116 | # SageMath parsed files 117 | *.sage.py 118 | 119 | # Environments 120 | .env 121 | .venv 122 | env/ 123 | venv/ 124 | ENV/ 125 | env.bak/ 126 | venv.bak/ 127 | 128 | # Spyder project settings 129 | .spyderproject 130 | .spyproject 131 | 132 | # Rope project settings 133 | .ropeproject 134 | 135 | # mkdocs documentation 136 | /site 137 | 138 | # mypy 139 | .mypy_cache/ 140 | .dmypy.json 141 | dmypy.json 142 | 143 | # Pyre type checker 144 | .pyre/ 145 | 146 | # pytype static type analyzer 147 | .pytype/ 148 | 149 | # Cython debug symbols 150 | cython_debug/ 151 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Patrick Loeber 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ML algorithms from Scratch! 2 | 3 | > Machine Learning algorithm implementations from scratch. 4 | 5 | You can find Tutorials with the math and code explanations on my channel: [Here](https://www.youtube.com/playlist?list=PLqnslRFeH2Upcrywf-u2etjdxxkL8nl7E) 6 | 7 | ## Algorithms Implemented 8 | 9 | - KNN 10 | - Linear Regression 11 | - Logistic Regression 12 | - Naive Bayes 13 | - Perceptron 14 | - SVM 15 | - Decision Tree 16 | - Random Forest 17 | - Principal Component Analysis (PCA) 18 | - K-Means 19 | - AdaBoost 20 | - Linear Discriminant Analysis (LDA) 21 | 22 | ## Installation and usage. 23 | 24 | This project has 2 dependencies. 25 | 26 | - `numpy` for the maths implementation and writing the algorithms 27 | - `Scikit-learn` for the data generation and testing. 28 | - `Matplotlib` for the plotting. 29 | - `Pandas` for loading data. 30 | 31 | **NOTE**: Do note that, Only `numpy` is used for the implementations. Others 32 | help in the testing of code, and making it easy for us, instead of writing that 33 | too from scratch. 34 | 35 | You can install these using the command below! 36 | 37 | ```sh 38 | # Linux or MacOS 39 | pip3 install -r requirements.txt 40 | 41 | # Windows 42 | pip install -r requirements.txt 43 | ``` 44 | 45 | You can run the files as following. 46 | 47 | ```sh 48 | python -m mlfromscratch. 49 | ``` 50 | 51 | with `` being the valid filename of the algorithm without the extension. 52 | 53 | For example, If I want to run the Linear regression example, I would do 54 | `python -m mlfromscratch.linear_regression` 55 | 56 | ## Watch the Playlist 57 | 58 | [![Alt text](https://img.youtube.com/vi/ngLyX54e1LU/hqdefault.jpg)](https://www.youtube.com/watch?v=ngLyX54e1LU&list=PLqnslRFeH2Upcrywf-u2etjdxxkL8nl7E) 59 | -------------------------------------------------------------------------------- /mlfromscratch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickloeber/MLfromscratch/7f0f18ada1f75d1999a5206b5126459d51f73dce/mlfromscratch/__init__.py -------------------------------------------------------------------------------- /mlfromscratch/adaboost.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | # Decision stump used as weak classifier 5 | class DecisionStump: 6 | def __init__(self): 7 | self.polarity = 1 8 | self.feature_idx = None 9 | self.threshold = None 10 | self.alpha = None 11 | 12 | def predict(self, X): 13 | n_samples = X.shape[0] 14 | X_column = X[:, self.feature_idx] 15 | predictions = np.ones(n_samples) 16 | if self.polarity == 1: 17 | predictions[X_column < self.threshold] = -1 18 | else: 19 | predictions[X_column > self.threshold] = -1 20 | 21 | return predictions 22 | 23 | 24 | class Adaboost: 25 | def __init__(self, n_clf=5): 26 | self.n_clf = n_clf 27 | self.clfs = [] 28 | 29 | def fit(self, X, y): 30 | n_samples, n_features = X.shape 31 | 32 | # Initialize weights to 1/N 33 | w = np.full(n_samples, (1 / n_samples)) 34 | 35 | self.clfs = [] 36 | 37 | # Iterate through classifiers 38 | for _ in range(self.n_clf): 39 | clf = DecisionStump() 40 | min_error = float("inf") 41 | 42 | # greedy search to find best threshold and feature 43 | for feature_i in range(n_features): 44 | X_column = X[:, feature_i] 45 | thresholds = np.unique(X_column) 46 | 47 | for threshold in thresholds: 48 | # predict with polarity 1 49 | p = 1 50 | predictions = np.ones(n_samples) 51 | predictions[X_column < threshold] = -1 52 | 53 | # Error = sum of weights of misclassified samples 54 | misclassified = w[y != predictions] 55 | error = sum(misclassified) 56 | 57 | if error > 0.5: 58 | error = 1 - error 59 | p = -1 60 | 61 | # store the best configuration 62 | if error < min_error: 63 | clf.polarity = p 64 | clf.threshold = threshold 65 | clf.feature_idx = feature_i 66 | min_error = error 67 | 68 | # calculate alpha 69 | EPS = 1e-10 70 | clf.alpha = 0.5 * np.log((1.0 - min_error + EPS) / (min_error + EPS)) 71 | 72 | # calculate predictions and update weights 73 | predictions = clf.predict(X) 74 | 75 | w *= np.exp(-clf.alpha * y * predictions) 76 | # Normalize to one 77 | w /= np.sum(w) 78 | 79 | # Save classifier 80 | self.clfs.append(clf) 81 | 82 | def predict(self, X): 83 | clf_preds = [clf.alpha * clf.predict(X) for clf in self.clfs] 84 | y_pred = np.sum(clf_preds, axis=0) 85 | y_pred = np.sign(y_pred) 86 | 87 | return y_pred 88 | 89 | 90 | # Testing 91 | if __name__ == "__main__": 92 | # Imports 93 | from sklearn import datasets 94 | from sklearn.model_selection import train_test_split 95 | 96 | def accuracy(y_true, y_pred): 97 | accuracy = np.sum(y_true == y_pred) / len(y_true) 98 | return accuracy 99 | 100 | data = datasets.load_breast_cancer() 101 | X, y = data.data, data.target 102 | 103 | y[y == 0] = -1 104 | 105 | X_train, X_test, y_train, y_test = train_test_split( 106 | X, y, test_size=0.2, random_state=5 107 | ) 108 | 109 | # Adaboost classification with 5 weak classifiers 110 | clf = Adaboost(n_clf=5) 111 | clf.fit(X_train, y_train) 112 | y_pred = clf.predict(X_test) 113 | 114 | acc = accuracy(y_test, y_pred) 115 | print("Accuracy:", acc) 116 | -------------------------------------------------------------------------------- /mlfromscratch/decision_tree.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | import numpy as np 4 | 5 | 6 | def entropy(y): 7 | hist = np.bincount(y) 8 | ps = hist / len(y) 9 | return -np.sum([p * np.log2(p) for p in ps if p > 0]) 10 | 11 | 12 | class Node: 13 | def __init__( 14 | self, feature=None, threshold=None, left=None, right=None, *, value=None 15 | ): 16 | self.feature = feature 17 | self.threshold = threshold 18 | self.left = left 19 | self.right = right 20 | self.value = value 21 | 22 | def is_leaf_node(self): 23 | return self.value is not None 24 | 25 | 26 | class DecisionTree: 27 | def __init__(self, min_samples_split=2, max_depth=100, n_feats=None): 28 | self.min_samples_split = min_samples_split 29 | self.max_depth = max_depth 30 | self.n_feats = n_feats 31 | self.root = None 32 | 33 | def fit(self, X, y): 34 | self.n_feats = X.shape[1] if not self.n_feats else min(self.n_feats, X.shape[1]) 35 | self.root = self._grow_tree(X, y) 36 | 37 | def predict(self, X): 38 | return np.array([self._traverse_tree(x, self.root) for x in X]) 39 | 40 | def _grow_tree(self, X, y, depth=0): 41 | n_samples, n_features = X.shape 42 | n_labels = len(np.unique(y)) 43 | 44 | # stopping criteria 45 | if ( 46 | depth >= self.max_depth 47 | or n_labels == 1 48 | or n_samples < self.min_samples_split 49 | ): 50 | leaf_value = self._most_common_label(y) 51 | return Node(value=leaf_value) 52 | 53 | feat_idxs = np.random.choice(n_features, self.n_feats, replace=False) 54 | 55 | # greedily select the best split according to information gain 56 | best_feat, best_thresh = self._best_criteria(X, y, feat_idxs) 57 | 58 | # grow the children that result from the split 59 | left_idxs, right_idxs = self._split(X[:, best_feat], best_thresh) 60 | left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth + 1) 61 | right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth + 1) 62 | return Node(best_feat, best_thresh, left, right) 63 | 64 | def _best_criteria(self, X, y, feat_idxs): 65 | best_gain = -1 66 | split_idx, split_thresh = None, None 67 | for feat_idx in feat_idxs: 68 | X_column = X[:, feat_idx] 69 | thresholds = np.unique(X_column) 70 | for threshold in thresholds: 71 | gain = self._information_gain(y, X_column, threshold) 72 | 73 | if gain > best_gain: 74 | best_gain = gain 75 | split_idx = feat_idx 76 | split_thresh = threshold 77 | 78 | return split_idx, split_thresh 79 | 80 | def _information_gain(self, y, X_column, split_thresh): 81 | # parent loss 82 | parent_entropy = entropy(y) 83 | 84 | # generate split 85 | left_idxs, right_idxs = self._split(X_column, split_thresh) 86 | 87 | if len(left_idxs) == 0 or len(right_idxs) == 0: 88 | return 0 89 | 90 | # compute the weighted avg. of the loss for the children 91 | n = len(y) 92 | n_l, n_r = len(left_idxs), len(right_idxs) 93 | e_l, e_r = entropy(y[left_idxs]), entropy(y[right_idxs]) 94 | child_entropy = (n_l / n) * e_l + (n_r / n) * e_r 95 | 96 | # information gain is difference in loss before vs. after split 97 | ig = parent_entropy - child_entropy 98 | return ig 99 | 100 | def _split(self, X_column, split_thresh): 101 | left_idxs = np.argwhere(X_column <= split_thresh).flatten() 102 | right_idxs = np.argwhere(X_column > split_thresh).flatten() 103 | return left_idxs, right_idxs 104 | 105 | def _traverse_tree(self, x, node): 106 | if node.is_leaf_node(): 107 | return node.value 108 | 109 | if x[node.feature] <= node.threshold: 110 | return self._traverse_tree(x, node.left) 111 | return self._traverse_tree(x, node.right) 112 | 113 | def _most_common_label(self, y): 114 | counter = Counter(y) 115 | most_common = counter.most_common(1)[0][0] 116 | return most_common 117 | 118 | 119 | if __name__ == "__main__": 120 | # Imports 121 | from sklearn import datasets 122 | from sklearn.model_selection import train_test_split 123 | 124 | def accuracy(y_true, y_pred): 125 | accuracy = np.sum(y_true == y_pred) / len(y_true) 126 | return accuracy 127 | 128 | data = datasets.load_breast_cancer() 129 | X, y = data.data, data.target 130 | 131 | X_train, X_test, y_train, y_test = train_test_split( 132 | X, y, test_size=0.2, random_state=1234 133 | ) 134 | 135 | clf = DecisionTree(max_depth=10) 136 | clf.fit(X_train, y_train) 137 | 138 | y_pred = clf.predict(X_test) 139 | acc = accuracy(y_test, y_pred) 140 | 141 | print("Accuracy:", acc) 142 | -------------------------------------------------------------------------------- /mlfromscratch/kmeans.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | np.random.seed(42) 5 | 6 | 7 | def euclidean_distance(x1, x2): 8 | return np.sqrt(np.sum((x1 - x2) ** 2)) 9 | 10 | 11 | class KMeans: 12 | def __init__(self, K=5, max_iters=100, plot_steps=False): 13 | self.K = K 14 | self.max_iters = max_iters 15 | self.plot_steps = plot_steps 16 | 17 | # list of sample indices for each cluster 18 | self.clusters = [[] for _ in range(self.K)] 19 | # the centers (mean feature vector) for each cluster 20 | self.centroids = [] 21 | 22 | def predict(self, X): 23 | self.X = X 24 | self.n_samples, self.n_features = X.shape 25 | 26 | # initialize 27 | random_sample_idxs = np.random.choice(self.n_samples, self.K, replace=False) 28 | self.centroids = [self.X[idx] for idx in random_sample_idxs] 29 | 30 | # Optimize clusters 31 | for _ in range(self.max_iters): 32 | # Assign samples to closest centroids (create clusters) 33 | self.clusters = self._create_clusters(self.centroids) 34 | 35 | if self.plot_steps: 36 | self.plot() 37 | 38 | # Calculate new centroids from the clusters 39 | centroids_old = self.centroids 40 | self.centroids = self._get_centroids(self.clusters) 41 | 42 | # check if clusters have changed 43 | if self._is_converged(centroids_old, self.centroids): 44 | break 45 | 46 | if self.plot_steps: 47 | self.plot() 48 | 49 | # Classify samples as the index of their clusters 50 | return self._get_cluster_labels(self.clusters) 51 | 52 | def _get_cluster_labels(self, clusters): 53 | # each sample will get the label of the cluster it was assigned to 54 | labels = np.empty(self.n_samples) 55 | 56 | for cluster_idx, cluster in enumerate(clusters): 57 | for sample_index in cluster: 58 | labels[sample_index] = cluster_idx 59 | return labels 60 | 61 | def _create_clusters(self, centroids): 62 | # Assign the samples to the closest centroids to create clusters 63 | clusters = [[] for _ in range(self.K)] 64 | for idx, sample in enumerate(self.X): 65 | centroid_idx = self._closest_centroid(sample, centroids) 66 | clusters[centroid_idx].append(idx) 67 | return clusters 68 | 69 | def _closest_centroid(self, sample, centroids): 70 | # distance of the current sample to each centroid 71 | distances = [euclidean_distance(sample, point) for point in centroids] 72 | closest_index = np.argmin(distances) 73 | return closest_index 74 | 75 | def _get_centroids(self, clusters): 76 | # assign mean value of clusters to centroids 77 | centroids = np.zeros((self.K, self.n_features)) 78 | for cluster_idx, cluster in enumerate(clusters): 79 | cluster_mean = np.mean(self.X[cluster], axis=0) 80 | centroids[cluster_idx] = cluster_mean 81 | return centroids 82 | 83 | def _is_converged(self, centroids_old, centroids): 84 | # distances between each old and new centroids, fol all centroids 85 | distances = [ 86 | euclidean_distance(centroids_old[i], centroids[i]) for i in range(self.K) 87 | ] 88 | return sum(distances) == 0 89 | 90 | def plot(self): 91 | fig, ax = plt.subplots(figsize=(12, 8)) 92 | 93 | for i, index in enumerate(self.clusters): 94 | point = self.X[index].T 95 | ax.scatter(*point) 96 | 97 | for point in self.centroids: 98 | ax.scatter(*point, marker="x", color="black", linewidth=2) 99 | 100 | plt.show() 101 | 102 | 103 | # Testing 104 | if __name__ == "__main__": 105 | from sklearn.datasets import make_blobs 106 | 107 | X, y = make_blobs( 108 | centers=3, n_samples=500, n_features=2, shuffle=True, random_state=40 109 | ) 110 | print(X.shape) 111 | 112 | clusters = len(np.unique(y)) 113 | print(clusters) 114 | 115 | k = KMeans(K=clusters, max_iters=150, plot_steps=True) 116 | y_pred = k.predict(X) 117 | 118 | k.plot() 119 | -------------------------------------------------------------------------------- /mlfromscratch/knn.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | import numpy as np 4 | 5 | 6 | def euclidean_distance(x1, x2): 7 | return np.sqrt(np.sum((x1 - x2) ** 2)) 8 | 9 | 10 | class KNN: 11 | def __init__(self, k=3): 12 | self.k = k 13 | 14 | def fit(self, X, y): 15 | self.X_train = X 16 | self.y_train = y 17 | 18 | def predict(self, X): 19 | y_pred = [self._predict(x) for x in X] 20 | return np.array(y_pred) 21 | 22 | def _predict(self, x): 23 | # Compute distances between x and all examples in the training set 24 | distances = [euclidean_distance(x, x_train) for x_train in self.X_train] 25 | # Sort by distance and return indices of the first k neighbors 26 | k_idx = np.argsort(distances)[: self.k] 27 | # Extract the labels of the k nearest neighbor training samples 28 | k_neighbor_labels = [self.y_train[i] for i in k_idx] 29 | # return the most common class label 30 | most_common = Counter(k_neighbor_labels).most_common(1) 31 | return most_common[0][0] 32 | 33 | 34 | if __name__ == "__main__": 35 | # Imports 36 | from matplotlib.colors import ListedColormap 37 | from sklearn import datasets 38 | from sklearn.model_selection import train_test_split 39 | 40 | cmap = ListedColormap(["#FF0000", "#00FF00", "#0000FF"]) 41 | 42 | def accuracy(y_true, y_pred): 43 | accuracy = np.sum(y_true == y_pred) / len(y_true) 44 | return accuracy 45 | 46 | iris = datasets.load_iris() 47 | X, y = iris.data, iris.target 48 | 49 | X_train, X_test, y_train, y_test = train_test_split( 50 | X, y, test_size=0.2, random_state=1234 51 | ) 52 | 53 | k = 3 54 | clf = KNN(k=k) 55 | clf.fit(X_train, y_train) 56 | predictions = clf.predict(X_test) 57 | print("KNN classification accuracy", accuracy(y_test, predictions)) 58 | -------------------------------------------------------------------------------- /mlfromscratch/lda.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class LDA: 5 | def __init__(self, n_components): 6 | self.n_components = n_components 7 | self.linear_discriminants = None 8 | 9 | def fit(self, X, y): 10 | n_features = X.shape[1] 11 | class_labels = np.unique(y) 12 | 13 | # Within class scatter matrix: 14 | # SW = sum((X_c - mean_X_c)^2 ) 15 | 16 | # Between class scatter: 17 | # SB = sum( n_c * (mean_X_c - mean_overall)^2 ) 18 | 19 | mean_overall = np.mean(X, axis=0) 20 | SW = np.zeros((n_features, n_features)) 21 | SB = np.zeros((n_features, n_features)) 22 | for c in class_labels: 23 | X_c = X[y == c] 24 | mean_c = np.mean(X_c, axis=0) 25 | # (4, n_c) * (n_c, 4) = (4,4) -> transpose 26 | SW += (X_c - mean_c).T.dot((X_c - mean_c)) 27 | 28 | # (4, 1) * (1, 4) = (4,4) -> reshape 29 | n_c = X_c.shape[0] 30 | mean_diff = (mean_c - mean_overall).reshape(n_features, 1) 31 | SB += n_c * (mean_diff).dot(mean_diff.T) 32 | 33 | # Determine SW^-1 * SB 34 | A = np.linalg.inv(SW).dot(SB) 35 | # Get eigenvalues and eigenvectors of SW^-1 * SB 36 | eigenvalues, eigenvectors = np.linalg.eig(A) 37 | # -> eigenvector v = [:,i] column vector, transpose for easier calculations 38 | # sort eigenvalues high to low 39 | eigenvectors = eigenvectors.T 40 | idxs = np.argsort(abs(eigenvalues))[::-1] 41 | eigenvalues = eigenvalues[idxs] 42 | eigenvectors = eigenvectors[idxs] 43 | # store first n eigenvectors 44 | self.linear_discriminants = eigenvectors[0 : self.n_components] 45 | 46 | def transform(self, X): 47 | # project data 48 | return np.dot(X, self.linear_discriminants.T) 49 | 50 | 51 | # Testing 52 | if __name__ == "__main__": 53 | # Imports 54 | import matplotlib.pyplot as plt 55 | from sklearn import datasets 56 | 57 | data = datasets.load_iris() 58 | X, y = data.data, data.target 59 | 60 | # Project the data onto the 2 primary linear discriminants 61 | lda = LDA(2) 62 | lda.fit(X, y) 63 | X_projected = lda.transform(X) 64 | 65 | print("Shape of X:", X.shape) 66 | print("Shape of transformed X:", X_projected.shape) 67 | 68 | x1, x2 = X_projected[:, 0], X_projected[:, 1] 69 | 70 | plt.scatter( 71 | x1, x2, c=y, edgecolor="none", alpha=0.8, cmap=plt.cm.get_cmap("viridis", 3) 72 | ) 73 | 74 | plt.xlabel("Linear Discriminant 1") 75 | plt.ylabel("Linear Discriminant 2") 76 | plt.colorbar() 77 | plt.show() 78 | -------------------------------------------------------------------------------- /mlfromscratch/linear_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def r2_score(y_true, y_pred): 5 | corr_matrix = np.corrcoef(y_true, y_pred) 6 | corr = corr_matrix[0, 1] 7 | return corr ** 2 8 | 9 | 10 | class LinearRegression: 11 | def __init__(self, learning_rate=0.001, n_iters=1000): 12 | self.lr = learning_rate 13 | self.n_iters = n_iters 14 | self.weights = None 15 | self.bias = None 16 | 17 | def fit(self, X, y): 18 | n_samples, n_features = X.shape 19 | 20 | # init parameters 21 | self.weights = np.zeros(n_features) 22 | self.bias = 0 23 | 24 | # gradient descent 25 | for _ in range(self.n_iters): 26 | y_predicted = np.dot(X, self.weights) + self.bias 27 | # compute gradients 28 | dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y)) 29 | db = (1 / n_samples) * np.sum(y_predicted - y) 30 | 31 | # update parameters 32 | self.weights -= self.lr * dw 33 | self.bias -= self.lr * db 34 | 35 | def predict(self, X): 36 | y_approximated = np.dot(X, self.weights) + self.bias 37 | return y_approximated 38 | 39 | 40 | # Testing 41 | if __name__ == "__main__": 42 | # Imports 43 | import matplotlib.pyplot as plt 44 | from sklearn.model_selection import train_test_split 45 | from sklearn import datasets 46 | 47 | def mean_squared_error(y_true, y_pred): 48 | return np.mean((y_true - y_pred) ** 2) 49 | 50 | X, y = datasets.make_regression( 51 | n_samples=100, n_features=1, noise=20, random_state=4 52 | ) 53 | 54 | X_train, X_test, y_train, y_test = train_test_split( 55 | X, y, test_size=0.2, random_state=1234 56 | ) 57 | 58 | regressor = LinearRegression(learning_rate=0.01, n_iters=1000) 59 | regressor.fit(X_train, y_train) 60 | predictions = regressor.predict(X_test) 61 | 62 | mse = mean_squared_error(y_test, predictions) 63 | print("MSE:", mse) 64 | 65 | accu = r2_score(y_test, predictions) 66 | print("Accuracy:", accu) 67 | 68 | y_pred_line = regressor.predict(X) 69 | cmap = plt.get_cmap("viridis") 70 | fig = plt.figure(figsize=(8, 6)) 71 | m1 = plt.scatter(X_train, y_train, color=cmap(0.9), s=10) 72 | m2 = plt.scatter(X_test, y_test, color=cmap(0.5), s=10) 73 | plt.plot(X, y_pred_line, color="black", linewidth=2, label="Prediction") 74 | plt.show() 75 | -------------------------------------------------------------------------------- /mlfromscratch/load_data.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import numpy as np 3 | import pandas as pd 4 | 5 | # Download data from https://archive.ics.uci.edu/ml/datasets/spambase 6 | FILE_NAME = "spambase.data" 7 | 8 | # 1) load with csv file 9 | with open(FILE_NAME, "r") as f: 10 | data = list(csv.reader(f, delimiter=",")) 11 | 12 | data = np.array(data, dtype=np.float32) 13 | print(data.shape) 14 | 15 | # 2) load with np.loadtxt() 16 | # skiprows=1 17 | data = np.loadtxt(FILE_NAME, delimiter=",", dtype=np.float32) 18 | print(data.shape, data.dtype) 19 | 20 | # 3) load with np.genfromtxt() 21 | # skip_header=0, missing_values="---", filling_values=0.0 22 | data = np.genfromtxt(FILE_NAME, delimiter=",", dtype=np.float32) 23 | print(data.shape) 24 | 25 | # split into X and y 26 | n_samples, n_features = data.shape 27 | n_features -= 1 28 | 29 | X = data[:, 0:n_features] 30 | y = data[:, n_features] 31 | 32 | print(X.shape, y.shape) 33 | print(X[0, 0:5]) 34 | # or if y is the first column 35 | # X = data[:, 1:n_features+1] 36 | # y = data[:, 0] 37 | 38 | # 4) load with pandas: read_csv() 39 | # na_values = ['---'] 40 | df = pd.read_csv(FILE_NAME, header=None, skiprows=0, dtype=np.float32) 41 | df = df.fillna(0.0) 42 | 43 | # dataframe to numpy 44 | data = df.to_numpy() 45 | print(data[4, 0:5]) 46 | 47 | # convert datatypes in numpy 48 | # data = np.asarray(data, dtype = np.float32) 49 | # print(data.dtype) 50 | -------------------------------------------------------------------------------- /mlfromscratch/logistic_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class LogisticRegression: 5 | def __init__(self, learning_rate=0.001, n_iters=1000): 6 | self.lr = learning_rate 7 | self.n_iters = n_iters 8 | self.weights = None 9 | self.bias = None 10 | 11 | def fit(self, X, y): 12 | n_samples, n_features = X.shape 13 | 14 | # init parameters 15 | self.weights = np.zeros(n_features) 16 | self.bias = 0 17 | 18 | # gradient descent 19 | for _ in range(self.n_iters): 20 | # approximate y with linear combination of weights and x, plus bias 21 | linear_model = np.dot(X, self.weights) + self.bias 22 | # apply sigmoid function 23 | y_predicted = self._sigmoid(linear_model) 24 | 25 | # compute gradients 26 | dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y)) 27 | db = (1 / n_samples) * np.sum(y_predicted - y) 28 | # update parameters 29 | self.weights -= self.lr * dw 30 | self.bias -= self.lr * db 31 | 32 | def predict(self, X): 33 | linear_model = np.dot(X, self.weights) + self.bias 34 | y_predicted = self._sigmoid(linear_model) 35 | y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted] 36 | return np.array(y_predicted_cls) 37 | 38 | def _sigmoid(self, x): 39 | return 1 / (1 + np.exp(-x)) 40 | 41 | 42 | # Testing 43 | if __name__ == "__main__": 44 | # Imports 45 | from sklearn.model_selection import train_test_split 46 | from sklearn import datasets 47 | 48 | def accuracy(y_true, y_pred): 49 | accuracy = np.sum(y_true == y_pred) / len(y_true) 50 | return accuracy 51 | 52 | bc = datasets.load_breast_cancer() 53 | X, y = bc.data, bc.target 54 | 55 | X_train, X_test, y_train, y_test = train_test_split( 56 | X, y, test_size=0.2, random_state=1234 57 | ) 58 | 59 | regressor = LogisticRegression(learning_rate=0.0001, n_iters=1000) 60 | regressor.fit(X_train, y_train) 61 | predictions = regressor.predict(X_test) 62 | 63 | print("LR classification accuracy:", accuracy(y_test, predictions)) 64 | -------------------------------------------------------------------------------- /mlfromscratch/naivebayes.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class NaiveBayes: 5 | def fit(self, X, y): 6 | n_samples, n_features = X.shape 7 | self._classes = np.unique(y) 8 | n_classes = len(self._classes) 9 | 10 | # calculate mean, var, and prior for each class 11 | self._mean = np.zeros((n_classes, n_features), dtype=np.float64) 12 | self._var = np.zeros((n_classes, n_features), dtype=np.float64) 13 | self._priors = np.zeros(n_classes, dtype=np.float64) 14 | 15 | for idx, c in enumerate(self._classes): 16 | X_c = X[y == c] 17 | self._mean[idx, :] = X_c.mean(axis=0) 18 | self._var[idx, :] = X_c.var(axis=0) 19 | self._priors[idx] = X_c.shape[0] / float(n_samples) 20 | 21 | def predict(self, X): 22 | y_pred = [self._predict(x) for x in X] 23 | return np.array(y_pred) 24 | 25 | def _predict(self, x): 26 | posteriors = [] 27 | 28 | # calculate posterior probability for each class 29 | for idx, c in enumerate(self._classes): 30 | prior = np.log(self._priors[idx]) 31 | posterior = np.sum(np.log(self._pdf(idx, x))) 32 | posterior = prior + posterior 33 | posteriors.append(posterior) 34 | 35 | # return class with highest posterior probability 36 | return self._classes[np.argmax(posteriors)] 37 | 38 | def _pdf(self, class_idx, x): 39 | mean = self._mean[class_idx] 40 | var = self._var[class_idx] 41 | numerator = np.exp(-((x - mean) ** 2) / (2 * var)) 42 | denominator = np.sqrt(2 * np.pi * var) 43 | return numerator / denominator 44 | 45 | 46 | # Testing 47 | if __name__ == "__main__": 48 | # Imports 49 | from sklearn.model_selection import train_test_split 50 | from sklearn import datasets 51 | 52 | def accuracy(y_true, y_pred): 53 | accuracy = np.sum(y_true == y_pred) / len(y_true) 54 | return accuracy 55 | 56 | X, y = datasets.make_classification( 57 | n_samples=1000, n_features=10, n_classes=2, random_state=123 58 | ) 59 | X_train, X_test, y_train, y_test = train_test_split( 60 | X, y, test_size=0.2, random_state=123 61 | ) 62 | 63 | nb = NaiveBayes() 64 | nb.fit(X_train, y_train) 65 | predictions = nb.predict(X_test) 66 | 67 | print("Naive Bayes classification accuracy", accuracy(y_test, predictions)) 68 | -------------------------------------------------------------------------------- /mlfromscratch/pca.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class PCA: 5 | def __init__(self, n_components): 6 | self.n_components = n_components 7 | self.components = None 8 | self.mean = None 9 | 10 | def fit(self, X): 11 | # Mean centering 12 | self.mean = np.mean(X, axis=0) 13 | X = X - self.mean 14 | 15 | # covariance, function needs samples as columns 16 | cov = np.cov(X.T) 17 | 18 | # eigenvalues, eigenvectors 19 | eigenvalues, eigenvectors = np.linalg.eig(cov) 20 | 21 | # -> eigenvector v = [:,i] column vector, transpose for easier calculations 22 | # sort eigenvectors 23 | eigenvectors = eigenvectors.T 24 | idxs = np.argsort(eigenvalues)[::-1] 25 | eigenvalues = eigenvalues[idxs] 26 | eigenvectors = eigenvectors[idxs] 27 | 28 | # store first n eigenvectors 29 | self.components = eigenvectors[0 : self.n_components] 30 | 31 | def transform(self, X): 32 | # project data 33 | X = X - self.mean 34 | return np.dot(X, self.components.T) 35 | 36 | 37 | # Testing 38 | if __name__ == "__main__": 39 | # Imports 40 | import matplotlib.pyplot as plt 41 | from sklearn import datasets 42 | 43 | # data = datasets.load_digits() 44 | data = datasets.load_iris() 45 | X = data.data 46 | y = data.target 47 | 48 | # Project the data onto the 2 primary principal components 49 | pca = PCA(2) 50 | pca.fit(X) 51 | X_projected = pca.transform(X) 52 | 53 | print("Shape of X:", X.shape) 54 | print("Shape of transformed X:", X_projected.shape) 55 | 56 | x1 = X_projected[:, 0] 57 | x2 = X_projected[:, 1] 58 | 59 | plt.scatter( 60 | x1, x2, c=y, edgecolor="none", alpha=0.8, cmap=plt.cm.get_cmap("viridis", 3) 61 | ) 62 | 63 | plt.xlabel("Principal Component 1") 64 | plt.ylabel("Principal Component 2") 65 | plt.colorbar() 66 | plt.show() 67 | -------------------------------------------------------------------------------- /mlfromscratch/perceptron.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Perceptron: 5 | def __init__(self, learning_rate=0.01, n_iters=1000): 6 | self.lr = learning_rate 7 | self.n_iters = n_iters 8 | self.activation_func = self._unit_step_func 9 | self.weights = None 10 | self.bias = None 11 | 12 | def fit(self, X, y): 13 | n_samples, n_features = X.shape 14 | 15 | # init parameters 16 | self.weights = np.zeros(n_features) 17 | self.bias = 0 18 | 19 | y_ = np.array([1 if i > 0 else 0 for i in y]) 20 | 21 | for _ in range(self.n_iters): 22 | 23 | for idx, x_i in enumerate(X): 24 | 25 | linear_output = np.dot(x_i, self.weights) + self.bias 26 | y_predicted = self.activation_func(linear_output) 27 | 28 | # Perceptron update rule 29 | update = self.lr * (y_[idx] - y_predicted) 30 | 31 | self.weights += update * x_i 32 | self.bias += update 33 | 34 | def predict(self, X): 35 | linear_output = np.dot(X, self.weights) + self.bias 36 | y_predicted = self.activation_func(linear_output) 37 | return y_predicted 38 | 39 | def _unit_step_func(self, x): 40 | return np.where(x >= 0, 1, 0) 41 | 42 | 43 | # Testing 44 | if __name__ == "__main__": 45 | # Imports 46 | import matplotlib.pyplot as plt 47 | from sklearn.model_selection import train_test_split 48 | from sklearn import datasets 49 | 50 | def accuracy(y_true, y_pred): 51 | accuracy = np.sum(y_true == y_pred) / len(y_true) 52 | return accuracy 53 | 54 | X, y = datasets.make_blobs( 55 | n_samples=150, n_features=2, centers=2, cluster_std=1.05, random_state=2 56 | ) 57 | X_train, X_test, y_train, y_test = train_test_split( 58 | X, y, test_size=0.2, random_state=123 59 | ) 60 | 61 | p = Perceptron(learning_rate=0.01, n_iters=1000) 62 | p.fit(X_train, y_train) 63 | predictions = p.predict(X_test) 64 | 65 | print("Perceptron classification accuracy", accuracy(y_test, predictions)) 66 | 67 | fig = plt.figure() 68 | ax = fig.add_subplot(1, 1, 1) 69 | plt.scatter(X_train[:, 0], X_train[:, 1], marker="o", c=y_train) 70 | 71 | x0_1 = np.amin(X_train[:, 0]) 72 | x0_2 = np.amax(X_train[:, 0]) 73 | 74 | x1_1 = (-p.weights[0] * x0_1 - p.bias) / p.weights[1] 75 | x1_2 = (-p.weights[0] * x0_2 - p.bias) / p.weights[1] 76 | 77 | ax.plot([x0_1, x0_2], [x1_1, x1_2], "k") 78 | 79 | ymin = np.amin(X_train[:, 1]) 80 | ymax = np.amax(X_train[:, 1]) 81 | ax.set_ylim([ymin - 3, ymax + 3]) 82 | 83 | plt.show() 84 | -------------------------------------------------------------------------------- /mlfromscratch/random_forest.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | import numpy as np 4 | 5 | from .decision_tree import DecisionTree 6 | 7 | 8 | def bootstrap_sample(X, y): 9 | n_samples = X.shape[0] 10 | idxs = np.random.choice(n_samples, n_samples, replace=True) 11 | return X[idxs], y[idxs] 12 | 13 | 14 | def most_common_label(y): 15 | counter = Counter(y) 16 | most_common = counter.most_common(1)[0][0] 17 | return most_common 18 | 19 | 20 | class RandomForest: 21 | def __init__(self, n_trees=10, min_samples_split=2, max_depth=100, n_feats=None): 22 | self.n_trees = n_trees 23 | self.min_samples_split = min_samples_split 24 | self.max_depth = max_depth 25 | self.n_feats = n_feats 26 | self.trees = [] 27 | 28 | def fit(self, X, y): 29 | self.trees = [] 30 | for _ in range(self.n_trees): 31 | tree = DecisionTree( 32 | min_samples_split=self.min_samples_split, 33 | max_depth=self.max_depth, 34 | n_feats=self.n_feats, 35 | ) 36 | X_samp, y_samp = bootstrap_sample(X, y) 37 | tree.fit(X_samp, y_samp) 38 | self.trees.append(tree) 39 | 40 | def predict(self, X): 41 | tree_preds = np.array([tree.predict(X) for tree in self.trees]) 42 | tree_preds = np.swapaxes(tree_preds, 0, 1) 43 | y_pred = [most_common_label(tree_pred) for tree_pred in tree_preds] 44 | return np.array(y_pred) 45 | 46 | 47 | # Testing 48 | if __name__ == "__main__": 49 | # Imports 50 | from sklearn import datasets 51 | from sklearn.model_selection import train_test_split 52 | 53 | def accuracy(y_true, y_pred): 54 | accuracy = np.sum(y_true == y_pred) / len(y_true) 55 | return accuracy 56 | 57 | data = datasets.load_breast_cancer() 58 | X = data.data 59 | y = data.target 60 | 61 | X_train, X_test, y_train, y_test = train_test_split( 62 | X, y, test_size=0.2, random_state=1234 63 | ) 64 | 65 | clf = RandomForest(n_trees=3, max_depth=10) 66 | 67 | clf.fit(X_train, y_train) 68 | y_pred = clf.predict(X_test) 69 | acc = accuracy(y_test, y_pred) 70 | 71 | print("Accuracy:", acc) 72 | -------------------------------------------------------------------------------- /mlfromscratch/regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class BaseRegression: 5 | def __init__(self, learning_rate: float = 0.001, n_iters: int = 1000): 6 | # Assign the variables 7 | self.learning_rate = learning_rate 8 | self.n_iters = n_iters 9 | 10 | # Weights and bias 11 | self.weights, self.bias = None, None 12 | 13 | def fit(self, X, y): 14 | n_samples, n_features = X.shape 15 | 16 | self.weights, self.bias = np.zeros(n_features), 0 17 | 18 | # Minimizing loss, and finding the correct Weights and biases using Gradient Descent 19 | for _ in range(self.n_iters): 20 | y_predicted = self._approximation(X, self.weights, self.bias) 21 | 22 | dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y)) 23 | db = (1 / n_samples) * np.sum(y_predicted - y) 24 | 25 | self.weights -= self.learning_rate * dw 26 | self.bias -= self.learning_rate * db 27 | 28 | def predict(self, X): 29 | return self._predict(X, self.weights, self.bias) 30 | 31 | def _predict(self, X, w, b): 32 | raise NotImplementedError 33 | 34 | def _approximation(self, X, w, b): 35 | raise NotImplementedError 36 | 37 | 38 | class LinearRegression(BaseRegression): 39 | def _approximation(self, X, w, b): 40 | return np.dot(X, w) + b 41 | 42 | def _predict(self, X, w, b): 43 | return np.dot(X, w) + b 44 | 45 | 46 | class LogisticRegression(BaseRegression): 47 | def _approximation(self, X, w, b): 48 | linear_model = np.dot(X, w) + b 49 | return self._sigmoid(linear_model) 50 | 51 | def _predict(self, X, w, b): 52 | linear_model = np.dot(X, w) + b 53 | y_predicted = self._sigmoid(linear_model) 54 | y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted] 55 | return np.array(y_predicted_cls) 56 | 57 | def _sigmoid(self, x): 58 | return 1 / (np.exp(-x) + 1) 59 | 60 | 61 | # Testing 62 | if __name__ == "__main__": 63 | # Imports 64 | from sklearn.model_selection import train_test_split 65 | from sklearn import datasets 66 | 67 | # Utils 68 | def r2_score(y_true, y_pred): 69 | corr_matrix = np.corrcoef(y_true, y_pred) 70 | corr = corr_matrix[0, 1] 71 | return corr ** 2 72 | 73 | def mean_squared_error(y_true, y_pred): 74 | return np.mean((y_true - y_pred) ** 2) 75 | 76 | def accuracy(y_true, y_pred): 77 | accuracy = np.sum(y_true == y_pred) / len(y_true) 78 | return accuracy 79 | 80 | # Linear Regression 81 | X, y = datasets.make_regression( 82 | n_samples=100, n_features=1, noise=20, random_state=4 83 | ) 84 | 85 | X_train, X_test, y_train, y_test = train_test_split( 86 | X, y, test_size=0.2, random_state=1234 87 | ) 88 | 89 | regressor = LinearRegression(learning_rate=0.01, n_iters=1000) 90 | regressor.fit(X_train, y_train) 91 | predictions = regressor.predict(X_test) 92 | 93 | accu = r2_score(y_test, predictions) 94 | print("Linear reg Accuracy:", accu) 95 | 96 | # Logistic reg 97 | bc = datasets.load_breast_cancer() 98 | X, y = bc.data, bc.target 99 | 100 | X_train, X_test, y_train, y_test = train_test_split( 101 | X, y, test_size=0.2, random_state=1234 102 | ) 103 | 104 | regressor = LogisticRegression(learning_rate=0.0001, n_iters=1000) 105 | regressor.fit(X_train, y_train) 106 | predictions = regressor.predict(X_test) 107 | 108 | print("Logistic reg classification accuracy:", accuracy(y_test, predictions)) 109 | -------------------------------------------------------------------------------- /mlfromscratch/svm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class SVM: 5 | def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iters=1000): 6 | self.lr = learning_rate 7 | self.lambda_param = lambda_param 8 | self.n_iters = n_iters 9 | self.w = None 10 | self.b = None 11 | 12 | def fit(self, X, y): 13 | n_samples, n_features = X.shape 14 | 15 | y_ = np.where(y <= 0, -1, 1) 16 | 17 | self.w = np.zeros(n_features) 18 | self.b = 0 19 | 20 | for _ in range(self.n_iters): 21 | for idx, x_i in enumerate(X): 22 | condition = y_[idx] * (np.dot(x_i, self.w) - self.b) >= 1 23 | if condition: 24 | self.w -= self.lr * (2 * self.lambda_param * self.w) 25 | else: 26 | self.w -= self.lr * ( 27 | 2 * self.lambda_param * self.w - np.dot(x_i, y_[idx]) 28 | ) 29 | self.b -= self.lr * y_[idx] 30 | 31 | def predict(self, X): 32 | approx = np.dot(X, self.w) - self.b 33 | return np.sign(approx) 34 | 35 | 36 | # Testing 37 | if __name__ == "__main__": 38 | # Imports 39 | from sklearn import datasets 40 | import matplotlib.pyplot as plt 41 | 42 | X, y = datasets.make_blobs( 43 | n_samples=50, n_features=2, centers=2, cluster_std=1.05, random_state=40 44 | ) 45 | y = np.where(y == 0, -1, 1) 46 | 47 | clf = SVM() 48 | clf.fit(X, y) 49 | # predictions = clf.predict(X) 50 | 51 | print(clf.w, clf.b) 52 | 53 | def visualize_svm(): 54 | def get_hyperplane_value(x, w, b, offset): 55 | return (-w[0] * x + b + offset) / w[1] 56 | 57 | fig = plt.figure() 58 | ax = fig.add_subplot(1, 1, 1) 59 | plt.scatter(X[:, 0], X[:, 1], marker="o", c=y) 60 | 61 | x0_1 = np.amin(X[:, 0]) 62 | x0_2 = np.amax(X[:, 0]) 63 | 64 | x1_1 = get_hyperplane_value(x0_1, clf.w, clf.b, 0) 65 | x1_2 = get_hyperplane_value(x0_2, clf.w, clf.b, 0) 66 | 67 | x1_1_m = get_hyperplane_value(x0_1, clf.w, clf.b, -1) 68 | x1_2_m = get_hyperplane_value(x0_2, clf.w, clf.b, -1) 69 | 70 | x1_1_p = get_hyperplane_value(x0_1, clf.w, clf.b, 1) 71 | x1_2_p = get_hyperplane_value(x0_2, clf.w, clf.b, 1) 72 | 73 | ax.plot([x0_1, x0_2], [x1_1, x1_2], "y--") 74 | ax.plot([x0_1, x0_2], [x1_1_m, x1_2_m], "k") 75 | ax.plot([x0_1, x0_2], [x1_1_p, x1_2_p], "k") 76 | 77 | x1_min = np.amin(X[:, 1]) 78 | x1_max = np.amax(X[:, 1]) 79 | ax.set_ylim([x1_min - 3, x1_max + 3]) 80 | 81 | plt.show() 82 | 83 | visualize_svm() 84 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.22.0 2 | scikit-learn==0.24.2 3 | matplotlib==3.4.2 4 | pandas==1.2.4 5 | --------------------------------------------------------------------------------