├── .gitignore
├── LICENSE
├── README.md
├── mlfromscratch
    ├── __init__.py
    ├── adaboost.py
    ├── decision_tree.py
    ├── kmeans.py
    ├── knn.py
    ├── lda.py
    ├── linear_regression.py
    ├── load_data.py
    ├── logistic_regression.py
    ├── naivebayes.py
    ├── pca.py
    ├── perceptron.py
    ├── random_forest.py
    ├── regression.py
    └── svm.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | .vscode/
  2 | .idea/
  3 | .spyproject/
  4 | 
  5 | __pycache__/
  6 | 
  7 | .env
  8 | TODO
  9 | 
 10 | .DS_STORE
 11 | 
 12 | # -- Others -- #
 13 | # Byte-compiled / optimized / DLL files
 14 | __pycache__/
 15 | *.py[cod]
 16 | *$py.class
 17 | 
 18 | # C extensions
 19 | *.so
 20 | 
 21 | # Distribution / packaging
 22 | .Python
 23 | build/
 24 | develop-eggs/
 25 | dist/
 26 | downloads/
 27 | eggs/
 28 | .eggs/
 29 | lib/
 30 | lib64/
 31 | parts/
 32 | sdist/
 33 | var/
 34 | wheels/
 35 | share/python-wheels/
 36 | *.egg-info/
 37 | .installed.cfg
 38 | *.egg
 39 | MANIFEST
 40 | 
 41 | # PyInstaller
 42 | #  Usually these files are written by a python script from a template
 43 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 44 | *.manifest
 45 | *.spec
 46 | 
 47 | # Installer logs
 48 | pip-log.txt
 49 | pip-delete-this-directory.txt
 50 | 
 51 | # Unit test / coverage reports
 52 | htmlcov/
 53 | .tox/
 54 | .nox/
 55 | .coverage
 56 | .coverage.*
 57 | .cache
 58 | nosetests.xml
 59 | coverage.xml
 60 | *.cover
 61 | *.py,cover
 62 | .hypothesis/
 63 | .pytest_cache/
 64 | cover/
 65 | 
 66 | # Translations
 67 | *.mo
 68 | *.pot
 69 | 
 70 | # Django stuff:
 71 | *.log
 72 | local_settings.py
 73 | db.sqlite3
 74 | db.sqlite3-journal
 75 | 
 76 | # Flask stuff:
 77 | instance/
 78 | .webassets-cache
 79 | 
 80 | # Scrapy stuff:
 81 | .scrapy
 82 | 
 83 | # Sphinx documentation
 84 | docs/_build/
 85 | 
 86 | # PyBuilder
 87 | .pybuilder/
 88 | target/
 89 | 
 90 | # Jupyter Notebook
 91 | .ipynb_checkpoints
 92 | 
 93 | # IPython
 94 | profile_default/
 95 | ipython_config.py
 96 | 
 97 | # pyenv
 98 | #   For a library or package, you might want to ignore these files since the code is
 99 | #   intended to run in multiple environments; otherwise, check them in:
100 | # .python-version
101 | 
102 | # pipenv
103 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
104 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
105 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
106 | #   install all needed dependencies.
107 | #Pipfile.lock
108 | 
109 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
110 | __pypackages__/
111 | 
112 | # Celery stuff
113 | celerybeat-schedule
114 | celerybeat.pid
115 | 
116 | # SageMath parsed files
117 | *.sage.py
118 | 
119 | # Environments
120 | .env
121 | .venv
122 | env/
123 | venv/
124 | ENV/
125 | env.bak/
126 | venv.bak/
127 | 
128 | # Spyder project settings
129 | .spyderproject
130 | .spyproject
131 | 
132 | # Rope project settings
133 | .ropeproject
134 | 
135 | # mkdocs documentation
136 | /site
137 | 
138 | # mypy
139 | .mypy_cache/
140 | .dmypy.json
141 | dmypy.json
142 | 
143 | # Pyre type checker
144 | .pyre/
145 | 
146 | # pytype static type analyzer
147 | .pytype/
148 | 
149 | # Cython debug symbols
150 | cython_debug/
151 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Patrick Loeber
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ML algorithms from Scratch!
 2 | 
 3 | > Machine Learning algorithm implementations from scratch.
 4 | 
 5 | You can find Tutorials with the math and code explanations on my channel: [Here](https://www.youtube.com/playlist?list=PLqnslRFeH2Upcrywf-u2etjdxxkL8nl7E)
 6 | 
 7 | ## Algorithms Implemented
 8 | 
 9 | - KNN
10 | - Linear Regression
11 | - Logistic Regression
12 | - Naive Bayes
13 | - Perceptron
14 | - SVM
15 | - Decision Tree
16 | - Random Forest
17 | - Principal Component Analysis (PCA)
18 | - K-Means
19 | - AdaBoost
20 | - Linear Discriminant Analysis (LDA)
21 | 
22 | ## Installation and usage.
23 | 
24 | This project has 2 dependencies.
25 | 
26 | - `numpy` for the maths implementation and writing the algorithms
27 | - `Scikit-learn` for the data generation and testing.
28 | - `Matplotlib` for the plotting.
29 | - `Pandas` for loading data.
30 | 
31 | **NOTE**: Do note that, Only `numpy` is used for the implementations. Others
32 | help in the testing of code, and making it easy for us, instead of writing that
33 | too from scratch.
34 | 
35 | You can install these using the command below!
36 | 
37 | ```sh
38 | # Linux or MacOS
39 | pip3 install -r requirements.txt
40 | 
41 | # Windows
42 | pip install -r requirements.txt
43 | ```
44 | 
45 | You can run the files as following.
46 | 
47 | ```sh
48 | python -m mlfromscratch.<algorithm-file>
49 | ```
50 | 
51 | with `<algorithm-file>` being the valid filename of the algorithm without the extension.
52 | 
53 | For example, If I want to run the Linear regression example, I would do 
54 | `python -m mlfromscratch.linear_regression`
55 | 
56 | ## Watch the Playlist
57 | 
58 | [![Alt text](https://img.youtube.com/vi/ngLyX54e1LU/hqdefault.jpg)](https://www.youtube.com/watch?v=ngLyX54e1LU&list=PLqnslRFeH2Upcrywf-u2etjdxxkL8nl7E)
59 | 


--------------------------------------------------------------------------------
/mlfromscratch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickloeber/MLfromscratch/7f0f18ada1f75d1999a5206b5126459d51f73dce/mlfromscratch/__init__.py


--------------------------------------------------------------------------------
/mlfromscratch/adaboost.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | # Decision stump used as weak classifier
  5 | class DecisionStump:
  6 |     def __init__(self):
  7 |         self.polarity = 1
  8 |         self.feature_idx = None
  9 |         self.threshold = None
 10 |         self.alpha = None
 11 | 
 12 |     def predict(self, X):
 13 |         n_samples = X.shape[0]
 14 |         X_column = X[:, self.feature_idx]
 15 |         predictions = np.ones(n_samples)
 16 |         if self.polarity == 1:
 17 |             predictions[X_column < self.threshold] = -1
 18 |         else:
 19 |             predictions[X_column > self.threshold] = -1
 20 | 
 21 |         return predictions
 22 | 
 23 | 
 24 | class Adaboost:
 25 |     def __init__(self, n_clf=5):
 26 |         self.n_clf = n_clf
 27 |         self.clfs = []
 28 | 
 29 |     def fit(self, X, y):
 30 |         n_samples, n_features = X.shape
 31 | 
 32 |         # Initialize weights to 1/N
 33 |         w = np.full(n_samples, (1 / n_samples))
 34 | 
 35 |         self.clfs = []
 36 | 
 37 |         # Iterate through classifiers
 38 |         for _ in range(self.n_clf):
 39 |             clf = DecisionStump()
 40 |             min_error = float("inf")
 41 | 
 42 |             # greedy search to find best threshold and feature
 43 |             for feature_i in range(n_features):
 44 |                 X_column = X[:, feature_i]
 45 |                 thresholds = np.unique(X_column)
 46 | 
 47 |                 for threshold in thresholds:
 48 |                     # predict with polarity 1
 49 |                     p = 1
 50 |                     predictions = np.ones(n_samples)
 51 |                     predictions[X_column < threshold] = -1
 52 | 
 53 |                     # Error = sum of weights of misclassified samples
 54 |                     misclassified = w[y != predictions]
 55 |                     error = sum(misclassified)
 56 | 
 57 |                     if error > 0.5:
 58 |                         error = 1 - error
 59 |                         p = -1
 60 | 
 61 |                     # store the best configuration
 62 |                     if error < min_error:
 63 |                         clf.polarity = p
 64 |                         clf.threshold = threshold
 65 |                         clf.feature_idx = feature_i
 66 |                         min_error = error
 67 | 
 68 |             # calculate alpha
 69 |             EPS = 1e-10
 70 |             clf.alpha = 0.5 * np.log((1.0 - min_error + EPS) / (min_error + EPS))
 71 | 
 72 |             # calculate predictions and update weights
 73 |             predictions = clf.predict(X)
 74 | 
 75 |             w *= np.exp(-clf.alpha * y * predictions)
 76 |             # Normalize to one
 77 |             w /= np.sum(w)
 78 | 
 79 |             # Save classifier
 80 |             self.clfs.append(clf)
 81 | 
 82 |     def predict(self, X):
 83 |         clf_preds = [clf.alpha * clf.predict(X) for clf in self.clfs]
 84 |         y_pred = np.sum(clf_preds, axis=0)
 85 |         y_pred = np.sign(y_pred)
 86 | 
 87 |         return y_pred
 88 | 
 89 | 
 90 | # Testing
 91 | if __name__ == "__main__":
 92 |     # Imports
 93 |     from sklearn import datasets
 94 |     from sklearn.model_selection import train_test_split
 95 | 
 96 |     def accuracy(y_true, y_pred):
 97 |         accuracy = np.sum(y_true == y_pred) / len(y_true)
 98 |         return accuracy
 99 | 
100 |     data = datasets.load_breast_cancer()
101 |     X, y = data.data, data.target
102 | 
103 |     y[y == 0] = -1
104 | 
105 |     X_train, X_test, y_train, y_test = train_test_split(
106 |         X, y, test_size=0.2, random_state=5
107 |     )
108 | 
109 |     # Adaboost classification with 5 weak classifiers
110 |     clf = Adaboost(n_clf=5)
111 |     clf.fit(X_train, y_train)
112 |     y_pred = clf.predict(X_test)
113 | 
114 |     acc = accuracy(y_test, y_pred)
115 |     print("Accuracy:", acc)
116 | 


--------------------------------------------------------------------------------
/mlfromscratch/decision_tree.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | 
  3 | import numpy as np
  4 | 
  5 | 
  6 | def entropy(y):
  7 |     hist = np.bincount(y)
  8 |     ps = hist / len(y)
  9 |     return -np.sum([p * np.log2(p) for p in ps if p > 0])
 10 | 
 11 | 
 12 | class Node:
 13 |     def __init__(
 14 |         self, feature=None, threshold=None, left=None, right=None, *, value=None
 15 |     ):
 16 |         self.feature = feature
 17 |         self.threshold = threshold
 18 |         self.left = left
 19 |         self.right = right
 20 |         self.value = value
 21 | 
 22 |     def is_leaf_node(self):
 23 |         return self.value is not None
 24 | 
 25 | 
 26 | class DecisionTree:
 27 |     def __init__(self, min_samples_split=2, max_depth=100, n_feats=None):
 28 |         self.min_samples_split = min_samples_split
 29 |         self.max_depth = max_depth
 30 |         self.n_feats = n_feats
 31 |         self.root = None
 32 | 
 33 |     def fit(self, X, y):
 34 |         self.n_feats = X.shape[1] if not self.n_feats else min(self.n_feats, X.shape[1])
 35 |         self.root = self._grow_tree(X, y)
 36 | 
 37 |     def predict(self, X):
 38 |         return np.array([self._traverse_tree(x, self.root) for x in X])
 39 | 
 40 |     def _grow_tree(self, X, y, depth=0):
 41 |         n_samples, n_features = X.shape
 42 |         n_labels = len(np.unique(y))
 43 | 
 44 |         # stopping criteria
 45 |         if (
 46 |             depth >= self.max_depth
 47 |             or n_labels == 1
 48 |             or n_samples < self.min_samples_split
 49 |         ):
 50 |             leaf_value = self._most_common_label(y)
 51 |             return Node(value=leaf_value)
 52 | 
 53 |         feat_idxs = np.random.choice(n_features, self.n_feats, replace=False)
 54 | 
 55 |         # greedily select the best split according to information gain
 56 |         best_feat, best_thresh = self._best_criteria(X, y, feat_idxs)
 57 | 
 58 |         # grow the children that result from the split
 59 |         left_idxs, right_idxs = self._split(X[:, best_feat], best_thresh)
 60 |         left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth + 1)
 61 |         right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth + 1)
 62 |         return Node(best_feat, best_thresh, left, right)
 63 | 
 64 |     def _best_criteria(self, X, y, feat_idxs):
 65 |         best_gain = -1
 66 |         split_idx, split_thresh = None, None
 67 |         for feat_idx in feat_idxs:
 68 |             X_column = X[:, feat_idx]
 69 |             thresholds = np.unique(X_column)
 70 |             for threshold in thresholds:
 71 |                 gain = self._information_gain(y, X_column, threshold)
 72 | 
 73 |                 if gain > best_gain:
 74 |                     best_gain = gain
 75 |                     split_idx = feat_idx
 76 |                     split_thresh = threshold
 77 | 
 78 |         return split_idx, split_thresh
 79 | 
 80 |     def _information_gain(self, y, X_column, split_thresh):
 81 |         # parent loss
 82 |         parent_entropy = entropy(y)
 83 | 
 84 |         # generate split
 85 |         left_idxs, right_idxs = self._split(X_column, split_thresh)
 86 | 
 87 |         if len(left_idxs) == 0 or len(right_idxs) == 0:
 88 |             return 0
 89 | 
 90 |         # compute the weighted avg. of the loss for the children
 91 |         n = len(y)
 92 |         n_l, n_r = len(left_idxs), len(right_idxs)
 93 |         e_l, e_r = entropy(y[left_idxs]), entropy(y[right_idxs])
 94 |         child_entropy = (n_l / n) * e_l + (n_r / n) * e_r
 95 | 
 96 |         # information gain is difference in loss before vs. after split
 97 |         ig = parent_entropy - child_entropy
 98 |         return ig
 99 | 
100 |     def _split(self, X_column, split_thresh):
101 |         left_idxs = np.argwhere(X_column <= split_thresh).flatten()
102 |         right_idxs = np.argwhere(X_column > split_thresh).flatten()
103 |         return left_idxs, right_idxs
104 | 
105 |     def _traverse_tree(self, x, node):
106 |         if node.is_leaf_node():
107 |             return node.value
108 | 
109 |         if x[node.feature] <= node.threshold:
110 |             return self._traverse_tree(x, node.left)
111 |         return self._traverse_tree(x, node.right)
112 | 
113 |     def _most_common_label(self, y):
114 |         counter = Counter(y)
115 |         most_common = counter.most_common(1)[0][0]
116 |         return most_common
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     # Imports
121 |     from sklearn import datasets
122 |     from sklearn.model_selection import train_test_split
123 | 
124 |     def accuracy(y_true, y_pred):
125 |         accuracy = np.sum(y_true == y_pred) / len(y_true)
126 |         return accuracy
127 | 
128 |     data = datasets.load_breast_cancer()
129 |     X, y = data.data, data.target
130 | 
131 |     X_train, X_test, y_train, y_test = train_test_split(
132 |         X, y, test_size=0.2, random_state=1234
133 |     )
134 | 
135 |     clf = DecisionTree(max_depth=10)
136 |     clf.fit(X_train, y_train)
137 | 
138 |     y_pred = clf.predict(X_test)
139 |     acc = accuracy(y_test, y_pred)
140 | 
141 |     print("Accuracy:", acc)
142 | 


--------------------------------------------------------------------------------
/mlfromscratch/kmeans.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import numpy as np
  3 | 
  4 | np.random.seed(42)
  5 | 
  6 | 
  7 | def euclidean_distance(x1, x2):
  8 |     return np.sqrt(np.sum((x1 - x2) ** 2))
  9 | 
 10 | 
 11 | class KMeans:
 12 |     def __init__(self, K=5, max_iters=100, plot_steps=False):
 13 |         self.K = K
 14 |         self.max_iters = max_iters
 15 |         self.plot_steps = plot_steps
 16 | 
 17 |         # list of sample indices for each cluster
 18 |         self.clusters = [[] for _ in range(self.K)]
 19 |         # the centers (mean feature vector) for each cluster
 20 |         self.centroids = []
 21 | 
 22 |     def predict(self, X):
 23 |         self.X = X
 24 |         self.n_samples, self.n_features = X.shape
 25 | 
 26 |         # initialize
 27 |         random_sample_idxs = np.random.choice(self.n_samples, self.K, replace=False)
 28 |         self.centroids = [self.X[idx] for idx in random_sample_idxs]
 29 | 
 30 |         # Optimize clusters
 31 |         for _ in range(self.max_iters):
 32 |             # Assign samples to closest centroids (create clusters)
 33 |             self.clusters = self._create_clusters(self.centroids)
 34 | 
 35 |             if self.plot_steps:
 36 |                 self.plot()
 37 | 
 38 |             # Calculate new centroids from the clusters
 39 |             centroids_old = self.centroids
 40 |             self.centroids = self._get_centroids(self.clusters)
 41 | 
 42 |             # check if clusters have changed
 43 |             if self._is_converged(centroids_old, self.centroids):
 44 |                 break
 45 | 
 46 |             if self.plot_steps:
 47 |                 self.plot()
 48 | 
 49 |         # Classify samples as the index of their clusters
 50 |         return self._get_cluster_labels(self.clusters)
 51 | 
 52 |     def _get_cluster_labels(self, clusters):
 53 |         # each sample will get the label of the cluster it was assigned to
 54 |         labels = np.empty(self.n_samples)
 55 | 
 56 |         for cluster_idx, cluster in enumerate(clusters):
 57 |             for sample_index in cluster:
 58 |                 labels[sample_index] = cluster_idx
 59 |         return labels
 60 | 
 61 |     def _create_clusters(self, centroids):
 62 |         # Assign the samples to the closest centroids to create clusters
 63 |         clusters = [[] for _ in range(self.K)]
 64 |         for idx, sample in enumerate(self.X):
 65 |             centroid_idx = self._closest_centroid(sample, centroids)
 66 |             clusters[centroid_idx].append(idx)
 67 |         return clusters
 68 | 
 69 |     def _closest_centroid(self, sample, centroids):
 70 |         # distance of the current sample to each centroid
 71 |         distances = [euclidean_distance(sample, point) for point in centroids]
 72 |         closest_index = np.argmin(distances)
 73 |         return closest_index
 74 | 
 75 |     def _get_centroids(self, clusters):
 76 |         # assign mean value of clusters to centroids
 77 |         centroids = np.zeros((self.K, self.n_features))
 78 |         for cluster_idx, cluster in enumerate(clusters):
 79 |             cluster_mean = np.mean(self.X[cluster], axis=0)
 80 |             centroids[cluster_idx] = cluster_mean
 81 |         return centroids
 82 | 
 83 |     def _is_converged(self, centroids_old, centroids):
 84 |         # distances between each old and new centroids, fol all centroids
 85 |         distances = [
 86 |             euclidean_distance(centroids_old[i], centroids[i]) for i in range(self.K)
 87 |         ]
 88 |         return sum(distances) == 0
 89 | 
 90 |     def plot(self):
 91 |         fig, ax = plt.subplots(figsize=(12, 8))
 92 | 
 93 |         for i, index in enumerate(self.clusters):
 94 |             point = self.X[index].T
 95 |             ax.scatter(*point)
 96 | 
 97 |         for point in self.centroids:
 98 |             ax.scatter(*point, marker="x", color="black", linewidth=2)
 99 | 
100 |         plt.show()
101 | 
102 | 
103 | # Testing
104 | if __name__ == "__main__":
105 |     from sklearn.datasets import make_blobs
106 | 
107 |     X, y = make_blobs(
108 |         centers=3, n_samples=500, n_features=2, shuffle=True, random_state=40
109 |     )
110 |     print(X.shape)
111 | 
112 |     clusters = len(np.unique(y))
113 |     print(clusters)
114 | 
115 |     k = KMeans(K=clusters, max_iters=150, plot_steps=True)
116 |     y_pred = k.predict(X)
117 | 
118 |     k.plot()
119 | 


--------------------------------------------------------------------------------
/mlfromscratch/knn.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | def euclidean_distance(x1, x2):
 7 |     return np.sqrt(np.sum((x1 - x2) ** 2))
 8 | 
 9 | 
10 | class KNN:
11 |     def __init__(self, k=3):
12 |         self.k = k
13 | 
14 |     def fit(self, X, y):
15 |         self.X_train = X
16 |         self.y_train = y
17 | 
18 |     def predict(self, X):
19 |         y_pred = [self._predict(x) for x in X]
20 |         return np.array(y_pred)
21 | 
22 |     def _predict(self, x):
23 |         # Compute distances between x and all examples in the training set
24 |         distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
25 |         # Sort by distance and return indices of the first k neighbors
26 |         k_idx = np.argsort(distances)[: self.k]
27 |         # Extract the labels of the k nearest neighbor training samples
28 |         k_neighbor_labels = [self.y_train[i] for i in k_idx]
29 |         # return the most common class label
30 |         most_common = Counter(k_neighbor_labels).most_common(1)
31 |         return most_common[0][0]
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     # Imports
36 |     from matplotlib.colors import ListedColormap
37 |     from sklearn import datasets
38 |     from sklearn.model_selection import train_test_split
39 | 
40 |     cmap = ListedColormap(["#FF0000", "#00FF00", "#0000FF"])
41 | 
42 |     def accuracy(y_true, y_pred):
43 |         accuracy = np.sum(y_true == y_pred) / len(y_true)
44 |         return accuracy
45 | 
46 |     iris = datasets.load_iris()
47 |     X, y = iris.data, iris.target
48 | 
49 |     X_train, X_test, y_train, y_test = train_test_split(
50 |         X, y, test_size=0.2, random_state=1234
51 |     )
52 | 
53 |     k = 3
54 |     clf = KNN(k=k)
55 |     clf.fit(X_train, y_train)
56 |     predictions = clf.predict(X_test)
57 |     print("KNN classification accuracy", accuracy(y_test, predictions))
58 | 


--------------------------------------------------------------------------------
/mlfromscratch/lda.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class LDA:
 5 |     def __init__(self, n_components):
 6 |         self.n_components = n_components
 7 |         self.linear_discriminants = None
 8 | 
 9 |     def fit(self, X, y):
10 |         n_features = X.shape[1]
11 |         class_labels = np.unique(y)
12 | 
13 |         # Within class scatter matrix:
14 |         # SW = sum((X_c - mean_X_c)^2 )
15 | 
16 |         # Between class scatter:
17 |         # SB = sum( n_c * (mean_X_c - mean_overall)^2 )
18 | 
19 |         mean_overall = np.mean(X, axis=0)
20 |         SW = np.zeros((n_features, n_features))
21 |         SB = np.zeros((n_features, n_features))
22 |         for c in class_labels:
23 |             X_c = X[y == c]
24 |             mean_c = np.mean(X_c, axis=0)
25 |             # (4, n_c) * (n_c, 4) = (4,4) -> transpose
26 |             SW += (X_c - mean_c).T.dot((X_c - mean_c))
27 | 
28 |             # (4, 1) * (1, 4) = (4,4) -> reshape
29 |             n_c = X_c.shape[0]
30 |             mean_diff = (mean_c - mean_overall).reshape(n_features, 1)
31 |             SB += n_c * (mean_diff).dot(mean_diff.T)
32 | 
33 |         # Determine SW^-1 * SB
34 |         A = np.linalg.inv(SW).dot(SB)
35 |         # Get eigenvalues and eigenvectors of SW^-1 * SB
36 |         eigenvalues, eigenvectors = np.linalg.eig(A)
37 |         # -> eigenvector v = [:,i] column vector, transpose for easier calculations
38 |         # sort eigenvalues high to low
39 |         eigenvectors = eigenvectors.T
40 |         idxs = np.argsort(abs(eigenvalues))[::-1]
41 |         eigenvalues = eigenvalues[idxs]
42 |         eigenvectors = eigenvectors[idxs]
43 |         # store first n eigenvectors
44 |         self.linear_discriminants = eigenvectors[0 : self.n_components]
45 | 
46 |     def transform(self, X):
47 |         # project data
48 |         return np.dot(X, self.linear_discriminants.T)
49 | 
50 | 
51 | # Testing
52 | if __name__ == "__main__":
53 |     # Imports
54 |     import matplotlib.pyplot as plt
55 |     from sklearn import datasets
56 | 
57 |     data = datasets.load_iris()
58 |     X, y = data.data, data.target
59 | 
60 |     # Project the data onto the 2 primary linear discriminants
61 |     lda = LDA(2)
62 |     lda.fit(X, y)
63 |     X_projected = lda.transform(X)
64 | 
65 |     print("Shape of X:", X.shape)
66 |     print("Shape of transformed X:", X_projected.shape)
67 | 
68 |     x1, x2 = X_projected[:, 0], X_projected[:, 1]
69 | 
70 |     plt.scatter(
71 |         x1, x2, c=y, edgecolor="none", alpha=0.8, cmap=plt.cm.get_cmap("viridis", 3)
72 |     )
73 | 
74 |     plt.xlabel("Linear Discriminant 1")
75 |     plt.ylabel("Linear Discriminant 2")
76 |     plt.colorbar()
77 |     plt.show()
78 | 


--------------------------------------------------------------------------------
/mlfromscratch/linear_regression.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def r2_score(y_true, y_pred):
 5 |     corr_matrix = np.corrcoef(y_true, y_pred)
 6 |     corr = corr_matrix[0, 1]
 7 |     return corr ** 2
 8 | 
 9 | 
10 | class LinearRegression:
11 |     def __init__(self, learning_rate=0.001, n_iters=1000):
12 |         self.lr = learning_rate
13 |         self.n_iters = n_iters
14 |         self.weights = None
15 |         self.bias = None
16 | 
17 |     def fit(self, X, y):
18 |         n_samples, n_features = X.shape
19 | 
20 |         # init parameters
21 |         self.weights = np.zeros(n_features)
22 |         self.bias = 0
23 | 
24 |         # gradient descent
25 |         for _ in range(self.n_iters):
26 |             y_predicted = np.dot(X, self.weights) + self.bias
27 |             # compute gradients
28 |             dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
29 |             db = (1 / n_samples) * np.sum(y_predicted - y)
30 | 
31 |             # update parameters
32 |             self.weights -= self.lr * dw
33 |             self.bias -= self.lr * db
34 | 
35 |     def predict(self, X):
36 |         y_approximated = np.dot(X, self.weights) + self.bias
37 |         return y_approximated
38 | 
39 | 
40 | # Testing
41 | if __name__ == "__main__":
42 |     # Imports
43 |     import matplotlib.pyplot as plt
44 |     from sklearn.model_selection import train_test_split
45 |     from sklearn import datasets
46 | 
47 |     def mean_squared_error(y_true, y_pred):
48 |         return np.mean((y_true - y_pred) ** 2)
49 | 
50 |     X, y = datasets.make_regression(
51 |         n_samples=100, n_features=1, noise=20, random_state=4
52 |     )
53 | 
54 |     X_train, X_test, y_train, y_test = train_test_split(
55 |         X, y, test_size=0.2, random_state=1234
56 |     )
57 | 
58 |     regressor = LinearRegression(learning_rate=0.01, n_iters=1000)
59 |     regressor.fit(X_train, y_train)
60 |     predictions = regressor.predict(X_test)
61 | 
62 |     mse = mean_squared_error(y_test, predictions)
63 |     print("MSE:", mse)
64 | 
65 |     accu = r2_score(y_test, predictions)
66 |     print("Accuracy:", accu)
67 | 
68 |     y_pred_line = regressor.predict(X)
69 |     cmap = plt.get_cmap("viridis")
70 |     fig = plt.figure(figsize=(8, 6))
71 |     m1 = plt.scatter(X_train, y_train, color=cmap(0.9), s=10)
72 |     m2 = plt.scatter(X_test, y_test, color=cmap(0.5), s=10)
73 |     plt.plot(X, y_pred_line, color="black", linewidth=2, label="Prediction")
74 |     plt.show()
75 | 


--------------------------------------------------------------------------------
/mlfromscratch/load_data.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | # Download data from https://archive.ics.uci.edu/ml/datasets/spambase
 6 | FILE_NAME = "spambase.data"
 7 | 
 8 | # 1) load with csv file
 9 | with open(FILE_NAME, "r") as f:
10 |     data = list(csv.reader(f, delimiter=","))
11 | 
12 | data = np.array(data, dtype=np.float32)
13 | print(data.shape)
14 | 
15 | # 2) load with np.loadtxt()
16 | # skiprows=1
17 | data = np.loadtxt(FILE_NAME, delimiter=",", dtype=np.float32)
18 | print(data.shape, data.dtype)
19 | 
20 | # 3) load with np.genfromtxt()
21 | # skip_header=0, missing_values="---", filling_values=0.0
22 | data = np.genfromtxt(FILE_NAME, delimiter=",", dtype=np.float32)
23 | print(data.shape)
24 | 
25 | # split into X and y
26 | n_samples, n_features = data.shape
27 | n_features -= 1
28 | 
29 | X = data[:, 0:n_features]
30 | y = data[:, n_features]
31 | 
32 | print(X.shape, y.shape)
33 | print(X[0, 0:5])
34 | # or if y is the first column
35 | # X = data[:, 1:n_features+1]
36 | # y = data[:, 0]
37 | 
38 | # 4) load with pandas: read_csv()
39 | # na_values = ['---']
40 | df = pd.read_csv(FILE_NAME, header=None, skiprows=0, dtype=np.float32)
41 | df = df.fillna(0.0)
42 | 
43 | # dataframe to numpy
44 | data = df.to_numpy()
45 | print(data[4, 0:5])
46 | 
47 | # convert datatypes in numpy
48 | # data = np.asarray(data, dtype = np.float32)
49 | # print(data.dtype)
50 | 


--------------------------------------------------------------------------------
/mlfromscratch/logistic_regression.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class LogisticRegression:
 5 |     def __init__(self, learning_rate=0.001, n_iters=1000):
 6 |         self.lr = learning_rate
 7 |         self.n_iters = n_iters
 8 |         self.weights = None
 9 |         self.bias = None
10 | 
11 |     def fit(self, X, y):
12 |         n_samples, n_features = X.shape
13 | 
14 |         # init parameters
15 |         self.weights = np.zeros(n_features)
16 |         self.bias = 0
17 | 
18 |         # gradient descent
19 |         for _ in range(self.n_iters):
20 |             # approximate y with linear combination of weights and x, plus bias
21 |             linear_model = np.dot(X, self.weights) + self.bias
22 |             # apply sigmoid function
23 |             y_predicted = self._sigmoid(linear_model)
24 | 
25 |             # compute gradients
26 |             dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
27 |             db = (1 / n_samples) * np.sum(y_predicted - y)
28 |             # update parameters
29 |             self.weights -= self.lr * dw
30 |             self.bias -= self.lr * db
31 | 
32 |     def predict(self, X):
33 |         linear_model = np.dot(X, self.weights) + self.bias
34 |         y_predicted = self._sigmoid(linear_model)
35 |         y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
36 |         return np.array(y_predicted_cls)
37 | 
38 |     def _sigmoid(self, x):
39 |         return 1 / (1 + np.exp(-x))
40 | 
41 | 
42 | # Testing
43 | if __name__ == "__main__":
44 |     # Imports
45 |     from sklearn.model_selection import train_test_split
46 |     from sklearn import datasets
47 | 
48 |     def accuracy(y_true, y_pred):
49 |         accuracy = np.sum(y_true == y_pred) / len(y_true)
50 |         return accuracy
51 | 
52 |     bc = datasets.load_breast_cancer()
53 |     X, y = bc.data, bc.target
54 | 
55 |     X_train, X_test, y_train, y_test = train_test_split(
56 |         X, y, test_size=0.2, random_state=1234
57 |     )
58 | 
59 |     regressor = LogisticRegression(learning_rate=0.0001, n_iters=1000)
60 |     regressor.fit(X_train, y_train)
61 |     predictions = regressor.predict(X_test)
62 | 
63 |     print("LR classification accuracy:", accuracy(y_test, predictions))
64 | 


--------------------------------------------------------------------------------
/mlfromscratch/naivebayes.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class NaiveBayes:
 5 |     def fit(self, X, y):
 6 |         n_samples, n_features = X.shape
 7 |         self._classes = np.unique(y)
 8 |         n_classes = len(self._classes)
 9 | 
10 |         # calculate mean, var, and prior for each class
11 |         self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
12 |         self._var = np.zeros((n_classes, n_features), dtype=np.float64)
13 |         self._priors = np.zeros(n_classes, dtype=np.float64)
14 | 
15 |         for idx, c in enumerate(self._classes):
16 |             X_c = X[y == c]
17 |             self._mean[idx, :] = X_c.mean(axis=0)
18 |             self._var[idx, :] = X_c.var(axis=0)
19 |             self._priors[idx] = X_c.shape[0] / float(n_samples)
20 | 
21 |     def predict(self, X):
22 |         y_pred = [self._predict(x) for x in X]
23 |         return np.array(y_pred)
24 | 
25 |     def _predict(self, x):
26 |         posteriors = []
27 | 
28 |         # calculate posterior probability for each class
29 |         for idx, c in enumerate(self._classes):
30 |             prior = np.log(self._priors[idx])
31 |             posterior = np.sum(np.log(self._pdf(idx, x)))
32 |             posterior = prior + posterior
33 |             posteriors.append(posterior)
34 | 
35 |         # return class with highest posterior probability
36 |         return self._classes[np.argmax(posteriors)]
37 | 
38 |     def _pdf(self, class_idx, x):
39 |         mean = self._mean[class_idx]
40 |         var = self._var[class_idx]
41 |         numerator = np.exp(-((x - mean) ** 2) / (2 * var))
42 |         denominator = np.sqrt(2 * np.pi * var)
43 |         return numerator / denominator
44 | 
45 | 
46 | # Testing
47 | if __name__ == "__main__":
48 |     # Imports
49 |     from sklearn.model_selection import train_test_split
50 |     from sklearn import datasets
51 | 
52 |     def accuracy(y_true, y_pred):
53 |         accuracy = np.sum(y_true == y_pred) / len(y_true)
54 |         return accuracy
55 | 
56 |     X, y = datasets.make_classification(
57 |         n_samples=1000, n_features=10, n_classes=2, random_state=123
58 |     )
59 |     X_train, X_test, y_train, y_test = train_test_split(
60 |         X, y, test_size=0.2, random_state=123
61 |     )
62 | 
63 |     nb = NaiveBayes()
64 |     nb.fit(X_train, y_train)
65 |     predictions = nb.predict(X_test)
66 | 
67 |     print("Naive Bayes classification accuracy", accuracy(y_test, predictions))
68 | 


--------------------------------------------------------------------------------
/mlfromscratch/pca.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class PCA:
 5 |     def __init__(self, n_components):
 6 |         self.n_components = n_components
 7 |         self.components = None
 8 |         self.mean = None
 9 | 
10 |     def fit(self, X):
11 |         # Mean centering
12 |         self.mean = np.mean(X, axis=0)
13 |         X = X - self.mean
14 | 
15 |         # covariance, function needs samples as columns
16 |         cov = np.cov(X.T)
17 | 
18 |         # eigenvalues, eigenvectors
19 |         eigenvalues, eigenvectors = np.linalg.eig(cov)
20 | 
21 |         # -> eigenvector v = [:,i] column vector, transpose for easier calculations
22 |         # sort eigenvectors
23 |         eigenvectors = eigenvectors.T
24 |         idxs = np.argsort(eigenvalues)[::-1]
25 |         eigenvalues = eigenvalues[idxs]
26 |         eigenvectors = eigenvectors[idxs]
27 | 
28 |         # store first n eigenvectors
29 |         self.components = eigenvectors[0 : self.n_components]
30 | 
31 |     def transform(self, X):
32 |         # project data
33 |         X = X - self.mean
34 |         return np.dot(X, self.components.T)
35 | 
36 | 
37 | # Testing
38 | if __name__ == "__main__":
39 |     # Imports
40 |     import matplotlib.pyplot as plt
41 |     from sklearn import datasets
42 | 
43 |     # data = datasets.load_digits()
44 |     data = datasets.load_iris()
45 |     X = data.data
46 |     y = data.target
47 | 
48 |     # Project the data onto the 2 primary principal components
49 |     pca = PCA(2)
50 |     pca.fit(X)
51 |     X_projected = pca.transform(X)
52 | 
53 |     print("Shape of X:", X.shape)
54 |     print("Shape of transformed X:", X_projected.shape)
55 | 
56 |     x1 = X_projected[:, 0]
57 |     x2 = X_projected[:, 1]
58 | 
59 |     plt.scatter(
60 |         x1, x2, c=y, edgecolor="none", alpha=0.8, cmap=plt.cm.get_cmap("viridis", 3)
61 |     )
62 | 
63 |     plt.xlabel("Principal Component 1")
64 |     plt.ylabel("Principal Component 2")
65 |     plt.colorbar()
66 |     plt.show()
67 | 


--------------------------------------------------------------------------------
/mlfromscratch/perceptron.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class Perceptron:
 5 |     def __init__(self, learning_rate=0.01, n_iters=1000):
 6 |         self.lr = learning_rate
 7 |         self.n_iters = n_iters
 8 |         self.activation_func = self._unit_step_func
 9 |         self.weights = None
10 |         self.bias = None
11 | 
12 |     def fit(self, X, y):
13 |         n_samples, n_features = X.shape
14 | 
15 |         # init parameters
16 |         self.weights = np.zeros(n_features)
17 |         self.bias = 0
18 | 
19 |         y_ = np.array([1 if i > 0 else 0 for i in y])
20 | 
21 |         for _ in range(self.n_iters):
22 | 
23 |             for idx, x_i in enumerate(X):
24 | 
25 |                 linear_output = np.dot(x_i, self.weights) + self.bias
26 |                 y_predicted = self.activation_func(linear_output)
27 | 
28 |                 # Perceptron update rule
29 |                 update = self.lr * (y_[idx] - y_predicted)
30 | 
31 |                 self.weights += update * x_i
32 |                 self.bias += update
33 | 
34 |     def predict(self, X):
35 |         linear_output = np.dot(X, self.weights) + self.bias
36 |         y_predicted = self.activation_func(linear_output)
37 |         return y_predicted
38 | 
39 |     def _unit_step_func(self, x):
40 |         return np.where(x >= 0, 1, 0)
41 | 
42 | 
43 | # Testing
44 | if __name__ == "__main__":
45 |     # Imports
46 |     import matplotlib.pyplot as plt
47 |     from sklearn.model_selection import train_test_split
48 |     from sklearn import datasets
49 | 
50 |     def accuracy(y_true, y_pred):
51 |         accuracy = np.sum(y_true == y_pred) / len(y_true)
52 |         return accuracy
53 | 
54 |     X, y = datasets.make_blobs(
55 |         n_samples=150, n_features=2, centers=2, cluster_std=1.05, random_state=2
56 |     )
57 |     X_train, X_test, y_train, y_test = train_test_split(
58 |         X, y, test_size=0.2, random_state=123
59 |     )
60 | 
61 |     p = Perceptron(learning_rate=0.01, n_iters=1000)
62 |     p.fit(X_train, y_train)
63 |     predictions = p.predict(X_test)
64 | 
65 |     print("Perceptron classification accuracy", accuracy(y_test, predictions))
66 | 
67 |     fig = plt.figure()
68 |     ax = fig.add_subplot(1, 1, 1)
69 |     plt.scatter(X_train[:, 0], X_train[:, 1], marker="o", c=y_train)
70 | 
71 |     x0_1 = np.amin(X_train[:, 0])
72 |     x0_2 = np.amax(X_train[:, 0])
73 | 
74 |     x1_1 = (-p.weights[0] * x0_1 - p.bias) / p.weights[1]
75 |     x1_2 = (-p.weights[0] * x0_2 - p.bias) / p.weights[1]
76 | 
77 |     ax.plot([x0_1, x0_2], [x1_1, x1_2], "k")
78 | 
79 |     ymin = np.amin(X_train[:, 1])
80 |     ymax = np.amax(X_train[:, 1])
81 |     ax.set_ylim([ymin - 3, ymax + 3])
82 | 
83 |     plt.show()
84 | 


--------------------------------------------------------------------------------
/mlfromscratch/random_forest.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | 
 3 | import numpy as np
 4 | 
 5 | from .decision_tree import DecisionTree
 6 | 
 7 | 
 8 | def bootstrap_sample(X, y):
 9 |     n_samples = X.shape[0]
10 |     idxs = np.random.choice(n_samples, n_samples, replace=True)
11 |     return X[idxs], y[idxs]
12 | 
13 | 
14 | def most_common_label(y):
15 |     counter = Counter(y)
16 |     most_common = counter.most_common(1)[0][0]
17 |     return most_common
18 | 
19 | 
20 | class RandomForest:
21 |     def __init__(self, n_trees=10, min_samples_split=2, max_depth=100, n_feats=None):
22 |         self.n_trees = n_trees
23 |         self.min_samples_split = min_samples_split
24 |         self.max_depth = max_depth
25 |         self.n_feats = n_feats
26 |         self.trees = []
27 | 
28 |     def fit(self, X, y):
29 |         self.trees = []
30 |         for _ in range(self.n_trees):
31 |             tree = DecisionTree(
32 |                 min_samples_split=self.min_samples_split,
33 |                 max_depth=self.max_depth,
34 |                 n_feats=self.n_feats,
35 |             )
36 |             X_samp, y_samp = bootstrap_sample(X, y)
37 |             tree.fit(X_samp, y_samp)
38 |             self.trees.append(tree)
39 | 
40 |     def predict(self, X):
41 |         tree_preds = np.array([tree.predict(X) for tree in self.trees])
42 |         tree_preds = np.swapaxes(tree_preds, 0, 1)
43 |         y_pred = [most_common_label(tree_pred) for tree_pred in tree_preds]
44 |         return np.array(y_pred)
45 | 
46 | 
47 | # Testing
48 | if __name__ == "__main__":
49 |     # Imports
50 |     from sklearn import datasets
51 |     from sklearn.model_selection import train_test_split
52 | 
53 |     def accuracy(y_true, y_pred):
54 |         accuracy = np.sum(y_true == y_pred) / len(y_true)
55 |         return accuracy
56 | 
57 |     data = datasets.load_breast_cancer()
58 |     X = data.data
59 |     y = data.target
60 | 
61 |     X_train, X_test, y_train, y_test = train_test_split(
62 |         X, y, test_size=0.2, random_state=1234
63 |     )
64 | 
65 |     clf = RandomForest(n_trees=3, max_depth=10)
66 | 
67 |     clf.fit(X_train, y_train)
68 |     y_pred = clf.predict(X_test)
69 |     acc = accuracy(y_test, y_pred)
70 | 
71 |     print("Accuracy:", acc)
72 | 


--------------------------------------------------------------------------------
/mlfromscratch/regression.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | class BaseRegression:
  5 |     def __init__(self, learning_rate: float = 0.001, n_iters: int = 1000):
  6 |         # Assign the variables
  7 |         self.learning_rate = learning_rate
  8 |         self.n_iters = n_iters
  9 | 
 10 |         # Weights and bias
 11 |         self.weights, self.bias = None, None
 12 | 
 13 |     def fit(self, X, y):
 14 |         n_samples, n_features = X.shape
 15 | 
 16 |         self.weights, self.bias = np.zeros(n_features), 0
 17 | 
 18 |         # Minimizing loss, and finding the correct Weights and biases using Gradient Descent
 19 |         for _ in range(self.n_iters):
 20 |             y_predicted = self._approximation(X, self.weights, self.bias)
 21 | 
 22 |             dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
 23 |             db = (1 / n_samples) * np.sum(y_predicted - y)
 24 | 
 25 |             self.weights -= self.learning_rate * dw
 26 |             self.bias -= self.learning_rate * db
 27 | 
 28 |     def predict(self, X):
 29 |         return self._predict(X, self.weights, self.bias)
 30 | 
 31 |     def _predict(self, X, w, b):
 32 |         raise NotImplementedError
 33 | 
 34 |     def _approximation(self, X, w, b):
 35 |         raise NotImplementedError
 36 | 
 37 | 
 38 | class LinearRegression(BaseRegression):
 39 |     def _approximation(self, X, w, b):
 40 |         return np.dot(X, w) + b
 41 | 
 42 |     def _predict(self, X, w, b):
 43 |         return np.dot(X, w) + b
 44 | 
 45 | 
 46 | class LogisticRegression(BaseRegression):
 47 |     def _approximation(self, X, w, b):
 48 |         linear_model = np.dot(X, w) + b
 49 |         return self._sigmoid(linear_model)
 50 | 
 51 |     def _predict(self, X, w, b):
 52 |         linear_model = np.dot(X, w) + b
 53 |         y_predicted = self._sigmoid(linear_model)
 54 |         y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
 55 |         return np.array(y_predicted_cls)
 56 | 
 57 |     def _sigmoid(self, x):
 58 |         return 1 / (np.exp(-x) + 1)
 59 | 
 60 | 
 61 | # Testing
 62 | if __name__ == "__main__":
 63 |     # Imports
 64 |     from sklearn.model_selection import train_test_split
 65 |     from sklearn import datasets
 66 | 
 67 |     # Utils
 68 |     def r2_score(y_true, y_pred):
 69 |         corr_matrix = np.corrcoef(y_true, y_pred)
 70 |         corr = corr_matrix[0, 1]
 71 |         return corr ** 2
 72 | 
 73 |     def mean_squared_error(y_true, y_pred):
 74 |         return np.mean((y_true - y_pred) ** 2)
 75 | 
 76 |     def accuracy(y_true, y_pred):
 77 |         accuracy = np.sum(y_true == y_pred) / len(y_true)
 78 |         return accuracy
 79 | 
 80 |     # Linear Regression
 81 |     X, y = datasets.make_regression(
 82 |         n_samples=100, n_features=1, noise=20, random_state=4
 83 |     )
 84 | 
 85 |     X_train, X_test, y_train, y_test = train_test_split(
 86 |         X, y, test_size=0.2, random_state=1234
 87 |     )
 88 | 
 89 |     regressor = LinearRegression(learning_rate=0.01, n_iters=1000)
 90 |     regressor.fit(X_train, y_train)
 91 |     predictions = regressor.predict(X_test)
 92 | 
 93 |     accu = r2_score(y_test, predictions)
 94 |     print("Linear reg Accuracy:", accu)
 95 | 
 96 |     # Logistic reg
 97 |     bc = datasets.load_breast_cancer()
 98 |     X, y = bc.data, bc.target
 99 | 
100 |     X_train, X_test, y_train, y_test = train_test_split(
101 |         X, y, test_size=0.2, random_state=1234
102 |     )
103 | 
104 |     regressor = LogisticRegression(learning_rate=0.0001, n_iters=1000)
105 |     regressor.fit(X_train, y_train)
106 |     predictions = regressor.predict(X_test)
107 | 
108 |     print("Logistic reg classification accuracy:", accuracy(y_test, predictions))
109 | 


--------------------------------------------------------------------------------
/mlfromscratch/svm.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class SVM:
 5 |     def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iters=1000):
 6 |         self.lr = learning_rate
 7 |         self.lambda_param = lambda_param
 8 |         self.n_iters = n_iters
 9 |         self.w = None
10 |         self.b = None
11 | 
12 |     def fit(self, X, y):
13 |         n_samples, n_features = X.shape
14 | 
15 |         y_ = np.where(y <= 0, -1, 1)
16 | 
17 |         self.w = np.zeros(n_features)
18 |         self.b = 0
19 | 
20 |         for _ in range(self.n_iters):
21 |             for idx, x_i in enumerate(X):
22 |                 condition = y_[idx] * (np.dot(x_i, self.w) - self.b) >= 1
23 |                 if condition:
24 |                     self.w -= self.lr * (2 * self.lambda_param * self.w)
25 |                 else:
26 |                     self.w -= self.lr * (
27 |                         2 * self.lambda_param * self.w - np.dot(x_i, y_[idx])
28 |                     )
29 |                     self.b -= self.lr * y_[idx]
30 | 
31 |     def predict(self, X):
32 |         approx = np.dot(X, self.w) - self.b
33 |         return np.sign(approx)
34 | 
35 | 
36 | # Testing
37 | if __name__ == "__main__":
38 |     # Imports
39 |     from sklearn import datasets
40 |     import matplotlib.pyplot as plt
41 | 
42 |     X, y = datasets.make_blobs(
43 |         n_samples=50, n_features=2, centers=2, cluster_std=1.05, random_state=40
44 |     )
45 |     y = np.where(y == 0, -1, 1)
46 | 
47 |     clf = SVM()
48 |     clf.fit(X, y)
49 |     # predictions = clf.predict(X)
50 | 
51 |     print(clf.w, clf.b)
52 | 
53 |     def visualize_svm():
54 |         def get_hyperplane_value(x, w, b, offset):
55 |             return (-w[0] * x + b + offset) / w[1]
56 | 
57 |         fig = plt.figure()
58 |         ax = fig.add_subplot(1, 1, 1)
59 |         plt.scatter(X[:, 0], X[:, 1], marker="o", c=y)
60 | 
61 |         x0_1 = np.amin(X[:, 0])
62 |         x0_2 = np.amax(X[:, 0])
63 | 
64 |         x1_1 = get_hyperplane_value(x0_1, clf.w, clf.b, 0)
65 |         x1_2 = get_hyperplane_value(x0_2, clf.w, clf.b, 0)
66 | 
67 |         x1_1_m = get_hyperplane_value(x0_1, clf.w, clf.b, -1)
68 |         x1_2_m = get_hyperplane_value(x0_2, clf.w, clf.b, -1)
69 | 
70 |         x1_1_p = get_hyperplane_value(x0_1, clf.w, clf.b, 1)
71 |         x1_2_p = get_hyperplane_value(x0_2, clf.w, clf.b, 1)
72 | 
73 |         ax.plot([x0_1, x0_2], [x1_1, x1_2], "y--")
74 |         ax.plot([x0_1, x0_2], [x1_1_m, x1_2_m], "k")
75 |         ax.plot([x0_1, x0_2], [x1_1_p, x1_2_p], "k")
76 | 
77 |         x1_min = np.amin(X[:, 1])
78 |         x1_max = np.amax(X[:, 1])
79 |         ax.set_ylim([x1_min - 3, x1_max + 3])
80 | 
81 |         plt.show()
82 | 
83 |     visualize_svm()
84 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.22.0
2 | scikit-learn==0.24.2
3 | matplotlib==3.4.2
4 | pandas==1.2.4
5 | 


--------------------------------------------------------------------------------