├── knn.py ├── README.md ├── logistic_regression.py ├── pca.py ├── random_forest.py ├── linear_regression.py ├── perceptron.py ├── k_means.py └── decision_tree /knn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import Counter 3 | 4 | def euclidean_distance(x1, x2): 5 | return np.sqrt(np.sum((x1 - x2) ** 2)) 6 | 7 | class KNN: 8 | def __init__(self, k=3): 9 | self.k = k 10 | 11 | def fit(self, X, y): 12 | self.X_train = X 13 | self.y_train = y 14 | 15 | def predict(self, X): 16 | return np.array([self._predict(x) for x in X]) 17 | 18 | def _predict(self, x): 19 | # Compute distances to all training points 20 | distances = [euclidean_distance(x, x_train) for x_train in self.X_train] 21 | # Get indices of k nearest points 22 | k_indices = np.argsort(distances)[:self.k] 23 | # Fetch the labels of those points 24 | k_labels = [self.y_train[i] for i in k_indices] 25 | # Return the most common label 26 | return Counter(k_labels).most_common(1)[0][0] 27 | 28 | # Testing 29 | if __name__ == "__main__": 30 | from sklearn.datasets import load_iris 31 | from sklearn.model_selection import train_test_split 32 | from matplotlib.colors import ListedColormap 33 | 34 | def accuracy(y_true, y_pred): 35 | return np.mean(y_true == y_pred) 36 | 37 | X, y = load_iris(return_X_y=True) 38 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 39 | 40 | model = KNN(k=3) 41 | model.fit(X_train, y_train) 42 | preds = model.predict(X_test) 43 | 44 | print("KNN Classification Accuracy:", accuracy(y_test, preds)) 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ScratchML: ML Algorithms from Scratch 2 | 3 | This repo is a growing collection of scratch implementations of the most common conventional machine learning algorithms — built without relying on high-level libraries like scikit-learn. 4 | 5 | Each algorithm is written with clarity and learning in mind, making it easier to understand what’s happening under the hood. 6 | 7 | 🚀 What's Inside (so far) 8 | ✅ Linear Regression 9 | 10 | ✅ Logistic Regression 11 | 12 | ✅ K-Nearest Neighbors (KNN) 13 | 14 | ✅ Decision Tree 15 | 16 | ✅ Random Forest 17 | 18 | ✅ Principal Component Analysis (PCA) 19 | 20 | ✅ K-Means Clustering 21 | 22 | More coming soon... 23 | 24 | 📌 Why this repo? 25 | If you're learning ML or mentoring others, you'll know how important it is to truly understand the core logic behind each algorithm. This repo is my way of reinforcing that understanding — and helping others do the same. 26 | 27 | ## How to Use 28 | Clone the repo and run any file directly — no extra dependencies required beyond the basics (numpy, matplotlib, sklearn for datasets). 29 | 30 | 31 | git clone https://github.com/your-username/scratchml.git 32 | cd scratchml 33 | python linear_regression.py 34 | 35 | 36 | # 🔄 Still a Work in Progress 37 | I'll keep adding more algorithms and improving the existing ones — feel free to follow along, give feedback, or even contribute if you're into this kind of hands-on ML learning! 38 | 39 | 🙌 Stay connected 40 | If you're learning ML from a non-tech background or transitioning into the field, feel free to connect with me on LinkedIn — always happy to help! 41 | -------------------------------------------------------------------------------- /logistic_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class LogisticRegression: 4 | def __init__(self, learning_rate=0.001, n_iters=1000): 5 | self.lr = learning_rate 6 | self.n_iters = n_iters 7 | self.weights = None 8 | self.bias = None 9 | 10 | def fit(self, X, y): 11 | n_samples, n_features = X.shape 12 | self.weights = np.zeros(n_features) 13 | self.bias = 0 14 | 15 | for _ in range(self.n_iters): 16 | linear = np.dot(X, self.weights) + self.bias 17 | y_pred = self._sigmoid(linear) 18 | 19 | dw = (1 / n_samples) * np.dot(X.T, (y_pred - y)) 20 | db = (1 / n_samples) * np.sum(y_pred - y) 21 | 22 | self.weights -= self.lr * dw 23 | self.bias -= self.lr * db 24 | 25 | def predict(self, X): 26 | linear = np.dot(X, self.weights) + self.bias 27 | probs = self._sigmoid(linear) 28 | return np.array([1 if p > 0.5 else 0 for p in probs]) 29 | 30 | def _sigmoid(self, x): 31 | return 1 / (1 + np.exp(-x)) 32 | 33 | # Testing it 34 | if __name__ == "__main__": 35 | from sklearn.datasets import load_breast_cancer 36 | from sklearn.model_selection import train_test_split 37 | 38 | def accuracy(y_true, y_pred): 39 | return np.mean(y_true == y_pred) 40 | 41 | X, y = load_breast_cancer(return_X_y=True) 42 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 43 | 44 | model = LogisticRegression(learning_rate=0.0001, n_iters=1000) 45 | model.fit(X_train, y_train) 46 | preds = model.predict(X_test) 47 | 48 | print("Accuracy:", accuracy(y_test, preds)) 49 | -------------------------------------------------------------------------------- /pca.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class PCA: 4 | def __init__(self, n_components): 5 | self.n_components = n_components 6 | self.components = None 7 | self.mean = None 8 | 9 | def fit(self, X): 10 | # Center the data 11 | self.mean = np.mean(X, axis=0) 12 | X_centered = X - self.mean 13 | 14 | # Compute covariance matrix 15 | cov_matrix = np.cov(X_centered.T) 16 | 17 | # Get eigenvalues and eigenvectors 18 | eigenvalues, eigenvectors = np.linalg.eig(cov_matrix) 19 | 20 | # Sort eigenvectors by descending eigenvalues 21 | sorted_idx = np.argsort(eigenvalues)[::-1] 22 | eigenvectors = eigenvectors[:, sorted_idx] 23 | 24 | # Keep only top n components 25 | self.components = eigenvectors[:, :self.n_components].T 26 | 27 | def transform(self, X): 28 | X_centered = X - self.mean 29 | return np.dot(X_centered, self.components.T) 30 | 31 | # Testing it 32 | if __name__ == "__main__": 33 | import matplotlib.pyplot as plt 34 | from sklearn.datasets import load_iris 35 | 36 | X, y = load_iris(return_X_y=True) 37 | 38 | pca = PCA(n_components=2) 39 | pca.fit(X) 40 | X_reduced = pca.transform(X) 41 | 42 | print("Original shape:", X.shape) 43 | print("Reduced shape:", X_reduced.shape) 44 | 45 | plt.scatter( 46 | X_reduced[:, 0], 47 | X_reduced[:, 1], 48 | c=y, 49 | cmap=plt.cm.get_cmap("viridis", 3), 50 | alpha=0.8, 51 | edgecolors="none" 52 | ) 53 | plt.xlabel("Principal Component 1") 54 | plt.ylabel("Principal Component 2") 55 | plt.colorbar() 56 | plt.show() 57 | -------------------------------------------------------------------------------- /random_forest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import Counter 3 | from decision_tree import DecisionTree 4 | 5 | def bootstrap_sample(X, y): 6 | n_samples = X.shape[0] 7 | indices = np.random.choice(n_samples, n_samples, replace=True) 8 | return X[indices], y[indices] 9 | 10 | def most_common_label(y): 11 | return Counter(y).most_common(1)[0][0] 12 | 13 | class RandomForest: 14 | def __init__(self, n_trees=10, min_samples_split=2, max_depth=100, n_feats=None): 15 | self.n_trees = n_trees 16 | self.min_samples_split = min_samples_split 17 | self.max_depth = max_depth 18 | self.n_feats = n_feats 19 | self.trees = [] 20 | 21 | def fit(self, X, y): 22 | self.trees = [] 23 | for _ in range(self.n_trees): 24 | tree = DecisionTree( 25 | min_samples_split=self.min_samples_split, 26 | max_depth=self.max_depth, 27 | n_feats=self.n_feats 28 | ) 29 | X_sample, y_sample = bootstrap_sample(X, y) 30 | tree.fit(X_sample, y_sample) 31 | self.trees.append(tree) 32 | 33 | def predict(self, X): 34 | tree_preds = np.array([tree.predict(X) for tree in self.trees]) 35 | # Transpose to shape (n_samples, n_trees) 36 | tree_preds = tree_preds.T 37 | return np.array([most_common_label(preds) for preds in tree_preds]) 38 | 39 | # Testing it 40 | if __name__ == "__main__": 41 | from sklearn.datasets import load_breast_cancer 42 | from sklearn.model_selection import train_test_split 43 | 44 | def accuracy(y_true, y_pred): 45 | return np.mean(y_true == y_pred) 46 | 47 | X, y = load_breast_cancer(return_X_y=True) 48 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 49 | 50 | model = RandomForest(n_trees=3, max_depth=10) 51 | model.fit(X_train, y_train) 52 | preds = model.predict(X_test) 53 | 54 | print("Accuracy:", accuracy(y_test, preds)) 55 | -------------------------------------------------------------------------------- /linear_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def r2_score(y_true, y_pred): 5 | corr_matrix = np.corrcoef(y_true, y_pred) 6 | corr = corr_matrix[0, 1] 7 | return corr ** 2 8 | 9 | 10 | class LinearRegression: 11 | def __init__(self, learning_rate=0.001, n_iters=1000): 12 | self.lr = learning_rate 13 | self.n_iters = n_iters 14 | self.weights = None 15 | self.bias = None 16 | 17 | def fit(self, X, y): 18 | n_samples, n_features = X.shape 19 | 20 | # init parameters 21 | self.weights = np.zeros(n_features) 22 | self.bias = 0 23 | 24 | # gradient descent 25 | for _ in range(self.n_iters): 26 | y_predicted = np.dot(X, self.weights) + self.bias 27 | # compute gradients 28 | dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y)) 29 | db = (1 / n_samples) * np.sum(y_predicted - y) 30 | 31 | # update parameters 32 | self.weights -= self.lr * dw 33 | self.bias -= self.lr * db 34 | 35 | def predict(self, X): 36 | y_approximated = np.dot(X, self.weights) + self.bias 37 | return y_approximated 38 | 39 | 40 | # Testing 41 | if __name__ == "__main__": 42 | # Imports 43 | import matplotlib.pyplot as plt 44 | from sklearn.model_selection import train_test_split 45 | from sklearn import datasets 46 | 47 | def mean_squared_error(y_true, y_pred): 48 | return np.mean((y_true - y_pred) ** 2) 49 | 50 | X, y = datasets.make_regression( 51 | n_samples=100, n_features=1, noise=20, random_state=4 52 | ) 53 | 54 | X_train, X_test, y_train, y_test = train_test_split( 55 | X, y, test_size=0.2, random_state=1234 56 | ) 57 | 58 | regressor = LinearRegression(learning_rate=0.01, n_iters=1000) 59 | regressor.fit(X_train, y_train) 60 | predictions = regressor.predict(X_test) 61 | 62 | mse = mean_squared_error(y_test, predictions) 63 | print("MSE:", mse) 64 | 65 | accu = r2_score(y_test, predictions) 66 | print("Accuracy:", accu) 67 | 68 | y_pred_line = regressor.predict(X) 69 | cmap = plt.get_cmap("viridis") 70 | fig = plt.figure(figsize=(8, 6)) 71 | m1 = plt.scatter(X_train, y_train, color=cmap(0.9), s=10) 72 | m2 = plt.scatter(X_test, y_test, color=cmap(0.5), s=10) 73 | plt.plot(X, y_pred_line, color="black", linewidth=2, label="Prediction") 74 | plt.show() -------------------------------------------------------------------------------- /perceptron.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | 5 | class Perceptron: 6 | def __init__(self, learning_rate=0.01, n_iters=1000): 7 | self.lr = learning_rate 8 | self.n_iters = n_iters 9 | self.activation_func = self._unit_step_func 10 | self.weights = None 11 | self.bias = None 12 | 13 | def fit(self, X, y): 14 | n_samples, n_features = X.shape 15 | 16 | # init parameters 17 | self.weights = np.zeros(n_features) 18 | self.bias = 0 19 | 20 | y_ = np.array([1 if i > 0 else 0 for i in y]) 21 | 22 | for _ in range(self.n_iters): 23 | 24 | for idx, x_i in enumerate(X): 25 | 26 | linear_output = np.dot(x_i, self.weights) + self.bias 27 | y_predicted = self.activation_func(linear_output) 28 | 29 | # Perceptron update rule 30 | update = self.lr * (y_[idx] - y_predicted) 31 | 32 | self.weights += update * x_i 33 | self.bias += update 34 | 35 | def predict(self, X): 36 | linear_output = np.dot(X, self.weights) + self.bias 37 | y_predicted = self.activation_func(linear_output) 38 | return y_predicted 39 | 40 | def _unit_step_func(self, x): 41 | return np.where(x >= 0, 1, 0) 42 | 43 | 44 | # Testing 45 | if __name__ == "__main__": 46 | # Imports 47 | import matplotlib.pyplot as plt 48 | from sklearn.model_selection import train_test_split 49 | from sklearn import datasets 50 | 51 | def accuracy(y_true, y_pred): 52 | accuracy = np.sum(y_true == y_pred) / len(y_true) 53 | return accuracy 54 | 55 | X, y = datasets.make_blobs( 56 | n_samples=150, n_features=2, centers=2, cluster_std=1.05, random_state=2 57 | ) 58 | X_train, X_test, y_train, y_test = train_test_split( 59 | X, y, test_size=0.2, random_state=123 60 | ) 61 | 62 | p = Perceptron(learning_rate=0.01, n_iters=1000) 63 | p.fit(X_train, y_train) 64 | predictions = p.predict(X_test) 65 | 66 | print("Perceptron classification accuracy", accuracy(y_test, predictions)) 67 | 68 | fig = plt.figure() 69 | ax = fig.add_subplot(1, 1, 1) 70 | plt.scatter(X_train[:, 0], X_train[:, 1], marker="o", c=y_train) 71 | 72 | x0_1 = np.amin(X_train[:, 0]) 73 | x0_2 = np.amax(X_train[:, 0]) 74 | 75 | x1_1 = (-p.weights[0] * x0_1 - p.bias) / p.weights[1] 76 | x1_2 = (-p.weights[0] * x0_2 - p.bias) / p.weights[1] 77 | 78 | ax.plot([x0_1, x0_2], [x1_1, x1_2], "k") 79 | 80 | ymin = np.amin(X_train[:, 1]) 81 | ymax = np.amax(X_train[:, 1]) 82 | ax.set_ylim([ymin - 3, ymax + 3]) 83 | 84 | plt.show() 85 | -------------------------------------------------------------------------------- /k_means.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | np.random.seed(42) 5 | 6 | def euclidean_distance(x1, x2): 7 | return np.sqrt(np.sum((x1 - x2) ** 2)) 8 | 9 | class KMeans: 10 | def __init__(self, K=3, max_iters=100, plot_steps=False): 11 | self.K = K 12 | self.max_iters = max_iters 13 | self.plot_steps = plot_steps 14 | self.clusters = [[] for _ in range(self.K)] 15 | self.centroids = [] 16 | 17 | def predict(self, X): 18 | self.X = X 19 | self.n_samples, self.n_features = X.shape 20 | 21 | # Initialize centroids 22 | random_idxs = np.random.choice(self.n_samples, self.K, replace=False) 23 | self.centroids = [X[idx] for idx in random_idxs] 24 | 25 | for _ in range(self.max_iters): 26 | # Assign clusters 27 | self.clusters = self._create_clusters(self.centroids) 28 | 29 | if self.plot_steps: 30 | self.plot() 31 | 32 | # Update centroids 33 | centroids_old = self.centroids 34 | self.centroids = self._calculate_centroids(self.clusters) 35 | 36 | # Check convergence 37 | if self._is_converged(centroids_old, self.centroids): 38 | break 39 | 40 | return self._get_cluster_labels(self.clusters) 41 | 42 | def _create_clusters(self, centroids): 43 | clusters = [[] for _ in range(self.K)] 44 | for idx, sample in enumerate(self.X): 45 | closest_idx = self._closest_centroid(sample, centroids) 46 | clusters[closest_idx].append(idx) 47 | return clusters 48 | 49 | def _closest_centroid(self, sample, centroids): 50 | distances = [euclidean_distance(sample, point) for point in centroids] 51 | return np.argmin(distances) 52 | 53 | def _calculate_centroids(self, clusters): 54 | centroids = np.zeros((self.K, self.n_features)) 55 | for idx, cluster in enumerate(clusters): 56 | cluster_mean = np.mean(self.X[cluster], axis=0) 57 | centroids[idx] = cluster_mean 58 | return centroids 59 | 60 | def _is_converged(self, old_centroids, new_centroids): 61 | distances = [ 62 | euclidean_distance(old_centroids[i], new_centroids[i]) 63 | for i in range(self.K) 64 | ] 65 | return np.sum(distances) == 0 66 | 67 | def _get_cluster_labels(self, clusters): 68 | labels = np.empty(self.n_samples) 69 | for cluster_idx, sample_idxs in enumerate(clusters): 70 | labels[sample_idxs] = cluster_idx 71 | return labels 72 | 73 | def plot(self): 74 | fig, ax = plt.subplots(figsize=(10, 6)) 75 | for i, cluster in enumerate(self.clusters): 76 | points = self.X[cluster].T 77 | ax.scatter(*points) 78 | for point in self.centroids: 79 | ax.scatter(*point, marker='x', color='black', linewidth=2) 80 | plt.show() 81 | 82 | # Testing 83 | if __name__ == "__main__": 84 | from sklearn.datasets import make_blobs 85 | 86 | X, y = make_blobs(n_samples=500, centers=3, n_features=2, random_state=40) 87 | kmeans = KMeans(K=3, max_iters=150, plot_steps=True) 88 | y_pred = kmeans.predict(X) 89 | kmeans.plot() 90 | -------------------------------------------------------------------------------- /decision_tree: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import Counter 3 | 4 | def entropy(y): 5 | hist = np.bincount(y) 6 | probs = hist / len(y) 7 | return -np.sum([p * np.log2(p) for p in probs if p > 0]) 8 | 9 | class Node: 10 | def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None): 11 | self.feature = feature 12 | self.threshold = threshold 13 | self.left = left 14 | self.right = right 15 | self.value = value 16 | 17 | def is_leaf(self): 18 | return self.value is not None 19 | 20 | class DecisionTree: 21 | def __init__(self, min_samples_split=2, max_depth=100, n_feats=None): 22 | self.min_samples_split = min_samples_split 23 | self.max_depth = max_depth 24 | self.n_feats = n_feats 25 | self.root = None 26 | 27 | def fit(self, X, y): 28 | self.n_feats = X.shape[1] if self.n_feats is None else min(self.n_feats, X.shape[1]) 29 | self.root = self._build_tree(X, y) 30 | 31 | def predict(self, X): 32 | return np.array([self._traverse(x, self.root) for x in X]) 33 | 34 | def _build_tree(self, X, y, depth=0): 35 | n_samples, n_features = X.shape 36 | n_labels = len(np.unique(y)) 37 | 38 | if (depth >= self.max_depth or n_labels == 1 or n_samples < self.min_samples_split): 39 | return Node(value=self._most_common_label(y)) 40 | 41 | feat_idxs = np.random.choice(n_features, self.n_feats, replace=False) 42 | 43 | best_feat, best_thresh = self._best_split(X, y, feat_idxs) 44 | 45 | left_idxs, right_idxs = self._split(X[:, best_feat], best_thresh) 46 | left = self._build_tree(X[left_idxs], y[left_idxs], depth + 1) 47 | right = self._build_tree(X[right_idxs], y[right_idxs], depth + 1) 48 | return Node(feature=best_feat, threshold=best_thresh, left=left, right=right) 49 | 50 | def _best_split(self, X, y, feat_idxs): 51 | best_gain = -1 52 | split_idx, split_thresh = None, None 53 | 54 | for feat_idx in feat_idxs: 55 | X_column = X[:, feat_idx] 56 | thresholds = np.unique(X_column) 57 | 58 | for thresh in thresholds: 59 | gain = self._information_gain(y, X_column, thresh) 60 | 61 | if gain > best_gain: 62 | best_gain = gain 63 | split_idx = feat_idx 64 | split_thresh = thresh 65 | 66 | return split_idx, split_thresh 67 | 68 | def _information_gain(self, y, X_column, split_thresh): 69 | parent_entropy = entropy(y) 70 | 71 | left_idxs, right_idxs = self._split(X_column, split_thresh) 72 | if len(left_idxs) == 0 or len(right_idxs) == 0: 73 | return 0 74 | 75 | n = len(y) 76 | n_l, n_r = len(left_idxs), len(right_idxs) 77 | e_l, e_r = entropy(y[left_idxs]), entropy(y[right_idxs]) 78 | weighted_avg_entropy = (n_l / n) * e_l + (n_r / n) * e_r 79 | 80 | return parent_entropy - weighted_avg_entropy 81 | 82 | def _split(self, X_column, threshold): 83 | left_idxs = np.argwhere(X_column <= threshold).flatten() 84 | right_idxs = np.argwhere(X_column > threshold).flatten() 85 | return left_idxs, right_idxs 86 | 87 | def _traverse(self, x, node): 88 | if node.is_leaf(): 89 | return node.value 90 | 91 | if x[node.feature] <= node.threshold: 92 | return self._traverse(x, node.left) 93 | return self._traverse(x, node.right) 94 | 95 | def _most_common_label(self, y): 96 | return Counter(y).most_common(1)[0][0] 97 | 98 | # Testing it 99 | if __name__ == "__main__": 100 | from sklearn.datasets import load_breast_cancer 101 | from sklearn.model_selection import train_test_split 102 | 103 | def accuracy(y_true, y_pred): 104 | return np.mean(y_true == y_pred) 105 | 106 | X, y = load_breast_cancer(return_X_y=True) 107 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 108 | 109 | tree = DecisionTree(max_depth=10) 110 | tree.fit(X_train, y_train) 111 | 112 | preds = tree.predict(X_test) 113 | print("Accuracy:", accuracy(y_test, preds)) 114 | --------------------------------------------------------------------------------