├── im └── README.md ├── scratch ├── __init__.py ├── nlp_advanced.py ├── machine_learning.py ├── simple_linear_regression.py ├── probability.py ├── visualization.py ├── k_nearest_neighbors.py ├── gradient_descent.py ├── linear_algebra.py └── naive_bayes.py ├── first-edition ├── code │ ├── charts.py │ ├── __init__.py │ ├── comma_delimited_stock_prices.txt │ ├── colon_delimited_stock_prices.txt │ ├── comma_delimited_stock_prices.csv │ ├── tab_delimited_stock_prices.txt │ ├── line_count.py │ ├── egrep.py │ ├── plot_state_borders.py │ ├── most_common_words.py │ ├── machine_learning.py │ ├── simple_linear_regression.py │ ├── linear_algebra.py │ ├── probability.py │ ├── naive_bayes.py │ ├── visualizing_data.py │ ├── statistics.py │ ├── logistic_regression.py │ ├── decision_trees.py │ ├── mapreduce.py │ ├── gradient_descent.py │ ├── hypothesis_and_inference.py │ ├── recommender_systems.py │ └── clustering.py ├── code-python3 │ ├── __init__.py │ ├── charts.py │ ├── comma_delimited_stock_prices.txt │ ├── colon_delimited_stock_prices.txt │ ├── comma_delimited_stock_prices.csv │ ├── tab_delimited_stock_prices.txt │ ├── line_count.py │ ├── egrep.py │ ├── plot_state_borders.py │ ├── most_common_words.py │ ├── machine_learning.py │ ├── simple_linear_regression.py │ ├── linear_algebra.py │ ├── probability.py │ ├── README.md │ ├── naive_bayes.py │ ├── visualizing_data.py │ ├── stats.py │ ├── decision_trees.py │ ├── logistic_regression.py │ ├── mapreduce.py │ ├── gradient_descent.py │ ├── hypothesis_and_inference.py │ ├── recommender_systems.py │ └── clustering.py └── README.md ├── .gitignore ├── comma_delimited_stock_prices.csv ├── INSTALL.md ├── requirements.txt ├── LICENSE └── README.md /im/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scratch/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /first-edition/code/charts.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /first-edition/code/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /first-edition/code-python3/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /first-edition/code-python3/charts.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.png 3 | 4 | -------------------------------------------------------------------------------- /first-edition/code/comma_delimited_stock_prices.txt: -------------------------------------------------------------------------------- 1 | AAPL,90.91 2 | FB,64.5 3 | MSFT,41.68 4 | -------------------------------------------------------------------------------- /first-edition/code-python3/comma_delimited_stock_prices.txt: -------------------------------------------------------------------------------- 1 | FB,64.5 2 | MSFT,41.68 3 | AAPL,90.91 4 | -------------------------------------------------------------------------------- /first-edition/code/colon_delimited_stock_prices.txt: -------------------------------------------------------------------------------- 1 | date:symbol:closing_price 2 | 6/20/2014:AAPL:90.91 3 | 6/20/2014:MSFT:41.68 4 | 6/20/2014:FB:64.5 -------------------------------------------------------------------------------- /first-edition/code-python3/colon_delimited_stock_prices.txt: -------------------------------------------------------------------------------- 1 | date:symbol:closing_price 2 | 6/20/2014:AAPL:90.91 3 | 6/20/2014:MSFT:41.68 4 | 6/20/2014:FB:64.5 -------------------------------------------------------------------------------- /comma_delimited_stock_prices.csv: -------------------------------------------------------------------------------- 1 | AAPL,6/20/2014,90.91 2 | MSFT,6/20/2014,41.68 3 | FB,6/20/3014,64.5 4 | AAPL,6/19/2014,91.86 5 | MSFT,6/19/2014,n/a 6 | FB,6/19/2014,64.34 7 | -------------------------------------------------------------------------------- /first-edition/code/comma_delimited_stock_prices.csv: -------------------------------------------------------------------------------- 1 | 6/20/2014,AAPL,90.91 2 | 6/20/2014,MSFT,41.68 3 | 6/20/3014,FB,64.5 4 | 6/19/2014,AAPL,91.86 5 | 6/19/2014,MSFT,n/a 6 | 6/19/2014,FB,64.34 -------------------------------------------------------------------------------- /first-edition/code/tab_delimited_stock_prices.txt: -------------------------------------------------------------------------------- 1 | 6/20/2014 AAPL 90.91 2 | 6/20/2014 MSFT 41.68 3 | 6/20/2014 FB 64.5 4 | 6/19/2014 AAPL 91.86 5 | 6/19/2014 MSFT 41.51 6 | 6/19/2014 FB 64.34 -------------------------------------------------------------------------------- /first-edition/code-python3/comma_delimited_stock_prices.csv: -------------------------------------------------------------------------------- 1 | 6/20/2014,AAPL,90.91 2 | 6/20/2014,MSFT,41.68 3 | 6/20/3014,FB,64.5 4 | 6/19/2014,AAPL,91.86 5 | 6/19/2014,MSFT,n/a 6 | 6/19/2014,FB,64.34 -------------------------------------------------------------------------------- /first-edition/code-python3/tab_delimited_stock_prices.txt: -------------------------------------------------------------------------------- 1 | 6/20/2014 AAPL 90.91 2 | 6/20/2014 MSFT 41.68 3 | 6/20/2014 FB 64.5 4 | 6/19/2014 AAPL 91.86 5 | 6/19/2014 MSFT 41.51 6 | 6/19/2014 FB 64.34 -------------------------------------------------------------------------------- /first-edition/code/line_count.py: -------------------------------------------------------------------------------- 1 | # line_count.py 2 | import sys 3 | 4 | if __name__ == "__main__": 5 | 6 | count = 0 7 | for line in sys.stdin: 8 | count += 1 9 | 10 | # print goes to sys.stdout 11 | print count -------------------------------------------------------------------------------- /first-edition/code-python3/line_count.py: -------------------------------------------------------------------------------- 1 | # line_count.py 2 | import sys 3 | 4 | if __name__ == "__main__": 5 | 6 | count = 0 7 | for line in sys.stdin: 8 | count += 1 9 | 10 | # print goes to sys.stdout 11 | print(count) 12 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | # How to Install Python 2 | 3 | If you don't already have Python, I strongly recommend you install the Anaconda version, 4 | which includes many of the libraries needed for data science. Get the Python 3 version, not the Python 2 version. 5 | 6 | https://www.anaconda.com/distribution/#download-section 7 | 8 | Follow the instructions indicated for your platform. 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # For a nicer terminal 2 | ipython 3 | 4 | # For plotting graphs 5 | matplotlib 6 | 7 | # For reading in images 8 | pillow 9 | 10 | # For making HTTP requests 11 | requests 12 | 13 | # For parsing HTML 14 | beautifulsoup4 15 | html5lib 16 | 17 | # For accessing Python 18 | twython 19 | 20 | # For generating progress bars 21 | tqdm 22 | 23 | # For downloading MNIST data 24 | mnist 25 | 26 | # For parsing dates 27 | python-dateutil 28 | -------------------------------------------------------------------------------- /first-edition/code/egrep.py: -------------------------------------------------------------------------------- 1 | # egrep.py 2 | import sys, re 3 | 4 | if __name__ == "__main__": 5 | 6 | # sys.argv is the list of command-line arguments 7 | # sys.argv[0] is the name of the program itself 8 | # sys.argv[1] will be the regex specfied at the command line 9 | regex = sys.argv[1] 10 | 11 | # for every line passed into the script 12 | for line in sys.stdin: 13 | # if it matches the regex, write it to stdout 14 | if re.search(regex, line): 15 | sys.stdout.write(line) -------------------------------------------------------------------------------- /first-edition/code-python3/egrep.py: -------------------------------------------------------------------------------- 1 | # egrep.py 2 | import sys, re 3 | 4 | if __name__ == "__main__": 5 | 6 | # sys.argv is the list of command-line arguments 7 | # sys.argv[0] is the name of the program itself 8 | # sys.argv[1] will be the regex specfied at the command line 9 | regex = sys.argv[1] 10 | 11 | # for every line passed into the script 12 | for line in sys.stdin: 13 | # if it matches the regex, write it to stdout 14 | if re.search(regex, line): 15 | sys.stdout.write(line) -------------------------------------------------------------------------------- /first-edition/code/plot_state_borders.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | segments = [] 4 | points = [] 5 | 6 | lat_long_regex = r""): 13 | for p1, p2 in zip(points, points[1:]): 14 | segments.append((p1, p2)) 15 | points = [] 16 | s = re.search(lat_long_regex, line) 17 | if s: 18 | lat, lon = s.groups() 19 | points.append((float(lon), float(lat))) 20 | 21 | def plot_state_borders(plt, color='0.8'): 22 | for (lon1, lat1), (lon2, lat2) in segments: 23 | plt.plot([lon1, lon2], [lat1, lat2], color=color) -------------------------------------------------------------------------------- /first-edition/code-python3/plot_state_borders.py: -------------------------------------------------------------------------------- 1 | import re 2 | import matplotlib.pyplot as plt 3 | 4 | segments = [] 5 | points = [] 6 | 7 | lat_long_regex = r""): 14 | for p1, p2 in zip(points, points[1:]): 15 | segments.append((p1, p2)) 16 | points = [] 17 | s = re.search(lat_long_regex, line) 18 | if s: 19 | lat, lon = s.groups() 20 | points.append((float(lon), float(lat))) 21 | 22 | def plot_state_borders(color='0.8'): 23 | for (lon1, lat1), (lon2, lat2) in segments: 24 | plt.plot([lon1, lon2], [lat1, lat2], color=color) 25 | -------------------------------------------------------------------------------- /scratch/nlp_advanced.py: -------------------------------------------------------------------------------- 1 | from scratch.deep_learning import Optimizer, Layer 2 | 3 | class EmbeddingOptimizer(Optimizer): 4 | """ 5 | Optimized for the case where there are 6 | only embedding layers with single id updates. 7 | """ 8 | def __init__(self, learning_rate: float) -> None: 9 | self.lr = learning_rate 10 | 11 | def step(self, layer: Layer) -> None: 12 | for param, grad in zip(layer.params(), layer.grads()): 13 | # Find the first (only) row with nonzero values. 14 | for idx, row in enumerate(grad): 15 | if row[0] != 0: 16 | break 17 | 18 | # Then update just that row. 19 | for j in range(len(row)): 20 | param[idx][j] -= grad[idx][j] * self.lr 21 | -------------------------------------------------------------------------------- /first-edition/code-python3/most_common_words.py: -------------------------------------------------------------------------------- 1 | # most_common_words.py 2 | import sys 3 | from collections import Counter 4 | 5 | if __name__ == "__main__": 6 | 7 | # pass in number of words as first argument 8 | try: 9 | num_words = int(sys.argv[1]) 10 | except: 11 | print("usage: most_common_words.py num_words") 12 | sys.exit(1) # non-zero exit code indicates error 13 | 14 | counter = Counter(word.lower() 15 | for line in sys.stdin 16 | for word in line.strip().split() 17 | if word) 18 | 19 | for word, count in counter.most_common(num_words): 20 | sys.stdout.write(str(count)) 21 | sys.stdout.write("\t") 22 | sys.stdout.write(word) 23 | sys.stdout.write("\n") 24 | -------------------------------------------------------------------------------- /first-edition/code/most_common_words.py: -------------------------------------------------------------------------------- 1 | # most_common_words.py 2 | import sys 3 | from collections import Counter 4 | 5 | if __name__ == "__main__": 6 | 7 | # pass in number of words as first argument 8 | try: 9 | num_words = int(sys.argv[1]) 10 | except: 11 | print "usage: most_common_words.py num_words" 12 | sys.exit(1) # non-zero exit code indicates error 13 | 14 | counter = Counter(word.lower() 15 | for line in sys.stdin 16 | for word in line.strip().split() 17 | if word) 18 | 19 | for word, count in counter.most_common(num_words): 20 | sys.stdout.write(str(count)) 21 | sys.stdout.write("\t") 22 | sys.stdout.write(word) 23 | sys.stdout.write("\n") -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Joel Grus 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /first-edition/code-python3/machine_learning.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | import math, random 3 | 4 | # 5 | # data splitting 6 | # 7 | 8 | def split_data(data, prob): 9 | """split data into fractions [prob, 1 - prob]""" 10 | results = [], [] 11 | for row in data: 12 | results[0 if random.random() < prob else 1].append(row) 13 | return results 14 | 15 | def train_test_split(x, y, test_pct): 16 | data = list(zip(x, y)) # pair corresponding values 17 | train, test = split_data(data, 1 - test_pct) # split the dataset of pairs 18 | x_train, y_train = list(zip(*train)) # magical un-zip trick 19 | x_test, y_test = list(zip(*test)) 20 | return x_train, x_test, y_train, y_test 21 | 22 | # 23 | # correctness 24 | # 25 | 26 | def accuracy(tp, fp, fn, tn): 27 | correct = tp + tn 28 | total = tp + fp + fn + tn 29 | return correct / total 30 | 31 | def precision(tp, fp, fn, tn): 32 | return tp / (tp + fp) 33 | 34 | def recall(tp, fp, fn, tn): 35 | return tp / (tp + fn) 36 | 37 | def f1_score(tp, fp, fn, tn): 38 | p = precision(tp, fp, fn, tn) 39 | r = recall(tp, fp, fn, tn) 40 | 41 | return 2 * p * r / (p + r) 42 | 43 | if __name__ == "__main__": 44 | 45 | print("accuracy(70, 4930, 13930, 981070)", accuracy(70, 4930, 13930, 981070)) 46 | print("precision(70, 4930, 13930, 981070)", precision(70, 4930, 13930, 981070)) 47 | print("recall(70, 4930, 13930, 981070)", recall(70, 4930, 13930, 981070)) 48 | print("f1_score(70, 4930, 13930, 981070)", f1_score(70, 4930, 13930, 981070)) 49 | -------------------------------------------------------------------------------- /first-edition/code/machine_learning.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from collections import Counter 3 | import math, random 4 | 5 | # 6 | # data splitting 7 | # 8 | 9 | def split_data(data, prob): 10 | """split data into fractions [prob, 1 - prob]""" 11 | results = [], [] 12 | for row in data: 13 | results[0 if random.random() < prob else 1].append(row) 14 | return results 15 | 16 | def train_test_split(x, y, test_pct): 17 | data = zip(x, y) # pair corresponding values 18 | train, test = split_data(data, 1 - test_pct) # split the dataset of pairs 19 | x_train, y_train = zip(*train) # magical un-zip trick 20 | x_test, y_test = zip(*test) 21 | return x_train, x_test, y_train, y_test 22 | 23 | # 24 | # correctness 25 | # 26 | 27 | def accuracy(tp, fp, fn, tn): 28 | correct = tp + tn 29 | total = tp + fp + fn + tn 30 | return correct / total 31 | 32 | def precision(tp, fp, fn, tn): 33 | return tp / (tp + fp) 34 | 35 | def recall(tp, fp, fn, tn): 36 | return tp / (tp + fn) 37 | 38 | def f1_score(tp, fp, fn, tn): 39 | p = precision(tp, fp, fn, tn) 40 | r = recall(tp, fp, fn, tn) 41 | 42 | return 2 * p * r / (p + r) 43 | 44 | if __name__ == "__main__": 45 | 46 | print "accuracy(70, 4930, 13930, 981070)", accuracy(70, 4930, 13930, 981070) 47 | print "precision(70, 4930, 13930, 981070)", precision(70, 4930, 13930, 981070) 48 | print "recall(70, 4930, 13930, 981070)", recall(70, 4930, 13930, 981070) 49 | print "f1_score(70, 4930, 13930, 981070)", f1_score(70, 4930, 13930, 981070) 50 | 51 | -------------------------------------------------------------------------------- /scratch/machine_learning.py: -------------------------------------------------------------------------------- 1 | import random 2 | from typing import TypeVar, List, Tuple 3 | X = TypeVar('X') # generic type to represent a data point 4 | 5 | def split_data(data: List[X], prob: float) -> Tuple[List[X], List[X]]: 6 | """Split data into fractions [prob, 1 - prob]""" 7 | data = data[:] # Make a shallow copy 8 | random.shuffle(data) # because shuffle modifies the list. 9 | cut = int(len(data) * prob) # Use prob to find a cutoff 10 | return data[:cut], data[cut:] # and split the shuffled list there. 11 | 12 | data = [n for n in range(1000)] 13 | train, test = split_data(data, 0.75) 14 | 15 | # The proportions should be correct 16 | assert len(train) == 750 17 | assert len(test) == 250 18 | 19 | # And the original data should be preserved (in some order) 20 | assert sorted(train + test) == data 21 | 22 | Y = TypeVar('Y') # generic type to represent output variables 23 | 24 | def train_test_split(xs: List[X], 25 | ys: List[Y], 26 | test_pct: float) -> Tuple[List[X], List[X], List[Y], List[Y]]: 27 | # Generate the indices and split them. 28 | idxs = [i for i in range(len(xs))] 29 | train_idxs, test_idxs = split_data(idxs, 1 - test_pct) 30 | 31 | return ([xs[i] for i in train_idxs], # x_train 32 | [xs[i] for i in test_idxs], # x_test 33 | [ys[i] for i in train_idxs], # y_train 34 | [ys[i] for i in test_idxs]) # y_test 35 | 36 | xs = [x for x in range(1000)] # xs are 1 ... 1000 37 | ys = [2 * x for x in xs] # each y_i is twice x_i 38 | x_train, x_test, y_train, y_test = train_test_split(xs, ys, 0.25) 39 | 40 | # Check that the proportions are correct 41 | assert len(x_train) == len(y_train) == 750 42 | assert len(x_test) == len(y_test) == 250 43 | 44 | # Check that the corresponding data points are paired correctly. 45 | assert all(y == 2 * x for x, y in zip(x_train, y_train)) 46 | assert all(y == 2 * x for x, y in zip(x_test, y_test)) 47 | 48 | def accuracy(tp: int, fp: int, fn: int, tn: int) -> float: 49 | correct = tp + tn 50 | total = tp + fp + fn + tn 51 | return correct / total 52 | 53 | assert accuracy(70, 4930, 13930, 981070) == 0.98114 54 | 55 | def precision(tp: int, fp: int, fn: int, tn: int) -> float: 56 | return tp / (tp + fp) 57 | 58 | assert precision(70, 4930, 13930, 981070) == 0.014 59 | 60 | def recall(tp: int, fp: int, fn: int, tn: int) -> float: 61 | return tp / (tp + fn) 62 | 63 | assert recall(70, 4930, 13930, 981070) == 0.005 64 | 65 | def f1_score(tp: int, fp: int, fn: int, tn: int) -> float: 66 | p = precision(tp, fp, fn, tn) 67 | r = recall(tp, fp, fn, tn) 68 | 69 | return 2 * p * r / (p + r) 70 | 71 | -------------------------------------------------------------------------------- /scratch/simple_linear_regression.py: -------------------------------------------------------------------------------- 1 | def predict(alpha: float, beta: float, x_i: float) -> float: 2 | return beta * x_i + alpha 3 | 4 | def error(alpha: float, beta: float, x_i: float, y_i: float) -> float: 5 | """ 6 | The error from predicting beta * x_i + alpha 7 | when the actual value is y_i 8 | """ 9 | return predict(alpha, beta, x_i) - y_i 10 | 11 | from scratch.linear_algebra import Vector 12 | 13 | def sum_of_sqerrors(alpha: float, beta: float, x: Vector, y: Vector) -> float: 14 | return sum(error(alpha, beta, x_i, y_i) ** 2 15 | for x_i, y_i in zip(x, y)) 16 | 17 | from typing import Tuple 18 | from scratch.linear_algebra import Vector 19 | from scratch.statistics import correlation, standard_deviation, mean 20 | 21 | def least_squares_fit(x: Vector, y: Vector) -> Tuple[float, float]: 22 | """ 23 | Given two vectors x and y, 24 | find the least-squares values of alpha and beta 25 | """ 26 | beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x) 27 | alpha = mean(y) - beta * mean(x) 28 | return alpha, beta 29 | 30 | x = [i for i in range(-100, 110, 10)] 31 | y = [3 * i - 5 for i in x] 32 | 33 | # Should find that y = 3x - 5 34 | assert least_squares_fit(x, y) == (-5, 3) 35 | 36 | from scratch.statistics import num_friends_good, daily_minutes_good 37 | 38 | alpha, beta = least_squares_fit(num_friends_good, daily_minutes_good) 39 | assert 22.9 < alpha < 23.0 40 | assert 0.9 < beta < 0.905 41 | 42 | from scratch.statistics import de_mean 43 | 44 | def total_sum_of_squares(y: Vector) -> float: 45 | """the total squared variation of y_i's from their mean""" 46 | return sum(v ** 2 for v in de_mean(y)) 47 | 48 | def r_squared(alpha: float, beta: float, x: Vector, y: Vector) -> float: 49 | """ 50 | the fraction of variation in y captured by the model, which equals 51 | 1 - the fraction of variation in y not captured by the model 52 | """ 53 | return 1.0 - (sum_of_sqerrors(alpha, beta, x, y) / 54 | total_sum_of_squares(y)) 55 | 56 | rsq = r_squared(alpha, beta, num_friends_good, daily_minutes_good) 57 | assert 0.328 < rsq < 0.330 58 | 59 | def main(): 60 | import random 61 | import tqdm 62 | from scratch.gradient_descent import gradient_step 63 | 64 | num_epochs = 10000 65 | random.seed(0) 66 | 67 | guess = [random.random(), random.random()] # choose random value to start 68 | 69 | learning_rate = 0.00001 70 | 71 | with tqdm.trange(num_epochs) as t: 72 | for _ in t: 73 | alpha, beta = guess 74 | 75 | # Partial derivative of loss with respect to alpha 76 | grad_a = sum(2 * error(alpha, beta, x_i, y_i) 77 | for x_i, y_i in zip(num_friends_good, 78 | daily_minutes_good)) 79 | 80 | # Partial derivative of loss with respect to beta 81 | grad_b = sum(2 * error(alpha, beta, x_i, y_i) * x_i 82 | for x_i, y_i in zip(num_friends_good, 83 | daily_minutes_good)) 84 | 85 | # Compute loss to stick in the tqdm description 86 | loss = sum_of_sqerrors(alpha, beta, 87 | num_friends_good, daily_minutes_good) 88 | t.set_description(f"loss: {loss:.3f}") 89 | 90 | # Finally, update the guess 91 | guess = gradient_step(guess, [grad_a, grad_b], -learning_rate) 92 | 93 | # We should get pretty much the same results: 94 | alpha, beta = guess 95 | assert 22.9 < alpha < 23.0 96 | assert 0.9 < beta < 0.905 97 | 98 | if __name__ == "__main__": main() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Data Science from Scratch 2 | ========================= 3 | 4 | Here's all the code and examples from the second edition of my book _Data Science from Scratch_. They require at least Python 3.6. 5 | 6 | (If you're looking for the code and examples from the first edition, that's in the `first-edition` folder.) 7 | 8 | If you want to use the code, you should be able to clone the repo and just do things like 9 | 10 | ``` 11 | In [1]: from scratch.linear_algebra import dot 12 | 13 | In [2]: dot([1, 2, 3], [4, 5, 6]) 14 | Out[2]: 32 15 | ``` 16 | 17 | and so on and so forth. 18 | 19 | Two notes: 20 | 21 | 1. In order to use the library like this, you need to be in the root directory (that is, the directory that contains the `scratch` folder). If you are in the `scratch` directory itself, the imports won't work. 22 | 23 | 2. It's possible that it will just work. It's also possible that you may need to add the root directory to your `PYTHONPATH`, if you are on Linux or OSX this is as simple as 24 | 25 | ``` 26 | export PYTHONPATH=/path/to/where/you/cloned/this/repo 27 | ``` 28 | 29 | (substituting in the real path, of course). 30 | 31 | If you are on Windows, it's [potentially more complicated](https://stackoverflow.com/questions/3701646/how-to-add-to-the-pythonpath-in-windows-so-it-finds-my-modules-packages). 32 | 33 | ## Table of Contents 34 | 35 | 1. Introduction 36 | 2. A Crash Course in Python 37 | 3. [Visualizing Data](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/visualization.py) 38 | 4. [Linear Algebra](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/linear_algebra.py) 39 | 5. [Statistics](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/statistics.py) 40 | 6. [Probability](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/probability.py) 41 | 7. [Hypothesis and Inference](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/inference.py) 42 | 8. [Gradient Descent](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/gradient_descent.py) 43 | 9. [Getting Data](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/getting_data.py) 44 | 10. [Working With Data](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/working_with_data.py) 45 | 11. [Machine Learning](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/machine_learning.py) 46 | 12. [k-Nearest Neighbors](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/nearest_neighbors.py) 47 | 13. [Naive Bayes](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/naive_bayes.py) 48 | 14. [Simple Linear Regression](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/simple_linear_regression.py) 49 | 15. [Multiple Regression](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/multiple_regression.py) 50 | 16. [Logistic Regression](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/logistic_regression.py) 51 | 17. [Decision Trees](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/decision_trees.py) 52 | 18. [Neural Networks](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/neural_networks.py) 53 | 19. [Deep Learning] 54 | 20. [Clustering](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/clustering.py) 55 | 21. [Natural Language Processing](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/nlp.py) 56 | 22. [Network Analysis](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/network_analysis.py) 57 | 23. [Recommender Systems](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/recommender_systems.py) 58 | 24. [Databases and SQL](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/databases.py) 59 | 25. [MapReduce](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/mapreduce.py) 60 | 26. Data Ethics 61 | 27. Go Forth And Do Data Science 62 | -------------------------------------------------------------------------------- /first-edition/README.md: -------------------------------------------------------------------------------- 1 | Data Science from Scratch 2 | ========================= 3 | 4 | Here's all the code and examples from the first edition of my book __[Data Science from Scratch](http://joelgrus.com/2015/04/26/data-science-from-scratch-first-principles-with-python/)__. The `code` directory contains Python 2.7 versions, and the `code-python3` direction contains the Python 3 equivalents. (I tested them in 3.5, but they should work in any 3.x.) 5 | 6 | 7 | Each can be imported as a module, for example (after you cd into the /code directory): 8 | 9 | ```python 10 | from linear_algebra import distance, vector_mean 11 | v = [1, 2, 3] 12 | w = [4, 5, 6] 13 | print distance(v, w) 14 | print vector_mean([v, w]) 15 | ``` 16 | 17 | Or can be run from the command line to get a demo of what it does (and to execute the examples from the book): 18 | 19 | ```bat 20 | python recommender_systems.py 21 | ``` 22 | 23 | Additionally, I've collected all the [links](https://github.com/joelgrus/data-science-from-scratch/blob/master/links.md) from the book. 24 | 25 | And, by popular demand, I made an index of functions defined in the book, by chapter and page number. 26 | The data is in a [spreadsheet](https://docs.google.com/spreadsheets/d/1mjGp94ehfxWOEaAFJsPiHqIeOioPH1vN1PdOE6v1az8/edit?usp=sharing), or I also made a toy (experimental) [searchable webapp](http://joelgrus.com/experiments/function-index/). 27 | 28 | ## Table of Contents 29 | 30 | 1. Introduction 31 | 2. A Crash Course in Python 32 | 3. [Visualizing Data](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/visualizing_data.py) 33 | 4. [Linear Algebra](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/linear_algebra.py) 34 | 5. [Statistics](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/statistics.py) 35 | 6. [Probability](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/probability.py) 36 | 7. [Hypothesis and Inference](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/hypothesis_and_inference.py) 37 | 8. [Gradient Descent](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/gradient_descent.py) 38 | 9. [Getting Data](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/getting_data.py) 39 | 10. [Working With Data](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/working_with_data.py) 40 | 11. [Machine Learning](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/machine_learning.py) 41 | 12. [k-Nearest Neighbors](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/nearest_neighbors.py) 42 | 13. [Naive Bayes](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/naive_bayes.py) 43 | 14. [Simple Linear Regression](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/simple_linear_regression.py) 44 | 15. [Multiple Regression](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/multiple_regression.py) 45 | 16. [Logistic Regression](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/logistic_regression.py) 46 | 17. [Decision Trees](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/decision_trees.py) 47 | 18. [Neural Networks](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/neural_networks.py) 48 | 19. [Clustering](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/clustering.py) 49 | 20. [Natural Language Processing](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/natural_language_processing.py) 50 | 21. [Network Analysis](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/network_analysis.py) 51 | 22. [Recommender Systems](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/recommender_systems.py) 52 | 23. [Databases and SQL](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/databases.py) 53 | 24. [MapReduce](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/mapreduce.py) 54 | 25. Go Forth And Do Data Science 55 | -------------------------------------------------------------------------------- /first-edition/code-python3/simple_linear_regression.py: -------------------------------------------------------------------------------- 1 | from collections import Counter, defaultdict 2 | from linear_algebra import vector_subtract 3 | from stats import mean, correlation, standard_deviation, de_mean 4 | from gradient_descent import minimize_stochastic 5 | import math, random 6 | 7 | def predict(alpha, beta, x_i): 8 | return beta * x_i + alpha 9 | 10 | def error(alpha, beta, x_i, y_i): 11 | return y_i - predict(alpha, beta, x_i) 12 | 13 | def sum_of_squared_errors(alpha, beta, x, y): 14 | return sum(error(alpha, beta, x_i, y_i) ** 2 15 | for x_i, y_i in zip(x, y)) 16 | 17 | def least_squares_fit(x,y): 18 | """given training values for x and y, 19 | find the least-squares values of alpha and beta""" 20 | beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x) 21 | alpha = mean(y) - beta * mean(x) 22 | return alpha, beta 23 | 24 | def total_sum_of_squares(y): 25 | """the total squared variation of y_i's from their mean""" 26 | return sum(v ** 2 for v in de_mean(y)) 27 | 28 | def r_squared(alpha, beta, x, y): 29 | """the fraction of variation in y captured by the model, which equals 30 | 1 - the fraction of variation in y not captured by the model""" 31 | 32 | return 1.0 - (sum_of_squared_errors(alpha, beta, x, y) / 33 | total_sum_of_squares(y)) 34 | 35 | def squared_error(x_i, y_i, theta): 36 | alpha, beta = theta 37 | return error(alpha, beta, x_i, y_i) ** 2 38 | 39 | def squared_error_gradient(x_i, y_i, theta): 40 | alpha, beta = theta 41 | return [-2 * error(alpha, beta, x_i, y_i), # alpha partial derivative 42 | -2 * error(alpha, beta, x_i, y_i) * x_i] # beta partial derivative 43 | 44 | if __name__ == "__main__": 45 | 46 | num_friends_good = [49,41,40,25,21,21,19,19,18,18,16,15,15,15,15,14,14,13,13,13,13,12,12,11,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,8,8,8,8,8,8,8,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 47 | daily_minutes_good = [68.77,51.25,52.08,38.36,44.54,57.13,51.4,41.42,31.22,34.76,54.01,38.79,47.59,49.1,27.66,41.03,36.73,48.65,28.12,46.62,35.57,32.98,35,26.07,23.77,39.73,40.57,31.65,31.21,36.32,20.45,21.93,26.02,27.34,23.49,46.94,30.5,33.8,24.23,21.4,27.94,32.24,40.57,25.07,19.42,22.39,18.42,46.96,23.72,26.41,26.97,36.76,40.32,35.02,29.47,30.2,31,38.11,38.18,36.31,21.03,30.86,36.07,28.66,29.08,37.28,15.28,24.17,22.31,30.17,25.53,19.85,35.37,44.6,17.23,13.47,26.33,35.02,32.09,24.81,19.33,28.77,24.26,31.98,25.73,24.86,16.28,34.51,15.23,39.72,40.8,26.06,35.76,34.76,16.13,44.04,18.03,19.65,32.62,35.59,39.43,14.18,35.24,40.13,41.82,35.45,36.07,43.67,24.61,20.9,21.9,18.79,27.61,27.21,26.61,29.77,20.59,27.53,13.82,33.2,25,33.1,36.65,18.63,14.87,22.2,36.81,25.53,24.62,26.25,18.21,28.08,19.42,29.79,32.8,35.99,28.32,27.79,35.88,29.06,36.28,14.1,36.63,37.49,26.9,18.58,38.48,24.48,18.95,33.55,14.24,29.04,32.51,25.63,22.22,19,32.73,15.16,13.9,27.2,32.01,29.27,33,13.74,20.42,27.32,18.23,35.35,28.48,9.08,24.62,20.12,35.26,19.92,31.02,16.49,12.16,30.7,31.22,34.65,13.13,27.51,33.2,31.57,14.1,33.42,17.44,10.12,24.42,9.82,23.39,30.93,15.03,21.67,31.09,33.29,22.61,26.89,23.48,8.38,27.81,32.35,23.84] 48 | 49 | alpha, beta = least_squares_fit(num_friends_good, daily_minutes_good) 50 | print("alpha", alpha) 51 | print("beta", beta) 52 | 53 | print("r-squared", r_squared(alpha, beta, num_friends_good, daily_minutes_good)) 54 | 55 | print() 56 | 57 | print("gradient descent:") 58 | # choose random value to start 59 | random.seed(0) 60 | theta = [random.random(), random.random()] 61 | alpha, beta = minimize_stochastic(squared_error, 62 | squared_error_gradient, 63 | num_friends_good, 64 | daily_minutes_good, 65 | theta, 66 | 0.0001) 67 | print("alpha", alpha) 68 | print("beta", beta) 69 | -------------------------------------------------------------------------------- /first-edition/code-python3/linear_algebra.py: -------------------------------------------------------------------------------- 1 | # -*- coding: iso-8859-15 -*- 2 | 3 | import re, math, random # regexes, math functions, random numbers 4 | import matplotlib.pyplot as plt # pyplot 5 | from collections import defaultdict, Counter 6 | from functools import partial, reduce 7 | 8 | # 9 | # functions for working with vectors 10 | # 11 | 12 | def vector_add(v, w): 13 | """adds two vectors componentwise""" 14 | return [v_i + w_i for v_i, w_i in zip(v,w)] 15 | 16 | def vector_subtract(v, w): 17 | """subtracts two vectors componentwise""" 18 | return [v_i - w_i for v_i, w_i in zip(v,w)] 19 | 20 | def vector_sum(vectors): 21 | return reduce(vector_add, vectors) 22 | 23 | def scalar_multiply(c, v): 24 | return [c * v_i for v_i in v] 25 | 26 | def vector_mean(vectors): 27 | """compute the vector whose i-th element is the mean of the 28 | i-th elements of the input vectors""" 29 | n = len(vectors) 30 | return scalar_multiply(1/n, vector_sum(vectors)) 31 | 32 | def dot(v, w): 33 | """v_1 * w_1 + ... + v_n * w_n""" 34 | return sum(v_i * w_i for v_i, w_i in zip(v, w)) 35 | 36 | def sum_of_squares(v): 37 | """v_1 * v_1 + ... + v_n * v_n""" 38 | return dot(v, v) 39 | 40 | def magnitude(v): 41 | return math.sqrt(sum_of_squares(v)) 42 | 43 | def squared_distance(v, w): 44 | return sum_of_squares(vector_subtract(v, w)) 45 | 46 | def distance(v, w): 47 | return math.sqrt(squared_distance(v, w)) 48 | 49 | # 50 | # functions for working with matrices 51 | # 52 | 53 | def shape(A): 54 | num_rows = len(A) 55 | num_cols = len(A[0]) if A else 0 56 | return num_rows, num_cols 57 | 58 | def get_row(A, i): 59 | return A[i] 60 | 61 | def get_column(A, j): 62 | return [A_i[j] for A_i in A] 63 | 64 | def make_matrix(num_rows, num_cols, entry_fn): 65 | """returns a num_rows x num_cols matrix 66 | whose (i,j)-th entry is entry_fn(i, j)""" 67 | return [[entry_fn(i, j) for j in range(num_cols)] 68 | for i in range(num_rows)] 69 | 70 | def is_diagonal(i, j): 71 | """1's on the 'diagonal', 0's everywhere else""" 72 | return 1 if i == j else 0 73 | 74 | identity_matrix = make_matrix(5, 5, is_diagonal) 75 | 76 | # user 0 1 2 3 4 5 6 7 8 9 77 | # 78 | friendships = [[0, 1, 1, 0, 0, 0, 0, 0, 0, 0], # user 0 79 | [1, 0, 1, 1, 0, 0, 0, 0, 0, 0], # user 1 80 | [1, 1, 0, 1, 0, 0, 0, 0, 0, 0], # user 2 81 | [0, 1, 1, 0, 1, 0, 0, 0, 0, 0], # user 3 82 | [0, 0, 0, 1, 0, 1, 0, 0, 0, 0], # user 4 83 | [0, 0, 0, 0, 1, 0, 1, 1, 0, 0], # user 5 84 | [0, 0, 0, 0, 0, 1, 0, 0, 1, 0], # user 6 85 | [0, 0, 0, 0, 0, 1, 0, 0, 1, 0], # user 7 86 | [0, 0, 0, 0, 0, 0, 1, 1, 0, 1], # user 8 87 | [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]] # user 9 88 | 89 | ##### 90 | # DELETE DOWN 91 | # 92 | 93 | 94 | def matrix_add(A, B): 95 | if shape(A) != shape(B): 96 | raise ArithmeticError("cannot add matrices with different shapes") 97 | 98 | num_rows, num_cols = shape(A) 99 | def entry_fn(i, j): return A[i][j] + B[i][j] 100 | 101 | return make_matrix(num_rows, num_cols, entry_fn) 102 | 103 | 104 | def make_graph_dot_product_as_vector_projection(plt): 105 | 106 | v = [2, 1] 107 | w = [math.sqrt(.25), math.sqrt(.75)] 108 | c = dot(v, w) 109 | vonw = scalar_multiply(c, w) 110 | o = [0,0] 111 | 112 | plt.arrow(0, 0, v[0], v[1], 113 | width=0.002, head_width=.1, length_includes_head=True) 114 | plt.annotate("v", v, xytext=[v[0] + 0.1, v[1]]) 115 | plt.arrow(0 ,0, w[0], w[1], 116 | width=0.002, head_width=.1, length_includes_head=True) 117 | plt.annotate("w", w, xytext=[w[0] - 0.1, w[1]]) 118 | plt.arrow(0, 0, vonw[0], vonw[1], length_includes_head=True) 119 | plt.annotate(u"(v•w)w", vonw, xytext=[vonw[0] - 0.1, vonw[1] + 0.1]) 120 | plt.arrow(v[0], v[1], vonw[0] - v[0], vonw[1] - v[1], 121 | linestyle='dotted', length_includes_head=True) 122 | plt.scatter(*zip(v,w,o),marker='.') 123 | plt.axis('equal') 124 | plt.show() 125 | -------------------------------------------------------------------------------- /first-edition/code/simple_linear_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from collections import Counter, defaultdict 3 | from linear_algebra import vector_subtract 4 | from statistics import mean, correlation, standard_deviation, de_mean 5 | from gradient_descent import minimize_stochastic 6 | import math, random 7 | 8 | def predict(alpha, beta, x_i): 9 | return beta * x_i + alpha 10 | 11 | def error(alpha, beta, x_i, y_i): 12 | return y_i - predict(alpha, beta, x_i) 13 | 14 | def sum_of_squared_errors(alpha, beta, x, y): 15 | return sum(error(alpha, beta, x_i, y_i) ** 2 16 | for x_i, y_i in zip(x, y)) 17 | 18 | def least_squares_fit(x,y): 19 | """given training values for x and y, 20 | find the least-squares values of alpha and beta""" 21 | beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x) 22 | alpha = mean(y) - beta * mean(x) 23 | return alpha, beta 24 | 25 | def total_sum_of_squares(y): 26 | """the total squared variation of y_i's from their mean""" 27 | return sum(v ** 2 for v in de_mean(y)) 28 | 29 | def r_squared(alpha, beta, x, y): 30 | """the fraction of variation in y captured by the model, which equals 31 | 1 - the fraction of variation in y not captured by the model""" 32 | 33 | return 1.0 - (sum_of_squared_errors(alpha, beta, x, y) / 34 | total_sum_of_squares(y)) 35 | 36 | def squared_error(x_i, y_i, theta): 37 | alpha, beta = theta 38 | return error(alpha, beta, x_i, y_i) ** 2 39 | 40 | def squared_error_gradient(x_i, y_i, theta): 41 | alpha, beta = theta 42 | return [-2 * error(alpha, beta, x_i, y_i), # alpha partial derivative 43 | -2 * error(alpha, beta, x_i, y_i) * x_i] # beta partial derivative 44 | 45 | if __name__ == "__main__": 46 | 47 | num_friends_good = [49,41,40,25,21,21,19,19,18,18,16,15,15,15,15,14,14,13,13,13,13,12,12,11,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,8,8,8,8,8,8,8,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 48 | daily_minutes_good = [68.77,51.25,52.08,38.36,44.54,57.13,51.4,41.42,31.22,34.76,54.01,38.79,47.59,49.1,27.66,41.03,36.73,48.65,28.12,46.62,35.57,32.98,35,26.07,23.77,39.73,40.57,31.65,31.21,36.32,20.45,21.93,26.02,27.34,23.49,46.94,30.5,33.8,24.23,21.4,27.94,32.24,40.57,25.07,19.42,22.39,18.42,46.96,23.72,26.41,26.97,36.76,40.32,35.02,29.47,30.2,31,38.11,38.18,36.31,21.03,30.86,36.07,28.66,29.08,37.28,15.28,24.17,22.31,30.17,25.53,19.85,35.37,44.6,17.23,13.47,26.33,35.02,32.09,24.81,19.33,28.77,24.26,31.98,25.73,24.86,16.28,34.51,15.23,39.72,40.8,26.06,35.76,34.76,16.13,44.04,18.03,19.65,32.62,35.59,39.43,14.18,35.24,40.13,41.82,35.45,36.07,43.67,24.61,20.9,21.9,18.79,27.61,27.21,26.61,29.77,20.59,27.53,13.82,33.2,25,33.1,36.65,18.63,14.87,22.2,36.81,25.53,24.62,26.25,18.21,28.08,19.42,29.79,32.8,35.99,28.32,27.79,35.88,29.06,36.28,14.1,36.63,37.49,26.9,18.58,38.48,24.48,18.95,33.55,14.24,29.04,32.51,25.63,22.22,19,32.73,15.16,13.9,27.2,32.01,29.27,33,13.74,20.42,27.32,18.23,35.35,28.48,9.08,24.62,20.12,35.26,19.92,31.02,16.49,12.16,30.7,31.22,34.65,13.13,27.51,33.2,31.57,14.1,33.42,17.44,10.12,24.42,9.82,23.39,30.93,15.03,21.67,31.09,33.29,22.61,26.89,23.48,8.38,27.81,32.35,23.84] 49 | 50 | alpha, beta = least_squares_fit(num_friends_good, daily_minutes_good) 51 | print "alpha", alpha 52 | print "beta", beta 53 | 54 | print "r-squared", r_squared(alpha, beta, num_friends_good, daily_minutes_good) 55 | 56 | print 57 | 58 | print "gradient descent:" 59 | # choose random value to start 60 | random.seed(0) 61 | theta = [random.random(), random.random()] 62 | alpha, beta = minimize_stochastic(squared_error, 63 | squared_error_gradient, 64 | num_friends_good, 65 | daily_minutes_good, 66 | theta, 67 | 0.0001) 68 | print "alpha", alpha 69 | print "beta", beta -------------------------------------------------------------------------------- /first-edition/code/linear_algebra.py: -------------------------------------------------------------------------------- 1 | # -*- coding: iso-8859-15 -*- 2 | 3 | from __future__ import division # want 3 / 2 == 1.5 4 | import re, math, random # regexes, math functions, random numbers 5 | import matplotlib.pyplot as plt # pyplot 6 | from collections import defaultdict, Counter 7 | from functools import partial 8 | 9 | # 10 | # functions for working with vectors 11 | # 12 | 13 | def vector_add(v, w): 14 | """adds two vectors componentwise""" 15 | return [v_i + w_i for v_i, w_i in zip(v,w)] 16 | 17 | def vector_subtract(v, w): 18 | """subtracts two vectors componentwise""" 19 | return [v_i - w_i for v_i, w_i in zip(v,w)] 20 | 21 | def vector_sum(vectors): 22 | return reduce(vector_add, vectors) 23 | 24 | def scalar_multiply(c, v): 25 | return [c * v_i for v_i in v] 26 | 27 | # this isn't right if you don't from __future__ import division 28 | def vector_mean(vectors): 29 | """compute the vector whose i-th element is the mean of the 30 | i-th elements of the input vectors""" 31 | n = len(vectors) 32 | return scalar_multiply(1/n, vector_sum(vectors)) 33 | 34 | def dot(v, w): 35 | """v_1 * w_1 + ... + v_n * w_n""" 36 | return sum(v_i * w_i for v_i, w_i in zip(v, w)) 37 | 38 | def sum_of_squares(v): 39 | """v_1 * v_1 + ... + v_n * v_n""" 40 | return dot(v, v) 41 | 42 | def magnitude(v): 43 | return math.sqrt(sum_of_squares(v)) 44 | 45 | def squared_distance(v, w): 46 | return sum_of_squares(vector_subtract(v, w)) 47 | 48 | def distance(v, w): 49 | return math.sqrt(squared_distance(v, w)) 50 | 51 | # 52 | # functions for working with matrices 53 | # 54 | 55 | def shape(A): 56 | num_rows = len(A) 57 | num_cols = len(A[0]) if A else 0 58 | return num_rows, num_cols 59 | 60 | def get_row(A, i): 61 | return A[i] 62 | 63 | def get_column(A, j): 64 | return [A_i[j] for A_i in A] 65 | 66 | def make_matrix(num_rows, num_cols, entry_fn): 67 | """returns a num_rows x num_cols matrix 68 | whose (i,j)-th entry is entry_fn(i, j)""" 69 | return [[entry_fn(i, j) for j in range(num_cols)] 70 | for i in range(num_rows)] 71 | 72 | def is_diagonal(i, j): 73 | """1's on the 'diagonal', 0's everywhere else""" 74 | return 1 if i == j else 0 75 | 76 | identity_matrix = make_matrix(5, 5, is_diagonal) 77 | 78 | # user 0 1 2 3 4 5 6 7 8 9 79 | # 80 | friendships = [[0, 1, 1, 0, 0, 0, 0, 0, 0, 0], # user 0 81 | [1, 0, 1, 1, 0, 0, 0, 0, 0, 0], # user 1 82 | [1, 1, 0, 1, 0, 0, 0, 0, 0, 0], # user 2 83 | [0, 1, 1, 0, 1, 0, 0, 0, 0, 0], # user 3 84 | [0, 0, 0, 1, 0, 1, 0, 0, 0, 0], # user 4 85 | [0, 0, 0, 0, 1, 0, 1, 1, 0, 0], # user 5 86 | [0, 0, 0, 0, 0, 1, 0, 0, 1, 0], # user 6 87 | [0, 0, 0, 0, 0, 1, 0, 0, 1, 0], # user 7 88 | [0, 0, 0, 0, 0, 0, 1, 1, 0, 1], # user 8 89 | [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]] # user 9 90 | 91 | ##### 92 | # DELETE DOWN 93 | # 94 | 95 | 96 | def matrix_add(A, B): 97 | if shape(A) != shape(B): 98 | raise ArithmeticError("cannot add matrices with different shapes") 99 | 100 | num_rows, num_cols = shape(A) 101 | def entry_fn(i, j): return A[i][j] + B[i][j] 102 | 103 | return make_matrix(num_rows, num_cols, entry_fn) 104 | 105 | 106 | def make_graph_dot_product_as_vector_projection(plt): 107 | 108 | v = [2, 1] 109 | w = [math.sqrt(.25), math.sqrt(.75)] 110 | c = dot(v, w) 111 | vonw = scalar_multiply(c, w) 112 | o = [0,0] 113 | 114 | plt.arrow(0, 0, v[0], v[1], 115 | width=0.002, head_width=.1, length_includes_head=True) 116 | plt.annotate("v", v, xytext=[v[0] + 0.1, v[1]]) 117 | plt.arrow(0 ,0, w[0], w[1], 118 | width=0.002, head_width=.1, length_includes_head=True) 119 | plt.annotate("w", w, xytext=[w[0] - 0.1, w[1]]) 120 | plt.arrow(0, 0, vonw[0], vonw[1], length_includes_head=True) 121 | plt.annotate(u"(v•w)w", vonw, xytext=[vonw[0] - 0.1, vonw[1] + 0.1]) 122 | plt.arrow(v[0], v[1], vonw[0] - v[0], vonw[1] - v[1], 123 | linestyle='dotted', length_includes_head=True) 124 | plt.scatter(*zip(v,w,o),marker='.') 125 | plt.axis('equal') 126 | plt.show() 127 | -------------------------------------------------------------------------------- /first-edition/code-python3/probability.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | import math, random 3 | 4 | def random_kid(): 5 | return random.choice(["boy", "girl"]) 6 | 7 | def uniform_pdf(x): 8 | return 1 if x >= 0 and x < 1 else 0 9 | 10 | def uniform_cdf(x): 11 | "returns the probability that a uniform random variable is less than x" 12 | if x < 0: return 0 # uniform random is never less than 0 13 | elif x < 1: return x # e.g. P(X < 0.4) = 0.4 14 | else: return 1 # uniform random is always less than 1 15 | 16 | def normal_pdf(x, mu=0, sigma=1): 17 | sqrt_two_pi = math.sqrt(2 * math.pi) 18 | return (math.exp(-(x-mu) ** 2 / 2 / sigma ** 2) / (sqrt_two_pi * sigma)) 19 | 20 | def plot_normal_pdfs(plt): 21 | xs = [x / 10.0 for x in range(-50, 50)] 22 | plt.plot(xs,[normal_pdf(x,sigma=1) for x in xs],'-',label='mu=0,sigma=1') 23 | plt.plot(xs,[normal_pdf(x,sigma=2) for x in xs],'--',label='mu=0,sigma=2') 24 | plt.plot(xs,[normal_pdf(x,sigma=0.5) for x in xs],':',label='mu=0,sigma=0.5') 25 | plt.plot(xs,[normal_pdf(x,mu=-1) for x in xs],'-.',label='mu=-1,sigma=1') 26 | plt.legend() 27 | plt.show() 28 | 29 | def normal_cdf(x, mu=0,sigma=1): 30 | return (1 + math.erf((x - mu) / math.sqrt(2) / sigma)) / 2 31 | 32 | def plot_normal_cdfs(plt): 33 | xs = [x / 10.0 for x in range(-50, 50)] 34 | plt.plot(xs,[normal_cdf(x,sigma=1) for x in xs],'-',label='mu=0,sigma=1') 35 | plt.plot(xs,[normal_cdf(x,sigma=2) for x in xs],'--',label='mu=0,sigma=2') 36 | plt.plot(xs,[normal_cdf(x,sigma=0.5) for x in xs],':',label='mu=0,sigma=0.5') 37 | plt.plot(xs,[normal_cdf(x,mu=-1) for x in xs],'-.',label='mu=-1,sigma=1') 38 | plt.legend(loc=4) # bottom right 39 | plt.show() 40 | 41 | def inverse_normal_cdf(p, mu=0, sigma=1, tolerance=0.00001): 42 | """find approximate inverse using binary search""" 43 | 44 | # if not standard, compute standard and rescale 45 | if mu != 0 or sigma != 1: 46 | return mu + sigma * inverse_normal_cdf(p, tolerance=tolerance) 47 | 48 | low_z, low_p = -10.0, 0 # normal_cdf(-10) is (very close to) 0 49 | hi_z, hi_p = 10.0, 1 # normal_cdf(10) is (very close to) 1 50 | while hi_z - low_z > tolerance: 51 | mid_z = (low_z + hi_z) / 2 # consider the midpoint 52 | mid_p = normal_cdf(mid_z) # and the cdf's value there 53 | if mid_p < p: 54 | # midpoint is still too low, search above it 55 | low_z, low_p = mid_z, mid_p 56 | elif mid_p > p: 57 | # midpoint is still too high, search below it 58 | hi_z, hi_p = mid_z, mid_p 59 | else: 60 | break 61 | 62 | return mid_z 63 | 64 | def bernoulli_trial(p): 65 | return 1 if random.random() < p else 0 66 | 67 | def binomial(p, n): 68 | return sum(bernoulli_trial(p) for _ in range(n)) 69 | 70 | def make_hist(p, n, num_points): 71 | 72 | data = [binomial(p, n) for _ in range(num_points)] 73 | 74 | # use a bar chart to show the actual binomial samples 75 | histogram = Counter(data) 76 | plt.bar([x - 0.4 for x in histogram.keys()], 77 | [v / num_points for v in histogram.values()], 78 | 0.8, 79 | color='0.75') 80 | 81 | mu = p * n 82 | sigma = math.sqrt(n * p * (1 - p)) 83 | 84 | # use a line chart to show the normal approximation 85 | xs = range(min(data), max(data) + 1) 86 | ys = [normal_cdf(i + 0.5, mu, sigma) - normal_cdf(i - 0.5, mu, sigma) 87 | for i in xs] 88 | plt.plot(xs,ys) 89 | plt.show() 90 | 91 | 92 | 93 | if __name__ == "__main__": 94 | 95 | # 96 | # CONDITIONAL PROBABILITY 97 | # 98 | 99 | both_girls = 0 100 | older_girl = 0 101 | either_girl = 0 102 | 103 | random.seed(0) 104 | for _ in range(10000): 105 | younger = random_kid() 106 | older = random_kid() 107 | if older == "girl": 108 | older_girl += 1 109 | if older == "girl" and younger == "girl": 110 | both_girls += 1 111 | if older == "girl" or younger == "girl": 112 | either_girl += 1 113 | 114 | print("P(both | older):", both_girls / older_girl) # 0.514 ~ 1/2 115 | print("P(both | either): ", both_girls / either_girl) # 0.342 ~ 1/3 116 | -------------------------------------------------------------------------------- /first-edition/code/probability.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from collections import Counter 3 | import math, random 4 | 5 | def random_kid(): 6 | return random.choice(["boy", "girl"]) 7 | 8 | def uniform_pdf(x): 9 | return 1 if x >= 0 and x < 1 else 0 10 | 11 | def uniform_cdf(x): 12 | "returns the probability that a uniform random variable is less than x" 13 | if x < 0: return 0 # uniform random is never less than 0 14 | elif x < 1: return x # e.g. P(X < 0.4) = 0.4 15 | else: return 1 # uniform random is always less than 1 16 | 17 | def normal_pdf(x, mu=0, sigma=1): 18 | sqrt_two_pi = math.sqrt(2 * math.pi) 19 | return (math.exp(-(x-mu) ** 2 / 2 / sigma ** 2) / (sqrt_two_pi * sigma)) 20 | 21 | def plot_normal_pdfs(plt): 22 | xs = [x / 10.0 for x in range(-50, 50)] 23 | plt.plot(xs,[normal_pdf(x,sigma=1) for x in xs],'-',label='mu=0,sigma=1') 24 | plt.plot(xs,[normal_pdf(x,sigma=2) for x in xs],'--',label='mu=0,sigma=2') 25 | plt.plot(xs,[normal_pdf(x,sigma=0.5) for x in xs],':',label='mu=0,sigma=0.5') 26 | plt.plot(xs,[normal_pdf(x,mu=-1) for x in xs],'-.',label='mu=-1,sigma=1') 27 | plt.legend() 28 | plt.show() 29 | 30 | def normal_cdf(x, mu=0,sigma=1): 31 | return (1 + math.erf((x - mu) / math.sqrt(2) / sigma)) / 2 32 | 33 | def plot_normal_cdfs(plt): 34 | xs = [x / 10.0 for x in range(-50, 50)] 35 | plt.plot(xs,[normal_cdf(x,sigma=1) for x in xs],'-',label='mu=0,sigma=1') 36 | plt.plot(xs,[normal_cdf(x,sigma=2) for x in xs],'--',label='mu=0,sigma=2') 37 | plt.plot(xs,[normal_cdf(x,sigma=0.5) for x in xs],':',label='mu=0,sigma=0.5') 38 | plt.plot(xs,[normal_cdf(x,mu=-1) for x in xs],'-.',label='mu=-1,sigma=1') 39 | plt.legend(loc=4) # bottom right 40 | plt.show() 41 | 42 | def inverse_normal_cdf(p, mu=0, sigma=1, tolerance=0.00001): 43 | """find approximate inverse using binary search""" 44 | 45 | # if not standard, compute standard and rescale 46 | if mu != 0 or sigma != 1: 47 | return mu + sigma * inverse_normal_cdf(p, tolerance=tolerance) 48 | 49 | low_z, low_p = -10.0, 0 # normal_cdf(-10) is (very close to) 0 50 | hi_z, hi_p = 10.0, 1 # normal_cdf(10) is (very close to) 1 51 | while hi_z - low_z > tolerance: 52 | mid_z = (low_z + hi_z) / 2 # consider the midpoint 53 | mid_p = normal_cdf(mid_z) # and the cdf's value there 54 | if mid_p < p: 55 | # midpoint is still too low, search above it 56 | low_z, low_p = mid_z, mid_p 57 | elif mid_p > p: 58 | # midpoint is still too high, search below it 59 | hi_z, hi_p = mid_z, mid_p 60 | else: 61 | break 62 | 63 | return mid_z 64 | 65 | def bernoulli_trial(p): 66 | return 1 if random.random() < p else 0 67 | 68 | def binomial(p, n): 69 | return sum(bernoulli_trial(p) for _ in range(n)) 70 | 71 | def make_hist(p, n, num_points): 72 | 73 | data = [binomial(p, n) for _ in range(num_points)] 74 | 75 | # use a bar chart to show the actual binomial samples 76 | histogram = Counter(data) 77 | plt.bar([x - 0.4 for x in histogram.keys()], 78 | [v / num_points for v in histogram.values()], 79 | 0.8, 80 | color='0.75') 81 | 82 | mu = p * n 83 | sigma = math.sqrt(n * p * (1 - p)) 84 | 85 | # use a line chart to show the normal approximation 86 | xs = range(min(data), max(data) + 1) 87 | ys = [normal_cdf(i + 0.5, mu, sigma) - normal_cdf(i - 0.5, mu, sigma) 88 | for i in xs] 89 | plt.plot(xs,ys) 90 | plt.show() 91 | 92 | 93 | 94 | if __name__ == "__main__": 95 | 96 | # 97 | # CONDITIONAL PROBABILITY 98 | # 99 | 100 | both_girls = 0 101 | older_girl = 0 102 | either_girl = 0 103 | 104 | random.seed(0) 105 | for _ in range(10000): 106 | younger = random_kid() 107 | older = random_kid() 108 | if older == "girl": 109 | older_girl += 1 110 | if older == "girl" and younger == "girl": 111 | both_girls += 1 112 | if older == "girl" or younger == "girl": 113 | either_girl += 1 114 | 115 | print "P(both | older):", both_girls / older_girl # 0.514 ~ 1/2 116 | print "P(both | either): ", both_girls / either_girl # 0.342 ~ 1/3 -------------------------------------------------------------------------------- /first-edition/code-python3/README.md: -------------------------------------------------------------------------------- 1 | # Updating the code from Python 2 to Python 3 2 | 3 | After many requests, here's the code from the book updated from Python 2 to Python 3. 4 | I have been telling people that there aren't too many changes required, but it turned 5 | out there were quite a few. Start-to-finish I'd say the porting took me about 4 hours, 6 | and I'm pretty familiar with the code. I think I got everything, let me know if you find something 7 | that doesn't work in Python 3. 8 | 9 | (For the most part my goal was to get everything to *work* in Python 3, I didn't spend any time on trying to make it *idiomatic* Python 3. Later.) 10 | 11 | Here's a fairly comprehensive list of the issues I ran into. 12 | 13 | ## `print` 14 | 15 | The first and most obvious difference is that in Python 3 `print` takes parentheses. 16 | This means that every 17 | 18 | ``` 19 | print "stuff", 1 20 | ``` 21 | 22 | had to be replaced with 23 | 24 | ``` 25 | print("stuff", 1) 26 | ``` 27 | 28 | This was mostly just tedious. I should have used 2to3. 29 | 30 | ## tuple unpacking 31 | 32 | PEP-3113 eliminates 33 | tuple unpacking in function parameters. In particular, that means that code like 34 | 35 | ``` 36 | lambda (a, b): b 37 | ``` 38 | 39 | has to be replaced with 40 | 41 | ``` 42 | lambda pair: pair[1] 43 | ``` 44 | 45 | This is unfortunate, as I tend to write a lot of code like 46 | 47 | ``` 48 | sorted(words_and_counts, key=lambda (word, count): count, reverse=True) 49 | ``` 50 | 51 | Probably I should have just created a `helpers.py` with a few functions like 52 | 53 | ``` 54 | def fst(pair): return pair[0] 55 | def snd(pair): return pair[1] 56 | ``` 57 | 58 | Maybe next time. 59 | 60 | ## laziness 61 | 62 | In Python 3, laziness is the order of the day. In particular, `dict`-like 63 | objects no longer have `.iteritems()` properties, so those all have to be replaced 64 | with `.items()` 65 | 66 | Similarly, `filter` now returns an iterator, so that code like 67 | 68 | ``` 69 | filter(is_even, my_list)[0] 70 | ``` 71 | 72 | doesn't work, and needs to be replaced with 73 | 74 | ``` 75 | list(filter(is_even, my_list))[0] 76 | ``` 77 | 78 | And likewise with `zip`, which in many instances needs to be replaced with `list(zip(...))`. (In particular, this uglies up my magic unzip trick.) 79 | 80 | At least when you try to index into an iterator you get an error. It's potentially worse if you iterate over it expecting `list` behavior. 81 | 82 | In the most subtle case this bit me at (in essence): 83 | 84 | ``` 85 | data = map(clean, data) 86 | x = [row[0] for row in data] 87 | y = [row[1] for row in data] 88 | ``` 89 | 90 | in this case the `map` makes `data` a generator, and once the `x` definition iterates 91 | over it, it's gone. The solution is 92 | 93 | ``` 94 | data = list(map(clean, data)) 95 | ``` 96 | 97 | Similarly, if you have a `dict` then its `.keys()` is lazy, so you have to wrap 98 | it in `list` as well. This is possibly my least favorite change in Python 3. 99 | 100 | A better solution is probably to replace most of these with list comprehensions. 101 | 102 | ## binary mode for CSVs 103 | 104 | In Python 2 it was best practice to open CSV files in binary mode to 105 | make sure you dealt properly with Windows line endings: 106 | 107 | ``` 108 | f = open("some.csv", "rb") 109 | ``` 110 | 111 | In Python 3 that doesn't work for various reasons having to do with raw bytes 112 | and string encodings. Instead you need to open them in text mode and 113 | specify the line ending types: 114 | 115 | ``` 116 | f = open("some.csv", 'r', encoding='utf8', newline='') 117 | ``` 118 | 119 | ## `reduce` 120 | 121 | Guido doesn't like `reduce`, so in Python 3 it's hidden in `functools`. So any code 122 | that uses it needs to add a 123 | 124 | ``` 125 | from functools import reduce 126 | ``` 127 | 128 | ## bad spam characters 129 | 130 | The Spam Assassin corpus files from the naive bayes chapter (are old and) 131 | contain some ugly characters that caused me problems until I tried opening the 132 | files with 133 | 134 | ``` 135 | encoding='ISO-8859-1' 136 | ``` 137 | 138 | # Bugs 139 | 140 | For some reason, my Python 3 topic model in `natural_language_processing` gives slightly different results from the Python 2 version. I suspect this means there is a bug in the port, but I haven't figured out what it is yet. Let me know if you find any more bugs, it's possible there's a lazy `zip` or `map` that I missed. 141 | -------------------------------------------------------------------------------- /first-edition/code-python3/naive_bayes.py: -------------------------------------------------------------------------------- 1 | from collections import Counter, defaultdict 2 | from machine_learning import split_data 3 | import math, random, re, glob 4 | 5 | def tokenize(message): 6 | message = message.lower() # convert to lowercase 7 | all_words = re.findall("[a-z0-9']+", message) # extract the words 8 | return set(all_words) # remove duplicates 9 | 10 | 11 | def count_words(training_set): 12 | """training set consists of pairs (message, is_spam)""" 13 | counts = defaultdict(lambda: [0, 0]) 14 | for message, is_spam in training_set: 15 | for word in tokenize(message): 16 | counts[word][0 if is_spam else 1] += 1 17 | return counts 18 | 19 | def word_probabilities(counts, total_spams, total_non_spams, k=0.5): 20 | """turn the word_counts into a list of triplets 21 | w, p(w | spam) and p(w | ~spam)""" 22 | return [(w, 23 | (spam + k) / (total_spams + 2 * k), 24 | (non_spam + k) / (total_non_spams + 2 * k)) 25 | for w, (spam, non_spam) in counts.items()] 26 | 27 | def spam_probability(word_probs, message): 28 | message_words = tokenize(message) 29 | log_prob_if_spam = log_prob_if_not_spam = 0.0 30 | 31 | for word, prob_if_spam, prob_if_not_spam in word_probs: 32 | 33 | # for each word in the message, 34 | # add the log probability of seeing it 35 | if word in message_words: 36 | log_prob_if_spam += math.log(prob_if_spam) 37 | log_prob_if_not_spam += math.log(prob_if_not_spam) 38 | 39 | # for each word that's not in the message 40 | # add the log probability of _not_ seeing it 41 | else: 42 | log_prob_if_spam += math.log(1.0 - prob_if_spam) 43 | log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam) 44 | 45 | prob_if_spam = math.exp(log_prob_if_spam) 46 | prob_if_not_spam = math.exp(log_prob_if_not_spam) 47 | return prob_if_spam / (prob_if_spam + prob_if_not_spam) 48 | 49 | 50 | class NaiveBayesClassifier: 51 | 52 | def __init__(self, k=0.5): 53 | self.k = k 54 | self.word_probs = [] 55 | 56 | def train(self, training_set): 57 | 58 | # count spam and non-spam messages 59 | num_spams = len([is_spam 60 | for message, is_spam in training_set 61 | if is_spam]) 62 | num_non_spams = len(training_set) - num_spams 63 | 64 | # run training data through our "pipeline" 65 | word_counts = count_words(training_set) 66 | self.word_probs = word_probabilities(word_counts, 67 | num_spams, 68 | num_non_spams, 69 | self.k) 70 | 71 | def classify(self, message): 72 | return spam_probability(self.word_probs, message) 73 | 74 | 75 | def get_subject_data(path): 76 | 77 | data = [] 78 | 79 | # regex for stripping out the leading "Subject:" and any spaces after it 80 | subject_regex = re.compile(r"^Subject:\s+") 81 | 82 | # glob.glob returns every filename that matches the wildcarded path 83 | for fn in glob.glob(path): 84 | is_spam = "ham" not in fn 85 | 86 | with open(fn,'r',encoding='ISO-8859-1') as file: 87 | for line in file: 88 | if line.startswith("Subject:"): 89 | subject = subject_regex.sub("", line).strip() 90 | data.append((subject, is_spam)) 91 | 92 | return data 93 | 94 | def p_spam_given_word(word_prob): 95 | word, prob_if_spam, prob_if_not_spam = word_prob 96 | return prob_if_spam / (prob_if_spam + prob_if_not_spam) 97 | 98 | def train_and_test_model(path): 99 | 100 | data = get_subject_data(path) 101 | random.seed(0) # just so you get the same answers as me 102 | train_data, test_data = split_data(data, 0.75) 103 | 104 | classifier = NaiveBayesClassifier() 105 | classifier.train(train_data) 106 | 107 | classified = [(subject, is_spam, classifier.classify(subject)) 108 | for subject, is_spam in test_data] 109 | 110 | counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted) 111 | for _, is_spam, spam_probability in classified) 112 | 113 | print(counts) 114 | 115 | classified.sort(key=lambda row: row[2]) 116 | spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:] 117 | hammiest_spams = list(filter(lambda row: row[1], classified))[:5] 118 | 119 | print("spammiest_hams", spammiest_hams) 120 | print("hammiest_spams", hammiest_spams) 121 | 122 | words = sorted(classifier.word_probs, key=p_spam_given_word) 123 | 124 | spammiest_words = words[-5:] 125 | hammiest_words = words[:5] 126 | 127 | print("spammiest_words", spammiest_words) 128 | print("hammiest_words", hammiest_words) 129 | 130 | 131 | if __name__ == "__main__": 132 | #train_and_test_model(r"c:\spam\*\*") 133 | train_and_test_model(r"/home/joel/src/spam/*/*") 134 | -------------------------------------------------------------------------------- /first-edition/code/naive_bayes.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from collections import Counter, defaultdict 3 | from machine_learning import split_data 4 | import math, random, re, glob 5 | 6 | def tokenize(message): 7 | message = message.lower() # convert to lowercase 8 | all_words = re.findall("[a-z0-9']+", message) # extract the words 9 | return set(all_words) # remove duplicates 10 | 11 | 12 | def count_words(training_set): 13 | """training set consists of pairs (message, is_spam)""" 14 | counts = defaultdict(lambda: [0, 0]) 15 | for message, is_spam in training_set: 16 | for word in tokenize(message): 17 | counts[word][0 if is_spam else 1] += 1 18 | return counts 19 | 20 | def word_probabilities(counts, total_spams, total_non_spams, k=0.5): 21 | """turn the word_counts into a list of triplets 22 | w, p(w | spam) and p(w | ~spam)""" 23 | return [(w, 24 | (spam + k) / (total_spams + 2 * k), 25 | (non_spam + k) / (total_non_spams + 2 * k)) 26 | for w, (spam, non_spam) in counts.iteritems()] 27 | 28 | def spam_probability(word_probs, message): 29 | message_words = tokenize(message) 30 | log_prob_if_spam = log_prob_if_not_spam = 0.0 31 | 32 | for word, prob_if_spam, prob_if_not_spam in word_probs: 33 | 34 | # for each word in the message, 35 | # add the log probability of seeing it 36 | if word in message_words: 37 | log_prob_if_spam += math.log(prob_if_spam) 38 | log_prob_if_not_spam += math.log(prob_if_not_spam) 39 | 40 | # for each word that's not in the message 41 | # add the log probability of _not_ seeing it 42 | else: 43 | log_prob_if_spam += math.log(1.0 - prob_if_spam) 44 | log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam) 45 | 46 | prob_if_spam = math.exp(log_prob_if_spam) 47 | prob_if_not_spam = math.exp(log_prob_if_not_spam) 48 | return prob_if_spam / (prob_if_spam + prob_if_not_spam) 49 | 50 | 51 | class NaiveBayesClassifier: 52 | 53 | def __init__(self, k=0.5): 54 | self.k = k 55 | self.word_probs = [] 56 | 57 | def train(self, training_set): 58 | 59 | # count spam and non-spam messages 60 | num_spams = len([is_spam 61 | for message, is_spam in training_set 62 | if is_spam]) 63 | num_non_spams = len(training_set) - num_spams 64 | 65 | # run training data through our "pipeline" 66 | word_counts = count_words(training_set) 67 | self.word_probs = word_probabilities(word_counts, 68 | num_spams, 69 | num_non_spams, 70 | self.k) 71 | 72 | def classify(self, message): 73 | return spam_probability(self.word_probs, message) 74 | 75 | 76 | def get_subject_data(path): 77 | 78 | data = [] 79 | 80 | # regex for stripping out the leading "Subject:" and any spaces after it 81 | subject_regex = re.compile(r"^Subject:\s+") 82 | 83 | # glob.glob returns every filename that matches the wildcarded path 84 | for fn in glob.glob(path): 85 | is_spam = "ham" not in fn 86 | 87 | with open(fn,'r') as file: 88 | for line in file: 89 | if line.startswith("Subject:"): 90 | subject = subject_regex.sub("", line).strip() 91 | data.append((subject, is_spam)) 92 | 93 | return data 94 | 95 | def p_spam_given_word(word_prob): 96 | word, prob_if_spam, prob_if_not_spam = word_prob 97 | return prob_if_spam / (prob_if_spam + prob_if_not_spam) 98 | 99 | def train_and_test_model(path): 100 | 101 | data = get_subject_data(path) 102 | random.seed(0) # just so you get the same answers as me 103 | train_data, test_data = split_data(data, 0.75) 104 | 105 | classifier = NaiveBayesClassifier() 106 | classifier.train(train_data) 107 | 108 | classified = [(subject, is_spam, classifier.classify(subject)) 109 | for subject, is_spam in test_data] 110 | 111 | counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted) 112 | for _, is_spam, spam_probability in classified) 113 | 114 | print counts 115 | 116 | classified.sort(key=lambda row: row[2]) 117 | spammiest_hams = filter(lambda row: not row[1], classified)[-5:] 118 | hammiest_spams = filter(lambda row: row[1], classified)[:5] 119 | 120 | print "spammiest_hams", spammiest_hams 121 | print "hammiest_spams", hammiest_spams 122 | 123 | words = sorted(classifier.word_probs, key=p_spam_given_word) 124 | 125 | spammiest_words = words[-5:] 126 | hammiest_words = words[:5] 127 | 128 | print "spammiest_words", spammiest_words 129 | print "hammiest_words", hammiest_words 130 | 131 | 132 | if __name__ == "__main__": 133 | train_and_test_model(r"c:\spam\*\*") -------------------------------------------------------------------------------- /scratch/probability.py: -------------------------------------------------------------------------------- 1 | def uniform_cdf(x: float) -> float: 2 | """Returns the probability that a uniform random variable is <= x""" 3 | if x < 0: return 0 # uniform random is never less than 0 4 | elif x < 1: return x # e.g. P(X <= 0.4) = 0.4 5 | else: return 1 # uniform random is always less than 1 6 | 7 | import math 8 | SQRT_TWO_PI = math.sqrt(2 * math.pi) 9 | 10 | def normal_pdf(x: float, mu: float = 0, sigma: float = 1) -> float: 11 | return (math.exp(-(x-mu) ** 2 / 2 / sigma ** 2) / (SQRT_TWO_PI * sigma)) 12 | 13 | import matplotlib.pyplot as plt 14 | xs = [x / 10.0 for x in range(-50, 50)] 15 | plt.plot(xs,[normal_pdf(x,sigma=1) for x in xs],'-',label='mu=0,sigma=1') 16 | plt.plot(xs,[normal_pdf(x,sigma=2) for x in xs],'--',label='mu=0,sigma=2') 17 | plt.plot(xs,[normal_pdf(x,sigma=0.5) for x in xs],':',label='mu=0,sigma=0.5') 18 | plt.plot(xs,[normal_pdf(x,mu=-1) for x in xs],'-.',label='mu=-1,sigma=1') 19 | plt.legend() 20 | plt.title("Various Normal pdfs") 21 | # plt.show() 22 | 23 | 24 | # plt.savefig('im/various_normal_pdfs.png') 25 | plt.gca().clear() 26 | plt.close() 27 | plt.clf() 28 | 29 | def normal_cdf(x: float, mu: float = 0, sigma: float = 1) -> float: 30 | return (1 + math.erf((x - mu) / math.sqrt(2) / sigma)) / 2 31 | 32 | xs = [x / 10.0 for x in range(-50, 50)] 33 | plt.plot(xs,[normal_cdf(x,sigma=1) for x in xs],'-',label='mu=0,sigma=1') 34 | plt.plot(xs,[normal_cdf(x,sigma=2) for x in xs],'--',label='mu=0,sigma=2') 35 | plt.plot(xs,[normal_cdf(x,sigma=0.5) for x in xs],':',label='mu=0,sigma=0.5') 36 | plt.plot(xs,[normal_cdf(x,mu=-1) for x in xs],'-.',label='mu=-1,sigma=1') 37 | plt.legend(loc=4) # bottom right 38 | plt.title("Various Normal cdfs") 39 | # plt.show() 40 | 41 | 42 | plt.close() 43 | plt.gca().clear() 44 | plt.clf() 45 | 46 | def inverse_normal_cdf(p: float, 47 | mu: float = 0, 48 | sigma: float = 1, 49 | tolerance: float = 0.00001) -> float: 50 | """Find approximate inverse using binary search""" 51 | 52 | # if not standard, compute standard and rescale 53 | if mu != 0 or sigma != 1: 54 | return mu + sigma * inverse_normal_cdf(p, tolerance=tolerance) 55 | 56 | low_z = -10.0 # normal_cdf(-10) is (very close to) 0 57 | hi_z = 10.0 # normal_cdf(10) is (very close to) 1 58 | while hi_z - low_z > tolerance: 59 | mid_z = (low_z + hi_z) / 2 # Consider the midpoint 60 | mid_p = normal_cdf(mid_z) # and the cdf's value there 61 | if mid_p < p: 62 | low_z = mid_z # Midpoint too low, search above it 63 | else: 64 | hi_z = mid_z # Midpoint too high, search below it 65 | 66 | return mid_z 67 | 68 | 69 | import random 70 | 71 | def bernoulli_trial(p: float) -> int: 72 | """Returns 1 with probability p and 0 with probability 1-p""" 73 | return 1 if random.random() < p else 0 74 | 75 | def binomial(n: int, p: float) -> int: 76 | """Returns the sum of n bernoulli(p) trials""" 77 | return sum(bernoulli_trial(p) for _ in range(n)) 78 | 79 | from collections import Counter 80 | 81 | def binomial_histogram(p: float, n: int, num_points: int) -> None: 82 | """Picks points from a Binomial(n, p) and plots their histogram""" 83 | data = [binomial(n, p) for _ in range(num_points)] 84 | 85 | # use a bar chart to show the actual binomial samples 86 | histogram = Counter(data) 87 | plt.bar([x - 0.4 for x in histogram.keys()], 88 | [v / num_points for v in histogram.values()], 89 | 0.8, 90 | color='0.75') 91 | 92 | mu = p * n 93 | sigma = math.sqrt(n * p * (1 - p)) 94 | 95 | # use a line chart to show the normal approximation 96 | xs = range(min(data), max(data) + 1) 97 | ys = [normal_cdf(i + 0.5, mu, sigma) - normal_cdf(i - 0.5, mu, sigma) 98 | for i in xs] 99 | plt.plot(xs,ys) 100 | plt.title("Binomial Distribution vs. Normal Approximation") 101 | # plt.show() 102 | 103 | def main(): 104 | import enum, random 105 | 106 | # An Enum is a typed set of enumerated values. We can use them 107 | # to make our code more descriptive and readable. 108 | class Kid(enum.Enum): 109 | BOY = 0 110 | GIRL = 1 111 | 112 | def random_kid() -> Kid: 113 | return random.choice([Kid.BOY, Kid.GIRL]) 114 | 115 | both_girls = 0 116 | older_girl = 0 117 | either_girl = 0 118 | 119 | random.seed(0) 120 | 121 | for _ in range(10000): 122 | younger = random_kid() 123 | older = random_kid() 124 | if older == Kid.GIRL: 125 | older_girl += 1 126 | if older == Kid.GIRL and younger == Kid.GIRL: 127 | both_girls += 1 128 | if older == Kid.GIRL or younger == Kid.GIRL: 129 | either_girl += 1 130 | 131 | print("P(both | older):", both_girls / older_girl) # 0.514 ~ 1/2 132 | print("P(both | either): ", both_girls / either_girl) # 0.342 ~ 1/3 133 | 134 | 135 | 136 | assert 0.48 < both_girls / older_girl < 0.52 137 | assert 0.30 < both_girls / either_girl < 0.35 138 | 139 | def uniform_pdf(x: float) -> float: 140 | return 1 if 0 <= x < 1 else 0 141 | 142 | if __name__ == "__main__": main() 143 | -------------------------------------------------------------------------------- /scratch/visualization.py: -------------------------------------------------------------------------------- 1 | from matplotlib import pyplot as plt 2 | 3 | years = [1950, 1960, 1970, 1980, 1990, 2000, 2010] 4 | gdp = [300.2, 543.3, 1075.9, 2862.5, 5979.6, 10289.7, 14958.3] 5 | 6 | # create a line chart, years on x-axis, gdp on y-axis 7 | plt.plot(years, gdp, color='green', marker='o', linestyle='solid') 8 | 9 | # add a title 10 | plt.title("Nominal GDP") 11 | 12 | # add a label to the y-axis 13 | plt.ylabel("Billions of $") 14 | # plt.show() 15 | 16 | 17 | plt.savefig('im/viz_gdp.png') 18 | plt.gca().clear() 19 | 20 | movies = ["Annie Hall", "Ben-Hur", "Casablanca", "Gandhi", "West Side Story"] 21 | num_oscars = [5, 11, 3, 8, 10] 22 | 23 | # plot bars with left x-coordinates [0, 1, 2, 3, 4], heights [num_oscars] 24 | plt.bar(range(len(movies)), num_oscars) 25 | 26 | plt.title("My Favorite Movies") # add a title 27 | plt.ylabel("# of Academy Awards") # label the y-axis 28 | 29 | # label x-axis with movie names at bar centers 30 | plt.xticks(range(len(movies)), movies) 31 | 32 | # plt.show() 33 | 34 | 35 | plt.savefig('im/viz_movies.png') 36 | plt.gca().clear() 37 | 38 | from collections import Counter 39 | grades = [83, 95, 91, 87, 70, 0, 85, 82, 100, 67, 73, 77, 0] 40 | 41 | # Bucket grades by decile, but put 100 in with the 90s 42 | histogram = Counter(min(grade // 10 * 10, 90) for grade in grades) 43 | 44 | plt.bar([x + 5 for x in histogram.keys()], # Shift bars right by 5 45 | histogram.values(), # Give each bar its correct height 46 | 10, # Give each bar a width of 8 47 | edgecolor=(0, 0, 0)) # Black edges for each bar 48 | 49 | plt.axis([-5, 105, 0, 5]) # x-axis from -5 to 105, 50 | # y-axis from 0 to 5 51 | 52 | plt.xticks([10 * i for i in range(11)]) # x-axis labels at 0, 10, ..., 100 53 | plt.xlabel("Decile") 54 | plt.ylabel("# of Students") 55 | plt.title("Distribution of Exam 1 Grades") 56 | # plt.show() 57 | 58 | 59 | plt.savefig('im/viz_grades.png') 60 | plt.gca().clear() 61 | 62 | mentions = [500, 505] 63 | years = [2017, 2018] 64 | 65 | plt.bar(years, mentions, 0.8) 66 | plt.xticks(years) 67 | plt.ylabel("# of times I heard someone say 'data science'") 68 | 69 | # if you don't do this, matplotlib will label the x-axis 0, 1 70 | # and then add a +2.013e3 off in the corner (bad matplotlib!) 71 | plt.ticklabel_format(useOffset=False) 72 | 73 | # misleading y-axis only shows the part above 500 74 | plt.axis([2016.5, 2018.5, 499, 506]) 75 | plt.title("Look at the 'Huge' Increase!") 76 | # plt.show() 77 | 78 | 79 | plt.savefig('im/viz_misleading_y_axis.png') 80 | plt.gca().clear() 81 | 82 | 83 | plt.bar(years, mentions, 0.8) 84 | plt.xticks(years) 85 | plt.ylabel("# of times I heard someone say 'data science'") 86 | plt.ticklabel_format(useOffset=False) 87 | 88 | plt.axis([2016.5, 2018.5, 0, 550]) 89 | plt.title("Not So Huge Anymore") 90 | # plt.show() 91 | 92 | 93 | plt.savefig('im/viz_non_misleading_y_axis.png') 94 | plt.gca().clear() 95 | 96 | variance = [1, 2, 4, 8, 16, 32, 64, 128, 256] 97 | bias_squared = [256, 128, 64, 32, 16, 8, 4, 2, 1] 98 | total_error = [x + y for x, y in zip(variance, bias_squared)] 99 | xs = [i for i, _ in enumerate(variance)] 100 | 101 | # We can make multiple calls to plt.plot 102 | # to show multiple series on the same chart 103 | plt.plot(xs, variance, 'g-', label='variance') # green solid line 104 | plt.plot(xs, bias_squared, 'r-.', label='bias^2') # red dot-dashed line 105 | plt.plot(xs, total_error, 'b:', label='total error') # blue dotted line 106 | 107 | # Because we've assigned labels to each series, 108 | # we can get a legend for free (loc=9 means "top center") 109 | plt.legend(loc=9) 110 | plt.xlabel("model complexity") 111 | plt.xticks([]) 112 | plt.title("The Bias-Variance Tradeoff") 113 | # plt.show() 114 | 115 | 116 | plt.savefig('im/viz_line_chart.png') 117 | plt.gca().clear() 118 | 119 | friends = [ 70, 65, 72, 63, 71, 64, 60, 64, 67] 120 | minutes = [175, 170, 205, 120, 220, 130, 105, 145, 190] 121 | labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'] 122 | 123 | plt.scatter(friends, minutes) 124 | 125 | # label each point 126 | for label, friend_count, minute_count in zip(labels, friends, minutes): 127 | plt.annotate(label, 128 | xy=(friend_count, minute_count), # Put the label with its point 129 | xytext=(5, -5), # but slightly offset 130 | textcoords='offset points') 131 | 132 | plt.title("Daily Minutes vs. Number of Friends") 133 | plt.xlabel("# of friends") 134 | plt.ylabel("daily minutes spent on the site") 135 | # plt.show() 136 | 137 | 138 | plt.savefig('im/viz_scatterplot.png') 139 | plt.gca().clear() 140 | 141 | test_1_grades = [ 99, 90, 85, 97, 80] 142 | test_2_grades = [100, 85, 60, 90, 70] 143 | 144 | plt.scatter(test_1_grades, test_2_grades) 145 | plt.title("Axes Aren't Comparable") 146 | plt.xlabel("test 1 grade") 147 | plt.ylabel("test 2 grade") 148 | # plt.show() 149 | 150 | 151 | plt.savefig('im/viz_scatterplot_axes_not_comparable.png') 152 | plt.gca().clear() 153 | 154 | 155 | test_1_grades = [ 99, 90, 85, 97, 80] 156 | test_2_grades = [100, 85, 60, 90, 70] 157 | plt.scatter(test_1_grades, test_2_grades) 158 | plt.title("Axes Are Comparable") 159 | plt.axis("equal") 160 | plt.xlabel("test 1 grade") 161 | plt.ylabel("test 2 grade") 162 | plt.savefig('im/viz_scatterplot_axes_comparable.png') 163 | plt.gca().clear() 164 | 165 | -------------------------------------------------------------------------------- /scratch/k_nearest_neighbors.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from collections import Counter 3 | 4 | def raw_majority_vote(labels: List[str]) -> str: 5 | votes = Counter(labels) 6 | winner, _ = votes.most_common(1)[0] 7 | return winner 8 | 9 | assert raw_majority_vote(['a', 'b', 'c', 'b']) == 'b' 10 | 11 | def majority_vote(labels: List[str]) -> str: 12 | """Assumes that labels are ordered from nearest to farthest.""" 13 | vote_counts = Counter(labels) 14 | winner, winner_count = vote_counts.most_common(1)[0] 15 | num_winners = len([count 16 | for count in vote_counts.values() 17 | if count == winner_count]) 18 | 19 | if num_winners == 1: 20 | return winner # unique winner, so return it 21 | else: 22 | return majority_vote(labels[:-1]) # try again without the farthest 23 | 24 | # Tie, so look at first 4, then 'b' 25 | assert majority_vote(['a', 'b', 'c', 'b', 'a']) == 'b' 26 | 27 | from typing import NamedTuple 28 | from scratch.linear_algebra import Vector, distance 29 | 30 | class LabeledPoint(NamedTuple): 31 | point: Vector 32 | label: str 33 | 34 | def knn_classify(k: int, 35 | labeled_points: List[LabeledPoint], 36 | new_point: Vector) -> str: 37 | 38 | # Order the labeled points from nearest to farthest. 39 | by_distance = sorted(labeled_points, 40 | key=lambda lp: distance(lp.point, new_point)) 41 | 42 | # Find the labels for the k closest 43 | k_nearest_labels = [lp.label for lp in by_distance[:k]] 44 | 45 | # and let them vote. 46 | return majority_vote(k_nearest_labels) 47 | 48 | 49 | import random 50 | 51 | def random_point(dim: int) -> Vector: 52 | return [random.random() for _ in range(dim)] 53 | 54 | def random_distances(dim: int, num_pairs: int) -> List[float]: 55 | return [distance(random_point(dim), random_point(dim)) 56 | for _ in range(num_pairs)] 57 | 58 | def main(): 59 | from typing import Dict 60 | import csv 61 | from collections import defaultdict 62 | 63 | def parse_iris_row(row: List[str]) -> LabeledPoint: 64 | """ 65 | sepal_length, sepal_width, petal_length, petal_width, class 66 | """ 67 | measurements = [float(value) for value in row[:-1]] 68 | # class is e.g. "Iris-virginica"; we just want "virginica" 69 | label = row[-1].split("-")[-1] 70 | 71 | return LabeledPoint(measurements, label) 72 | 73 | with open('iris.data') as f: 74 | reader = csv.reader(f) 75 | iris_data = [parse_iris_row(row) for row in reader] 76 | 77 | # We'll also group just the points by species/label so we can plot them. 78 | points_by_species: Dict[str, List[Vector]] = defaultdict(list) 79 | for iris in iris_data: 80 | points_by_species[iris.label].append(iris.point) 81 | 82 | from matplotlib import pyplot as plt 83 | metrics = ['sepal length', 'sepal width', 'petal length', 'petal width'] 84 | pairs = [(i, j) for i in range(4) for j in range(4) if i < j] 85 | marks = ['+', '.', 'x'] # we have 3 classes, so 3 markers 86 | 87 | fig, ax = plt.subplots(2, 3) 88 | 89 | for row in range(2): 90 | for col in range(3): 91 | i, j = pairs[3 * row + col] 92 | ax[row][col].set_title(f"{metrics[i]} vs {metrics[j]}", fontsize=8) 93 | ax[row][col].set_xticks([]) 94 | ax[row][col].set_yticks([]) 95 | 96 | for mark, (species, points) in zip(marks, points_by_species.items()): 97 | xs = [point[i] for point in points] 98 | ys = [point[j] for point in points] 99 | ax[row][col].scatter(xs, ys, marker=mark, label=species) 100 | 101 | ax[-1][-1].legend(loc='lower right', prop={'size': 6}) 102 | # plt.show() 103 | 104 | 105 | 106 | plt.savefig('im/iris_scatter.png') 107 | plt.gca().clear() 108 | 109 | import random 110 | from scratch.machine_learning import split_data 111 | 112 | random.seed(12) 113 | iris_train, iris_test = split_data(iris_data, 0.70) 114 | assert len(iris_train) == 0.7 * 150 115 | assert len(iris_test) == 0.3 * 150 116 | 117 | from typing import Tuple 118 | 119 | # track how many times we see (predicted, actual) 120 | confusion_matrix: Dict[Tuple[str, str], int] = defaultdict(int) 121 | num_correct = 0 122 | 123 | for iris in iris_test: 124 | predicted = knn_classify(5, iris_train, iris.point) 125 | actual = iris.label 126 | 127 | if predicted == actual: 128 | num_correct += 1 129 | 130 | confusion_matrix[(predicted, actual)] += 1 131 | 132 | pct_correct = num_correct / len(iris_test) 133 | print(pct_correct, confusion_matrix) 134 | 135 | import tqdm 136 | dimensions = range(1, 101) 137 | 138 | avg_distances = [] 139 | min_distances = [] 140 | 141 | random.seed(0) 142 | for dim in tqdm.tqdm(dimensions, desc="Curse of Dimensionality"): 143 | distances = random_distances(dim, 10000) # 10,000 random pairs 144 | avg_distances.append(sum(distances) / 10000) # track the average 145 | min_distances.append(min(distances)) # track the minimum 146 | 147 | min_avg_ratio = [min_dist / avg_dist 148 | for min_dist, avg_dist in zip(min_distances, avg_distances)] 149 | 150 | if __name__ == "__main__": main() -------------------------------------------------------------------------------- /first-edition/code-python3/visualizing_data.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from collections import Counter 3 | 4 | def make_chart_simple_line_chart(): 5 | 6 | years = [1950, 1960, 1970, 1980, 1990, 2000, 2010] 7 | gdp = [300.2, 543.3, 1075.9, 2862.5, 5979.6, 10289.7, 14958.3] 8 | 9 | # create a line chart, years on x-axis, gdp on y-axis 10 | plt.plot(years, gdp, color='green', marker='o', linestyle='solid') 11 | 12 | # add a title 13 | plt.title("Nominal GDP") 14 | 15 | # add a label to the y-axis 16 | plt.ylabel("Billions of $") 17 | plt.show() 18 | 19 | 20 | def make_chart_simple_bar_chart(): 21 | 22 | movies = ["Annie Hall", "Ben-Hur", "Casablanca", "Gandhi", "West Side Story"] 23 | num_oscars = [5, 11, 3, 8, 10] 24 | 25 | # bars are by default width 0.8, so we'll add 0.1 to the left coordinates 26 | # so that each bar is centered 27 | xs = [i + 0.1 for i, _ in enumerate(movies)] 28 | 29 | # plot bars with left x-coordinates [xs], heights [num_oscars] 30 | plt.bar(xs, num_oscars) 31 | plt.ylabel("# of Academy Awards") 32 | plt.title("My Favorite Movies") 33 | 34 | # label x-axis with movie names at bar centers 35 | plt.xticks([i + 0.5 for i, _ in enumerate(movies)], movies) 36 | 37 | plt.show() 38 | 39 | def make_chart_histogram(): 40 | grades = [83,95,91,87,70,0,85,82,100,67,73,77,0] 41 | decile = lambda grade: grade // 10 * 10 42 | histogram = Counter(decile(grade) for grade in grades) 43 | 44 | plt.bar([x - 4 for x in histogram.keys()], # shift each bar to the left by 4 45 | histogram.values(), # give each bar its correct height 46 | 8) # give each bar a width of 8 47 | plt.axis([-5, 105, 0, 5]) # x-axis from -5 to 105, 48 | # y-axis from 0 to 5 49 | plt.xticks([10 * i for i in range(11)]) # x-axis labels at 0, 10, ..., 100 50 | plt.xlabel("Decile") 51 | plt.ylabel("# of Students") 52 | plt.title("Distribution of Exam 1 Grades") 53 | plt.show() 54 | 55 | def make_chart_misleading_y_axis(mislead=True): 56 | 57 | mentions = [500, 505] 58 | years = [2013, 2014] 59 | 60 | plt.bar([2012.6, 2013.6], mentions, 0.8) 61 | plt.xticks(years) 62 | plt.ylabel("# of times I heard someone say 'data science'") 63 | 64 | # if you don't do this, matplotlib will label the x-axis 0, 1 65 | # and then add a +2.013e3 off in the corner (bad matplotlib!) 66 | plt.ticklabel_format(useOffset=False) 67 | 68 | if mislead: 69 | # misleading y-axis only shows the part above 500 70 | plt.axis([2012.5,2014.5,499,506]) 71 | plt.title("Look at the 'Huge' Increase!") 72 | else: 73 | plt.axis([2012.5,2014.5,0,550]) 74 | plt.title("Not So Huge Anymore.") 75 | plt.show() 76 | 77 | def make_chart_several_line_charts(): 78 | 79 | variance = [1,2,4,8,16,32,64,128,256] 80 | bias_squared = [256,128,64,32,16,8,4,2,1] 81 | total_error = [x + y for x, y in zip(variance, bias_squared)] 82 | 83 | xs = range(len(variance)) 84 | 85 | # we can make multiple calls to plt.plot 86 | # to show multiple series on the same chart 87 | plt.plot(xs, variance, 'g-', label='variance') # green solid line 88 | plt.plot(xs, bias_squared, 'r-.', label='bias^2') # red dot-dashed line 89 | plt.plot(xs, total_error, 'b:', label='total error') # blue dotted line 90 | 91 | # because we've assigned labels to each series 92 | # we can get a legend for free 93 | # loc=9 means "top center" 94 | plt.legend(loc=9) 95 | plt.xlabel("model complexity") 96 | plt.title("The Bias-Variance Tradeoff") 97 | plt.show() 98 | 99 | def make_chart_scatter_plot(): 100 | 101 | friends = [ 70, 65, 72, 63, 71, 64, 60, 64, 67] 102 | minutes = [175, 170, 205, 120, 220, 130, 105, 145, 190] 103 | labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'] 104 | 105 | plt.scatter(friends, minutes) 106 | 107 | # label each point 108 | for label, friend_count, minute_count in zip(labels, friends, minutes): 109 | plt.annotate(label, 110 | xy=(friend_count, minute_count), # put the label with its point 111 | xytext=(5, -5), # but slightly offset 112 | textcoords='offset points') 113 | 114 | plt.title("Daily Minutes vs. Number of Friends") 115 | plt.xlabel("# of friends") 116 | plt.ylabel("daily minutes spent on the site") 117 | plt.show() 118 | 119 | def make_chart_scatterplot_axes(equal_axes=False): 120 | 121 | test_1_grades = [ 99, 90, 85, 97, 80] 122 | test_2_grades = [100, 85, 60, 90, 70] 123 | 124 | plt.scatter(test_1_grades, test_2_grades) 125 | plt.xlabel("test 1 grade") 126 | plt.ylabel("test 2 grade") 127 | 128 | if equal_axes: 129 | plt.title("Axes Are Comparable") 130 | plt.axis("equal") 131 | else: 132 | plt.title("Axes Aren't Comparable") 133 | 134 | plt.show() 135 | 136 | def make_chart_pie_chart(): 137 | 138 | plt.pie([0.95, 0.05], labels=["Uses pie charts", "Knows better"]) 139 | 140 | # make sure pie is a circle and not an oval 141 | plt.axis("equal") 142 | plt.show() 143 | 144 | 145 | if __name__ == "__main__": 146 | 147 | make_chart_simple_line_chart() 148 | 149 | make_chart_simple_bar_chart() 150 | 151 | make_chart_histogram() 152 | 153 | make_chart_misleading_y_axis(mislead=True) 154 | 155 | make_chart_misleading_y_axis(mislead=False) 156 | 157 | make_chart_several_line_charts() 158 | 159 | make_chart_scatterplot_axes(equal_axes=False) 160 | 161 | make_chart_scatterplot_axes(equal_axes=True) 162 | 163 | make_chart_pie_chart() 164 | -------------------------------------------------------------------------------- /first-edition/code/visualizing_data.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from collections import Counter 3 | 4 | def make_chart_simple_line_chart(plt): 5 | 6 | years = [1950, 1960, 1970, 1980, 1990, 2000, 2010] 7 | gdp = [300.2, 543.3, 1075.9, 2862.5, 5979.6, 10289.7, 14958.3] 8 | 9 | # create a line chart, years on x-axis, gdp on y-axis 10 | plt.plot(years, gdp, color='green', marker='o', linestyle='solid') 11 | 12 | # add a title 13 | plt.title("Nominal GDP") 14 | 15 | # add a label to the y-axis 16 | plt.ylabel("Billions of $") 17 | plt.show() 18 | 19 | 20 | def make_chart_simple_bar_chart(plt): 21 | 22 | movies = ["Annie Hall", "Ben-Hur", "Casablanca", "Gandhi", "West Side Story"] 23 | num_oscars = [5, 11, 3, 8, 10] 24 | 25 | # bars are by default width 0.8, so we'll add 0.1 to the left coordinates 26 | # so that each bar is centered 27 | xs = [i + 0.1 for i, _ in enumerate(movies)] 28 | 29 | # plot bars with left x-coordinates [xs], heights [num_oscars] 30 | plt.bar(xs, num_oscars) 31 | plt.ylabel("# of Academy Awards") 32 | plt.title("My Favorite Movies") 33 | 34 | # label x-axis with movie names at bar centers 35 | plt.xticks([i + 0.5 for i, _ in enumerate(movies)], movies) 36 | 37 | plt.show() 38 | 39 | def make_chart_histogram(plt): 40 | grades = [83,95,91,87,70,0,85,82,100,67,73,77,0] 41 | decile = lambda grade: grade // 10 * 10 42 | histogram = Counter(decile(grade) for grade in grades) 43 | 44 | plt.bar([x - 4 for x in histogram.keys()], # shift each bar to the left by 4 45 | histogram.values(), # give each bar its correct height 46 | 8) # give each bar a width of 8 47 | plt.axis([-5, 105, 0, 5]) # x-axis from -5 to 105, 48 | # y-axis from 0 to 5 49 | plt.xticks([10 * i for i in range(11)]) # x-axis labels at 0, 10, ..., 100 50 | plt.xlabel("Decile") 51 | plt.ylabel("# of Students") 52 | plt.title("Distribution of Exam 1 Grades") 53 | plt.show() 54 | 55 | def make_chart_misleading_y_axis(plt, mislead=True): 56 | 57 | mentions = [500, 505] 58 | years = [2013, 2014] 59 | 60 | plt.bar([2012.6, 2013.6], mentions, 0.8) 61 | plt.xticks(years) 62 | plt.ylabel("# of times I heard someone say 'data science'") 63 | 64 | # if you don't do this, matplotlib will label the x-axis 0, 1 65 | # and then add a +2.013e3 off in the corner (bad matplotlib!) 66 | plt.ticklabel_format(useOffset=False) 67 | 68 | if mislead: 69 | # misleading y-axis only shows the part above 500 70 | plt.axis([2012.5,2014.5,499,506]) 71 | plt.title("Look at the 'Huge' Increase!") 72 | else: 73 | plt.axis([2012.5,2014.5,0,550]) 74 | plt.title("Not So Huge Anymore.") 75 | plt.show() 76 | 77 | def make_chart_several_line_charts(plt): 78 | 79 | variance = [1,2,4,8,16,32,64,128,256] 80 | bias_squared = [256,128,64,32,16,8,4,2,1] 81 | total_error = [x + y for x, y in zip(variance, bias_squared)] 82 | 83 | xs = range(len(variance)) 84 | 85 | # we can make multiple calls to plt.plot 86 | # to show multiple series on the same chart 87 | plt.plot(xs, variance, 'g-', label='variance') # green solid line 88 | plt.plot(xs, bias_squared, 'r-.', label='bias^2') # red dot-dashed line 89 | plt.plot(xs, total_error, 'b:', label='total error') # blue dotted line 90 | 91 | # because we've assigned labels to each series 92 | # we can get a legend for free 93 | # loc=9 means "top center" 94 | plt.legend(loc=9) 95 | plt.xlabel("model complexity") 96 | plt.title("The Bias-Variance Tradeoff") 97 | plt.show() 98 | 99 | def make_chart_scatter_plot(plt): 100 | 101 | friends = [ 70, 65, 72, 63, 71, 64, 60, 64, 67] 102 | minutes = [175, 170, 205, 120, 220, 130, 105, 145, 190] 103 | labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'] 104 | 105 | plt.scatter(friends, minutes) 106 | 107 | # label each point 108 | for label, friend_count, minute_count in zip(labels, friends, minutes): 109 | plt.annotate(label, 110 | xy=(friend_count, minute_count), # put the label with its point 111 | xytext=(5, -5), # but slightly offset 112 | textcoords='offset points') 113 | 114 | plt.title("Daily Minutes vs. Number of Friends") 115 | plt.xlabel("# of friends") 116 | plt.ylabel("daily minutes spent on the site") 117 | plt.show() 118 | 119 | def make_chart_scatterplot_axes(plt, equal_axes=False): 120 | 121 | test_1_grades = [ 99, 90, 85, 97, 80] 122 | test_2_grades = [100, 85, 60, 90, 70] 123 | 124 | plt.scatter(test_1_grades, test_2_grades) 125 | plt.xlabel("test 1 grade") 126 | plt.ylabel("test 2 grade") 127 | 128 | if equal_axes: 129 | plt.title("Axes Are Comparable") 130 | plt.axis("equal") 131 | else: 132 | plt.title("Axes Aren't Comparable") 133 | 134 | plt.show() 135 | 136 | def make_chart_pie_chart(plt): 137 | 138 | plt.pie([0.95, 0.05], labels=["Uses pie charts", "Knows better"]) 139 | 140 | # make sure pie is a circle and not an oval 141 | plt.axis("equal") 142 | plt.show() 143 | 144 | 145 | if __name__ == "__main__": 146 | 147 | make_chart_simple_line_chart(plt) 148 | 149 | make_chart_simple_bar_chart(plt) 150 | 151 | make_chart_histogram(plt) 152 | 153 | make_chart_misleading_y_axis(plt, mislead=True) 154 | 155 | make_chart_misleading_y_axis(plt, mislead=False) 156 | 157 | make_chart_several_line_charts(plt) 158 | 159 | make_chart_scatterplot_axes(plt, equal_axes=False) 160 | 161 | make_chart_scatterplot_axes(plt, equal_axes=True) 162 | 163 | make_chart_pie_chart(plt) 164 | -------------------------------------------------------------------------------- /scratch/gradient_descent.py: -------------------------------------------------------------------------------- 1 | from scratch.linear_algebra import Vector, dot 2 | 3 | def sum_of_squares(v: Vector) -> float: 4 | """Computes the sum of squared elements in v""" 5 | return dot(v, v) 6 | 7 | from typing import Callable 8 | 9 | def difference_quotient(f: Callable[[float], float], 10 | x: float, 11 | h: float) -> float: 12 | return (f(x + h) - f(x)) / h 13 | 14 | def square(x: float) -> float: 15 | return x * x 16 | 17 | def derivative(x: float) -> float: 18 | return 2 * x 19 | 20 | def estimate_gradient(f: Callable[[Vector], float], 21 | v: Vector, 22 | h: float = 0.0001): 23 | return [partial_difference_quotient(f, v, i, h) 24 | for i in range(len(v))] 25 | 26 | import random 27 | from scratch.linear_algebra import distance, add, scalar_multiply 28 | 29 | def gradient_step(v: Vector, gradient: Vector, step_size: float) -> Vector: 30 | """Moves `step_size` in the `gradient` direction from `v`""" 31 | assert len(v) == len(gradient) 32 | step = scalar_multiply(step_size, gradient) 33 | return add(v, step) 34 | 35 | def sum_of_squares_gradient(v: Vector) -> Vector: 36 | return [2 * v_i for v_i in v] 37 | 38 | # x ranges from -50 to 49, y is always 20 * x + 5 39 | inputs = [(x, 20 * x + 5) for x in range(-50, 50)] 40 | 41 | def linear_gradient(x: float, y: float, theta: Vector) -> Vector: 42 | slope, intercept = theta 43 | predicted = slope * x + intercept # The prediction of the model. 44 | error = (predicted - y) # error is (predicted - actual) 45 | squared_error = error ** 2 # We'll minimize squared error 46 | grad = [2 * error * x, 2 * error] # using its gradient. 47 | return grad 48 | 49 | from typing import TypeVar, List, Iterator 50 | 51 | T = TypeVar('T') # this allows us to type "generic" functions 52 | 53 | def minibatches(dataset: List[T], 54 | batch_size: int, 55 | shuffle: bool = True) -> Iterator[List[T]]: 56 | """Generates `batch_size`-sized minibatches from the dataset""" 57 | # Start indexes 0, batch_size, 2 * batch_size, ... 58 | batch_starts = [start for start in range(0, len(dataset), batch_size)] 59 | 60 | if shuffle: random.shuffle(batch_starts) # shuffle the batches 61 | 62 | for start in batch_starts: 63 | end = start + batch_size 64 | yield dataset[start:end] 65 | 66 | def main(): 67 | xs = range(-10, 11) 68 | actuals = [derivative(x) for x in xs] 69 | estimates = [difference_quotient(square, x, h=0.001) for x in xs] 70 | 71 | # plot to show they're basically the same 72 | import matplotlib.pyplot as plt 73 | plt.title("Actual Derivatives vs. Estimates") 74 | plt.plot(xs, actuals, 'rx', label='Actual') # red x 75 | plt.plot(xs, estimates, 'b+', label='Estimate') # blue + 76 | plt.legend(loc=9) 77 | # plt.show() 78 | 79 | 80 | plt.close() 81 | 82 | def partial_difference_quotient(f: Callable[[Vector], float], 83 | v: Vector, 84 | i: int, 85 | h: float) -> float: 86 | """Returns the i-th partial difference quotient of f at v""" 87 | w = [v_j + (h if j == i else 0) # add h to just the ith element of v 88 | for j, v_j in enumerate(v)] 89 | 90 | return (f(w) - f(v)) / h 91 | 92 | 93 | # "Using the Gradient" example 94 | 95 | # pick a random starting point 96 | v = [random.uniform(-10, 10) for i in range(3)] 97 | 98 | for epoch in range(1000): 99 | grad = sum_of_squares_gradient(v) # compute the gradient at v 100 | v = gradient_step(v, grad, -0.01) # take a negative gradient step 101 | print(epoch, v) 102 | 103 | assert distance(v, [0, 0, 0]) < 0.001 # v should be close to 0 104 | 105 | 106 | # First "Using Gradient Descent to Fit Models" example 107 | 108 | from scratch.linear_algebra import vector_mean 109 | 110 | # Start with random values for slope and intercept. 111 | theta = [random.uniform(-1, 1), random.uniform(-1, 1)] 112 | 113 | learning_rate = 0.001 114 | 115 | for epoch in range(5000): 116 | # Compute the mean of the gradients 117 | grad = vector_mean([linear_gradient(x, y, theta) for x, y in inputs]) 118 | # Take a step in that direction 119 | theta = gradient_step(theta, grad, -learning_rate) 120 | print(epoch, theta) 121 | 122 | slope, intercept = theta 123 | assert 19.9 < slope < 20.1, "slope should be about 20" 124 | assert 4.9 < intercept < 5.1, "intercept should be about 5" 125 | 126 | 127 | # Minibatch gradient descent example 128 | 129 | theta = [random.uniform(-1, 1), random.uniform(-1, 1)] 130 | 131 | for epoch in range(1000): 132 | for batch in minibatches(inputs, batch_size=20): 133 | grad = vector_mean([linear_gradient(x, y, theta) for x, y in batch]) 134 | theta = gradient_step(theta, grad, -learning_rate) 135 | print(epoch, theta) 136 | 137 | slope, intercept = theta 138 | assert 19.9 < slope < 20.1, "slope should be about 20" 139 | assert 4.9 < intercept < 5.1, "intercept should be about 5" 140 | 141 | 142 | # Stochastic gradient descent example 143 | 144 | theta = [random.uniform(-1, 1), random.uniform(-1, 1)] 145 | 146 | for epoch in range(100): 147 | for x, y in inputs: 148 | grad = linear_gradient(x, y, theta) 149 | theta = gradient_step(theta, grad, -learning_rate) 150 | print(epoch, theta) 151 | 152 | slope, intercept = theta 153 | assert 19.9 < slope < 20.1, "slope should be about 20" 154 | assert 4.9 < intercept < 5.1, "intercept should be about 5" 155 | 156 | if __name__ == "__main__": main() -------------------------------------------------------------------------------- /first-edition/code-python3/stats.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from linear_algebra import sum_of_squares, dot 3 | import math 4 | 5 | num_friends = [100,49,41,40,25,21,21,19,19,18,18,16,15,15,15,15,14,14,13,13,13,13,12,12,11,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,8,8,8,8,8,8,8,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 6 | 7 | def make_friend_counts_histogram(plt): 8 | friend_counts = Counter(num_friends) 9 | xs = range(101) 10 | ys = [friend_counts[x] for x in xs] 11 | plt.bar(xs, ys) 12 | plt.axis([0,101,0,25]) 13 | plt.title("Histogram of Friend Counts") 14 | plt.xlabel("# of friends") 15 | plt.ylabel("# of people") 16 | plt.show() 17 | 18 | num_points = len(num_friends) # 204 19 | 20 | largest_value = max(num_friends) # 100 21 | smallest_value = min(num_friends) # 1 22 | 23 | sorted_values = sorted(num_friends) 24 | smallest_value = sorted_values[0] # 1 25 | second_smallest_value = sorted_values[1] # 1 26 | second_largest_value = sorted_values[-2] # 49 27 | 28 | # this isn't right if you don't from __future__ import division 29 | def mean(x): 30 | return sum(x) / len(x) 31 | 32 | def median(v): 33 | """finds the 'middle-most' value of v""" 34 | n = len(v) 35 | sorted_v = sorted(v) 36 | midpoint = n // 2 37 | 38 | if n % 2 == 1: 39 | # if odd, return the middle value 40 | return sorted_v[midpoint] 41 | else: 42 | # if even, return the average of the middle values 43 | lo = midpoint - 1 44 | hi = midpoint 45 | return (sorted_v[lo] + sorted_v[hi]) / 2 46 | 47 | def quantile(x, p): 48 | """returns the pth-percentile value in x""" 49 | p_index = int(p * len(x)) 50 | return sorted(x)[p_index] 51 | 52 | def mode(x): 53 | """returns a list, might be more than one mode""" 54 | counts = Counter(x) 55 | max_count = max(counts.values()) 56 | return [x_i for x_i, count in counts.items() 57 | if count == max_count] 58 | 59 | # "range" already means something in Python, so we'll use a different name 60 | def data_range(x): 61 | return max(x) - min(x) 62 | 63 | def de_mean(x): 64 | """translate x by subtracting its mean (so the result has mean 0)""" 65 | x_bar = mean(x) 66 | return [x_i - x_bar for x_i in x] 67 | 68 | def variance(x): 69 | """assumes x has at least two elements""" 70 | n = len(x) 71 | deviations = de_mean(x) 72 | return sum_of_squares(deviations) / (n - 1) 73 | 74 | def standard_deviation(x): 75 | return math.sqrt(variance(x)) 76 | 77 | def interquartile_range(x): 78 | return quantile(x, 0.75) - quantile(x, 0.25) 79 | 80 | #### 81 | # 82 | # CORRELATION 83 | # 84 | ##### 85 | 86 | daily_minutes = [1,68.77,51.25,52.08,38.36,44.54,57.13,51.4,41.42,31.22,34.76,54.01,38.79,47.59,49.1,27.66,41.03,36.73,48.65,28.12,46.62,35.57,32.98,35,26.07,23.77,39.73,40.57,31.65,31.21,36.32,20.45,21.93,26.02,27.34,23.49,46.94,30.5,33.8,24.23,21.4,27.94,32.24,40.57,25.07,19.42,22.39,18.42,46.96,23.72,26.41,26.97,36.76,40.32,35.02,29.47,30.2,31,38.11,38.18,36.31,21.03,30.86,36.07,28.66,29.08,37.28,15.28,24.17,22.31,30.17,25.53,19.85,35.37,44.6,17.23,13.47,26.33,35.02,32.09,24.81,19.33,28.77,24.26,31.98,25.73,24.86,16.28,34.51,15.23,39.72,40.8,26.06,35.76,34.76,16.13,44.04,18.03,19.65,32.62,35.59,39.43,14.18,35.24,40.13,41.82,35.45,36.07,43.67,24.61,20.9,21.9,18.79,27.61,27.21,26.61,29.77,20.59,27.53,13.82,33.2,25,33.1,36.65,18.63,14.87,22.2,36.81,25.53,24.62,26.25,18.21,28.08,19.42,29.79,32.8,35.99,28.32,27.79,35.88,29.06,36.28,14.1,36.63,37.49,26.9,18.58,38.48,24.48,18.95,33.55,14.24,29.04,32.51,25.63,22.22,19,32.73,15.16,13.9,27.2,32.01,29.27,33,13.74,20.42,27.32,18.23,35.35,28.48,9.08,24.62,20.12,35.26,19.92,31.02,16.49,12.16,30.7,31.22,34.65,13.13,27.51,33.2,31.57,14.1,33.42,17.44,10.12,24.42,9.82,23.39,30.93,15.03,21.67,31.09,33.29,22.61,26.89,23.48,8.38,27.81,32.35,23.84] 87 | 88 | def covariance(x, y): 89 | n = len(x) 90 | return dot(de_mean(x), de_mean(y)) / (n - 1) 91 | 92 | def correlation(x, y): 93 | stdev_x = standard_deviation(x) 94 | stdev_y = standard_deviation(y) 95 | if stdev_x > 0 and stdev_y > 0: 96 | return covariance(x, y) / stdev_x / stdev_y 97 | else: 98 | return 0 # if no variation, correlation is zero 99 | 100 | outlier = num_friends.index(100) # index of outlier 101 | 102 | num_friends_good = [x 103 | for i, x in enumerate(num_friends) 104 | if i != outlier] 105 | 106 | daily_minutes_good = [x 107 | for i, x in enumerate(daily_minutes) 108 | if i != outlier] 109 | 110 | 111 | 112 | if __name__ == "__main__": 113 | 114 | print("num_points", len(num_friends)) 115 | print("largest value", max(num_friends)) 116 | print("smallest value", min(num_friends)) 117 | print("second_smallest_value", sorted_values[1]) 118 | print("second_largest_value", sorted_values[-2] ) 119 | print("mean(num_friends)", mean(num_friends)) 120 | print("median(num_friends)", median(num_friends)) 121 | print("quantile(num_friends, 0.10)", quantile(num_friends, 0.10)) 122 | print("quantile(num_friends, 0.25)", quantile(num_friends, 0.25)) 123 | print("quantile(num_friends, 0.75)", quantile(num_friends, 0.75)) 124 | print("quantile(num_friends, 0.90)", quantile(num_friends, 0.90)) 125 | print("mode(num_friends)", mode(num_friends)) 126 | print("data_range(num_friends)", data_range(num_friends)) 127 | print("variance(num_friends)", variance(num_friends)) 128 | print("standard_deviation(num_friends)", standard_deviation(num_friends)) 129 | print("interquartile_range(num_friends)", interquartile_range(num_friends)) 130 | 131 | print("covariance(num_friends, daily_minutes)", covariance(num_friends, daily_minutes)) 132 | print("correlation(num_friends, daily_minutes)", correlation(num_friends, daily_minutes)) 133 | print("correlation(num_friends_good, daily_minutes_good)", correlation(num_friends_good, daily_minutes_good)) 134 | -------------------------------------------------------------------------------- /first-edition/code/statistics.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from collections import Counter 3 | from linear_algebra import sum_of_squares, dot 4 | import math 5 | 6 | num_friends = [100,49,41,40,25,21,21,19,19,18,18,16,15,15,15,15,14,14,13,13,13,13,12,12,11,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,8,8,8,8,8,8,8,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 7 | 8 | def make_friend_counts_histogram(plt): 9 | friend_counts = Counter(num_friends) 10 | xs = range(101) 11 | ys = [friend_counts[x] for x in xs] 12 | plt.bar(xs, ys) 13 | plt.axis([0,101,0,25]) 14 | plt.title("Histogram of Friend Counts") 15 | plt.xlabel("# of friends") 16 | plt.ylabel("# of people") 17 | plt.show() 18 | 19 | num_points = len(num_friends) # 204 20 | 21 | largest_value = max(num_friends) # 100 22 | smallest_value = min(num_friends) # 1 23 | 24 | sorted_values = sorted(num_friends) 25 | smallest_value = sorted_values[0] # 1 26 | second_smallest_value = sorted_values[1] # 1 27 | second_largest_value = sorted_values[-2] # 49 28 | 29 | # this isn't right if you don't from __future__ import division 30 | def mean(x): 31 | return sum(x) / len(x) 32 | 33 | def median(v): 34 | """finds the 'middle-most' value of v""" 35 | n = len(v) 36 | sorted_v = sorted(v) 37 | midpoint = n // 2 38 | 39 | if n % 2 == 1: 40 | # if odd, return the middle value 41 | return sorted_v[midpoint] 42 | else: 43 | # if even, return the average of the middle values 44 | lo = midpoint - 1 45 | hi = midpoint 46 | return (sorted_v[lo] + sorted_v[hi]) / 2 47 | 48 | def quantile(x, p): 49 | """returns the pth-percentile value in x""" 50 | p_index = int(p * len(x)) 51 | return sorted(x)[p_index] 52 | 53 | def mode(x): 54 | """returns a list, might be more than one mode""" 55 | counts = Counter(x) 56 | max_count = max(counts.values()) 57 | return [x_i for x_i, count in counts.iteritems() 58 | if count == max_count] 59 | 60 | # "range" already means something in Python, so we'll use a different name 61 | def data_range(x): 62 | return max(x) - min(x) 63 | 64 | def de_mean(x): 65 | """translate x by subtracting its mean (so the result has mean 0)""" 66 | x_bar = mean(x) 67 | return [x_i - x_bar for x_i in x] 68 | 69 | def variance(x): 70 | """assumes x has at least two elements""" 71 | n = len(x) 72 | deviations = de_mean(x) 73 | return sum_of_squares(deviations) / (n - 1) 74 | 75 | def standard_deviation(x): 76 | return math.sqrt(variance(x)) 77 | 78 | def interquartile_range(x): 79 | return quantile(x, 0.75) - quantile(x, 0.25) 80 | 81 | #### 82 | # 83 | # CORRELATION 84 | # 85 | ##### 86 | 87 | daily_minutes = [1,68.77,51.25,52.08,38.36,44.54,57.13,51.4,41.42,31.22,34.76,54.01,38.79,47.59,49.1,27.66,41.03,36.73,48.65,28.12,46.62,35.57,32.98,35,26.07,23.77,39.73,40.57,31.65,31.21,36.32,20.45,21.93,26.02,27.34,23.49,46.94,30.5,33.8,24.23,21.4,27.94,32.24,40.57,25.07,19.42,22.39,18.42,46.96,23.72,26.41,26.97,36.76,40.32,35.02,29.47,30.2,31,38.11,38.18,36.31,21.03,30.86,36.07,28.66,29.08,37.28,15.28,24.17,22.31,30.17,25.53,19.85,35.37,44.6,17.23,13.47,26.33,35.02,32.09,24.81,19.33,28.77,24.26,31.98,25.73,24.86,16.28,34.51,15.23,39.72,40.8,26.06,35.76,34.76,16.13,44.04,18.03,19.65,32.62,35.59,39.43,14.18,35.24,40.13,41.82,35.45,36.07,43.67,24.61,20.9,21.9,18.79,27.61,27.21,26.61,29.77,20.59,27.53,13.82,33.2,25,33.1,36.65,18.63,14.87,22.2,36.81,25.53,24.62,26.25,18.21,28.08,19.42,29.79,32.8,35.99,28.32,27.79,35.88,29.06,36.28,14.1,36.63,37.49,26.9,18.58,38.48,24.48,18.95,33.55,14.24,29.04,32.51,25.63,22.22,19,32.73,15.16,13.9,27.2,32.01,29.27,33,13.74,20.42,27.32,18.23,35.35,28.48,9.08,24.62,20.12,35.26,19.92,31.02,16.49,12.16,30.7,31.22,34.65,13.13,27.51,33.2,31.57,14.1,33.42,17.44,10.12,24.42,9.82,23.39,30.93,15.03,21.67,31.09,33.29,22.61,26.89,23.48,8.38,27.81,32.35,23.84] 88 | 89 | def covariance(x, y): 90 | n = len(x) 91 | return dot(de_mean(x), de_mean(y)) / (n - 1) 92 | 93 | def correlation(x, y): 94 | stdev_x = standard_deviation(x) 95 | stdev_y = standard_deviation(y) 96 | if stdev_x > 0 and stdev_y > 0: 97 | return covariance(x, y) / stdev_x / stdev_y 98 | else: 99 | return 0 # if no variation, correlation is zero 100 | 101 | outlier = num_friends.index(100) # index of outlier 102 | 103 | num_friends_good = [x 104 | for i, x in enumerate(num_friends) 105 | if i != outlier] 106 | 107 | daily_minutes_good = [x 108 | for i, x in enumerate(daily_minutes) 109 | if i != outlier] 110 | 111 | 112 | 113 | if __name__ == "__main__": 114 | 115 | print "num_points", len(num_friends) 116 | print "largest value", max(num_friends) 117 | print "smallest value", min(num_friends) 118 | print "second_smallest_value", sorted_values[1] 119 | print "second_largest_value", sorted_values[-2] 120 | print "mean(num_friends)", mean(num_friends) 121 | print "median(num_friends)", median(num_friends) 122 | print "quantile(num_friends, 0.10)", quantile(num_friends, 0.10) 123 | print "quantile(num_friends, 0.25)", quantile(num_friends, 0.25) 124 | print "quantile(num_friends, 0.75)", quantile(num_friends, 0.75) 125 | print "quantile(num_friends, 0.90)", quantile(num_friends, 0.90) 126 | print "mode(num_friends)", mode(num_friends) 127 | print "data_range(num_friends)", data_range(num_friends) 128 | print "variance(num_friends)", variance(num_friends) 129 | print "standard_deviation(num_friends)", standard_deviation(num_friends) 130 | print "interquartile_range(num_friends)", interquartile_range(num_friends) 131 | 132 | print "covariance(num_friends, daily_minutes)", covariance(num_friends, daily_minutes) 133 | print "correlation(num_friends, daily_minutes)", correlation(num_friends, daily_minutes) 134 | print "correlation(num_friends_good, daily_minutes_good)", correlation(num_friends_good, daily_minutes_good) 135 | -------------------------------------------------------------------------------- /scratch/linear_algebra.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | Vector = List[float] 4 | 5 | height_weight_age = [70, # inches, 6 | 170, # pounds, 7 | 40 ] # years 8 | 9 | grades = [95, # exam1 10 | 80, # exam2 11 | 75, # exam3 12 | 62 ] # exam4 13 | 14 | def add(v: Vector, w: Vector) -> Vector: 15 | """Adds corresponding elements""" 16 | assert len(v) == len(w), "vectors must be the same length" 17 | 18 | return [v_i + w_i for v_i, w_i in zip(v, w)] 19 | 20 | assert add([1, 2, 3], [4, 5, 6]) == [5, 7, 9] 21 | 22 | def subtract(v: Vector, w: Vector) -> Vector: 23 | """Subtracts corresponding elements""" 24 | assert len(v) == len(w), "vectors must be the same length" 25 | 26 | return [v_i - w_i for v_i, w_i in zip(v, w)] 27 | 28 | assert subtract([5, 7, 9], [4, 5, 6]) == [1, 2, 3] 29 | 30 | def vector_sum(vectors: List[Vector]) -> Vector: 31 | """Sums all corresponding elements""" 32 | # Check that vectors is not empty 33 | assert vectors, "no vectors provided!" 34 | 35 | # Check the vectors are all the same size 36 | num_elements = len(vectors[0]) 37 | assert all(len(v) == num_elements for v in vectors), "different sizes!" 38 | 39 | # the i-th element of the result is the sum of every vector[i] 40 | return [sum(vector[i] for vector in vectors) 41 | for i in range(num_elements)] 42 | 43 | assert vector_sum([[1, 2], [3, 4], [5, 6], [7, 8]]) == [16, 20] 44 | 45 | def scalar_multiply(c: float, v: Vector) -> Vector: 46 | """Multiplies every element by c""" 47 | return [c * v_i for v_i in v] 48 | 49 | assert scalar_multiply(2, [1, 2, 3]) == [2, 4, 6] 50 | 51 | def vector_mean(vectors: List[Vector]) -> Vector: 52 | """Computes the element-wise average""" 53 | n = len(vectors) 54 | return scalar_multiply(1/n, vector_sum(vectors)) 55 | 56 | assert vector_mean([[1, 2], [3, 4], [5, 6]]) == [3, 4] 57 | 58 | def dot(v: Vector, w: Vector) -> float: 59 | """Computes v_1 * w_1 + ... + v_n * w_n""" 60 | assert len(v) == len(w), "vectors must be same length" 61 | 62 | return sum(v_i * w_i for v_i, w_i in zip(v, w)) 63 | 64 | assert dot([1, 2, 3], [4, 5, 6]) == 32 # 1 * 4 + 2 * 5 + 3 * 6 65 | 66 | def sum_of_squares(v: Vector) -> float: 67 | """Returns v_1 * v_1 + ... + v_n * v_n""" 68 | return dot(v, v) 69 | 70 | assert sum_of_squares([1, 2, 3]) == 14 # 1 * 1 + 2 * 2 + 3 * 3 71 | 72 | import math 73 | 74 | def magnitude(v: Vector) -> float: 75 | """Returns the magnitude (or length) of v""" 76 | return math.sqrt(sum_of_squares(v)) # math.sqrt is square root function 77 | 78 | assert magnitude([3, 4]) == 5 79 | 80 | def squared_distance(v: Vector, w: Vector) -> float: 81 | """Computes (v_1 - w_1) ** 2 + ... + (v_n - w_n) ** 2""" 82 | return sum_of_squares(subtract(v, w)) 83 | 84 | def distance(v: Vector, w: Vector) -> float: 85 | """Computes the distance between v and w""" 86 | return math.sqrt(squared_distance(v, w)) 87 | 88 | 89 | def distance(v: Vector, w: Vector) -> float: # type: ignore 90 | return magnitude(subtract(v, w)) 91 | 92 | # Another type alias 93 | Matrix = List[List[float]] 94 | 95 | A = [[1, 2, 3], # A has 2 rows and 3 columns 96 | [4, 5, 6]] 97 | 98 | B = [[1, 2], # B has 3 rows and 2 columns 99 | [3, 4], 100 | [5, 6]] 101 | 102 | from typing import Tuple 103 | 104 | def shape(A: Matrix) -> Tuple[int, int]: 105 | """Returns (# of rows of A, # of columns of A)""" 106 | num_rows = len(A) 107 | num_cols = len(A[0]) if A else 0 # number of elements in first row 108 | return num_rows, num_cols 109 | 110 | assert shape([[1, 2, 3], [4, 5, 6]]) == (2, 3) # 2 rows, 3 columns 111 | 112 | def get_row(A: Matrix, i: int) -> Vector: 113 | """Returns the i-th row of A (as a Vector)""" 114 | return A[i] # A[i] is already the ith row 115 | 116 | def get_column(A: Matrix, j: int) -> Vector: 117 | """Returns the j-th column of A (as a Vector)""" 118 | return [A_i[j] # jth element of row A_i 119 | for A_i in A] # for each row A_i 120 | 121 | from typing import Callable 122 | 123 | def make_matrix(num_rows: int, 124 | num_cols: int, 125 | entry_fn: Callable[[int, int], float]) -> Matrix: 126 | """ 127 | Returns a num_rows x num_cols matrix 128 | whose (i,j)-th entry is entry_fn(i, j) 129 | """ 130 | return [[entry_fn(i, j) # given i, create a list 131 | for j in range(num_cols)] # [entry_fn(i, 0), ... ] 132 | for i in range(num_rows)] # create one list for each i 133 | 134 | def identity_matrix(n: int) -> Matrix: 135 | """Returns the n x n identity matrix""" 136 | return make_matrix(n, n, lambda i, j: 1 if i == j else 0) 137 | 138 | assert identity_matrix(5) == [[1, 0, 0, 0, 0], 139 | [0, 1, 0, 0, 0], 140 | [0, 0, 1, 0, 0], 141 | [0, 0, 0, 1, 0], 142 | [0, 0, 0, 0, 1]] 143 | 144 | data = [[70, 170, 40], 145 | [65, 120, 26], 146 | [77, 250, 19], 147 | # .... 148 | ] 149 | 150 | friendships = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4), 151 | (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)] 152 | 153 | # user 0 1 2 3 4 5 6 7 8 9 154 | # 155 | friend_matrix = [[0, 1, 1, 0, 0, 0, 0, 0, 0, 0], # user 0 156 | [1, 0, 1, 1, 0, 0, 0, 0, 0, 0], # user 1 157 | [1, 1, 0, 1, 0, 0, 0, 0, 0, 0], # user 2 158 | [0, 1, 1, 0, 1, 0, 0, 0, 0, 0], # user 3 159 | [0, 0, 0, 1, 0, 1, 0, 0, 0, 0], # user 4 160 | [0, 0, 0, 0, 1, 0, 1, 1, 0, 0], # user 5 161 | [0, 0, 0, 0, 0, 1, 0, 0, 1, 0], # user 6 162 | [0, 0, 0, 0, 0, 1, 0, 0, 1, 0], # user 7 163 | [0, 0, 0, 0, 0, 0, 1, 1, 0, 1], # user 8 164 | [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]] # user 9 165 | 166 | assert friend_matrix[0][2] == 1, "0 and 2 are friends" 167 | assert friend_matrix[0][8] == 0, "0 and 8 are not friends" 168 | 169 | # only need to look at one row 170 | friends_of_five = [i 171 | for i, is_friend in enumerate(friend_matrix[5]) 172 | if is_friend] 173 | 174 | -------------------------------------------------------------------------------- /first-edition/code-python3/decision_trees.py: -------------------------------------------------------------------------------- 1 | from collections import Counter, defaultdict 2 | from functools import partial 3 | import math, random 4 | 5 | def entropy(class_probabilities): 6 | """given a list of class probabilities, compute the entropy""" 7 | return sum(-p * math.log(p, 2) for p in class_probabilities if p) 8 | 9 | def class_probabilities(labels): 10 | total_count = len(labels) 11 | return [count / total_count 12 | for count in Counter(labels).values()] 13 | 14 | def data_entropy(labeled_data): 15 | labels = [label for _, label in labeled_data] 16 | probabilities = class_probabilities(labels) 17 | return entropy(probabilities) 18 | 19 | def partition_entropy(subsets): 20 | """find the entropy from this partition of data into subsets""" 21 | total_count = sum(len(subset) for subset in subsets) 22 | 23 | return sum( data_entropy(subset) * len(subset) / total_count 24 | for subset in subsets ) 25 | 26 | def group_by(items, key_fn): 27 | """returns a defaultdict(list), where each input item 28 | is in the list whose key is key_fn(item)""" 29 | groups = defaultdict(list) 30 | for item in items: 31 | key = key_fn(item) 32 | groups[key].append(item) 33 | return groups 34 | 35 | def partition_by(inputs, attribute): 36 | """returns a dict of inputs partitioned by the attribute 37 | each input is a pair (attribute_dict, label)""" 38 | return group_by(inputs, lambda x: x[0][attribute]) 39 | 40 | def partition_entropy_by(inputs,attribute): 41 | """computes the entropy corresponding to the given partition""" 42 | partitions = partition_by(inputs, attribute) 43 | return partition_entropy(partitions.values()) 44 | 45 | def classify(tree, input): 46 | """classify the input using the given decision tree""" 47 | 48 | # if this is a leaf node, return its value 49 | if tree in [True, False]: 50 | return tree 51 | 52 | # otherwise find the correct subtree 53 | attribute, subtree_dict = tree 54 | 55 | subtree_key = input.get(attribute) # None if input is missing attribute 56 | 57 | if subtree_key not in subtree_dict: # if no subtree for key, 58 | subtree_key = None # we'll use the None subtree 59 | 60 | subtree = subtree_dict[subtree_key] # choose the appropriate subtree 61 | return classify(subtree, input) # and use it to classify the input 62 | 63 | def build_tree_id3(inputs, split_candidates=None): 64 | 65 | # if this is our first pass, 66 | # all keys of the first input are split candidates 67 | if split_candidates is None: 68 | split_candidates = inputs[0][0].keys() 69 | 70 | # count Trues and Falses in the inputs 71 | num_inputs = len(inputs) 72 | num_trues = len([label for item, label in inputs if label]) 73 | num_falses = num_inputs - num_trues 74 | 75 | if num_trues == 0: # if only Falses are left 76 | return False # return a "False" leaf 77 | 78 | if num_falses == 0: # if only Trues are left 79 | return True # return a "True" leaf 80 | 81 | if not split_candidates: # if no split candidates left 82 | return num_trues >= num_falses # return the majority leaf 83 | 84 | # otherwise, split on the best attribute 85 | best_attribute = min(split_candidates, 86 | key=partial(partition_entropy_by, inputs)) 87 | 88 | partitions = partition_by(inputs, best_attribute) 89 | new_candidates = [a for a in split_candidates 90 | if a != best_attribute] 91 | 92 | # recursively build the subtrees 93 | subtrees = { attribute : build_tree_id3(subset, new_candidates) 94 | for attribute, subset in partitions.items() } 95 | 96 | subtrees[None] = num_trues > num_falses # default case 97 | 98 | return (best_attribute, subtrees) 99 | 100 | def forest_classify(trees, input): 101 | votes = [classify(tree, input) for tree in trees] 102 | vote_counts = Counter(votes) 103 | return vote_counts.most_common(1)[0][0] 104 | 105 | 106 | if __name__ == "__main__": 107 | 108 | inputs = [ 109 | ({'level':'Senior','lang':'Java','tweets':'no','phd':'no'}, False), 110 | ({'level':'Senior','lang':'Java','tweets':'no','phd':'yes'}, False), 111 | ({'level':'Mid','lang':'Python','tweets':'no','phd':'no'}, True), 112 | ({'level':'Junior','lang':'Python','tweets':'no','phd':'no'}, True), 113 | ({'level':'Junior','lang':'R','tweets':'yes','phd':'no'}, True), 114 | ({'level':'Junior','lang':'R','tweets':'yes','phd':'yes'}, False), 115 | ({'level':'Mid','lang':'R','tweets':'yes','phd':'yes'}, True), 116 | ({'level':'Senior','lang':'Python','tweets':'no','phd':'no'}, False), 117 | ({'level':'Senior','lang':'R','tweets':'yes','phd':'no'}, True), 118 | ({'level':'Junior','lang':'Python','tweets':'yes','phd':'no'}, True), 119 | ({'level':'Senior','lang':'Python','tweets':'yes','phd':'yes'},True), 120 | ({'level':'Mid','lang':'Python','tweets':'no','phd':'yes'}, True), 121 | ({'level':'Mid','lang':'Java','tweets':'yes','phd':'no'}, True), 122 | ({'level':'Junior','lang':'Python','tweets':'no','phd':'yes'},False) 123 | ] 124 | 125 | for key in ['level','lang','tweets','phd']: 126 | print(key, partition_entropy_by(inputs, key)) 127 | print() 128 | 129 | senior_inputs = [(input, label) 130 | for input, label in inputs if input["level"] == "Senior"] 131 | 132 | for key in ['lang', 'tweets', 'phd']: 133 | print(key, partition_entropy_by(senior_inputs, key)) 134 | print() 135 | 136 | print("building the tree") 137 | tree = build_tree_id3(inputs) 138 | print(tree) 139 | 140 | print("Junior / Java / tweets / no phd", classify(tree, 141 | { "level" : "Junior", 142 | "lang" : "Java", 143 | "tweets" : "yes", 144 | "phd" : "no"} )) 145 | 146 | print("Junior / Java / tweets / phd", classify(tree, 147 | { "level" : "Junior", 148 | "lang" : "Java", 149 | "tweets" : "yes", 150 | "phd" : "yes"} )) 151 | 152 | print("Intern", classify(tree, { "level" : "Intern" } )) 153 | print("Senior", classify(tree, { "level" : "Senior" } )) 154 | -------------------------------------------------------------------------------- /first-edition/code-python3/logistic_regression.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from functools import partial, reduce 3 | from linear_algebra import dot, vector_add 4 | from gradient_descent import maximize_stochastic, maximize_batch 5 | from working_with_data import rescale 6 | from machine_learning import train_test_split 7 | from multiple_regression import estimate_beta, predict 8 | import math, random 9 | 10 | def logistic(x): 11 | return 1.0 / (1 + math.exp(-x)) 12 | 13 | def logistic_prime(x): 14 | return logistic(x) * (1 - logistic(x)) 15 | 16 | def logistic_log_likelihood_i(x_i, y_i, beta): 17 | if y_i == 1: 18 | return math.log(logistic(dot(x_i, beta))) 19 | else: 20 | return math.log(1 - logistic(dot(x_i, beta))) 21 | 22 | def logistic_log_likelihood(x, y, beta): 23 | return sum(logistic_log_likelihood_i(x_i, y_i, beta) 24 | for x_i, y_i in zip(x, y)) 25 | 26 | def logistic_log_partial_ij(x_i, y_i, beta, j): 27 | """here i is the index of the data point, 28 | j the index of the derivative""" 29 | 30 | return (y_i - logistic(dot(x_i, beta))) * x_i[j] 31 | 32 | def logistic_log_gradient_i(x_i, y_i, beta): 33 | """the gradient of the log likelihood 34 | corresponding to the i-th data point""" 35 | 36 | return [logistic_log_partial_ij(x_i, y_i, beta, j) 37 | for j, _ in enumerate(beta)] 38 | 39 | def logistic_log_gradient(x, y, beta): 40 | return reduce(vector_add, 41 | [logistic_log_gradient_i(x_i, y_i, beta) 42 | for x_i, y_i in zip(x,y)]) 43 | 44 | if __name__ == "__main__": 45 | 46 | data = [(0.7,48000,1),(1.9,48000,0),(2.5,60000,1),(4.2,63000,0),(6,76000,0),(6.5,69000,0),(7.5,76000,0),(8.1,88000,0),(8.7,83000,1),(10,83000,1),(0.8,43000,0),(1.8,60000,0),(10,79000,1),(6.1,76000,0),(1.4,50000,0),(9.1,92000,0),(5.8,75000,0),(5.2,69000,0),(1,56000,0),(6,67000,0),(4.9,74000,0),(6.4,63000,1),(6.2,82000,0),(3.3,58000,0),(9.3,90000,1),(5.5,57000,1),(9.1,102000,0),(2.4,54000,0),(8.2,65000,1),(5.3,82000,0),(9.8,107000,0),(1.8,64000,0),(0.6,46000,1),(0.8,48000,0),(8.6,84000,1),(0.6,45000,0),(0.5,30000,1),(7.3,89000,0),(2.5,48000,1),(5.6,76000,0),(7.4,77000,0),(2.7,56000,0),(0.7,48000,0),(1.2,42000,0),(0.2,32000,1),(4.7,56000,1),(2.8,44000,1),(7.6,78000,0),(1.1,63000,0),(8,79000,1),(2.7,56000,0),(6,52000,1),(4.6,56000,0),(2.5,51000,0),(5.7,71000,0),(2.9,65000,0),(1.1,33000,1),(3,62000,0),(4,71000,0),(2.4,61000,0),(7.5,75000,0),(9.7,81000,1),(3.2,62000,0),(7.9,88000,0),(4.7,44000,1),(2.5,55000,0),(1.6,41000,0),(6.7,64000,1),(6.9,66000,1),(7.9,78000,1),(8.1,102000,0),(5.3,48000,1),(8.5,66000,1),(0.2,56000,0),(6,69000,0),(7.5,77000,0),(8,86000,0),(4.4,68000,0),(4.9,75000,0),(1.5,60000,0),(2.2,50000,0),(3.4,49000,1),(4.2,70000,0),(7.7,98000,0),(8.2,85000,0),(5.4,88000,0),(0.1,46000,0),(1.5,37000,0),(6.3,86000,0),(3.7,57000,0),(8.4,85000,0),(2,42000,0),(5.8,69000,1),(2.7,64000,0),(3.1,63000,0),(1.9,48000,0),(10,72000,1),(0.2,45000,0),(8.6,95000,0),(1.5,64000,0),(9.8,95000,0),(5.3,65000,0),(7.5,80000,0),(9.9,91000,0),(9.7,50000,1),(2.8,68000,0),(3.6,58000,0),(3.9,74000,0),(4.4,76000,0),(2.5,49000,0),(7.2,81000,0),(5.2,60000,1),(2.4,62000,0),(8.9,94000,0),(2.4,63000,0),(6.8,69000,1),(6.5,77000,0),(7,86000,0),(9.4,94000,0),(7.8,72000,1),(0.2,53000,0),(10,97000,0),(5.5,65000,0),(7.7,71000,1),(8.1,66000,1),(9.8,91000,0),(8,84000,0),(2.7,55000,0),(2.8,62000,0),(9.4,79000,0),(2.5,57000,0),(7.4,70000,1),(2.1,47000,0),(5.3,62000,1),(6.3,79000,0),(6.8,58000,1),(5.7,80000,0),(2.2,61000,0),(4.8,62000,0),(3.7,64000,0),(4.1,85000,0),(2.3,51000,0),(3.5,58000,0),(0.9,43000,0),(0.9,54000,0),(4.5,74000,0),(6.5,55000,1),(4.1,41000,1),(7.1,73000,0),(1.1,66000,0),(9.1,81000,1),(8,69000,1),(7.3,72000,1),(3.3,50000,0),(3.9,58000,0),(2.6,49000,0),(1.6,78000,0),(0.7,56000,0),(2.1,36000,1),(7.5,90000,0),(4.8,59000,1),(8.9,95000,0),(6.2,72000,0),(6.3,63000,0),(9.1,100000,0),(7.3,61000,1),(5.6,74000,0),(0.5,66000,0),(1.1,59000,0),(5.1,61000,0),(6.2,70000,0),(6.6,56000,1),(6.3,76000,0),(6.5,78000,0),(5.1,59000,0),(9.5,74000,1),(4.5,64000,0),(2,54000,0),(1,52000,0),(4,69000,0),(6.5,76000,0),(3,60000,0),(4.5,63000,0),(7.8,70000,0),(3.9,60000,1),(0.8,51000,0),(4.2,78000,0),(1.1,54000,0),(6.2,60000,0),(2.9,59000,0),(2.1,52000,0),(8.2,87000,0),(4.8,73000,0),(2.2,42000,1),(9.1,98000,0),(6.5,84000,0),(6.9,73000,0),(5.1,72000,0),(9.1,69000,1),(9.8,79000,1),] 47 | data = list(map(list, data)) # change tuples to lists 48 | 49 | x = [[1] + row[:2] for row in data] # each element is [1, experience, salary] 50 | y = [row[2] for row in data] # each element is paid_account 51 | 52 | print("linear regression:") 53 | 54 | rescaled_x = rescale(x) 55 | beta = estimate_beta(rescaled_x, y) 56 | print(beta) 57 | 58 | print("logistic regression:") 59 | 60 | random.seed(0) 61 | x_train, x_test, y_train, y_test = train_test_split(rescaled_x, y, 0.33) 62 | 63 | # want to maximize log likelihood on the training data 64 | fn = partial(logistic_log_likelihood, x_train, y_train) 65 | gradient_fn = partial(logistic_log_gradient, x_train, y_train) 66 | 67 | # pick a random starting point 68 | beta_0 = [1, 1, 1] 69 | 70 | # and maximize using gradient descent 71 | beta_hat = maximize_batch(fn, gradient_fn, beta_0) 72 | 73 | print("beta_batch", beta_hat) 74 | 75 | beta_0 = [1, 1, 1] 76 | beta_hat = maximize_stochastic(logistic_log_likelihood_i, 77 | logistic_log_gradient_i, 78 | x_train, y_train, beta_0) 79 | 80 | print("beta stochastic", beta_hat) 81 | 82 | true_positives = false_positives = true_negatives = false_negatives = 0 83 | 84 | for x_i, y_i in zip(x_test, y_test): 85 | predict = logistic(dot(beta_hat, x_i)) 86 | 87 | if y_i == 1 and predict >= 0.5: # TP: paid and we predict paid 88 | true_positives += 1 89 | elif y_i == 1: # FN: paid and we predict unpaid 90 | false_negatives += 1 91 | elif predict >= 0.5: # FP: unpaid and we predict paid 92 | false_positives += 1 93 | else: # TN: unpaid and we predict unpaid 94 | true_negatives += 1 95 | 96 | precision = true_positives / (true_positives + false_positives) 97 | recall = true_positives / (true_positives + false_negatives) 98 | 99 | print("precision", precision) 100 | print("recall", recall) 101 | -------------------------------------------------------------------------------- /first-edition/code-python3/mapreduce.py: -------------------------------------------------------------------------------- 1 | import math, random, re, datetime 2 | from collections import defaultdict, Counter 3 | from functools import partial 4 | from naive_bayes import tokenize 5 | 6 | def word_count_old(documents): 7 | """word count not using MapReduce""" 8 | return Counter(word 9 | for document in documents 10 | for word in tokenize(document)) 11 | 12 | def wc_mapper(document): 13 | """for each word in the document, emit (word,1)""" 14 | for word in tokenize(document): 15 | yield (word, 1) 16 | 17 | def wc_reducer(word, counts): 18 | """sum up the counts for a word""" 19 | yield (word, sum(counts)) 20 | 21 | def word_count(documents): 22 | """count the words in the input documents using MapReduce""" 23 | 24 | # place to store grouped values 25 | collector = defaultdict(list) 26 | 27 | for document in documents: 28 | for word, count in wc_mapper(document): 29 | collector[word].append(count) 30 | 31 | return [output 32 | for word, counts in collector.items() 33 | for output in wc_reducer(word, counts)] 34 | 35 | def map_reduce(inputs, mapper, reducer): 36 | """runs MapReduce on the inputs using mapper and reducer""" 37 | collector = defaultdict(list) 38 | 39 | for input in inputs: 40 | for key, value in mapper(input): 41 | collector[key].append(value) 42 | 43 | return [output 44 | for key, values in collector.items() 45 | for output in reducer(key,values)] 46 | 47 | def reduce_with(aggregation_fn, key, values): 48 | """reduces a key-values pair by applying aggregation_fn to the values""" 49 | yield (key, aggregation_fn(values)) 50 | 51 | def values_reducer(aggregation_fn): 52 | """turns a function (values -> output) into a reducer""" 53 | return partial(reduce_with, aggregation_fn) 54 | 55 | sum_reducer = values_reducer(sum) 56 | max_reducer = values_reducer(max) 57 | min_reducer = values_reducer(min) 58 | count_distinct_reducer = values_reducer(lambda values: len(set(values))) 59 | 60 | # 61 | # Analyzing Status Updates 62 | # 63 | 64 | status_updates = [ 65 | {"id": 1, 66 | "username" : "joelgrus", 67 | "text" : "Is anyone interested in a data science book?", 68 | "created_at" : datetime.datetime(2013, 12, 21, 11, 47, 0), 69 | "liked_by" : ["data_guy", "data_gal", "bill"] }, 70 | # add your own 71 | ] 72 | 73 | def data_science_day_mapper(status_update): 74 | """yields (day_of_week, 1) if status_update contains "data science" """ 75 | if "data science" in status_update["text"].lower(): 76 | day_of_week = status_update["created_at"].weekday() 77 | yield (day_of_week, 1) 78 | 79 | data_science_days = map_reduce(status_updates, 80 | data_science_day_mapper, 81 | sum_reducer) 82 | 83 | def words_per_user_mapper(status_update): 84 | user = status_update["username"] 85 | for word in tokenize(status_update["text"]): 86 | yield (user, (word, 1)) 87 | 88 | def most_popular_word_reducer(user, words_and_counts): 89 | """given a sequence of (word, count) pairs, 90 | return the word with the highest total count""" 91 | 92 | word_counts = Counter() 93 | for word, count in words_and_counts: 94 | word_counts[word] += count 95 | 96 | word, count = word_counts.most_common(1)[0] 97 | 98 | yield (user, (word, count)) 99 | 100 | user_words = map_reduce(status_updates, 101 | words_per_user_mapper, 102 | most_popular_word_reducer) 103 | 104 | def liker_mapper(status_update): 105 | user = status_update["username"] 106 | for liker in status_update["liked_by"]: 107 | yield (user, liker) 108 | 109 | distinct_likers_per_user = map_reduce(status_updates, 110 | liker_mapper, 111 | count_distinct_reducer) 112 | 113 | 114 | # 115 | # matrix multiplication 116 | # 117 | 118 | def matrix_multiply_mapper(m, element): 119 | """m is the common dimension (columns of A, rows of B) 120 | element is a tuple (matrix_name, i, j, value)""" 121 | matrix, i, j, value = element 122 | 123 | if matrix == "A": 124 | for column in range(m): 125 | # A_ij is the jth entry in the sum for each C_i_column 126 | yield((i, column), (j, value)) 127 | else: 128 | for row in range(m): 129 | # B_ij is the ith entry in the sum for each C_row_j 130 | yield((row, j), (i, value)) 131 | 132 | def matrix_multiply_reducer(m, key, indexed_values): 133 | results_by_index = defaultdict(list) 134 | for index, value in indexed_values: 135 | results_by_index[index].append(value) 136 | 137 | # sum up all the products of the positions with two results 138 | sum_product = sum(results[0] * results[1] 139 | for results in results_by_index.values() 140 | if len(results) == 2) 141 | 142 | if sum_product != 0.0: 143 | yield (key, sum_product) 144 | 145 | if __name__ == "__main__": 146 | 147 | documents = ["data science", "big data", "science fiction"] 148 | 149 | wc_mapper_results = [result 150 | for document in documents 151 | for result in wc_mapper(document)] 152 | 153 | print("wc_mapper results") 154 | print(wc_mapper_results) 155 | print() 156 | 157 | print("word count results") 158 | print(word_count(documents)) 159 | print() 160 | 161 | print("word count using map_reduce function") 162 | print(map_reduce(documents, wc_mapper, wc_reducer)) 163 | print() 164 | 165 | print("data science days") 166 | print(data_science_days) 167 | print() 168 | 169 | print("user words") 170 | print(user_words) 171 | print() 172 | 173 | print("distinct likers") 174 | print(distinct_likers_per_user) 175 | print() 176 | 177 | # matrix multiplication 178 | 179 | entries = [("A", 0, 0, 3), ("A", 0, 1, 2), 180 | ("B", 0, 0, 4), ("B", 0, 1, -1), ("B", 1, 0, 10)] 181 | mapper = partial(matrix_multiply_mapper, 3) 182 | reducer = partial(matrix_multiply_reducer, 3) 183 | 184 | print("map-reduce matrix multiplication") 185 | print("entries:", entries) 186 | print("result:", map_reduce(entries, mapper, reducer)) 187 | -------------------------------------------------------------------------------- /first-edition/code/logistic_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from collections import Counter 3 | from functools import partial 4 | from linear_algebra import dot, vector_add 5 | from gradient_descent import maximize_stochastic, maximize_batch 6 | from working_with_data import rescale 7 | from machine_learning import train_test_split 8 | from multiple_regression import estimate_beta, predict 9 | import math, random 10 | 11 | def logistic(x): 12 | return 1.0 / (1 + math.exp(-x)) 13 | 14 | def logistic_prime(x): 15 | return logistic(x) * (1 - logistic(x)) 16 | 17 | def logistic_log_likelihood_i(x_i, y_i, beta): 18 | if y_i == 1: 19 | return math.log(logistic(dot(x_i, beta))) 20 | else: 21 | return math.log(1 - logistic(dot(x_i, beta))) 22 | 23 | def logistic_log_likelihood(x, y, beta): 24 | return sum(logistic_log_likelihood_i(x_i, y_i, beta) 25 | for x_i, y_i in zip(x, y)) 26 | 27 | def logistic_log_partial_ij(x_i, y_i, beta, j): 28 | """here i is the index of the data point, 29 | j the index of the derivative""" 30 | 31 | return (y_i - logistic(dot(x_i, beta))) * x_i[j] 32 | 33 | def logistic_log_gradient_i(x_i, y_i, beta): 34 | """the gradient of the log likelihood 35 | corresponding to the i-th data point""" 36 | 37 | return [logistic_log_partial_ij(x_i, y_i, beta, j) 38 | for j, _ in enumerate(beta)] 39 | 40 | def logistic_log_gradient(x, y, beta): 41 | return reduce(vector_add, 42 | [logistic_log_gradient_i(x_i, y_i, beta) 43 | for x_i, y_i in zip(x,y)]) 44 | 45 | if __name__ == "__main__": 46 | 47 | data = [(0.7,48000,1),(1.9,48000,0),(2.5,60000,1),(4.2,63000,0),(6,76000,0),(6.5,69000,0),(7.5,76000,0),(8.1,88000,0),(8.7,83000,1),(10,83000,1),(0.8,43000,0),(1.8,60000,0),(10,79000,1),(6.1,76000,0),(1.4,50000,0),(9.1,92000,0),(5.8,75000,0),(5.2,69000,0),(1,56000,0),(6,67000,0),(4.9,74000,0),(6.4,63000,1),(6.2,82000,0),(3.3,58000,0),(9.3,90000,1),(5.5,57000,1),(9.1,102000,0),(2.4,54000,0),(8.2,65000,1),(5.3,82000,0),(9.8,107000,0),(1.8,64000,0),(0.6,46000,1),(0.8,48000,0),(8.6,84000,1),(0.6,45000,0),(0.5,30000,1),(7.3,89000,0),(2.5,48000,1),(5.6,76000,0),(7.4,77000,0),(2.7,56000,0),(0.7,48000,0),(1.2,42000,0),(0.2,32000,1),(4.7,56000,1),(2.8,44000,1),(7.6,78000,0),(1.1,63000,0),(8,79000,1),(2.7,56000,0),(6,52000,1),(4.6,56000,0),(2.5,51000,0),(5.7,71000,0),(2.9,65000,0),(1.1,33000,1),(3,62000,0),(4,71000,0),(2.4,61000,0),(7.5,75000,0),(9.7,81000,1),(3.2,62000,0),(7.9,88000,0),(4.7,44000,1),(2.5,55000,0),(1.6,41000,0),(6.7,64000,1),(6.9,66000,1),(7.9,78000,1),(8.1,102000,0),(5.3,48000,1),(8.5,66000,1),(0.2,56000,0),(6,69000,0),(7.5,77000,0),(8,86000,0),(4.4,68000,0),(4.9,75000,0),(1.5,60000,0),(2.2,50000,0),(3.4,49000,1),(4.2,70000,0),(7.7,98000,0),(8.2,85000,0),(5.4,88000,0),(0.1,46000,0),(1.5,37000,0),(6.3,86000,0),(3.7,57000,0),(8.4,85000,0),(2,42000,0),(5.8,69000,1),(2.7,64000,0),(3.1,63000,0),(1.9,48000,0),(10,72000,1),(0.2,45000,0),(8.6,95000,0),(1.5,64000,0),(9.8,95000,0),(5.3,65000,0),(7.5,80000,0),(9.9,91000,0),(9.7,50000,1),(2.8,68000,0),(3.6,58000,0),(3.9,74000,0),(4.4,76000,0),(2.5,49000,0),(7.2,81000,0),(5.2,60000,1),(2.4,62000,0),(8.9,94000,0),(2.4,63000,0),(6.8,69000,1),(6.5,77000,0),(7,86000,0),(9.4,94000,0),(7.8,72000,1),(0.2,53000,0),(10,97000,0),(5.5,65000,0),(7.7,71000,1),(8.1,66000,1),(9.8,91000,0),(8,84000,0),(2.7,55000,0),(2.8,62000,0),(9.4,79000,0),(2.5,57000,0),(7.4,70000,1),(2.1,47000,0),(5.3,62000,1),(6.3,79000,0),(6.8,58000,1),(5.7,80000,0),(2.2,61000,0),(4.8,62000,0),(3.7,64000,0),(4.1,85000,0),(2.3,51000,0),(3.5,58000,0),(0.9,43000,0),(0.9,54000,0),(4.5,74000,0),(6.5,55000,1),(4.1,41000,1),(7.1,73000,0),(1.1,66000,0),(9.1,81000,1),(8,69000,1),(7.3,72000,1),(3.3,50000,0),(3.9,58000,0),(2.6,49000,0),(1.6,78000,0),(0.7,56000,0),(2.1,36000,1),(7.5,90000,0),(4.8,59000,1),(8.9,95000,0),(6.2,72000,0),(6.3,63000,0),(9.1,100000,0),(7.3,61000,1),(5.6,74000,0),(0.5,66000,0),(1.1,59000,0),(5.1,61000,0),(6.2,70000,0),(6.6,56000,1),(6.3,76000,0),(6.5,78000,0),(5.1,59000,0),(9.5,74000,1),(4.5,64000,0),(2,54000,0),(1,52000,0),(4,69000,0),(6.5,76000,0),(3,60000,0),(4.5,63000,0),(7.8,70000,0),(3.9,60000,1),(0.8,51000,0),(4.2,78000,0),(1.1,54000,0),(6.2,60000,0),(2.9,59000,0),(2.1,52000,0),(8.2,87000,0),(4.8,73000,0),(2.2,42000,1),(9.1,98000,0),(6.5,84000,0),(6.9,73000,0),(5.1,72000,0),(9.1,69000,1),(9.8,79000,1),] 48 | data = map(list, data) # change tuples to lists 49 | 50 | x = [[1] + row[:2] for row in data] # each element is [1, experience, salary] 51 | y = [row[2] for row in data] # each element is paid_account 52 | 53 | print "linear regression:" 54 | 55 | rescaled_x = rescale(x) 56 | beta = estimate_beta(rescaled_x, y) 57 | print beta 58 | 59 | print "logistic regression:" 60 | 61 | random.seed(0) 62 | x_train, x_test, y_train, y_test = train_test_split(rescaled_x, y, 0.33) 63 | 64 | # want to maximize log likelihood on the training data 65 | fn = partial(logistic_log_likelihood, x_train, y_train) 66 | gradient_fn = partial(logistic_log_gradient, x_train, y_train) 67 | 68 | # pick a random starting point 69 | beta_0 = [1, 1, 1] 70 | 71 | # and maximize using gradient descent 72 | beta_hat = maximize_batch(fn, gradient_fn, beta_0) 73 | 74 | print "beta_batch", beta_hat 75 | 76 | beta_0 = [1, 1, 1] 77 | beta_hat = maximize_stochastic(logistic_log_likelihood_i, 78 | logistic_log_gradient_i, 79 | x_train, y_train, beta_0) 80 | 81 | print "beta stochastic", beta_hat 82 | 83 | true_positives = false_positives = true_negatives = false_negatives = 0 84 | 85 | for x_i, y_i in zip(x_test, y_test): 86 | predict = logistic(dot(beta_hat, x_i)) 87 | 88 | if y_i == 1 and predict >= 0.5: # TP: paid and we predict paid 89 | true_positives += 1 90 | elif y_i == 1: # FN: paid and we predict unpaid 91 | false_negatives += 1 92 | elif predict >= 0.5: # FP: unpaid and we predict paid 93 | false_positives += 1 94 | else: # TN: unpaid and we predict unpaid 95 | true_negatives += 1 96 | 97 | precision = true_positives / (true_positives + false_positives) 98 | recall = true_positives / (true_positives + false_negatives) 99 | 100 | print "precision", precision 101 | print "recall", recall -------------------------------------------------------------------------------- /first-edition/code/decision_trees.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from collections import Counter, defaultdict 3 | from functools import partial 4 | import math, random 5 | 6 | def entropy(class_probabilities): 7 | """given a list of class probabilities, compute the entropy""" 8 | return sum(-p * math.log(p, 2) for p in class_probabilities if p) 9 | 10 | def class_probabilities(labels): 11 | total_count = len(labels) 12 | return [count / total_count 13 | for count in Counter(labels).values()] 14 | 15 | def data_entropy(labeled_data): 16 | labels = [label for _, label in labeled_data] 17 | probabilities = class_probabilities(labels) 18 | return entropy(probabilities) 19 | 20 | def partition_entropy(subsets): 21 | """find the entropy from this partition of data into subsets""" 22 | total_count = sum(len(subset) for subset in subsets) 23 | 24 | return sum( data_entropy(subset) * len(subset) / total_count 25 | for subset in subsets ) 26 | 27 | def group_by(items, key_fn): 28 | """returns a defaultdict(list), where each input item 29 | is in the list whose key is key_fn(item)""" 30 | groups = defaultdict(list) 31 | for item in items: 32 | key = key_fn(item) 33 | groups[key].append(item) 34 | return groups 35 | 36 | def partition_by(inputs, attribute): 37 | """returns a dict of inputs partitioned by the attribute 38 | each input is a pair (attribute_dict, label)""" 39 | return group_by(inputs, lambda x: x[0][attribute]) 40 | 41 | def partition_entropy_by(inputs,attribute): 42 | """computes the entropy corresponding to the given partition""" 43 | partitions = partition_by(inputs, attribute) 44 | return partition_entropy(partitions.values()) 45 | 46 | def classify(tree, input): 47 | """classify the input using the given decision tree""" 48 | 49 | # if this is a leaf node, return its value 50 | if tree in [True, False]: 51 | return tree 52 | 53 | # otherwise find the correct subtree 54 | attribute, subtree_dict = tree 55 | 56 | subtree_key = input.get(attribute) # None if input is missing attribute 57 | 58 | if subtree_key not in subtree_dict: # if no subtree for key, 59 | subtree_key = None # we'll use the None subtree 60 | 61 | subtree = subtree_dict[subtree_key] # choose the appropriate subtree 62 | return classify(subtree, input) # and use it to classify the input 63 | 64 | def build_tree_id3(inputs, split_candidates=None): 65 | 66 | # if this is our first pass, 67 | # all keys of the first input are split candidates 68 | if split_candidates is None: 69 | split_candidates = inputs[0][0].keys() 70 | 71 | # count Trues and Falses in the inputs 72 | num_inputs = len(inputs) 73 | num_trues = len([label for item, label in inputs if label]) 74 | num_falses = num_inputs - num_trues 75 | 76 | if num_trues == 0: # if only Falses are left 77 | return False # return a "False" leaf 78 | 79 | if num_falses == 0: # if only Trues are left 80 | return True # return a "True" leaf 81 | 82 | if not split_candidates: # if no split candidates left 83 | return num_trues >= num_falses # return the majority leaf 84 | 85 | # otherwise, split on the best attribute 86 | best_attribute = min(split_candidates, 87 | key=partial(partition_entropy_by, inputs)) 88 | 89 | partitions = partition_by(inputs, best_attribute) 90 | new_candidates = [a for a in split_candidates 91 | if a != best_attribute] 92 | 93 | # recursively build the subtrees 94 | subtrees = { attribute : build_tree_id3(subset, new_candidates) 95 | for attribute, subset in partitions.iteritems() } 96 | 97 | subtrees[None] = num_trues > num_falses # default case 98 | 99 | return (best_attribute, subtrees) 100 | 101 | def forest_classify(trees, input): 102 | votes = [classify(tree, input) for tree in trees] 103 | vote_counts = Counter(votes) 104 | return vote_counts.most_common(1)[0][0] 105 | 106 | 107 | if __name__ == "__main__": 108 | 109 | inputs = [ 110 | ({'level':'Senior','lang':'Java','tweets':'no','phd':'no'}, False), 111 | ({'level':'Senior','lang':'Java','tweets':'no','phd':'yes'}, False), 112 | ({'level':'Mid','lang':'Python','tweets':'no','phd':'no'}, True), 113 | ({'level':'Junior','lang':'Python','tweets':'no','phd':'no'}, True), 114 | ({'level':'Junior','lang':'R','tweets':'yes','phd':'no'}, True), 115 | ({'level':'Junior','lang':'R','tweets':'yes','phd':'yes'}, False), 116 | ({'level':'Mid','lang':'R','tweets':'yes','phd':'yes'}, True), 117 | ({'level':'Senior','lang':'Python','tweets':'no','phd':'no'}, False), 118 | ({'level':'Senior','lang':'R','tweets':'yes','phd':'no'}, True), 119 | ({'level':'Junior','lang':'Python','tweets':'yes','phd':'no'}, True), 120 | ({'level':'Senior','lang':'Python','tweets':'yes','phd':'yes'},True), 121 | ({'level':'Mid','lang':'Python','tweets':'no','phd':'yes'}, True), 122 | ({'level':'Mid','lang':'Java','tweets':'yes','phd':'no'}, True), 123 | ({'level':'Junior','lang':'Python','tweets':'no','phd':'yes'},False) 124 | ] 125 | 126 | for key in ['level','lang','tweets','phd']: 127 | print key, partition_entropy_by(inputs, key) 128 | print 129 | 130 | senior_inputs = [(input, label) 131 | for input, label in inputs if input["level"] == "Senior"] 132 | 133 | for key in ['lang', 'tweets', 'phd']: 134 | print key, partition_entropy_by(senior_inputs, key) 135 | print 136 | 137 | print "building the tree" 138 | tree = build_tree_id3(inputs) 139 | print tree 140 | 141 | print "Junior / Java / tweets / no phd", classify(tree, 142 | { "level" : "Junior", 143 | "lang" : "Java", 144 | "tweets" : "yes", 145 | "phd" : "no"} ) 146 | 147 | print "Junior / Java / tweets / phd", classify(tree, 148 | { "level" : "Junior", 149 | "lang" : "Java", 150 | "tweets" : "yes", 151 | "phd" : "yes"} ) 152 | 153 | print "Intern", classify(tree, { "level" : "Intern" } ) 154 | print "Senior", classify(tree, { "level" : "Senior" } ) 155 | 156 | -------------------------------------------------------------------------------- /first-edition/code/mapreduce.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import math, random, re, datetime 3 | from collections import defaultdict, Counter 4 | from functools import partial 5 | from naive_bayes import tokenize 6 | 7 | def word_count_old(documents): 8 | """word count not using MapReduce""" 9 | return Counter(word 10 | for document in documents 11 | for word in tokenize(document)) 12 | 13 | def wc_mapper(document): 14 | """for each word in the document, emit (word,1)""" 15 | for word in tokenize(document): 16 | yield (word, 1) 17 | 18 | def wc_reducer(word, counts): 19 | """sum up the counts for a word""" 20 | yield (word, sum(counts)) 21 | 22 | def word_count(documents): 23 | """count the words in the input documents using MapReduce""" 24 | 25 | # place to store grouped values 26 | collector = defaultdict(list) 27 | 28 | for document in documents: 29 | for word, count in wc_mapper(document): 30 | collector[word].append(count) 31 | 32 | return [output 33 | for word, counts in collector.iteritems() 34 | for output in wc_reducer(word, counts)] 35 | 36 | def map_reduce(inputs, mapper, reducer): 37 | """runs MapReduce on the inputs using mapper and reducer""" 38 | collector = defaultdict(list) 39 | 40 | for input in inputs: 41 | for key, value in mapper(input): 42 | collector[key].append(value) 43 | 44 | return [output 45 | for key, values in collector.iteritems() 46 | for output in reducer(key,values)] 47 | 48 | def reduce_with(aggregation_fn, key, values): 49 | """reduces a key-values pair by applying aggregation_fn to the values""" 50 | yield (key, aggregation_fn(values)) 51 | 52 | def values_reducer(aggregation_fn): 53 | """turns a function (values -> output) into a reducer""" 54 | return partial(reduce_with, aggregation_fn) 55 | 56 | sum_reducer = values_reducer(sum) 57 | max_reducer = values_reducer(max) 58 | min_reducer = values_reducer(min) 59 | count_distinct_reducer = values_reducer(lambda values: len(set(values))) 60 | 61 | # 62 | # Analyzing Status Updates 63 | # 64 | 65 | status_updates = [ 66 | {"id": 1, 67 | "username" : "joelgrus", 68 | "text" : "Is anyone interested in a data science book?", 69 | "created_at" : datetime.datetime(2013, 12, 21, 11, 47, 0), 70 | "liked_by" : ["data_guy", "data_gal", "bill"] }, 71 | # add your own 72 | ] 73 | 74 | def data_science_day_mapper(status_update): 75 | """yields (day_of_week, 1) if status_update contains "data science" """ 76 | if "data science" in status_update["text"].lower(): 77 | day_of_week = status_update["created_at"].weekday() 78 | yield (day_of_week, 1) 79 | 80 | data_science_days = map_reduce(status_updates, 81 | data_science_day_mapper, 82 | sum_reducer) 83 | 84 | def words_per_user_mapper(status_update): 85 | user = status_update["username"] 86 | for word in tokenize(status_update["text"]): 87 | yield (user, (word, 1)) 88 | 89 | def most_popular_word_reducer(user, words_and_counts): 90 | """given a sequence of (word, count) pairs, 91 | return the word with the highest total count""" 92 | 93 | word_counts = Counter() 94 | for word, count in words_and_counts: 95 | word_counts[word] += count 96 | 97 | word, count = word_counts.most_common(1)[0] 98 | 99 | yield (user, (word, count)) 100 | 101 | user_words = map_reduce(status_updates, 102 | words_per_user_mapper, 103 | most_popular_word_reducer) 104 | 105 | def liker_mapper(status_update): 106 | user = status_update["username"] 107 | for liker in status_update["liked_by"]: 108 | yield (user, liker) 109 | 110 | distinct_likers_per_user = map_reduce(status_updates, 111 | liker_mapper, 112 | count_distinct_reducer) 113 | 114 | 115 | # 116 | # matrix multiplication 117 | # 118 | 119 | def matrix_multiply_mapper(m, element): 120 | """m is the common dimension (columns of A, rows of B) 121 | element is a tuple (matrix_name, i, j, value)""" 122 | matrix, i, j, value = element 123 | 124 | if matrix == "A": 125 | for column in range(m): 126 | # A_ij is the jth entry in the sum for each C_i_column 127 | yield((i, column), (j, value)) 128 | else: 129 | for row in range(m): 130 | # B_ij is the ith entry in the sum for each C_row_j 131 | yield((row, j), (i, value)) 132 | 133 | def matrix_multiply_reducer(m, key, indexed_values): 134 | results_by_index = defaultdict(list) 135 | for index, value in indexed_values: 136 | results_by_index[index].append(value) 137 | 138 | # sum up all the products of the positions with two results 139 | sum_product = sum(results[0] * results[1] 140 | for results in results_by_index.values() 141 | if len(results) == 2) 142 | 143 | if sum_product != 0.0: 144 | yield (key, sum_product) 145 | 146 | if __name__ == "__main__": 147 | 148 | documents = ["data science", "big data", "science fiction"] 149 | 150 | wc_mapper_results = [result 151 | for document in documents 152 | for result in wc_mapper(document)] 153 | 154 | print "wc_mapper results" 155 | print wc_mapper_results 156 | print 157 | 158 | print "word count results" 159 | print word_count(documents) 160 | print 161 | 162 | print "word count using map_reduce function" 163 | print map_reduce(documents, wc_mapper, wc_reducer) 164 | print 165 | 166 | print "data science days" 167 | print data_science_days 168 | print 169 | 170 | print "user words" 171 | print user_words 172 | print 173 | 174 | print "distinct likers" 175 | print distinct_likers_per_user 176 | print 177 | 178 | # matrix multiplication 179 | 180 | entries = [("A", 0, 0, 3), ("A", 0, 1, 2), 181 | ("B", 0, 0, 4), ("B", 0, 1, -1), ("B", 1, 0, 10)] 182 | mapper = partial(matrix_multiply_mapper, 3) 183 | reducer = partial(matrix_multiply_reducer, 3) 184 | 185 | print "map-reduce matrix multiplication" 186 | print "entries:", entries 187 | print "result:", map_reduce(entries, mapper, reducer) 188 | 189 | -------------------------------------------------------------------------------- /first-edition/code-python3/gradient_descent.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from linear_algebra import distance, vector_subtract, scalar_multiply 3 | from functools import reduce 4 | import math, random 5 | 6 | def sum_of_squares(v): 7 | """computes the sum of squared elements in v""" 8 | return sum(v_i ** 2 for v_i in v) 9 | 10 | def difference_quotient(f, x, h): 11 | return (f(x + h) - f(x)) / h 12 | 13 | def plot_estimated_derivative(): 14 | 15 | def square(x): 16 | return x * x 17 | 18 | def derivative(x): 19 | return 2 * x 20 | 21 | derivative_estimate = lambda x: difference_quotient(square, x, h=0.00001) 22 | 23 | # plot to show they're basically the same 24 | import matplotlib.pyplot as plt 25 | x = range(-10,10) 26 | plt.plot(x, map(derivative, x), 'rx') # red x 27 | plt.plot(x, map(derivative_estimate, x), 'b+') # blue + 28 | plt.show() # purple *, hopefully 29 | 30 | def partial_difference_quotient(f, v, i, h): 31 | 32 | # add h to just the i-th element of v 33 | w = [v_j + (h if j == i else 0) 34 | for j, v_j in enumerate(v)] 35 | 36 | return (f(w) - f(v)) / h 37 | 38 | def estimate_gradient(f, v, h=0.00001): 39 | return [partial_difference_quotient(f, v, i, h) 40 | for i, _ in enumerate(v)] 41 | 42 | def step(v, direction, step_size): 43 | """move step_size in the direction from v""" 44 | return [v_i + step_size * direction_i 45 | for v_i, direction_i in zip(v, direction)] 46 | 47 | def sum_of_squares_gradient(v): 48 | return [2 * v_i for v_i in v] 49 | 50 | def safe(f): 51 | """define a new function that wraps f and return it""" 52 | def safe_f(*args, **kwargs): 53 | try: 54 | return f(*args, **kwargs) 55 | except: 56 | return float('inf') # this means "infinity" in Python 57 | return safe_f 58 | 59 | 60 | # 61 | # 62 | # minimize / maximize batch 63 | # 64 | # 65 | 66 | def minimize_batch(target_fn, gradient_fn, theta_0, tolerance=0.000001): 67 | """use gradient descent to find theta that minimizes target function""" 68 | 69 | step_sizes = [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001] 70 | 71 | theta = theta_0 # set theta to initial value 72 | target_fn = safe(target_fn) # safe version of target_fn 73 | value = target_fn(theta) # value we're minimizing 74 | 75 | while True: 76 | gradient = gradient_fn(theta) 77 | next_thetas = [step(theta, gradient, -step_size) 78 | for step_size in step_sizes] 79 | 80 | # choose the one that minimizes the error function 81 | next_theta = min(next_thetas, key=target_fn) 82 | next_value = target_fn(next_theta) 83 | 84 | # stop if we're "converging" 85 | if abs(value - next_value) < tolerance: 86 | return theta 87 | else: 88 | theta, value = next_theta, next_value 89 | 90 | def negate(f): 91 | """return a function that for any input x returns -f(x)""" 92 | return lambda *args, **kwargs: -f(*args, **kwargs) 93 | 94 | def negate_all(f): 95 | """the same when f returns a list of numbers""" 96 | return lambda *args, **kwargs: [-y for y in f(*args, **kwargs)] 97 | 98 | def maximize_batch(target_fn, gradient_fn, theta_0, tolerance=0.000001): 99 | return minimize_batch(negate(target_fn), 100 | negate_all(gradient_fn), 101 | theta_0, 102 | tolerance) 103 | 104 | # 105 | # minimize / maximize stochastic 106 | # 107 | 108 | def in_random_order(data): 109 | """generator that returns the elements of data in random order""" 110 | indexes = [i for i, _ in enumerate(data)] # create a list of indexes 111 | random.shuffle(indexes) # shuffle them 112 | for i in indexes: # return the data in that order 113 | yield data[i] 114 | 115 | def minimize_stochastic(target_fn, gradient_fn, x, y, theta_0, alpha_0=0.01): 116 | 117 | data = list(zip(x, y)) 118 | theta = theta_0 # initial guess 119 | alpha = alpha_0 # initial step size 120 | min_theta, min_value = None, float("inf") # the minimum so far 121 | iterations_with_no_improvement = 0 122 | 123 | # if we ever go 100 iterations with no improvement, stop 124 | while iterations_with_no_improvement < 100: 125 | value = sum( target_fn(x_i, y_i, theta) for x_i, y_i in data ) 126 | 127 | if value < min_value: 128 | # if we've found a new minimum, remember it 129 | # and go back to the original step size 130 | min_theta, min_value = theta, value 131 | iterations_with_no_improvement = 0 132 | alpha = alpha_0 133 | else: 134 | # otherwise we're not improving, so try shrinking the step size 135 | iterations_with_no_improvement += 1 136 | alpha *= 0.9 137 | 138 | # and take a gradient step for each of the data points 139 | for x_i, y_i in in_random_order(data): 140 | gradient_i = gradient_fn(x_i, y_i, theta) 141 | theta = vector_subtract(theta, scalar_multiply(alpha, gradient_i)) 142 | 143 | return min_theta 144 | 145 | def maximize_stochastic(target_fn, gradient_fn, x, y, theta_0, alpha_0=0.01): 146 | return minimize_stochastic(negate(target_fn), 147 | negate_all(gradient_fn), 148 | x, y, theta_0, alpha_0) 149 | 150 | if __name__ == "__main__": 151 | 152 | print("using the gradient") 153 | 154 | v = [random.randint(-10,10) for i in range(3)] 155 | 156 | tolerance = 0.0000001 157 | 158 | while True: 159 | #print v, sum_of_squares(v) 160 | gradient = sum_of_squares_gradient(v) # compute the gradient at v 161 | next_v = step(v, gradient, -0.01) # take a negative gradient step 162 | if distance(next_v, v) < tolerance: # stop if we're converging 163 | break 164 | v = next_v # continue if we're not 165 | 166 | print("minimum v", v) 167 | print("minimum value", sum_of_squares(v)) 168 | print() 169 | 170 | 171 | print("using minimize_batch") 172 | 173 | v = [random.randint(-10,10) for i in range(3)] 174 | 175 | v = minimize_batch(sum_of_squares, sum_of_squares_gradient, v) 176 | 177 | print("minimum v", v) 178 | print("minimum value", sum_of_squares(v)) 179 | -------------------------------------------------------------------------------- /first-edition/code/gradient_descent.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from collections import Counter 3 | from linear_algebra import distance, vector_subtract, scalar_multiply 4 | import math, random 5 | 6 | def sum_of_squares(v): 7 | """computes the sum of squared elements in v""" 8 | return sum(v_i ** 2 for v_i in v) 9 | 10 | def difference_quotient(f, x, h): 11 | return (f(x + h) - f(x)) / h 12 | 13 | def plot_estimated_derivative(): 14 | 15 | def square(x): 16 | return x * x 17 | 18 | def derivative(x): 19 | return 2 * x 20 | 21 | derivative_estimate = lambda x: difference_quotient(square, x, h=0.00001) 22 | 23 | # plot to show they're basically the same 24 | import matplotlib.pyplot as plt 25 | x = range(-10,10) 26 | plt.plot(x, map(derivative, x), 'rx') # red x 27 | plt.plot(x, map(derivative_estimate, x), 'b+') # blue + 28 | plt.show() # purple *, hopefully 29 | 30 | def partial_difference_quotient(f, v, i, h): 31 | 32 | # add h to just the i-th element of v 33 | w = [v_j + (h if j == i else 0) 34 | for j, v_j in enumerate(v)] 35 | 36 | return (f(w) - f(v)) / h 37 | 38 | def estimate_gradient(f, v, h=0.00001): 39 | return [partial_difference_quotient(f, v, i, h) 40 | for i, _ in enumerate(v)] 41 | 42 | def step(v, direction, step_size): 43 | """move step_size in the direction from v""" 44 | return [v_i + step_size * direction_i 45 | for v_i, direction_i in zip(v, direction)] 46 | 47 | def sum_of_squares_gradient(v): 48 | return [2 * v_i for v_i in v] 49 | 50 | def safe(f): 51 | """define a new function that wraps f and return it""" 52 | def safe_f(*args, **kwargs): 53 | try: 54 | return f(*args, **kwargs) 55 | except: 56 | return float('inf') # this means "infinity" in Python 57 | return safe_f 58 | 59 | 60 | # 61 | # 62 | # minimize / maximize batch 63 | # 64 | # 65 | 66 | def minimize_batch(target_fn, gradient_fn, theta_0, tolerance=0.000001): 67 | """use gradient descent to find theta that minimizes target function""" 68 | 69 | step_sizes = [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001] 70 | 71 | theta = theta_0 # set theta to initial value 72 | target_fn = safe(target_fn) # safe version of target_fn 73 | value = target_fn(theta) # value we're minimizing 74 | 75 | while True: 76 | gradient = gradient_fn(theta) 77 | next_thetas = [step(theta, gradient, -step_size) 78 | for step_size in step_sizes] 79 | 80 | # choose the one that minimizes the error function 81 | next_theta = min(next_thetas, key=target_fn) 82 | next_value = target_fn(next_theta) 83 | 84 | # stop if we're "converging" 85 | if abs(value - next_value) < tolerance: 86 | return theta 87 | else: 88 | theta, value = next_theta, next_value 89 | 90 | def negate(f): 91 | """return a function that for any input x returns -f(x)""" 92 | return lambda *args, **kwargs: -f(*args, **kwargs) 93 | 94 | def negate_all(f): 95 | """the same when f returns a list of numbers""" 96 | return lambda *args, **kwargs: [-y for y in f(*args, **kwargs)] 97 | 98 | def maximize_batch(target_fn, gradient_fn, theta_0, tolerance=0.000001): 99 | return minimize_batch(negate(target_fn), 100 | negate_all(gradient_fn), 101 | theta_0, 102 | tolerance) 103 | 104 | # 105 | # minimize / maximize stochastic 106 | # 107 | 108 | def in_random_order(data): 109 | """generator that returns the elements of data in random order""" 110 | indexes = [i for i, _ in enumerate(data)] # create a list of indexes 111 | random.shuffle(indexes) # shuffle them 112 | for i in indexes: # return the data in that order 113 | yield data[i] 114 | 115 | def minimize_stochastic(target_fn, gradient_fn, x, y, theta_0, alpha_0=0.01): 116 | 117 | data = zip(x, y) 118 | theta = theta_0 # initial guess 119 | alpha = alpha_0 # initial step size 120 | min_theta, min_value = None, float("inf") # the minimum so far 121 | iterations_with_no_improvement = 0 122 | 123 | # if we ever go 100 iterations with no improvement, stop 124 | while iterations_with_no_improvement < 100: 125 | value = sum( target_fn(x_i, y_i, theta) for x_i, y_i in data ) 126 | 127 | if value < min_value: 128 | # if we've found a new minimum, remember it 129 | # and go back to the original step size 130 | min_theta, min_value = theta, value 131 | iterations_with_no_improvement = 0 132 | alpha = alpha_0 133 | else: 134 | # otherwise we're not improving, so try shrinking the step size 135 | iterations_with_no_improvement += 1 136 | alpha *= 0.9 137 | 138 | # and take a gradient step for each of the data points 139 | for x_i, y_i in in_random_order(data): 140 | gradient_i = gradient_fn(x_i, y_i, theta) 141 | theta = vector_subtract(theta, scalar_multiply(alpha, gradient_i)) 142 | 143 | return min_theta 144 | 145 | def maximize_stochastic(target_fn, gradient_fn, x, y, theta_0, alpha_0=0.01): 146 | return minimize_stochastic(negate(target_fn), 147 | negate_all(gradient_fn), 148 | x, y, theta_0, alpha_0) 149 | 150 | if __name__ == "__main__": 151 | 152 | print "using the gradient" 153 | 154 | v = [random.randint(-10,10) for i in range(3)] 155 | 156 | tolerance = 0.0000001 157 | 158 | while True: 159 | #print v, sum_of_squares(v) 160 | gradient = sum_of_squares_gradient(v) # compute the gradient at v 161 | next_v = step(v, gradient, -0.01) # take a negative gradient step 162 | if distance(next_v, v) < tolerance: # stop if we're converging 163 | break 164 | v = next_v # continue if we're not 165 | 166 | print "minimum v", v 167 | print "minimum value", sum_of_squares(v) 168 | print 169 | 170 | 171 | print "using minimize_batch" 172 | 173 | v = [random.randint(-10,10) for i in range(3)] 174 | 175 | v = minimize_batch(sum_of_squares, sum_of_squares_gradient, v) 176 | 177 | print "minimum v", v 178 | print "minimum value", sum_of_squares(v) 179 | -------------------------------------------------------------------------------- /scratch/naive_bayes.py: -------------------------------------------------------------------------------- 1 | from typing import Set 2 | import re 3 | 4 | def tokenize(text: str) -> Set[str]: 5 | text = text.lower() # Convert to lowercase, 6 | all_words = re.findall("[a-z0-9']+", text) # extract the words, and 7 | return set(all_words) # remove duplicates. 8 | 9 | assert tokenize("Data Science is science") == {"data", "science", "is"} 10 | 11 | from typing import NamedTuple 12 | 13 | class Message(NamedTuple): 14 | text: str 15 | is_spam: bool 16 | 17 | from typing import List, Tuple, Dict, Iterable 18 | import math 19 | from collections import defaultdict 20 | 21 | class NaiveBayesClassifier: 22 | def __init__(self, k: float = 0.5) -> None: 23 | self.k = k # smoothing factor 24 | 25 | self.tokens: Set[str] = set() 26 | self.token_spam_counts: Dict[str, int] = defaultdict(int) 27 | self.token_ham_counts: Dict[str, int] = defaultdict(int) 28 | self.spam_messages = self.ham_messages = 0 29 | 30 | def train(self, messages: Iterable[Message]) -> None: 31 | for message in messages: 32 | # Increment message counts 33 | if message.is_spam: 34 | self.spam_messages += 1 35 | else: 36 | self.ham_messages += 1 37 | 38 | # Increment word counts 39 | for token in tokenize(message.text): 40 | self.tokens.add(token) 41 | if message.is_spam: 42 | self.token_spam_counts[token] += 1 43 | else: 44 | self.token_ham_counts[token] += 1 45 | 46 | def _probabilities(self, token: str) -> Tuple[float, float]: 47 | """returns P(token | spam) and P(token | not spam)""" 48 | spam = self.token_spam_counts[token] 49 | ham = self.token_ham_counts[token] 50 | 51 | p_token_spam = (spam + self.k) / (self.spam_messages + 2 * self.k) 52 | p_token_ham = (ham + self.k) / (self.ham_messages + 2 * self.k) 53 | 54 | return p_token_spam, p_token_ham 55 | 56 | def predict(self, text: str) -> float: 57 | text_tokens = tokenize(text) 58 | log_prob_if_spam = log_prob_if_ham = 0.0 59 | 60 | # Iterate through each word in our vocabulary. 61 | for token in self.tokens: 62 | prob_if_spam, prob_if_ham = self._probabilities(token) 63 | 64 | # If *token* appears in the message, 65 | # add the log probability of seeing it; 66 | if token in text_tokens: 67 | log_prob_if_spam += math.log(prob_if_spam) 68 | log_prob_if_ham += math.log(prob_if_ham) 69 | 70 | # otherwise add the log probability of _not_ seeing it 71 | # which is log(1 - probability of seeing it) 72 | else: 73 | log_prob_if_spam += math.log(1.0 - prob_if_spam) 74 | log_prob_if_ham += math.log(1.0 - prob_if_ham) 75 | 76 | prob_if_spam = math.exp(log_prob_if_spam) 77 | prob_if_ham = math.exp(log_prob_if_ham) 78 | return prob_if_spam / (prob_if_spam + prob_if_ham) 79 | 80 | messages = [Message("spam rules", is_spam=True), 81 | Message("ham rules", is_spam=False), 82 | Message("hello ham", is_spam=False)] 83 | 84 | model = NaiveBayesClassifier(k=0.5) 85 | model.train(messages) 86 | 87 | assert model.tokens == {"spam", "ham", "rules", "hello"} 88 | assert model.spam_messages == 1 89 | assert model.ham_messages == 2 90 | assert model.token_spam_counts == {"spam": 1, "rules": 1} 91 | assert model.token_ham_counts == {"ham": 2, "rules": 1, "hello": 1} 92 | 93 | text = "hello spam" 94 | 95 | probs_if_spam = [ 96 | (1 + 0.5) / (1 + 2 * 0.5), # "spam" (present) 97 | 1 - (0 + 0.5) / (1 + 2 * 0.5), # "ham" (not present) 98 | 1 - (1 + 0.5) / (1 + 2 * 0.5), # "rules" (not present) 99 | (0 + 0.5) / (1 + 2 * 0.5) # "hello" (present) 100 | ] 101 | 102 | probs_if_ham = [ 103 | (0 + 0.5) / (2 + 2 * 0.5), # "spam" (present) 104 | 1 - (2 + 0.5) / (2 + 2 * 0.5), # "ham" (not present) 105 | 1 - (1 + 0.5) / (2 + 2 * 0.5), # "rules" (not present) 106 | (1 + 0.5) / (2 + 2 * 0.5), # "hello" (present) 107 | ] 108 | 109 | p_if_spam = math.exp(sum(math.log(p) for p in probs_if_spam)) 110 | p_if_ham = math.exp(sum(math.log(p) for p in probs_if_ham)) 111 | 112 | # Should be about 0.83 113 | assert model.predict(text) == p_if_spam / (p_if_spam + p_if_ham) 114 | 115 | def drop_final_s(word): 116 | return re.sub("s$", "", word) 117 | 118 | def main(): 119 | import glob, re 120 | 121 | # modify the path to wherever you've put the files 122 | path = 'spam_data/*/*' 123 | 124 | data: List[Message] = [] 125 | 126 | # glob.glob returns every filename that matches the wildcarded path 127 | for filename in glob.glob(path): 128 | is_spam = "ham" not in filename 129 | 130 | # There are some garbage characters in the emails, the errors='ignore' 131 | # skips them instead of raising an exception. 132 | with open(filename, errors='ignore') as email_file: 133 | for line in email_file: 134 | if line.startswith("Subject:"): 135 | subject = line.lstrip("Subject: ") 136 | data.append(Message(subject, is_spam)) 137 | break # done with this file 138 | 139 | import random 140 | from scratch.machine_learning import split_data 141 | 142 | random.seed(0) # just so you get the same answers as me 143 | train_messages, test_messages = split_data(data, 0.75) 144 | 145 | model = NaiveBayesClassifier() 146 | model.train(train_messages) 147 | 148 | from collections import Counter 149 | 150 | predictions = [(message, model.predict(message.text)) 151 | for message in test_messages] 152 | 153 | # Assume that spam_probability > 0.5 corresponds to spam prediction 154 | # and count the combinations of (actual is_spam, predicted is_spam) 155 | confusion_matrix = Counter((message.is_spam, spam_probability > 0.5) 156 | for message, spam_probability in predictions) 157 | 158 | print(confusion_matrix) 159 | 160 | def p_spam_given_token(token: str, model: NaiveBayesClassifier) -> float: 161 | # We probably shouldn't call private methods, but it's for a good cause. 162 | prob_if_spam, prob_if_ham = model._probabilities(token) 163 | 164 | return prob_if_spam / (prob_if_spam + prob_if_ham) 165 | 166 | words = sorted(model.tokens, key=lambda t: p_spam_given_token(t, model)) 167 | 168 | print("spammiest_words", words[-10:]) 169 | print("hammiest_words", words[:10]) 170 | 171 | if __name__ == "__main__": main() -------------------------------------------------------------------------------- /first-edition/code-python3/hypothesis_and_inference.py: -------------------------------------------------------------------------------- 1 | from probability import normal_cdf, inverse_normal_cdf 2 | import math, random 3 | 4 | def normal_approximation_to_binomial(n, p): 5 | """finds mu and sigma corresponding to a Binomial(n, p)""" 6 | mu = p * n 7 | sigma = math.sqrt(p * (1 - p) * n) 8 | return mu, sigma 9 | 10 | ##### 11 | # 12 | # probabilities a normal lies in an interval 13 | # 14 | ###### 15 | 16 | # the normal cdf _is_ the probability the variable is below a threshold 17 | normal_probability_below = normal_cdf 18 | 19 | # it's above the threshold if it's not below the threshold 20 | def normal_probability_above(lo, mu=0, sigma=1): 21 | return 1 - normal_cdf(lo, mu, sigma) 22 | 23 | # it's between if it's less than hi, but not less than lo 24 | def normal_probability_between(lo, hi, mu=0, sigma=1): 25 | return normal_cdf(hi, mu, sigma) - normal_cdf(lo, mu, sigma) 26 | 27 | # it's outside if it's not between 28 | def normal_probability_outside(lo, hi, mu=0, sigma=1): 29 | return 1 - normal_probability_between(lo, hi, mu, sigma) 30 | 31 | ###### 32 | # 33 | # normal bounds 34 | # 35 | ###### 36 | 37 | 38 | def normal_upper_bound(probability, mu=0, sigma=1): 39 | """returns the z for which P(Z <= z) = probability""" 40 | return inverse_normal_cdf(probability, mu, sigma) 41 | 42 | def normal_lower_bound(probability, mu=0, sigma=1): 43 | """returns the z for which P(Z >= z) = probability""" 44 | return inverse_normal_cdf(1 - probability, mu, sigma) 45 | 46 | def normal_two_sided_bounds(probability, mu=0, sigma=1): 47 | """returns the symmetric (about the mean) bounds 48 | that contain the specified probability""" 49 | tail_probability = (1 - probability) / 2 50 | 51 | # upper bound should have tail_probability above it 52 | upper_bound = normal_lower_bound(tail_probability, mu, sigma) 53 | 54 | # lower bound should have tail_probability below it 55 | lower_bound = normal_upper_bound(tail_probability, mu, sigma) 56 | 57 | return lower_bound, upper_bound 58 | 59 | def two_sided_p_value(x, mu=0, sigma=1): 60 | if x >= mu: 61 | # if x is greater than the mean, the tail is above x 62 | return 2 * normal_probability_above(x, mu, sigma) 63 | else: 64 | # if x is less than the mean, the tail is below x 65 | return 2 * normal_probability_below(x, mu, sigma) 66 | 67 | def count_extreme_values(): 68 | extreme_value_count = 0 69 | for _ in range(100000): 70 | num_heads = sum(1 if random.random() < 0.5 else 0 # count # of heads 71 | for _ in range(1000)) # in 1000 flips 72 | if num_heads >= 530 or num_heads <= 470: # and count how often 73 | extreme_value_count += 1 # the # is 'extreme' 74 | 75 | return extreme_value_count / 100000 76 | 77 | upper_p_value = normal_probability_above 78 | lower_p_value = normal_probability_below 79 | 80 | ## 81 | # 82 | # P-hacking 83 | # 84 | ## 85 | 86 | def run_experiment(): 87 | """flip a fair coin 1000 times, True = heads, False = tails""" 88 | return [random.random() < 0.5 for _ in range(1000)] 89 | 90 | def reject_fairness(experiment): 91 | """using the 5% significance levels""" 92 | num_heads = len([flip for flip in experiment if flip]) 93 | return num_heads < 469 or num_heads > 531 94 | 95 | 96 | ## 97 | # 98 | # running an A/B test 99 | # 100 | ## 101 | 102 | def estimated_parameters(N, n): 103 | p = n / N 104 | sigma = math.sqrt(p * (1 - p) / N) 105 | return p, sigma 106 | 107 | def a_b_test_statistic(N_A, n_A, N_B, n_B): 108 | p_A, sigma_A = estimated_parameters(N_A, n_A) 109 | p_B, sigma_B = estimated_parameters(N_B, n_B) 110 | return (p_B - p_A) / math.sqrt(sigma_A ** 2 + sigma_B ** 2) 111 | 112 | ## 113 | # 114 | # Bayesian Inference 115 | # 116 | ## 117 | 118 | def B(alpha, beta): 119 | """a normalizing constant so that the total probability is 1""" 120 | return math.gamma(alpha) * math.gamma(beta) / math.gamma(alpha + beta) 121 | 122 | def beta_pdf(x, alpha, beta): 123 | if x < 0 or x > 1: # no weight outside of [0, 1] 124 | return 0 125 | return x ** (alpha - 1) * (1 - x) ** (beta - 1) / B(alpha, beta) 126 | 127 | 128 | if __name__ == "__main__": 129 | 130 | mu_0, sigma_0 = normal_approximation_to_binomial(1000, 0.5) 131 | print("mu_0", mu_0) 132 | print("sigma_0", sigma_0) 133 | print("normal_two_sided_bounds(0.95, mu_0, sigma_0)", normal_two_sided_bounds(0.95, mu_0, sigma_0)) 134 | print() 135 | print("power of a test") 136 | 137 | print("95% bounds based on assumption p is 0.5") 138 | 139 | lo, hi = normal_two_sided_bounds(0.95, mu_0, sigma_0) 140 | print("lo", lo) 141 | print("hi", hi) 142 | 143 | print("actual mu and sigma based on p = 0.55") 144 | mu_1, sigma_1 = normal_approximation_to_binomial(1000, 0.55) 145 | print("mu_1", mu_1) 146 | print("sigma_1", sigma_1) 147 | 148 | # a type 2 error means we fail to reject the null hypothesis 149 | # which will happen when X is still in our original interval 150 | type_2_probability = normal_probability_between(lo, hi, mu_1, sigma_1) 151 | power = 1 - type_2_probability # 0.887 152 | 153 | print("type 2 probability", type_2_probability) 154 | print("power", power) 155 | print 156 | 157 | print("one-sided test") 158 | hi = normal_upper_bound(0.95, mu_0, sigma_0) 159 | print("hi", hi) # is 526 (< 531, since we need more probability in the upper tail) 160 | type_2_probability = normal_probability_below(hi, mu_1, sigma_1) 161 | power = 1 - type_2_probability # = 0.936 162 | print("type 2 probability", type_2_probability) 163 | print("power", power) 164 | print() 165 | 166 | print("two_sided_p_value(529.5, mu_0, sigma_0)", two_sided_p_value(529.5, mu_0, sigma_0)) 167 | 168 | print("two_sided_p_value(531.5, mu_0, sigma_0)", two_sided_p_value(531.5, mu_0, sigma_0)) 169 | 170 | print("upper_p_value(525, mu_0, sigma_0)", upper_p_value(525, mu_0, sigma_0)) 171 | print("upper_p_value(527, mu_0, sigma_0)", upper_p_value(527, mu_0, sigma_0)) 172 | print() 173 | 174 | print("P-hacking") 175 | 176 | random.seed(0) 177 | experiments = [run_experiment() for _ in range(1000)] 178 | num_rejections = len([experiment 179 | for experiment in experiments 180 | if reject_fairness(experiment)]) 181 | 182 | print(num_rejections, "rejections out of 1000") 183 | print() 184 | 185 | print("A/B testing") 186 | z = a_b_test_statistic(1000, 200, 1000, 180) 187 | print("a_b_test_statistic(1000, 200, 1000, 180)", z) 188 | print("p-value", two_sided_p_value(z)) 189 | z = a_b_test_statistic(1000, 200, 1000, 150) 190 | print("a_b_test_statistic(1000, 200, 1000, 150)", z) 191 | print("p-value", two_sided_p_value(z)) 192 | -------------------------------------------------------------------------------- /first-edition/code-python3/recommender_systems.py: -------------------------------------------------------------------------------- 1 | import math, random 2 | from collections import defaultdict, Counter 3 | from linear_algebra import dot 4 | 5 | users_interests = [ 6 | ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"], 7 | ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"], 8 | ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"], 9 | ["R", "Python", "statistics", "regression", "probability"], 10 | ["machine learning", "regression", "decision trees", "libsvm"], 11 | ["Python", "R", "Java", "C++", "Haskell", "programming languages"], 12 | ["statistics", "probability", "mathematics", "theory"], 13 | ["machine learning", "scikit-learn", "Mahout", "neural networks"], 14 | ["neural networks", "deep learning", "Big Data", "artificial intelligence"], 15 | ["Hadoop", "Java", "MapReduce", "Big Data"], 16 | ["statistics", "R", "statsmodels"], 17 | ["C++", "deep learning", "artificial intelligence", "probability"], 18 | ["pandas", "R", "Python"], 19 | ["databases", "HBase", "Postgres", "MySQL", "MongoDB"], 20 | ["libsvm", "regression", "support vector machines"] 21 | ] 22 | 23 | popular_interests = Counter(interest 24 | for user_interests in users_interests 25 | for interest in user_interests).most_common() 26 | 27 | def most_popular_new_interests(user_interests, max_results=5): 28 | suggestions = [(interest, frequency) 29 | for interest, frequency in popular_interests 30 | if interest not in user_interests] 31 | return suggestions[:max_results] 32 | 33 | # 34 | # user-based filtering 35 | # 36 | 37 | def cosine_similarity(v, w): 38 | return dot(v, w) / math.sqrt(dot(v, v) * dot(w, w)) 39 | 40 | unique_interests = sorted(list({ interest 41 | for user_interests in users_interests 42 | for interest in user_interests })) 43 | 44 | def make_user_interest_vector(user_interests): 45 | """given a list of interests, produce a vector whose i-th element is 1 46 | if unique_interests[i] is in the list, 0 otherwise""" 47 | return [1 if interest in user_interests else 0 48 | for interest in unique_interests] 49 | 50 | user_interest_matrix = list(map(make_user_interest_vector, users_interests)) 51 | 52 | user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j) 53 | for interest_vector_j in user_interest_matrix] 54 | for interest_vector_i in user_interest_matrix] 55 | 56 | def most_similar_users_to(user_id): 57 | pairs = [(other_user_id, similarity) # find other 58 | for other_user_id, similarity in # users with 59 | enumerate(user_similarities[user_id]) # nonzero 60 | if user_id != other_user_id and similarity > 0] # similarity 61 | 62 | return sorted(pairs, # sort them 63 | key=lambda pair: pair[1], # most similar 64 | reverse=True) # first 65 | 66 | 67 | def user_based_suggestions(user_id, include_current_interests=False): 68 | # sum up the similarities 69 | suggestions = defaultdict(float) 70 | for other_user_id, similarity in most_similar_users_to(user_id): 71 | for interest in users_interests[other_user_id]: 72 | suggestions[interest] += similarity 73 | 74 | # convert them to a sorted list 75 | suggestions = sorted(suggestions.items(), 76 | key=lambda pair: pair[1], 77 | reverse=True) 78 | 79 | # and (maybe) exclude already-interests 80 | if include_current_interests: 81 | return suggestions 82 | else: 83 | return [(suggestion, weight) 84 | for suggestion, weight in suggestions 85 | if suggestion not in users_interests[user_id]] 86 | 87 | # 88 | # Item-Based Collaborative Filtering 89 | # 90 | 91 | interest_user_matrix = [[user_interest_vector[j] 92 | for user_interest_vector in user_interest_matrix] 93 | for j, _ in enumerate(unique_interests)] 94 | 95 | interest_similarities = [[cosine_similarity(user_vector_i, user_vector_j) 96 | for user_vector_j in interest_user_matrix] 97 | for user_vector_i in interest_user_matrix] 98 | 99 | def most_similar_interests_to(interest_id): 100 | similarities = interest_similarities[interest_id] 101 | pairs = [(unique_interests[other_interest_id], similarity) 102 | for other_interest_id, similarity in enumerate(similarities) 103 | if interest_id != other_interest_id and similarity > 0] 104 | return sorted(pairs, 105 | key=lambda pair: pair[1], 106 | reverse=True) 107 | 108 | def item_based_suggestions(user_id, include_current_interests=False): 109 | suggestions = defaultdict(float) 110 | user_interest_vector = user_interest_matrix[user_id] 111 | for interest_id, is_interested in enumerate(user_interest_vector): 112 | if is_interested == 1: 113 | similar_interests = most_similar_interests_to(interest_id) 114 | for interest, similarity in similar_interests: 115 | suggestions[interest] += similarity 116 | 117 | suggestions = sorted(suggestions.items(), 118 | key=lambda pair: pair[1], 119 | reverse=True) 120 | 121 | if include_current_interests: 122 | return suggestions 123 | else: 124 | return [(suggestion, weight) 125 | for suggestion, weight in suggestions 126 | if suggestion not in users_interests[user_id]] 127 | 128 | 129 | if __name__ == "__main__": 130 | 131 | print("Popular Interests") 132 | print(popular_interests) 133 | print() 134 | 135 | print("Most Popular New Interests") 136 | print("already like:", ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"]) 137 | print(most_popular_new_interests(["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"])) 138 | print() 139 | print("already like:", ["R", "Python", "statistics", "regression", "probability"]) 140 | print(most_popular_new_interests(["R", "Python", "statistics", "regression", "probability"])) 141 | print() 142 | 143 | print("User based similarity") 144 | print("most similar to 0") 145 | print(most_similar_users_to(0)) 146 | 147 | print("Suggestions for 0") 148 | print(user_based_suggestions(0)) 149 | print() 150 | 151 | print("Item based similarity") 152 | print("most similar to 'Big Data'") 153 | print(most_similar_interests_to(0)) 154 | print() 155 | 156 | print("suggestions for user 0") 157 | print(item_based_suggestions(0)) 158 | -------------------------------------------------------------------------------- /first-edition/code/hypothesis_and_inference.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from probability import normal_cdf, inverse_normal_cdf 3 | import math, random 4 | 5 | def normal_approximation_to_binomial(n, p): 6 | """finds mu and sigma corresponding to a Binomial(n, p)""" 7 | mu = p * n 8 | sigma = math.sqrt(p * (1 - p) * n) 9 | return mu, sigma 10 | 11 | ##### 12 | # 13 | # probabilities a normal lies in an interval 14 | # 15 | ###### 16 | 17 | # the normal cdf _is_ the probability the variable is below a threshold 18 | normal_probability_below = normal_cdf 19 | 20 | # it's above the threshold if it's not below the threshold 21 | def normal_probability_above(lo, mu=0, sigma=1): 22 | return 1 - normal_cdf(lo, mu, sigma) 23 | 24 | # it's between if it's less than hi, but not less than lo 25 | def normal_probability_between(lo, hi, mu=0, sigma=1): 26 | return normal_cdf(hi, mu, sigma) - normal_cdf(lo, mu, sigma) 27 | 28 | # it's outside if it's not between 29 | def normal_probability_outside(lo, hi, mu=0, sigma=1): 30 | return 1 - normal_probability_between(lo, hi, mu, sigma) 31 | 32 | ###### 33 | # 34 | # normal bounds 35 | # 36 | ###### 37 | 38 | 39 | def normal_upper_bound(probability, mu=0, sigma=1): 40 | """returns the z for which P(Z <= z) = probability""" 41 | return inverse_normal_cdf(probability, mu, sigma) 42 | 43 | def normal_lower_bound(probability, mu=0, sigma=1): 44 | """returns the z for which P(Z >= z) = probability""" 45 | return inverse_normal_cdf(1 - probability, mu, sigma) 46 | 47 | def normal_two_sided_bounds(probability, mu=0, sigma=1): 48 | """returns the symmetric (about the mean) bounds 49 | that contain the specified probability""" 50 | tail_probability = (1 - probability) / 2 51 | 52 | # upper bound should have tail_probability above it 53 | upper_bound = normal_lower_bound(tail_probability, mu, sigma) 54 | 55 | # lower bound should have tail_probability below it 56 | lower_bound = normal_upper_bound(tail_probability, mu, sigma) 57 | 58 | return lower_bound, upper_bound 59 | 60 | def two_sided_p_value(x, mu=0, sigma=1): 61 | if x >= mu: 62 | # if x is greater than the mean, the tail is above x 63 | return 2 * normal_probability_above(x, mu, sigma) 64 | else: 65 | # if x is less than the mean, the tail is below x 66 | return 2 * normal_probability_below(x, mu, sigma) 67 | 68 | def count_extreme_values(): 69 | extreme_value_count = 0 70 | for _ in range(100000): 71 | num_heads = sum(1 if random.random() < 0.5 else 0 # count # of heads 72 | for _ in range(1000)) # in 1000 flips 73 | if num_heads >= 530 or num_heads <= 470: # and count how often 74 | extreme_value_count += 1 # the # is 'extreme' 75 | 76 | return extreme_value_count / 100000 77 | 78 | upper_p_value = normal_probability_above 79 | lower_p_value = normal_probability_below 80 | 81 | ## 82 | # 83 | # P-hacking 84 | # 85 | ## 86 | 87 | def run_experiment(): 88 | """flip a fair coin 1000 times, True = heads, False = tails""" 89 | return [random.random() < 0.5 for _ in range(1000)] 90 | 91 | def reject_fairness(experiment): 92 | """using the 5% significance levels""" 93 | num_heads = len([flip for flip in experiment if flip]) 94 | return num_heads < 469 or num_heads > 531 95 | 96 | 97 | ## 98 | # 99 | # running an A/B test 100 | # 101 | ## 102 | 103 | def estimated_parameters(N, n): 104 | p = n / N 105 | sigma = math.sqrt(p * (1 - p) / N) 106 | return p, sigma 107 | 108 | def a_b_test_statistic(N_A, n_A, N_B, n_B): 109 | p_A, sigma_A = estimated_parameters(N_A, n_A) 110 | p_B, sigma_B = estimated_parameters(N_B, n_B) 111 | return (p_B - p_A) / math.sqrt(sigma_A ** 2 + sigma_B ** 2) 112 | 113 | ## 114 | # 115 | # Bayesian Inference 116 | # 117 | ## 118 | 119 | def B(alpha, beta): 120 | """a normalizing constant so that the total probability is 1""" 121 | return math.gamma(alpha) * math.gamma(beta) / math.gamma(alpha + beta) 122 | 123 | def beta_pdf(x, alpha, beta): 124 | if x < 0 or x > 1: # no weight outside of [0, 1] 125 | return 0 126 | return x ** (alpha - 1) * (1 - x) ** (beta - 1) / B(alpha, beta) 127 | 128 | 129 | if __name__ == "__main__": 130 | 131 | mu_0, sigma_0 = normal_approximation_to_binomial(1000, 0.5) 132 | print "mu_0", mu_0 133 | print "sigma_0", sigma_0 134 | print "normal_two_sided_bounds(0.95, mu_0, sigma_0)", normal_two_sided_bounds(0.95, mu_0, sigma_0) 135 | print 136 | print "power of a test" 137 | 138 | print "95% bounds based on assumption p is 0.5" 139 | 140 | lo, hi = normal_two_sided_bounds(0.95, mu_0, sigma_0) 141 | print "lo", lo 142 | print "hi", hi 143 | 144 | print "actual mu and sigma based on p = 0.55" 145 | mu_1, sigma_1 = normal_approximation_to_binomial(1000, 0.55) 146 | print "mu_1", mu_1 147 | print "sigma_1", sigma_1 148 | 149 | # a type 2 error means we fail to reject the null hypothesis 150 | # which will happen when X is still in our original interval 151 | type_2_probability = normal_probability_between(lo, hi, mu_1, sigma_1) 152 | power = 1 - type_2_probability # 0.887 153 | 154 | print "type 2 probability", type_2_probability 155 | print "power", power 156 | print 157 | 158 | print "one-sided test" 159 | hi = normal_upper_bound(0.95, mu_0, sigma_0) 160 | print "hi", hi # is 526 (< 531, since we need more probability in the upper tail) 161 | type_2_probability = normal_probability_below(hi, mu_1, sigma_1) 162 | power = 1 - type_2_probability # = 0.936 163 | print "type 2 probability", type_2_probability 164 | print "power", power 165 | print 166 | 167 | print "two_sided_p_value(529.5, mu_0, sigma_0)", two_sided_p_value(529.5, mu_0, sigma_0) 168 | 169 | print "two_sided_p_value(531.5, mu_0, sigma_0)", two_sided_p_value(531.5, mu_0, sigma_0) 170 | 171 | print "upper_p_value(525, mu_0, sigma_0)", upper_p_value(525, mu_0, sigma_0) 172 | print "upper_p_value(527, mu_0, sigma_0)", upper_p_value(527, mu_0, sigma_0) 173 | print 174 | 175 | print "P-hacking" 176 | 177 | random.seed(0) 178 | experiments = [run_experiment() for _ in range(1000)] 179 | num_rejections = len([experiment 180 | for experiment in experiments 181 | if reject_fairness(experiment)]) 182 | 183 | print num_rejections, "rejections out of 1000" 184 | print 185 | 186 | print "A/B testing" 187 | z = a_b_test_statistic(1000, 200, 1000, 180) 188 | print "a_b_test_statistic(1000, 200, 1000, 180)", z 189 | print "p-value", two_sided_p_value(z) 190 | z = a_b_test_statistic(1000, 200, 1000, 150) 191 | print "a_b_test_statistic(1000, 200, 1000, 150)", z 192 | print "p-value", two_sided_p_value(z) 193 | -------------------------------------------------------------------------------- /first-edition/code/recommender_systems.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import math, random 3 | from collections import defaultdict, Counter 4 | from linear_algebra import dot 5 | 6 | users_interests = [ 7 | ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"], 8 | ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"], 9 | ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"], 10 | ["R", "Python", "statistics", "regression", "probability"], 11 | ["machine learning", "regression", "decision trees", "libsvm"], 12 | ["Python", "R", "Java", "C++", "Haskell", "programming languages"], 13 | ["statistics", "probability", "mathematics", "theory"], 14 | ["machine learning", "scikit-learn", "Mahout", "neural networks"], 15 | ["neural networks", "deep learning", "Big Data", "artificial intelligence"], 16 | ["Hadoop", "Java", "MapReduce", "Big Data"], 17 | ["statistics", "R", "statsmodels"], 18 | ["C++", "deep learning", "artificial intelligence", "probability"], 19 | ["pandas", "R", "Python"], 20 | ["databases", "HBase", "Postgres", "MySQL", "MongoDB"], 21 | ["libsvm", "regression", "support vector machines"] 22 | ] 23 | 24 | popular_interests = Counter(interest 25 | for user_interests in users_interests 26 | for interest in user_interests).most_common() 27 | 28 | def most_popular_new_interests(user_interests, max_results=5): 29 | suggestions = [(interest, frequency) 30 | for interest, frequency in popular_interests 31 | if interest not in user_interests] 32 | return suggestions[:max_results] 33 | 34 | # 35 | # user-based filtering 36 | # 37 | 38 | def cosine_similarity(v, w): 39 | return dot(v, w) / math.sqrt(dot(v, v) * dot(w, w)) 40 | 41 | unique_interests = sorted(list({ interest 42 | for user_interests in users_interests 43 | for interest in user_interests })) 44 | 45 | def make_user_interest_vector(user_interests): 46 | """given a list of interests, produce a vector whose i-th element is 1 47 | if unique_interests[i] is in the list, 0 otherwise""" 48 | return [1 if interest in user_interests else 0 49 | for interest in unique_interests] 50 | 51 | user_interest_matrix = map(make_user_interest_vector, users_interests) 52 | 53 | user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j) 54 | for interest_vector_j in user_interest_matrix] 55 | for interest_vector_i in user_interest_matrix] 56 | 57 | def most_similar_users_to(user_id): 58 | pairs = [(other_user_id, similarity) # find other 59 | for other_user_id, similarity in # users with 60 | enumerate(user_similarities[user_id]) # nonzero 61 | if user_id != other_user_id and similarity > 0] # similarity 62 | 63 | return sorted(pairs, # sort them 64 | key=lambda (_, similarity): similarity, # most similar 65 | reverse=True) # first 66 | 67 | 68 | def user_based_suggestions(user_id, include_current_interests=False): 69 | # sum up the similarities 70 | suggestions = defaultdict(float) 71 | for other_user_id, similarity in most_similar_users_to(user_id): 72 | for interest in users_interests[other_user_id]: 73 | suggestions[interest] += similarity 74 | 75 | # convert them to a sorted list 76 | suggestions = sorted(suggestions.items(), 77 | key=lambda (_, weight): weight, 78 | reverse=True) 79 | 80 | # and (maybe) exclude already-interests 81 | if include_current_interests: 82 | return suggestions 83 | else: 84 | return [(suggestion, weight) 85 | for suggestion, weight in suggestions 86 | if suggestion not in users_interests[user_id]] 87 | 88 | # 89 | # Item-Based Collaborative Filtering 90 | # 91 | 92 | interest_user_matrix = [[user_interest_vector[j] 93 | for user_interest_vector in user_interest_matrix] 94 | for j, _ in enumerate(unique_interests)] 95 | 96 | interest_similarities = [[cosine_similarity(user_vector_i, user_vector_j) 97 | for user_vector_j in interest_user_matrix] 98 | for user_vector_i in interest_user_matrix] 99 | 100 | def most_similar_interests_to(interest_id): 101 | similarities = interest_similarities[interest_id] 102 | pairs = [(unique_interests[other_interest_id], similarity) 103 | for other_interest_id, similarity in enumerate(similarities) 104 | if interest_id != other_interest_id and similarity > 0] 105 | return sorted(pairs, 106 | key=lambda (_, similarity): similarity, 107 | reverse=True) 108 | 109 | def item_based_suggestions(user_id, include_current_interests=False): 110 | suggestions = defaultdict(float) 111 | user_interest_vector = user_interest_matrix[user_id] 112 | for interest_id, is_interested in enumerate(user_interest_vector): 113 | if is_interested == 1: 114 | similar_interests = most_similar_interests_to(interest_id) 115 | for interest, similarity in similar_interests: 116 | suggestions[interest] += similarity 117 | 118 | suggestions = sorted(suggestions.items(), 119 | key=lambda (_, similarity): similarity, 120 | reverse=True) 121 | 122 | if include_current_interests: 123 | return suggestions 124 | else: 125 | return [(suggestion, weight) 126 | for suggestion, weight in suggestions 127 | if suggestion not in users_interests[user_id]] 128 | 129 | 130 | if __name__ == "__main__": 131 | 132 | print "Popular Interests" 133 | print popular_interests 134 | print 135 | 136 | print "Most Popular New Interests" 137 | print "already like:", ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"] 138 | print most_popular_new_interests(["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"]) 139 | print 140 | print "already like:", ["R", "Python", "statistics", "regression", "probability"] 141 | print most_popular_new_interests(["R", "Python", "statistics", "regression", "probability"]) 142 | print 143 | 144 | print "User based similarity" 145 | print "most similar to 0" 146 | print most_similar_users_to(0) 147 | 148 | print "Suggestions for 0" 149 | print user_based_suggestions(0) 150 | print 151 | 152 | print "Item based similarity" 153 | print "most similar to 'Big Data'" 154 | print most_similar_interests_to(0) 155 | print 156 | 157 | print "suggestions for user 0" 158 | print item_based_suggestions(0) 159 | 160 | -------------------------------------------------------------------------------- /first-edition/code-python3/clustering.py: -------------------------------------------------------------------------------- 1 | from linear_algebra import squared_distance, vector_mean, distance 2 | import math, random 3 | import matplotlib.image as mpimg 4 | import matplotlib.pyplot as plt 5 | 6 | class KMeans: 7 | """performs k-means clustering""" 8 | 9 | def __init__(self, k): 10 | self.k = k # number of clusters 11 | self.means = None # means of clusters 12 | 13 | def classify(self, input): 14 | """return the index of the cluster closest to the input""" 15 | return min(range(self.k), 16 | key=lambda i: squared_distance(input, self.means[i])) 17 | 18 | def train(self, inputs): 19 | 20 | self.means = random.sample(inputs, self.k) 21 | assignments = None 22 | 23 | while True: 24 | # Find new assignments 25 | new_assignments = list(map(self.classify, inputs)) 26 | 27 | # If no assignments have changed, we're done. 28 | if assignments == new_assignments: 29 | return 30 | 31 | # Otherwise keep the new assignments, 32 | assignments = new_assignments 33 | 34 | for i in range(self.k): 35 | i_points = [p for p, a in zip(inputs, assignments) if a == i] 36 | # avoid divide-by-zero if i_points is empty 37 | if i_points: 38 | self.means[i] = vector_mean(i_points) 39 | 40 | def squared_clustering_errors(inputs, k): 41 | """finds the total squared error from k-means clustering the inputs""" 42 | clusterer = KMeans(k) 43 | clusterer.train(inputs) 44 | means = clusterer.means 45 | assignments = list(map(clusterer.classify, inputs)) 46 | 47 | return sum(squared_distance(input,means[cluster]) 48 | for input, cluster in zip(inputs, assignments)) 49 | 50 | def plot_squared_clustering_errors(): 51 | 52 | ks = range(1, len(inputs) + 1) 53 | errors = [squared_clustering_errors(inputs, k) for k in ks] 54 | 55 | plt.plot(ks, errors) 56 | plt.xticks(ks) 57 | plt.xlabel("k") 58 | plt.ylabel("total squared error") 59 | plt.show() 60 | 61 | # 62 | # using clustering to recolor an image 63 | # 64 | 65 | def recolor_image(input_file, k=5): 66 | 67 | img = mpimg.imread(path_to_png_file) 68 | pixels = [pixel for row in img for pixel in row] 69 | clusterer = KMeans(k) 70 | clusterer.train(pixels) # this might take a while 71 | 72 | def recolor(pixel): 73 | cluster = clusterer.classify(pixel) # index of the closest cluster 74 | return clusterer.means[cluster] # mean of the closest cluster 75 | 76 | new_img = [[recolor(pixel) for pixel in row] 77 | for row in img] 78 | 79 | plt.imshow(new_img) 80 | plt.axis('off') 81 | plt.show() 82 | 83 | # 84 | # hierarchical clustering 85 | # 86 | 87 | def is_leaf(cluster): 88 | """a cluster is a leaf if it has length 1""" 89 | return len(cluster) == 1 90 | 91 | def get_children(cluster): 92 | """returns the two children of this cluster if it's a merged cluster; 93 | raises an exception if this is a leaf cluster""" 94 | if is_leaf(cluster): 95 | raise TypeError("a leaf cluster has no children") 96 | else: 97 | return cluster[1] 98 | 99 | def get_values(cluster): 100 | """returns the value in this cluster (if it's a leaf cluster) 101 | or all the values in the leaf clusters below it (if it's not)""" 102 | if is_leaf(cluster): 103 | return cluster # is already a 1-tuple containing value 104 | else: 105 | return [value 106 | for child in get_children(cluster) 107 | for value in get_values(child)] 108 | 109 | def cluster_distance(cluster1, cluster2, distance_agg=min): 110 | """finds the aggregate distance between elements of cluster1 111 | and elements of cluster2""" 112 | return distance_agg([distance(input1, input2) 113 | for input1 in get_values(cluster1) 114 | for input2 in get_values(cluster2)]) 115 | 116 | def get_merge_order(cluster): 117 | if is_leaf(cluster): 118 | return float('inf') 119 | else: 120 | return cluster[0] # merge_order is first element of 2-tuple 121 | 122 | def bottom_up_cluster(inputs, distance_agg=min): 123 | # start with every input a leaf cluster / 1-tuple 124 | clusters = [(input,) for input in inputs] 125 | 126 | # as long as we have more than one cluster left... 127 | while len(clusters) > 1: 128 | # find the two closest clusters 129 | c1, c2 = min([(cluster1, cluster2) 130 | for i, cluster1 in enumerate(clusters) 131 | for cluster2 in clusters[:i]], 132 | key=lambda p: cluster_distance(p[0], p[1], distance_agg)) 133 | 134 | # remove them from the list of clusters 135 | clusters = [c for c in clusters if c != c1 and c != c2] 136 | 137 | # merge them, using merge_order = # of clusters left 138 | merged_cluster = (len(clusters), [c1, c2]) 139 | 140 | # and add their merge 141 | clusters.append(merged_cluster) 142 | 143 | # when there's only one cluster left, return it 144 | return clusters[0] 145 | 146 | def generate_clusters(base_cluster, num_clusters): 147 | # start with a list with just the base cluster 148 | clusters = [base_cluster] 149 | 150 | # as long as we don't have enough clusters yet... 151 | while len(clusters) < num_clusters: 152 | # choose the last-merged of our clusters 153 | next_cluster = min(clusters, key=get_merge_order) 154 | # remove it from the list 155 | clusters = [c for c in clusters if c != next_cluster] 156 | # and add its children to the list (i.e., unmerge it) 157 | clusters.extend(get_children(next_cluster)) 158 | 159 | # once we have enough clusters... 160 | return clusters 161 | 162 | if __name__ == "__main__": 163 | 164 | inputs = [[-14,-5],[13,13],[20,23],[-19,-11],[-9,-16],[21,27],[-49,15],[26,13],[-46,5],[-34,-1],[11,15],[-49,0],[-22,-16],[19,28],[-12,-8],[-13,-19],[-41,8],[-11,-6],[-25,-9],[-18,-3]] 165 | 166 | random.seed(0) # so you get the same results as me 167 | clusterer = KMeans(3) 168 | clusterer.train(inputs) 169 | print("3-means:") 170 | print(clusterer.means) 171 | print() 172 | 173 | random.seed(0) 174 | clusterer = KMeans(2) 175 | clusterer.train(inputs) 176 | print("2-means:") 177 | print(clusterer.means) 178 | print() 179 | 180 | print("errors as a function of k") 181 | 182 | for k in range(1, len(inputs) + 1): 183 | print(k, squared_clustering_errors(inputs, k)) 184 | print() 185 | 186 | 187 | print("bottom up hierarchical clustering") 188 | 189 | base_cluster = bottom_up_cluster(inputs) 190 | print(base_cluster) 191 | 192 | print() 193 | print("three clusters, min:") 194 | for cluster in generate_clusters(base_cluster, 3): 195 | print(get_values(cluster)) 196 | 197 | print() 198 | print("three clusters, max:") 199 | base_cluster = bottom_up_cluster(inputs, max) 200 | for cluster in generate_clusters(base_cluster, 3): 201 | print(get_values(cluster)) 202 | -------------------------------------------------------------------------------- /first-edition/code/clustering.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from linear_algebra import squared_distance, vector_mean, distance 3 | import math, random 4 | import matplotlib.image as mpimg 5 | import matplotlib.pyplot as plt 6 | 7 | class KMeans: 8 | """performs k-means clustering""" 9 | 10 | def __init__(self, k): 11 | self.k = k # number of clusters 12 | self.means = None # means of clusters 13 | 14 | def classify(self, input): 15 | """return the index of the cluster closest to the input""" 16 | return min(range(self.k), 17 | key=lambda i: squared_distance(input, self.means[i])) 18 | 19 | def train(self, inputs): 20 | 21 | self.means = random.sample(inputs, self.k) 22 | assignments = None 23 | 24 | while True: 25 | # Find new assignments 26 | new_assignments = map(self.classify, inputs) 27 | 28 | # If no assignments have changed, we're done. 29 | if assignments == new_assignments: 30 | return 31 | 32 | # Otherwise keep the new assignments, 33 | assignments = new_assignments 34 | 35 | for i in range(self.k): 36 | i_points = [p for p, a in zip(inputs, assignments) if a == i] 37 | # avoid divide-by-zero if i_points is empty 38 | if i_points: 39 | self.means[i] = vector_mean(i_points) 40 | 41 | def squared_clustering_errors(inputs, k): 42 | """finds the total squared error from k-means clustering the inputs""" 43 | clusterer = KMeans(k) 44 | clusterer.train(inputs) 45 | means = clusterer.means 46 | assignments = map(clusterer.classify, inputs) 47 | 48 | return sum(squared_distance(input,means[cluster]) 49 | for input, cluster in zip(inputs, assignments)) 50 | 51 | def plot_squared_clustering_errors(plt): 52 | 53 | ks = range(1, len(inputs) + 1) 54 | errors = [squared_clustering_errors(inputs, k) for k in ks] 55 | 56 | plt.plot(ks, errors) 57 | plt.xticks(ks) 58 | plt.xlabel("k") 59 | plt.ylabel("total squared error") 60 | plt.show() 61 | 62 | # 63 | # using clustering to recolor an image 64 | # 65 | 66 | def recolor_image(input_file, k=5): 67 | 68 | img = mpimg.imread(path_to_png_file) 69 | pixels = [pixel for row in img for pixel in row] 70 | clusterer = KMeans(k) 71 | clusterer.train(pixels) # this might take a while 72 | 73 | def recolor(pixel): 74 | cluster = clusterer.classify(pixel) # index of the closest cluster 75 | return clusterer.means[cluster] # mean of the closest cluster 76 | 77 | new_img = [[recolor(pixel) for pixel in row] 78 | for row in img] 79 | 80 | plt.imshow(new_img) 81 | plt.axis('off') 82 | plt.show() 83 | 84 | # 85 | # hierarchical clustering 86 | # 87 | 88 | def is_leaf(cluster): 89 | """a cluster is a leaf if it has length 1""" 90 | return len(cluster) == 1 91 | 92 | def get_children(cluster): 93 | """returns the two children of this cluster if it's a merged cluster; 94 | raises an exception if this is a leaf cluster""" 95 | if is_leaf(cluster): 96 | raise TypeError("a leaf cluster has no children") 97 | else: 98 | return cluster[1] 99 | 100 | def get_values(cluster): 101 | """returns the value in this cluster (if it's a leaf cluster) 102 | or all the values in the leaf clusters below it (if it's not)""" 103 | if is_leaf(cluster): 104 | return cluster # is already a 1-tuple containing value 105 | else: 106 | return [value 107 | for child in get_children(cluster) 108 | for value in get_values(child)] 109 | 110 | def cluster_distance(cluster1, cluster2, distance_agg=min): 111 | """finds the aggregate distance between elements of cluster1 112 | and elements of cluster2""" 113 | return distance_agg([distance(input1, input2) 114 | for input1 in get_values(cluster1) 115 | for input2 in get_values(cluster2)]) 116 | 117 | def get_merge_order(cluster): 118 | if is_leaf(cluster): 119 | return float('inf') 120 | else: 121 | return cluster[0] # merge_order is first element of 2-tuple 122 | 123 | def bottom_up_cluster(inputs, distance_agg=min): 124 | # start with every input a leaf cluster / 1-tuple 125 | clusters = [(input,) for input in inputs] 126 | 127 | # as long as we have more than one cluster left... 128 | while len(clusters) > 1: 129 | # find the two closest clusters 130 | c1, c2 = min([(cluster1, cluster2) 131 | for i, cluster1 in enumerate(clusters) 132 | for cluster2 in clusters[:i]], 133 | key=lambda (x, y): cluster_distance(x, y, distance_agg)) 134 | 135 | # remove them from the list of clusters 136 | clusters = [c for c in clusters if c != c1 and c != c2] 137 | 138 | # merge them, using merge_order = # of clusters left 139 | merged_cluster = (len(clusters), [c1, c2]) 140 | 141 | # and add their merge 142 | clusters.append(merged_cluster) 143 | 144 | # when there's only one cluster left, return it 145 | return clusters[0] 146 | 147 | def generate_clusters(base_cluster, num_clusters): 148 | # start with a list with just the base cluster 149 | clusters = [base_cluster] 150 | 151 | # as long as we don't have enough clusters yet... 152 | while len(clusters) < num_clusters: 153 | # choose the last-merged of our clusters 154 | next_cluster = min(clusters, key=get_merge_order) 155 | # remove it from the list 156 | clusters = [c for c in clusters if c != next_cluster] 157 | # and add its children to the list (i.e., unmerge it) 158 | clusters.extend(get_children(next_cluster)) 159 | 160 | # once we have enough clusters... 161 | return clusters 162 | 163 | if __name__ == "__main__": 164 | 165 | inputs = [[-14,-5],[13,13],[20,23],[-19,-11],[-9,-16],[21,27],[-49,15],[26,13],[-46,5],[-34,-1],[11,15],[-49,0],[-22,-16],[19,28],[-12,-8],[-13,-19],[-41,8],[-11,-6],[-25,-9],[-18,-3]] 166 | 167 | random.seed(0) # so you get the same results as me 168 | clusterer = KMeans(3) 169 | clusterer.train(inputs) 170 | print "3-means:" 171 | print clusterer.means 172 | print 173 | 174 | random.seed(0) 175 | clusterer = KMeans(2) 176 | clusterer.train(inputs) 177 | print "2-means:" 178 | print clusterer.means 179 | print 180 | 181 | print "errors as a function of k" 182 | 183 | for k in range(1, len(inputs) + 1): 184 | print k, squared_clustering_errors(inputs, k) 185 | print 186 | 187 | 188 | print "bottom up hierarchical clustering" 189 | 190 | base_cluster = bottom_up_cluster(inputs) 191 | print base_cluster 192 | 193 | print 194 | print "three clusters, min:" 195 | for cluster in generate_clusters(base_cluster, 3): 196 | print get_values(cluster) 197 | 198 | print 199 | print "three clusters, max:" 200 | base_cluster = bottom_up_cluster(inputs, max) 201 | for cluster in generate_clusters(base_cluster, 3): 202 | print get_values(cluster) 203 | --------------------------------------------------------------------------------