├── im
    └── README.md
├── scratch
    ├── __init__.py
    ├── nlp_advanced.py
    ├── machine_learning.py
    ├── simple_linear_regression.py
    ├── probability.py
    ├── visualization.py
    ├── k_nearest_neighbors.py
    ├── gradient_descent.py
    ├── linear_algebra.py
    └── naive_bayes.py
├── first-edition
    ├── code
    │   ├── charts.py
    │   ├── __init__.py
    │   ├── comma_delimited_stock_prices.txt
    │   ├── colon_delimited_stock_prices.txt
    │   ├── comma_delimited_stock_prices.csv
    │   ├── tab_delimited_stock_prices.txt
    │   ├── line_count.py
    │   ├── egrep.py
    │   ├── plot_state_borders.py
    │   ├── most_common_words.py
    │   ├── machine_learning.py
    │   ├── simple_linear_regression.py
    │   ├── linear_algebra.py
    │   ├── probability.py
    │   ├── naive_bayes.py
    │   ├── visualizing_data.py
    │   ├── statistics.py
    │   ├── logistic_regression.py
    │   ├── decision_trees.py
    │   ├── mapreduce.py
    │   ├── gradient_descent.py
    │   ├── hypothesis_and_inference.py
    │   ├── recommender_systems.py
    │   └── clustering.py
    ├── code-python3
    │   ├── __init__.py
    │   ├── charts.py
    │   ├── comma_delimited_stock_prices.txt
    │   ├── colon_delimited_stock_prices.txt
    │   ├── comma_delimited_stock_prices.csv
    │   ├── tab_delimited_stock_prices.txt
    │   ├── line_count.py
    │   ├── egrep.py
    │   ├── plot_state_borders.py
    │   ├── most_common_words.py
    │   ├── machine_learning.py
    │   ├── simple_linear_regression.py
    │   ├── linear_algebra.py
    │   ├── probability.py
    │   ├── README.md
    │   ├── naive_bayes.py
    │   ├── visualizing_data.py
    │   ├── stats.py
    │   ├── decision_trees.py
    │   ├── logistic_regression.py
    │   ├── mapreduce.py
    │   ├── gradient_descent.py
    │   ├── hypothesis_and_inference.py
    │   ├── recommender_systems.py
    │   └── clustering.py
    └── README.md
├── .gitignore
├── comma_delimited_stock_prices.csv
├── INSTALL.md
├── requirements.txt
├── LICENSE
└── README.md


/im/README.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scratch/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/first-edition/code/charts.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/first-edition/code/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/first-edition/code-python3/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/first-edition/code-python3/charts.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.png
3 | 
4 | 


--------------------------------------------------------------------------------
/first-edition/code/comma_delimited_stock_prices.txt:
--------------------------------------------------------------------------------
1 | AAPL,90.91
2 | FB,64.5
3 | MSFT,41.68
4 | 


--------------------------------------------------------------------------------
/first-edition/code-python3/comma_delimited_stock_prices.txt:
--------------------------------------------------------------------------------
1 | FB,64.5
2 | MSFT,41.68
3 | AAPL,90.91
4 | 


--------------------------------------------------------------------------------
/first-edition/code/colon_delimited_stock_prices.txt:
--------------------------------------------------------------------------------
1 | date:symbol:closing_price
2 | 6/20/2014:AAPL:90.91
3 | 6/20/2014:MSFT:41.68
4 | 6/20/2014:FB:64.5


--------------------------------------------------------------------------------
/first-edition/code-python3/colon_delimited_stock_prices.txt:
--------------------------------------------------------------------------------
1 | date:symbol:closing_price
2 | 6/20/2014:AAPL:90.91
3 | 6/20/2014:MSFT:41.68
4 | 6/20/2014:FB:64.5


--------------------------------------------------------------------------------
/comma_delimited_stock_prices.csv:
--------------------------------------------------------------------------------
1 | AAPL,6/20/2014,90.91
2 | MSFT,6/20/2014,41.68
3 | FB,6/20/3014,64.5
4 | AAPL,6/19/2014,91.86
5 | MSFT,6/19/2014,n/a
6 | FB,6/19/2014,64.34
7 | 


--------------------------------------------------------------------------------
/first-edition/code/comma_delimited_stock_prices.csv:
--------------------------------------------------------------------------------
1 | 6/20/2014,AAPL,90.91
2 | 6/20/2014,MSFT,41.68
3 | 6/20/3014,FB,64.5
4 | 6/19/2014,AAPL,91.86
5 | 6/19/2014,MSFT,n/a
6 | 6/19/2014,FB,64.34


--------------------------------------------------------------------------------
/first-edition/code/tab_delimited_stock_prices.txt:
--------------------------------------------------------------------------------
1 | 6/20/2014	AAPL	90.91
2 | 6/20/2014	MSFT	41.68
3 | 6/20/2014	FB	64.5
4 | 6/19/2014	AAPL	91.86
5 | 6/19/2014	MSFT	41.51
6 | 6/19/2014	FB	64.34


--------------------------------------------------------------------------------
/first-edition/code-python3/comma_delimited_stock_prices.csv:
--------------------------------------------------------------------------------
1 | 6/20/2014,AAPL,90.91
2 | 6/20/2014,MSFT,41.68
3 | 6/20/3014,FB,64.5
4 | 6/19/2014,AAPL,91.86
5 | 6/19/2014,MSFT,n/a
6 | 6/19/2014,FB,64.34


--------------------------------------------------------------------------------
/first-edition/code-python3/tab_delimited_stock_prices.txt:
--------------------------------------------------------------------------------
1 | 6/20/2014	AAPL	90.91
2 | 6/20/2014	MSFT	41.68
3 | 6/20/2014	FB	64.5
4 | 6/19/2014	AAPL	91.86
5 | 6/19/2014	MSFT	41.51
6 | 6/19/2014	FB	64.34


--------------------------------------------------------------------------------
/first-edition/code/line_count.py:
--------------------------------------------------------------------------------
 1 | # line_count.py
 2 | import sys
 3 | 
 4 | if __name__ == "__main__":
 5 | 
 6 |     count = 0
 7 |     for line in sys.stdin:
 8 |         count += 1
 9 | 
10 |     # print goes to sys.stdout
11 |     print count


--------------------------------------------------------------------------------
/first-edition/code-python3/line_count.py:
--------------------------------------------------------------------------------
 1 | # line_count.py
 2 | import sys
 3 | 
 4 | if __name__ == "__main__":
 5 | 
 6 |     count = 0
 7 |     for line in sys.stdin:
 8 |         count += 1
 9 | 
10 |     # print goes to sys.stdout
11 |     print(count)
12 | 


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
1 | # How to Install Python
2 | 
3 | If you don't already have Python, I strongly recommend you install the Anaconda version,
4 | which includes many of the libraries needed for data science. Get the Python 3 version, not the Python 2 version.
5 | 
6 | https://www.anaconda.com/distribution/#download-section
7 | 
8 | Follow the instructions indicated for your platform.
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # For a nicer terminal
 2 | ipython
 3 | 
 4 | # For plotting graphs
 5 | matplotlib
 6 | 
 7 | # For reading in images
 8 | pillow
 9 | 
10 | # For making HTTP requests
11 | requests
12 | 
13 | # For parsing HTML
14 | beautifulsoup4
15 | html5lib
16 | 
17 | # For accessing Python
18 | twython
19 | 
20 | # For generating progress bars
21 | tqdm
22 | 
23 | # For downloading MNIST data
24 | mnist
25 | 
26 | # For parsing dates
27 | python-dateutil
28 | 


--------------------------------------------------------------------------------
/first-edition/code/egrep.py:
--------------------------------------------------------------------------------
 1 | # egrep.py
 2 | import sys, re
 3 | 
 4 | if __name__ == "__main__":
 5 | 
 6 |     # sys.argv is the list of command-line arguments
 7 |     # sys.argv[0] is the name of the program itself
 8 |     # sys.argv[1] will be the regex specfied at the command line
 9 |     regex = sys.argv[1]
10 | 
11 |     # for every line passed into the script
12 |     for line in sys.stdin:
13 |         # if it matches the regex, write it to stdout
14 |         if re.search(regex, line):
15 |             sys.stdout.write(line)


--------------------------------------------------------------------------------
/first-edition/code-python3/egrep.py:
--------------------------------------------------------------------------------
 1 | # egrep.py
 2 | import sys, re
 3 | 
 4 | if __name__ == "__main__":
 5 | 
 6 |     # sys.argv is the list of command-line arguments
 7 |     # sys.argv[0] is the name of the program itself
 8 |     # sys.argv[1] will be the regex specfied at the command line
 9 |     regex = sys.argv[1]
10 | 
11 |     # for every line passed into the script
12 |     for line in sys.stdin:
13 |         # if it matches the regex, write it to stdout
14 |         if re.search(regex, line):
15 |             sys.stdout.write(line)


--------------------------------------------------------------------------------
/first-edition/code/plot_state_borders.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | segments = []
 4 | points = []
 5 | 
 6 | lat_long_regex = r"<point lat=\"(.*)\" lng=\"(.*)\""
 7 | 
 8 | with open("states.txt", "r") as f:
 9 |     lines = [line for line in f]
10 | 
11 | for line in lines:
12 |     if line.startswith("</state>"):
13 |         for p1, p2 in zip(points, points[1:]):
14 |             segments.append((p1, p2))
15 |         points = []
16 |     s = re.search(lat_long_regex, line)
17 |     if s:
18 |         lat, lon = s.groups()
19 |         points.append((float(lon), float(lat)))
20 | 
21 | def plot_state_borders(plt, color='0.8'):
22 |     for (lon1, lat1), (lon2, lat2) in segments:
23 |         plt.plot([lon1, lon2], [lat1, lat2], color=color)


--------------------------------------------------------------------------------
/first-edition/code-python3/plot_state_borders.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | segments = []
 5 | points = []
 6 | 
 7 | lat_long_regex = r"<point lat=\"(.*)\" lng=\"(.*)\""
 8 | 
 9 | with open("states.txt", "r") as f:
10 |     lines = [line for line in f]
11 | 
12 | for line in lines:
13 |     if line.startswith("</state>"):
14 |         for p1, p2 in zip(points, points[1:]):
15 |             segments.append((p1, p2))
16 |         points = []
17 |     s = re.search(lat_long_regex, line)
18 |     if s:
19 |         lat, lon = s.groups()
20 |         points.append((float(lon), float(lat)))
21 | 
22 | def plot_state_borders(color='0.8'):
23 |     for (lon1, lat1), (lon2, lat2) in segments:
24 |         plt.plot([lon1, lon2], [lat1, lat2], color=color)
25 | 


--------------------------------------------------------------------------------
/scratch/nlp_advanced.py:
--------------------------------------------------------------------------------
 1 | from scratch.deep_learning import Optimizer, Layer
 2 | 
 3 | class EmbeddingOptimizer(Optimizer):
 4 |     """
 5 |     Optimized for the case where there are
 6 |     only embedding layers with single id updates.
 7 |     """
 8 |     def __init__(self, learning_rate: float) -> None:
 9 |         self.lr = learning_rate
10 | 
11 |     def step(self, layer: Layer) -> None:
12 |         for param, grad in zip(layer.params(), layer.grads()):
13 |             # Find the first (only) row with nonzero values.
14 |             for idx, row in enumerate(grad):
15 |                 if row[0] != 0:
16 |                     break
17 | 
18 |             # Then update just that row.
19 |             for j in range(len(row)):
20 |                 param[idx][j] -= grad[idx][j] * self.lr
21 | 


--------------------------------------------------------------------------------
/first-edition/code-python3/most_common_words.py:
--------------------------------------------------------------------------------
 1 | # most_common_words.py
 2 | import sys
 3 | from collections import Counter
 4 | 
 5 | if __name__ == "__main__":
 6 | 
 7 |     # pass in number of words as first argument
 8 |     try:
 9 |         num_words = int(sys.argv[1])
10 |     except:
11 |         print("usage: most_common_words.py num_words")
12 |         sys.exit(1)   # non-zero exit code indicates error
13 | 
14 |     counter = Counter(word.lower()
15 |                       for line in sys.stdin
16 |                       for word in line.strip().split()
17 |                       if word)
18 | 
19 |     for word, count in counter.most_common(num_words):
20 |         sys.stdout.write(str(count))
21 |         sys.stdout.write("\t")
22 |         sys.stdout.write(word)
23 |         sys.stdout.write("\n")
24 | 


--------------------------------------------------------------------------------
/first-edition/code/most_common_words.py:
--------------------------------------------------------------------------------
 1 | # most_common_words.py
 2 | import sys
 3 | from collections import Counter
 4 | 
 5 | if __name__ == "__main__":
 6 | 
 7 |     # pass in number of words as first argument
 8 |     try:
 9 |         num_words = int(sys.argv[1])
10 |     except:
11 |         print "usage: most_common_words.py num_words"
12 |         sys.exit(1)   # non-zero exit code indicates error
13 | 
14 |     counter = Counter(word.lower()                      
15 |                       for line in sys.stdin             
16 |                       for word in line.strip().split()  
17 |                       if word)                          
18 |             
19 |     for word, count in counter.most_common(num_words):
20 |         sys.stdout.write(str(count))
21 |         sys.stdout.write("\t")
22 |         sys.stdout.write(word)
23 |         sys.stdout.write("\n")


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Joel Grus
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/first-edition/code-python3/machine_learning.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | import math, random
 3 | 
 4 | #
 5 | # data splitting
 6 | #
 7 | 
 8 | def split_data(data, prob):
 9 |     """split data into fractions [prob, 1 - prob]"""
10 |     results = [], []
11 |     for row in data:
12 |         results[0 if random.random() < prob else 1].append(row)
13 |     return results
14 | 
15 | def train_test_split(x, y, test_pct):
16 |     data = list(zip(x, y))                        # pair corresponding values
17 |     train, test = split_data(data, 1 - test_pct)  # split the dataset of pairs
18 |     x_train, y_train = list(zip(*train))          # magical un-zip trick
19 |     x_test, y_test = list(zip(*test))
20 |     return x_train, x_test, y_train, y_test
21 | 
22 | #
23 | # correctness
24 | #
25 | 
26 | def accuracy(tp, fp, fn, tn):
27 |     correct = tp + tn
28 |     total = tp + fp + fn + tn
29 |     return correct / total
30 | 
31 | def precision(tp, fp, fn, tn):
32 |     return tp / (tp + fp)
33 | 
34 | def recall(tp, fp, fn, tn):
35 |     return tp / (tp + fn)
36 | 
37 | def f1_score(tp, fp, fn, tn):
38 |     p = precision(tp, fp, fn, tn)
39 |     r = recall(tp, fp, fn, tn)
40 | 
41 |     return 2 * p * r / (p + r)
42 | 
43 | if __name__ == "__main__":
44 | 
45 |     print("accuracy(70, 4930, 13930, 981070)", accuracy(70, 4930, 13930, 981070))
46 |     print("precision(70, 4930, 13930, 981070)", precision(70, 4930, 13930, 981070))
47 |     print("recall(70, 4930, 13930, 981070)", recall(70, 4930, 13930, 981070))
48 |     print("f1_score(70, 4930, 13930, 981070)", f1_score(70, 4930, 13930, 981070))
49 | 


--------------------------------------------------------------------------------
/first-edition/code/machine_learning.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from collections import Counter
 3 | import math, random
 4 | 
 5 | #
 6 | # data splitting
 7 | #
 8 | 
 9 | def split_data(data, prob):
10 |     """split data into fractions [prob, 1 - prob]"""
11 |     results = [], []
12 |     for row in data:
13 |         results[0 if random.random() < prob else 1].append(row)
14 |     return results
15 | 
16 | def train_test_split(x, y, test_pct):
17 |     data = zip(x, y)                              # pair corresponding values  
18 |     train, test = split_data(data, 1 - test_pct)  # split the dataset of pairs
19 |     x_train, y_train = zip(*train)                # magical un-zip trick
20 |     x_test, y_test = zip(*test)
21 |     return x_train, x_test, y_train, y_test
22 | 
23 | #
24 | # correctness
25 | #
26 | 
27 | def accuracy(tp, fp, fn, tn):
28 |     correct = tp + tn
29 |     total = tp + fp + fn + tn
30 |     return correct / total
31 | 
32 | def precision(tp, fp, fn, tn):
33 |     return tp / (tp + fp)
34 | 
35 | def recall(tp, fp, fn, tn):
36 |     return tp / (tp + fn)
37 | 
38 | def f1_score(tp, fp, fn, tn):
39 |     p = precision(tp, fp, fn, tn)
40 |     r = recall(tp, fp, fn, tn)
41 | 
42 |     return 2 * p * r / (p + r)
43 | 
44 | if __name__ == "__main__":
45 | 
46 |     print "accuracy(70, 4930, 13930, 981070)", accuracy(70, 4930, 13930, 981070)
47 |     print "precision(70, 4930, 13930, 981070)", precision(70, 4930, 13930, 981070)
48 |     print "recall(70, 4930, 13930, 981070)", recall(70, 4930, 13930, 981070)
49 |     print "f1_score(70, 4930, 13930, 981070)", f1_score(70, 4930, 13930, 981070)
50 | 
51 | 


--------------------------------------------------------------------------------
/scratch/machine_learning.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from typing import TypeVar, List, Tuple
 3 | X = TypeVar('X')  # generic type to represent a data point
 4 | 
 5 | def split_data(data: List[X], prob: float) -> Tuple[List[X], List[X]]:
 6 |     """Split data into fractions [prob, 1 - prob]"""
 7 |     data = data[:]                    # Make a shallow copy
 8 |     random.shuffle(data)              # because shuffle modifies the list.
 9 |     cut = int(len(data) * prob)       # Use prob to find a cutoff
10 |     return data[:cut], data[cut:]     # and split the shuffled list there.
11 | 
12 | data = [n for n in range(1000)]
13 | train, test = split_data(data, 0.75)
14 | 
15 | # The proportions should be correct
16 | assert len(train) == 750
17 | assert len(test) == 250
18 | 
19 | # And the original data should be preserved (in some order)
20 | assert sorted(train + test) == data
21 | 
22 | Y = TypeVar('Y')  # generic type to represent output variables
23 | 
24 | def train_test_split(xs: List[X],
25 |                      ys: List[Y],
26 |                      test_pct: float) -> Tuple[List[X], List[X], List[Y], List[Y]]:
27 |     # Generate the indices and split them.
28 |     idxs = [i for i in range(len(xs))]
29 |     train_idxs, test_idxs = split_data(idxs, 1 - test_pct)
30 | 
31 |     return ([xs[i] for i in train_idxs],  # x_train
32 |             [xs[i] for i in test_idxs],   # x_test
33 |             [ys[i] for i in train_idxs],  # y_train
34 |             [ys[i] for i in test_idxs])   # y_test
35 | 
36 | xs = [x for x in range(1000)]  # xs are 1 ... 1000
37 | ys = [2 * x for x in xs]       # each y_i is twice x_i
38 | x_train, x_test, y_train, y_test = train_test_split(xs, ys, 0.25)
39 | 
40 | # Check that the proportions are correct
41 | assert len(x_train) == len(y_train) == 750
42 | assert len(x_test) == len(y_test) == 250
43 | 
44 | # Check that the corresponding data points are paired correctly.
45 | assert all(y == 2 * x for x, y in zip(x_train, y_train))
46 | assert all(y == 2 * x for x, y in zip(x_test, y_test))
47 | 
48 | def accuracy(tp: int, fp: int, fn: int, tn: int) -> float:
49 |     correct = tp + tn
50 |     total = tp + fp + fn + tn
51 |     return correct / total
52 | 
53 | assert accuracy(70, 4930, 13930, 981070) == 0.98114
54 | 
55 | def precision(tp: int, fp: int, fn: int, tn: int) -> float:
56 |     return tp / (tp + fp)
57 | 
58 | assert precision(70, 4930, 13930, 981070) == 0.014
59 | 
60 | def recall(tp: int, fp: int, fn: int, tn: int) -> float:
61 |     return tp / (tp + fn)
62 | 
63 | assert recall(70, 4930, 13930, 981070) == 0.005
64 | 
65 | def f1_score(tp: int, fp: int, fn: int, tn: int) -> float:
66 |     p = precision(tp, fp, fn, tn)
67 |     r = recall(tp, fp, fn, tn)
68 | 
69 |     return 2 * p * r / (p + r)
70 | 
71 | 


--------------------------------------------------------------------------------
/scratch/simple_linear_regression.py:
--------------------------------------------------------------------------------
 1 | def predict(alpha: float, beta: float, x_i: float) -> float:
 2 |     return beta * x_i + alpha
 3 | 
 4 | def error(alpha: float, beta: float, x_i: float, y_i: float) -> float:
 5 |     """
 6 |     The error from predicting beta * x_i + alpha
 7 |     when the actual value is y_i
 8 |     """
 9 |     return predict(alpha, beta, x_i) - y_i
10 | 
11 | from scratch.linear_algebra import Vector
12 | 
13 | def sum_of_sqerrors(alpha: float, beta: float, x: Vector, y: Vector) -> float:
14 |     return sum(error(alpha, beta, x_i, y_i) ** 2
15 |                for x_i, y_i in zip(x, y))
16 | 
17 | from typing import Tuple
18 | from scratch.linear_algebra import Vector
19 | from scratch.statistics import correlation, standard_deviation, mean
20 | 
21 | def least_squares_fit(x: Vector, y: Vector) -> Tuple[float, float]:
22 |     """
23 |     Given two vectors x and y,
24 |     find the least-squares values of alpha and beta
25 |     """
26 |     beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x)
27 |     alpha = mean(y) - beta * mean(x)
28 |     return alpha, beta
29 | 
30 | x = [i for i in range(-100, 110, 10)]
31 | y = [3 * i - 5 for i in x]
32 | 
33 | # Should find that y = 3x - 5
34 | assert least_squares_fit(x, y) == (-5, 3)
35 | 
36 | from scratch.statistics import num_friends_good, daily_minutes_good
37 | 
38 | alpha, beta = least_squares_fit(num_friends_good, daily_minutes_good)
39 | assert 22.9 < alpha < 23.0
40 | assert 0.9 < beta < 0.905
41 | 
42 | from scratch.statistics import de_mean
43 | 
44 | def total_sum_of_squares(y: Vector) -> float:
45 |     """the total squared variation of y_i's from their mean"""
46 |     return sum(v ** 2 for v in de_mean(y))
47 | 
48 | def r_squared(alpha: float, beta: float, x: Vector, y: Vector) -> float:
49 |     """
50 |     the fraction of variation in y captured by the model, which equals
51 |     1 - the fraction of variation in y not captured by the model
52 |     """
53 |     return 1.0 - (sum_of_sqerrors(alpha, beta, x, y) /
54 |                   total_sum_of_squares(y))
55 | 
56 | rsq = r_squared(alpha, beta, num_friends_good, daily_minutes_good)
57 | assert 0.328 < rsq < 0.330
58 | 
59 | def main():
60 |     import random
61 |     import tqdm
62 |     from scratch.gradient_descent import gradient_step
63 |     
64 |     num_epochs = 10000
65 |     random.seed(0)
66 |     
67 |     guess = [random.random(), random.random()]  # choose random value to start
68 |     
69 |     learning_rate = 0.00001
70 |     
71 |     with tqdm.trange(num_epochs) as t:
72 |         for _ in t:
73 |             alpha, beta = guess
74 |     
75 |             # Partial derivative of loss with respect to alpha
76 |             grad_a = sum(2 * error(alpha, beta, x_i, y_i)
77 |                          for x_i, y_i in zip(num_friends_good,
78 |                                              daily_minutes_good))
79 |     
80 |             # Partial derivative of loss with respect to beta
81 |             grad_b = sum(2 * error(alpha, beta, x_i, y_i) * x_i
82 |                          for x_i, y_i in zip(num_friends_good,
83 |                                              daily_minutes_good))
84 |     
85 |             # Compute loss to stick in the tqdm description
86 |             loss = sum_of_sqerrors(alpha, beta,
87 |                                    num_friends_good, daily_minutes_good)
88 |             t.set_description(f"loss: {loss:.3f}")
89 |     
90 |             # Finally, update the guess
91 |             guess = gradient_step(guess, [grad_a, grad_b], -learning_rate)
92 |     
93 |     # We should get pretty much the same results:
94 |     alpha, beta = guess
95 |     assert 22.9 < alpha < 23.0
96 |     assert 0.9 < beta < 0.905
97 |     
98 | if __name__ == "__main__": main()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Data Science from Scratch
 2 | =========================
 3 | 
 4 | Here's all the code and examples from the second edition of my book _Data Science from Scratch_. They require at least Python 3.6.
 5 | 
 6 | (If you're looking for the code and examples from the first edition, that's in the `first-edition` folder.)
 7 | 
 8 | If you want to use the code, you should be able to clone the repo and just do things like
 9 | 
10 | ```
11 | In [1]: from scratch.linear_algebra import dot
12 | 
13 | In [2]: dot([1, 2, 3], [4, 5, 6])
14 | Out[2]: 32
15 | ```
16 | 
17 | and so on and so forth.
18 | 
19 | Two notes:
20 | 
21 | 1. In order to use the library like this, you need to be in the root directory (that is, the directory that contains the `scratch` folder). If you are in the `scratch` directory itself, the imports won't work.
22 | 
23 | 2. It's possible that it will just work. It's also possible that you may need to add the root directory to your `PYTHONPATH`, if you are on Linux or OSX this is as simple as 
24 | 
25 | ```
26 | export PYTHONPATH=/path/to/where/you/cloned/this/repo
27 | ```
28 | 
29 | (substituting in the real path, of course).
30 | 
31 | If you are on Windows, it's [potentially more complicated](https://stackoverflow.com/questions/3701646/how-to-add-to-the-pythonpath-in-windows-so-it-finds-my-modules-packages).
32 | 
33 | ## Table of Contents
34 | 
35 | 1. Introduction
36 | 2. A Crash Course in Python
37 | 3. [Visualizing Data](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/visualization.py)
38 | 4. [Linear Algebra](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/linear_algebra.py)
39 | 5. [Statistics](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/statistics.py)
40 | 6. [Probability](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/probability.py)
41 | 7. [Hypothesis and Inference](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/inference.py)
42 | 8. [Gradient Descent](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/gradient_descent.py)
43 | 9. [Getting Data](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/getting_data.py)
44 | 10. [Working With Data](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/working_with_data.py)
45 | 11. [Machine Learning](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/machine_learning.py)
46 | 12. [k-Nearest Neighbors](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/nearest_neighbors.py)
47 | 13. [Naive Bayes](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/naive_bayes.py)
48 | 14. [Simple Linear Regression](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/simple_linear_regression.py)
49 | 15. [Multiple Regression](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/multiple_regression.py)
50 | 16. [Logistic Regression](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/logistic_regression.py)
51 | 17. [Decision Trees](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/decision_trees.py)
52 | 18. [Neural Networks](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/neural_networks.py)
53 | 19. [Deep Learning]
54 | 20. [Clustering](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/clustering.py)
55 | 21. [Natural Language Processing](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/nlp.py)
56 | 22. [Network Analysis](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/network_analysis.py)
57 | 23. [Recommender Systems](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/recommender_systems.py)
58 | 24. [Databases and SQL](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/databases.py)
59 | 25. [MapReduce](https://github.com/joelgrus/data-science-from-scratch/blob/master/scratch/mapreduce.py)
60 | 26. Data Ethics
61 | 27. Go Forth And Do Data Science
62 | 


--------------------------------------------------------------------------------
/first-edition/README.md:
--------------------------------------------------------------------------------
 1 | Data Science from Scratch
 2 | =========================
 3 | 
 4 | Here's all the code and examples from the first edition of my book __[Data Science from Scratch](http://joelgrus.com/2015/04/26/data-science-from-scratch-first-principles-with-python/)__. The `code` directory contains Python 2.7 versions, and the `code-python3` direction contains the Python 3 equivalents. (I tested them in 3.5, but they should work in any 3.x.)
 5 | 
 6 | 
 7 | Each can be imported as a module, for example (after you cd into the /code directory):
 8 | 
 9 | ```python
10 | from linear_algebra import distance, vector_mean
11 | v = [1, 2, 3]
12 | w = [4, 5, 6]
13 | print distance(v, w)
14 | print vector_mean([v, w])
15 | ```
16 | 
17 | Or can be run from the command line to get a demo of what it does (and to execute the examples from the book):
18 | 
19 | ```bat
20 | python recommender_systems.py
21 | ```
22 | 
23 | Additionally, I've collected all the [links](https://github.com/joelgrus/data-science-from-scratch/blob/master/links.md) from the book.
24 | 
25 | And, by popular demand, I made an index of functions defined in the book, by chapter and page number.
26 | The data is in a [spreadsheet](https://docs.google.com/spreadsheets/d/1mjGp94ehfxWOEaAFJsPiHqIeOioPH1vN1PdOE6v1az8/edit?usp=sharing), or I also made a toy (experimental) [searchable webapp](http://joelgrus.com/experiments/function-index/).
27 | 
28 | ## Table of Contents
29 | 
30 | 1. Introduction
31 | 2. A Crash Course in Python
32 | 3. [Visualizing Data](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/visualizing_data.py)
33 | 4. [Linear Algebra](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/linear_algebra.py)
34 | 5. [Statistics](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/statistics.py)
35 | 6. [Probability](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/probability.py)
36 | 7. [Hypothesis and Inference](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/hypothesis_and_inference.py)
37 | 8. [Gradient Descent](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/gradient_descent.py)
38 | 9. [Getting Data](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/getting_data.py)
39 | 10. [Working With Data](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/working_with_data.py)
40 | 11. [Machine Learning](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/machine_learning.py)
41 | 12. [k-Nearest Neighbors](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/nearest_neighbors.py)
42 | 13. [Naive Bayes](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/naive_bayes.py)
43 | 14. [Simple Linear Regression](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/simple_linear_regression.py)
44 | 15. [Multiple Regression](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/multiple_regression.py)
45 | 16. [Logistic Regression](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/logistic_regression.py)
46 | 17. [Decision Trees](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/decision_trees.py)
47 | 18. [Neural Networks](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/neural_networks.py)
48 | 19. [Clustering](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/clustering.py)
49 | 20. [Natural Language Processing](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/natural_language_processing.py)
50 | 21. [Network Analysis](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/network_analysis.py)
51 | 22. [Recommender Systems](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/recommender_systems.py)
52 | 23. [Databases and SQL](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/databases.py)
53 | 24. [MapReduce](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/mapreduce.py)
54 | 25. Go Forth And Do Data Science
55 | 


--------------------------------------------------------------------------------
/first-edition/code-python3/simple_linear_regression.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter, defaultdict
 2 | from linear_algebra import vector_subtract
 3 | from stats import mean, correlation, standard_deviation, de_mean
 4 | from gradient_descent import minimize_stochastic
 5 | import math, random
 6 | 
 7 | def predict(alpha, beta, x_i):
 8 |     return beta * x_i + alpha
 9 | 
10 | def error(alpha, beta, x_i, y_i):
11 |     return y_i - predict(alpha, beta, x_i)
12 | 
13 | def sum_of_squared_errors(alpha, beta, x, y):
14 |     return sum(error(alpha, beta, x_i, y_i) ** 2
15 |                for x_i, y_i in zip(x, y))
16 | 
17 | def least_squares_fit(x,y):
18 |     """given training values for x and y,
19 |     find the least-squares values of alpha and beta"""
20 |     beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x)
21 |     alpha = mean(y) - beta * mean(x)
22 |     return alpha, beta
23 | 
24 | def total_sum_of_squares(y):
25 |     """the total squared variation of y_i's from their mean"""
26 |     return sum(v ** 2 for v in de_mean(y))
27 | 
28 | def r_squared(alpha, beta, x, y):
29 |     """the fraction of variation in y captured by the model, which equals
30 |     1 - the fraction of variation in y not captured by the model"""
31 | 
32 |     return 1.0 - (sum_of_squared_errors(alpha, beta, x, y) /
33 |                   total_sum_of_squares(y))
34 | 
35 | def squared_error(x_i, y_i, theta):
36 |     alpha, beta = theta
37 |     return error(alpha, beta, x_i, y_i) ** 2
38 | 
39 | def squared_error_gradient(x_i, y_i, theta):
40 |     alpha, beta = theta
41 |     return [-2 * error(alpha, beta, x_i, y_i),       # alpha partial derivative
42 |             -2 * error(alpha, beta, x_i, y_i) * x_i] # beta partial derivative
43 | 
44 | if __name__ == "__main__":
45 | 
46 |     num_friends_good = [49,41,40,25,21,21,19,19,18,18,16,15,15,15,15,14,14,13,13,13,13,12,12,11,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,8,8,8,8,8,8,8,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
47 |     daily_minutes_good = [68.77,51.25,52.08,38.36,44.54,57.13,51.4,41.42,31.22,34.76,54.01,38.79,47.59,49.1,27.66,41.03,36.73,48.65,28.12,46.62,35.57,32.98,35,26.07,23.77,39.73,40.57,31.65,31.21,36.32,20.45,21.93,26.02,27.34,23.49,46.94,30.5,33.8,24.23,21.4,27.94,32.24,40.57,25.07,19.42,22.39,18.42,46.96,23.72,26.41,26.97,36.76,40.32,35.02,29.47,30.2,31,38.11,38.18,36.31,21.03,30.86,36.07,28.66,29.08,37.28,15.28,24.17,22.31,30.17,25.53,19.85,35.37,44.6,17.23,13.47,26.33,35.02,32.09,24.81,19.33,28.77,24.26,31.98,25.73,24.86,16.28,34.51,15.23,39.72,40.8,26.06,35.76,34.76,16.13,44.04,18.03,19.65,32.62,35.59,39.43,14.18,35.24,40.13,41.82,35.45,36.07,43.67,24.61,20.9,21.9,18.79,27.61,27.21,26.61,29.77,20.59,27.53,13.82,33.2,25,33.1,36.65,18.63,14.87,22.2,36.81,25.53,24.62,26.25,18.21,28.08,19.42,29.79,32.8,35.99,28.32,27.79,35.88,29.06,36.28,14.1,36.63,37.49,26.9,18.58,38.48,24.48,18.95,33.55,14.24,29.04,32.51,25.63,22.22,19,32.73,15.16,13.9,27.2,32.01,29.27,33,13.74,20.42,27.32,18.23,35.35,28.48,9.08,24.62,20.12,35.26,19.92,31.02,16.49,12.16,30.7,31.22,34.65,13.13,27.51,33.2,31.57,14.1,33.42,17.44,10.12,24.42,9.82,23.39,30.93,15.03,21.67,31.09,33.29,22.61,26.89,23.48,8.38,27.81,32.35,23.84]
48 | 
49 |     alpha, beta = least_squares_fit(num_friends_good, daily_minutes_good)
50 |     print("alpha", alpha)
51 |     print("beta", beta)
52 | 
53 |     print("r-squared", r_squared(alpha, beta, num_friends_good, daily_minutes_good))
54 | 
55 |     print()
56 | 
57 |     print("gradient descent:")
58 |     # choose random value to start
59 |     random.seed(0)
60 |     theta = [random.random(), random.random()]
61 |     alpha, beta = minimize_stochastic(squared_error,
62 |                                       squared_error_gradient,
63 |                                       num_friends_good,
64 |                                       daily_minutes_good,
65 |                                       theta,
66 |                                       0.0001)
67 |     print("alpha", alpha)
68 |     print("beta", beta)
69 | 


--------------------------------------------------------------------------------
/first-edition/code-python3/linear_algebra.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: iso-8859-15 -*-
  2 | 
  3 | import re, math, random # regexes, math functions, random numbers
  4 | import matplotlib.pyplot as plt # pyplot
  5 | from collections import defaultdict, Counter
  6 | from functools import partial, reduce
  7 | 
  8 | #
  9 | # functions for working with vectors
 10 | #
 11 | 
 12 | def vector_add(v, w):
 13 |     """adds two vectors componentwise"""
 14 |     return [v_i + w_i for v_i, w_i in zip(v,w)]
 15 | 
 16 | def vector_subtract(v, w):
 17 |     """subtracts two vectors componentwise"""
 18 |     return [v_i - w_i for v_i, w_i in zip(v,w)]
 19 | 
 20 | def vector_sum(vectors):
 21 |     return reduce(vector_add, vectors)
 22 | 
 23 | def scalar_multiply(c, v):
 24 |     return [c * v_i for v_i in v]
 25 | 
 26 | def vector_mean(vectors):
 27 |     """compute the vector whose i-th element is the mean of the
 28 |     i-th elements of the input vectors"""
 29 |     n = len(vectors)
 30 |     return scalar_multiply(1/n, vector_sum(vectors))
 31 | 
 32 | def dot(v, w):
 33 |     """v_1 * w_1 + ... + v_n * w_n"""
 34 |     return sum(v_i * w_i for v_i, w_i in zip(v, w))
 35 | 
 36 | def sum_of_squares(v):
 37 |     """v_1 * v_1 + ... + v_n * v_n"""
 38 |     return dot(v, v)
 39 | 
 40 | def magnitude(v):
 41 |     return math.sqrt(sum_of_squares(v))
 42 | 
 43 | def squared_distance(v, w):
 44 |     return sum_of_squares(vector_subtract(v, w))
 45 | 
 46 | def distance(v, w):
 47 |    return math.sqrt(squared_distance(v, w))
 48 | 
 49 | #
 50 | # functions for working with matrices
 51 | #
 52 | 
 53 | def shape(A):
 54 |     num_rows = len(A)
 55 |     num_cols = len(A[0]) if A else 0
 56 |     return num_rows, num_cols
 57 | 
 58 | def get_row(A, i):
 59 |     return A[i]
 60 | 
 61 | def get_column(A, j):
 62 |     return [A_i[j] for A_i in A]
 63 | 
 64 | def make_matrix(num_rows, num_cols, entry_fn):
 65 |     """returns a num_rows x num_cols matrix
 66 |     whose (i,j)-th entry is entry_fn(i, j)"""
 67 |     return [[entry_fn(i, j) for j in range(num_cols)]
 68 |             for i in range(num_rows)]
 69 | 
 70 | def is_diagonal(i, j):
 71 |     """1's on the 'diagonal', 0's everywhere else"""
 72 |     return 1 if i == j else 0
 73 | 
 74 | identity_matrix = make_matrix(5, 5, is_diagonal)
 75 | 
 76 | #          user 0  1  2  3  4  5  6  7  8  9
 77 | #
 78 | friendships = [[0, 1, 1, 0, 0, 0, 0, 0, 0, 0], # user 0
 79 |                [1, 0, 1, 1, 0, 0, 0, 0, 0, 0], # user 1
 80 |                [1, 1, 0, 1, 0, 0, 0, 0, 0, 0], # user 2
 81 |                [0, 1, 1, 0, 1, 0, 0, 0, 0, 0], # user 3
 82 |                [0, 0, 0, 1, 0, 1, 0, 0, 0, 0], # user 4
 83 |                [0, 0, 0, 0, 1, 0, 1, 1, 0, 0], # user 5
 84 |                [0, 0, 0, 0, 0, 1, 0, 0, 1, 0], # user 6
 85 |                [0, 0, 0, 0, 0, 1, 0, 0, 1, 0], # user 7
 86 |                [0, 0, 0, 0, 0, 0, 1, 1, 0, 1], # user 8
 87 |                [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]] # user 9
 88 | 
 89 | #####
 90 | # DELETE DOWN
 91 | #
 92 | 
 93 | 
 94 | def matrix_add(A, B):
 95 |     if shape(A) != shape(B):
 96 |         raise ArithmeticError("cannot add matrices with different shapes")
 97 | 
 98 |     num_rows, num_cols = shape(A)
 99 |     def entry_fn(i, j): return A[i][j] + B[i][j]
100 | 
101 |     return make_matrix(num_rows, num_cols, entry_fn)
102 | 
103 | 
104 | def make_graph_dot_product_as_vector_projection(plt):
105 | 
106 |     v = [2, 1]
107 |     w = [math.sqrt(.25), math.sqrt(.75)]
108 |     c = dot(v, w)
109 |     vonw = scalar_multiply(c, w)
110 |     o = [0,0]
111 | 
112 |     plt.arrow(0, 0, v[0], v[1],
113 |               width=0.002, head_width=.1, length_includes_head=True)
114 |     plt.annotate("v", v, xytext=[v[0] + 0.1, v[1]])
115 |     plt.arrow(0 ,0, w[0], w[1],
116 |               width=0.002, head_width=.1, length_includes_head=True)
117 |     plt.annotate("w", w, xytext=[w[0] - 0.1, w[1]])
118 |     plt.arrow(0, 0, vonw[0], vonw[1], length_includes_head=True)
119 |     plt.annotate(u"(v•w)w", vonw, xytext=[vonw[0] - 0.1, vonw[1] + 0.1])
120 |     plt.arrow(v[0], v[1], vonw[0] - v[0], vonw[1] - v[1],
121 |               linestyle='dotted', length_includes_head=True)
122 |     plt.scatter(*zip(v,w,o),marker='.')
123 |     plt.axis('equal')
124 |     plt.show()
125 | 


--------------------------------------------------------------------------------
/first-edition/code/simple_linear_regression.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from collections import Counter, defaultdict
 3 | from linear_algebra import vector_subtract
 4 | from statistics import mean, correlation, standard_deviation, de_mean
 5 | from gradient_descent import minimize_stochastic
 6 | import math, random
 7 | 
 8 | def predict(alpha, beta, x_i):
 9 |     return beta * x_i + alpha
10 | 
11 | def error(alpha, beta, x_i, y_i):
12 |     return y_i - predict(alpha, beta, x_i)
13 | 
14 | def sum_of_squared_errors(alpha, beta, x, y):
15 |     return sum(error(alpha, beta, x_i, y_i) ** 2
16 |                for x_i, y_i in zip(x, y))
17 | 
18 | def least_squares_fit(x,y):
19 |     """given training values for x and y,
20 |     find the least-squares values of alpha and beta"""
21 |     beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x)
22 |     alpha = mean(y) - beta * mean(x)
23 |     return alpha, beta
24 | 
25 | def total_sum_of_squares(y):
26 |     """the total squared variation of y_i's from their mean"""
27 |     return sum(v ** 2 for v in de_mean(y))
28 | 
29 | def r_squared(alpha, beta, x, y):
30 |     """the fraction of variation in y captured by the model, which equals
31 |     1 - the fraction of variation in y not captured by the model"""
32 |     
33 |     return 1.0 - (sum_of_squared_errors(alpha, beta, x, y) /
34 |                   total_sum_of_squares(y))
35 | 
36 | def squared_error(x_i, y_i, theta):
37 |     alpha, beta = theta
38 |     return error(alpha, beta, x_i, y_i) ** 2
39 | 
40 | def squared_error_gradient(x_i, y_i, theta):
41 |     alpha, beta = theta
42 |     return [-2 * error(alpha, beta, x_i, y_i),       # alpha partial derivative
43 |             -2 * error(alpha, beta, x_i, y_i) * x_i] # beta partial derivative
44 | 
45 | if __name__ == "__main__":
46 | 
47 |     num_friends_good = [49,41,40,25,21,21,19,19,18,18,16,15,15,15,15,14,14,13,13,13,13,12,12,11,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,8,8,8,8,8,8,8,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
48 |     daily_minutes_good = [68.77,51.25,52.08,38.36,44.54,57.13,51.4,41.42,31.22,34.76,54.01,38.79,47.59,49.1,27.66,41.03,36.73,48.65,28.12,46.62,35.57,32.98,35,26.07,23.77,39.73,40.57,31.65,31.21,36.32,20.45,21.93,26.02,27.34,23.49,46.94,30.5,33.8,24.23,21.4,27.94,32.24,40.57,25.07,19.42,22.39,18.42,46.96,23.72,26.41,26.97,36.76,40.32,35.02,29.47,30.2,31,38.11,38.18,36.31,21.03,30.86,36.07,28.66,29.08,37.28,15.28,24.17,22.31,30.17,25.53,19.85,35.37,44.6,17.23,13.47,26.33,35.02,32.09,24.81,19.33,28.77,24.26,31.98,25.73,24.86,16.28,34.51,15.23,39.72,40.8,26.06,35.76,34.76,16.13,44.04,18.03,19.65,32.62,35.59,39.43,14.18,35.24,40.13,41.82,35.45,36.07,43.67,24.61,20.9,21.9,18.79,27.61,27.21,26.61,29.77,20.59,27.53,13.82,33.2,25,33.1,36.65,18.63,14.87,22.2,36.81,25.53,24.62,26.25,18.21,28.08,19.42,29.79,32.8,35.99,28.32,27.79,35.88,29.06,36.28,14.1,36.63,37.49,26.9,18.58,38.48,24.48,18.95,33.55,14.24,29.04,32.51,25.63,22.22,19,32.73,15.16,13.9,27.2,32.01,29.27,33,13.74,20.42,27.32,18.23,35.35,28.48,9.08,24.62,20.12,35.26,19.92,31.02,16.49,12.16,30.7,31.22,34.65,13.13,27.51,33.2,31.57,14.1,33.42,17.44,10.12,24.42,9.82,23.39,30.93,15.03,21.67,31.09,33.29,22.61,26.89,23.48,8.38,27.81,32.35,23.84]
49 | 
50 |     alpha, beta = least_squares_fit(num_friends_good, daily_minutes_good)
51 |     print "alpha", alpha
52 |     print "beta", beta
53 | 
54 |     print "r-squared", r_squared(alpha, beta, num_friends_good, daily_minutes_good)
55 | 
56 |     print
57 | 
58 |     print "gradient descent:"
59 |     # choose random value to start
60 |     random.seed(0)
61 |     theta = [random.random(), random.random()]
62 |     alpha, beta = minimize_stochastic(squared_error, 
63 |                                       squared_error_gradient,
64 |                                       num_friends_good,
65 |                                       daily_minutes_good, 
66 |                                       theta,
67 |                                       0.0001)
68 |     print "alpha", alpha
69 |     print "beta", beta


--------------------------------------------------------------------------------
/first-edition/code/linear_algebra.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: iso-8859-15 -*-
  2 | 
  3 | from __future__ import division # want 3 / 2 == 1.5
  4 | import re, math, random # regexes, math functions, random numbers
  5 | import matplotlib.pyplot as plt # pyplot
  6 | from collections import defaultdict, Counter
  7 | from functools import partial
  8 | 
  9 | # 
 10 | # functions for working with vectors
 11 | #
 12 | 
 13 | def vector_add(v, w):
 14 |     """adds two vectors componentwise"""
 15 |     return [v_i + w_i for v_i, w_i in zip(v,w)]
 16 | 
 17 | def vector_subtract(v, w):
 18 |     """subtracts two vectors componentwise"""
 19 |     return [v_i - w_i for v_i, w_i in zip(v,w)]
 20 | 
 21 | def vector_sum(vectors):
 22 |     return reduce(vector_add, vectors)
 23 | 
 24 | def scalar_multiply(c, v):
 25 |     return [c * v_i for v_i in v]
 26 | 
 27 | # this isn't right if you don't from __future__ import division
 28 | def vector_mean(vectors):
 29 |     """compute the vector whose i-th element is the mean of the
 30 |     i-th elements of the input vectors"""
 31 |     n = len(vectors)
 32 |     return scalar_multiply(1/n, vector_sum(vectors))
 33 | 
 34 | def dot(v, w):
 35 |     """v_1 * w_1 + ... + v_n * w_n"""
 36 |     return sum(v_i * w_i for v_i, w_i in zip(v, w))
 37 | 
 38 | def sum_of_squares(v):
 39 |     """v_1 * v_1 + ... + v_n * v_n"""
 40 |     return dot(v, v)
 41 | 
 42 | def magnitude(v):
 43 |     return math.sqrt(sum_of_squares(v))
 44 | 
 45 | def squared_distance(v, w):
 46 |     return sum_of_squares(vector_subtract(v, w))
 47 | 
 48 | def distance(v, w):
 49 |    return math.sqrt(squared_distance(v, w))
 50 | 
 51 | #
 52 | # functions for working with matrices
 53 | #
 54 | 
 55 | def shape(A):
 56 |     num_rows = len(A)
 57 |     num_cols = len(A[0]) if A else 0
 58 |     return num_rows, num_cols
 59 | 
 60 | def get_row(A, i):
 61 |     return A[i]
 62 |     
 63 | def get_column(A, j):
 64 |     return [A_i[j] for A_i in A]
 65 | 
 66 | def make_matrix(num_rows, num_cols, entry_fn):
 67 |     """returns a num_rows x num_cols matrix 
 68 |     whose (i,j)-th entry is entry_fn(i, j)"""
 69 |     return [[entry_fn(i, j) for j in range(num_cols)]
 70 |             for i in range(num_rows)]  
 71 | 
 72 | def is_diagonal(i, j):
 73 |     """1's on the 'diagonal', 0's everywhere else"""
 74 |     return 1 if i == j else 0
 75 | 
 76 | identity_matrix = make_matrix(5, 5, is_diagonal)
 77 | 
 78 | #          user 0  1  2  3  4  5  6  7  8  9
 79 | #
 80 | friendships = [[0, 1, 1, 0, 0, 0, 0, 0, 0, 0], # user 0
 81 |                [1, 0, 1, 1, 0, 0, 0, 0, 0, 0], # user 1
 82 |                [1, 1, 0, 1, 0, 0, 0, 0, 0, 0], # user 2
 83 |                [0, 1, 1, 0, 1, 0, 0, 0, 0, 0], # user 3
 84 |                [0, 0, 0, 1, 0, 1, 0, 0, 0, 0], # user 4
 85 |                [0, 0, 0, 0, 1, 0, 1, 1, 0, 0], # user 5
 86 |                [0, 0, 0, 0, 0, 1, 0, 0, 1, 0], # user 6
 87 |                [0, 0, 0, 0, 0, 1, 0, 0, 1, 0], # user 7
 88 |                [0, 0, 0, 0, 0, 0, 1, 1, 0, 1], # user 8
 89 |                [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]] # user 9
 90 | 
 91 | #####
 92 | # DELETE DOWN
 93 | #
 94 | 
 95 | 
 96 | def matrix_add(A, B):
 97 |     if shape(A) != shape(B):
 98 |         raise ArithmeticError("cannot add matrices with different shapes")
 99 |         
100 |     num_rows, num_cols = shape(A)
101 |     def entry_fn(i, j): return A[i][j] + B[i][j]
102 |         
103 |     return make_matrix(num_rows, num_cols, entry_fn)
104 | 
105 | 
106 | def make_graph_dot_product_as_vector_projection(plt):
107 | 
108 |     v = [2, 1]
109 |     w = [math.sqrt(.25), math.sqrt(.75)]
110 |     c = dot(v, w)
111 |     vonw = scalar_multiply(c, w)
112 |     o = [0,0]
113 | 
114 |     plt.arrow(0, 0, v[0], v[1], 
115 |               width=0.002, head_width=.1, length_includes_head=True)
116 |     plt.annotate("v", v, xytext=[v[0] + 0.1, v[1]])
117 |     plt.arrow(0 ,0, w[0], w[1], 
118 |               width=0.002, head_width=.1, length_includes_head=True)
119 |     plt.annotate("w", w, xytext=[w[0] - 0.1, w[1]])
120 |     plt.arrow(0, 0, vonw[0], vonw[1], length_includes_head=True)
121 |     plt.annotate(u"(v•w)w", vonw, xytext=[vonw[0] - 0.1, vonw[1] + 0.1])
122 |     plt.arrow(v[0], v[1], vonw[0] - v[0], vonw[1] - v[1], 
123 |               linestyle='dotted', length_includes_head=True)
124 |     plt.scatter(*zip(v,w,o),marker='.')
125 |     plt.axis('equal')
126 |     plt.show()
127 | 


--------------------------------------------------------------------------------
/first-edition/code-python3/probability.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | import math, random
  3 | 
  4 | def random_kid():
  5 |     return random.choice(["boy", "girl"])
  6 | 
  7 | def uniform_pdf(x):
  8 |     return 1 if x >= 0 and x < 1 else 0
  9 | 
 10 | def uniform_cdf(x):
 11 |     "returns the probability that a uniform random variable is less than x"
 12 |     if x < 0:   return 0    # uniform random is never less than 0
 13 |     elif x < 1: return x    # e.g. P(X < 0.4) = 0.4
 14 |     else:       return 1    # uniform random is always less than 1
 15 | 
 16 | def normal_pdf(x, mu=0, sigma=1):
 17 |     sqrt_two_pi = math.sqrt(2 * math.pi)
 18 |     return (math.exp(-(x-mu) ** 2 / 2 / sigma ** 2) / (sqrt_two_pi * sigma))
 19 | 
 20 | def plot_normal_pdfs(plt):
 21 |     xs = [x / 10.0 for x in range(-50, 50)]
 22 |     plt.plot(xs,[normal_pdf(x,sigma=1) for x in xs],'-',label='mu=0,sigma=1')
 23 |     plt.plot(xs,[normal_pdf(x,sigma=2) for x in xs],'--',label='mu=0,sigma=2')
 24 |     plt.plot(xs,[normal_pdf(x,sigma=0.5) for x in xs],':',label='mu=0,sigma=0.5')
 25 |     plt.plot(xs,[normal_pdf(x,mu=-1)   for x in xs],'-.',label='mu=-1,sigma=1')
 26 |     plt.legend()
 27 |     plt.show()
 28 | 
 29 | def normal_cdf(x, mu=0,sigma=1):
 30 |     return (1 + math.erf((x - mu) / math.sqrt(2) / sigma)) / 2
 31 | 
 32 | def plot_normal_cdfs(plt):
 33 |     xs = [x / 10.0 for x in range(-50, 50)]
 34 |     plt.plot(xs,[normal_cdf(x,sigma=1) for x in xs],'-',label='mu=0,sigma=1')
 35 |     plt.plot(xs,[normal_cdf(x,sigma=2) for x in xs],'--',label='mu=0,sigma=2')
 36 |     plt.plot(xs,[normal_cdf(x,sigma=0.5) for x in xs],':',label='mu=0,sigma=0.5')
 37 |     plt.plot(xs,[normal_cdf(x,mu=-1) for x in xs],'-.',label='mu=-1,sigma=1')
 38 |     plt.legend(loc=4) # bottom right
 39 |     plt.show()
 40 | 
 41 | def inverse_normal_cdf(p, mu=0, sigma=1, tolerance=0.00001):
 42 |     """find approximate inverse using binary search"""
 43 | 
 44 |     # if not standard, compute standard and rescale
 45 |     if mu != 0 or sigma != 1:
 46 |         return mu + sigma * inverse_normal_cdf(p, tolerance=tolerance)
 47 | 
 48 |     low_z, low_p = -10.0, 0            # normal_cdf(-10) is (very close to) 0
 49 |     hi_z,  hi_p  =  10.0, 1            # normal_cdf(10)  is (very close to) 1
 50 |     while hi_z - low_z > tolerance:
 51 |         mid_z = (low_z + hi_z) / 2     # consider the midpoint
 52 |         mid_p = normal_cdf(mid_z)      # and the cdf's value there
 53 |         if mid_p < p:
 54 |             # midpoint is still too low, search above it
 55 |             low_z, low_p = mid_z, mid_p
 56 |         elif mid_p > p:
 57 |             # midpoint is still too high, search below it
 58 |             hi_z, hi_p = mid_z, mid_p
 59 |         else:
 60 |             break
 61 | 
 62 |     return mid_z
 63 | 
 64 | def bernoulli_trial(p):
 65 |     return 1 if random.random() < p else 0
 66 | 
 67 | def binomial(p, n):
 68 |     return sum(bernoulli_trial(p) for _ in range(n))
 69 | 
 70 | def make_hist(p, n, num_points):
 71 | 
 72 |     data = [binomial(p, n) for _ in range(num_points)]
 73 | 
 74 |     # use a bar chart to show the actual binomial samples
 75 |     histogram = Counter(data)
 76 |     plt.bar([x - 0.4 for x in histogram.keys()],
 77 |             [v / num_points for v in histogram.values()],
 78 |             0.8,
 79 |             color='0.75')
 80 | 
 81 |     mu = p * n
 82 |     sigma = math.sqrt(n * p * (1 - p))
 83 | 
 84 |     # use a line chart to show the normal approximation
 85 |     xs = range(min(data), max(data) + 1)
 86 |     ys = [normal_cdf(i + 0.5, mu, sigma) - normal_cdf(i - 0.5, mu, sigma)
 87 |           for i in xs]
 88 |     plt.plot(xs,ys)
 89 |     plt.show()
 90 | 
 91 | 
 92 | 
 93 | if __name__ == "__main__":
 94 | 
 95 |     #
 96 |     # CONDITIONAL PROBABILITY
 97 |     #
 98 | 
 99 |     both_girls = 0
100 |     older_girl = 0
101 |     either_girl = 0
102 | 
103 |     random.seed(0)
104 |     for _ in range(10000):
105 |         younger = random_kid()
106 |         older = random_kid()
107 |         if older == "girl":
108 |             older_girl += 1
109 |         if older == "girl" and younger == "girl":
110 |             both_girls += 1
111 |         if older == "girl" or younger == "girl":
112 |             either_girl += 1
113 | 
114 |     print("P(both | older):", both_girls / older_girl)      # 0.514 ~ 1/2
115 |     print("P(both | either): ", both_girls / either_girl)   # 0.342 ~ 1/3
116 | 


--------------------------------------------------------------------------------
/first-edition/code/probability.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from collections import Counter
  3 | import math, random
  4 | 
  5 | def random_kid():
  6 |     return random.choice(["boy", "girl"])
  7 | 
  8 | def uniform_pdf(x):
  9 |     return 1 if x >= 0 and x < 1 else 0
 10 | 
 11 | def uniform_cdf(x):
 12 |     "returns the probability that a uniform random variable is less than x"
 13 |     if x < 0:   return 0    # uniform random is never less than 0
 14 |     elif x < 1: return x    # e.g. P(X < 0.4) = 0.4
 15 |     else:       return 1    # uniform random is always less than 1
 16 | 
 17 | def normal_pdf(x, mu=0, sigma=1):
 18 |     sqrt_two_pi = math.sqrt(2 * math.pi)
 19 |     return (math.exp(-(x-mu) ** 2 / 2 / sigma ** 2) / (sqrt_two_pi * sigma))
 20 | 
 21 | def plot_normal_pdfs(plt):
 22 |     xs = [x / 10.0 for x in range(-50, 50)]
 23 |     plt.plot(xs,[normal_pdf(x,sigma=1) for x in xs],'-',label='mu=0,sigma=1')
 24 |     plt.plot(xs,[normal_pdf(x,sigma=2) for x in xs],'--',label='mu=0,sigma=2')
 25 |     plt.plot(xs,[normal_pdf(x,sigma=0.5) for x in xs],':',label='mu=0,sigma=0.5')
 26 |     plt.plot(xs,[normal_pdf(x,mu=-1)   for x in xs],'-.',label='mu=-1,sigma=1')
 27 |     plt.legend()
 28 |     plt.show()      
 29 | 
 30 | def normal_cdf(x, mu=0,sigma=1):
 31 |     return (1 + math.erf((x - mu) / math.sqrt(2) / sigma)) / 2  
 32 | 
 33 | def plot_normal_cdfs(plt):
 34 |     xs = [x / 10.0 for x in range(-50, 50)]
 35 |     plt.plot(xs,[normal_cdf(x,sigma=1) for x in xs],'-',label='mu=0,sigma=1')
 36 |     plt.plot(xs,[normal_cdf(x,sigma=2) for x in xs],'--',label='mu=0,sigma=2')
 37 |     plt.plot(xs,[normal_cdf(x,sigma=0.5) for x in xs],':',label='mu=0,sigma=0.5')
 38 |     plt.plot(xs,[normal_cdf(x,mu=-1) for x in xs],'-.',label='mu=-1,sigma=1')
 39 |     plt.legend(loc=4) # bottom right
 40 |     plt.show()
 41 | 
 42 | def inverse_normal_cdf(p, mu=0, sigma=1, tolerance=0.00001):
 43 |     """find approximate inverse using binary search"""
 44 | 
 45 |     # if not standard, compute standard and rescale
 46 |     if mu != 0 or sigma != 1:
 47 |         return mu + sigma * inverse_normal_cdf(p, tolerance=tolerance)
 48 |     
 49 |     low_z, low_p = -10.0, 0            # normal_cdf(-10) is (very close to) 0
 50 |     hi_z,  hi_p  =  10.0, 1            # normal_cdf(10)  is (very close to) 1
 51 |     while hi_z - low_z > tolerance:
 52 |         mid_z = (low_z + hi_z) / 2     # consider the midpoint
 53 |         mid_p = normal_cdf(mid_z)      # and the cdf's value there
 54 |         if mid_p < p:
 55 |             # midpoint is still too low, search above it
 56 |             low_z, low_p = mid_z, mid_p
 57 |         elif mid_p > p:
 58 |             # midpoint is still too high, search below it
 59 |             hi_z, hi_p = mid_z, mid_p
 60 |         else:
 61 |             break
 62 | 
 63 |     return mid_z
 64 | 
 65 | def bernoulli_trial(p):
 66 |     return 1 if random.random() < p else 0
 67 | 
 68 | def binomial(p, n):
 69 |     return sum(bernoulli_trial(p) for _ in range(n))
 70 | 
 71 | def make_hist(p, n, num_points):
 72 |     
 73 |     data = [binomial(p, n) for _ in range(num_points)]
 74 |     
 75 |     # use a bar chart to show the actual binomial samples
 76 |     histogram = Counter(data)
 77 |     plt.bar([x - 0.4 for x in histogram.keys()],
 78 |             [v / num_points for v in histogram.values()],
 79 |             0.8,
 80 |             color='0.75')
 81 |     
 82 |     mu = p * n
 83 |     sigma = math.sqrt(n * p * (1 - p))
 84 | 
 85 |     # use a line chart to show the normal approximation
 86 |     xs = range(min(data), max(data) + 1)
 87 |     ys = [normal_cdf(i + 0.5, mu, sigma) - normal_cdf(i - 0.5, mu, sigma) 
 88 |           for i in xs]
 89 |     plt.plot(xs,ys)
 90 |     plt.show()
 91 | 
 92 | 
 93 | 
 94 | if __name__ == "__main__":
 95 | 
 96 |     #
 97 |     # CONDITIONAL PROBABILITY
 98 |     #
 99 | 
100 |     both_girls = 0
101 |     older_girl = 0
102 |     either_girl = 0
103 | 
104 |     random.seed(0)
105 |     for _ in range(10000):
106 |         younger = random_kid()
107 |         older = random_kid()
108 |         if older == "girl":
109 |             older_girl += 1
110 |         if older == "girl" and younger == "girl":
111 |             both_girls += 1
112 |         if older == "girl" or younger == "girl":
113 |             either_girl += 1
114 | 
115 |     print "P(both | older):", both_girls / older_girl      # 0.514 ~ 1/2
116 |     print "P(both | either): ", both_girls / either_girl   # 0.342 ~ 1/3


--------------------------------------------------------------------------------
/first-edition/code-python3/README.md:
--------------------------------------------------------------------------------
  1 | # Updating the code from Python 2 to Python 3
  2 | 
  3 | After many requests, here's the code from the book updated from Python 2 to Python 3.
  4 | I have been telling people that there aren't too many changes required, but it turned
  5 | out there were quite a few. Start-to-finish I'd say the porting took me about 4 hours,
  6 | and I'm pretty familiar with the code. I think I got everything, let me know if you find something 
  7 | that doesn't work in Python 3.
  8 | 
  9 | (For the most part my goal was to get everything to *work* in Python 3, I didn't spend any time on trying to make it *idiomatic* Python 3. Later.)
 10 | 
 11 | Here's a fairly comprehensive list of the issues I ran into.
 12 | 
 13 | ## `print`
 14 | 
 15 | The first and most obvious difference is that in Python 3 `print` takes parentheses.
 16 | This means that every
 17 | 
 18 | ```
 19 | print "stuff", 1
 20 | ```
 21 | 
 22 | had to be replaced with
 23 | 
 24 | ```
 25 | print("stuff", 1)
 26 | ```
 27 | 
 28 | This was mostly just tedious. I should have used 2to3.
 29 | 
 30 | ## tuple unpacking
 31 | 
 32 | <a href="https://www.python.org/dev/peps/pep-3113/">PEP-3113</a> eliminates
 33 | tuple unpacking in function parameters. In particular, that means that code like
 34 | 
 35 | ```
 36 | lambda (a, b): b
 37 | ```
 38 | 
 39 | has to be replaced with
 40 | 
 41 | ```
 42 | lambda pair: pair[1]
 43 | ```
 44 | 
 45 | This is unfortunate, as I tend to write a lot of code like
 46 | 
 47 | ```
 48 | sorted(words_and_counts, key=lambda (word, count): count, reverse=True)
 49 | ```
 50 | 
 51 | Probably I should have just created a `helpers.py` with a few functions like
 52 | 
 53 | ```
 54 | def fst(pair): return pair[0]
 55 | def snd(pair): return pair[1]
 56 | ```
 57 | 
 58 | Maybe next time.
 59 | 
 60 | ## laziness
 61 | 
 62 | In Python 3, laziness is the order of the day. In particular, `dict`-like
 63 | objects no longer have `.iteritems()` properties, so those all have to be replaced
 64 | with `.items()`
 65 | 
 66 | Similarly, `filter` now returns an iterator, so that code like
 67 | 
 68 | ```
 69 | filter(is_even, my_list)[0]
 70 | ```
 71 | 
 72 | doesn't work, and needs to be replaced with
 73 | 
 74 | ```
 75 | list(filter(is_even, my_list))[0]
 76 | ```
 77 | 
 78 | And likewise with `zip`, which in many instances needs to be replaced with `list(zip(...))`. (In particular, this uglies up my magic unzip trick.)
 79 | 
 80 | At least when you try to index into an iterator you get an error. It's potentially worse if you iterate over it expecting `list` behavior.
 81 | 
 82 | In the most subtle case this bit me at (in essence):
 83 | 
 84 | ```
 85 | data = map(clean, data)
 86 | x = [row[0] for row in data]
 87 | y = [row[1] for row in data]
 88 | ```
 89 | 
 90 | in this case the `map` makes `data` a generator, and once the `x` definition iterates
 91 | over it, it's gone. The solution is
 92 | 
 93 | ```
 94 | data = list(map(clean, data))
 95 | ```
 96 | 
 97 | Similarly, if you have a `dict` then its `.keys()` is lazy, so you have to wrap
 98 | it in `list` as well. This is possibly my least favorite change in Python 3.
 99 | 
100 | A better solution is probably to replace most of these with list comprehensions.
101 | 
102 | ## binary mode for CSVs
103 | 
104 | In Python 2 it was best practice to open CSV files in binary mode to
105 | make sure you dealt properly with Windows line endings:
106 | 
107 | ```
108 | f = open("some.csv", "rb")
109 | ```
110 | 
111 | In Python 3 that doesn't work for various reasons having to do with raw bytes
112 | and string encodings. Instead you need to open them in text mode and
113 | specify the line ending types:
114 | 
115 | ```
116 | f = open("some.csv", 'r', encoding='utf8', newline='')
117 | ```
118 | 
119 | ## `reduce`
120 | 
121 | Guido doesn't like `reduce`, so in Python 3 it's hidden in `functools`. So any code
122 | that uses it needs to add a
123 | 
124 | ```
125 | from functools import reduce
126 | ```
127 | 
128 | ## bad spam characters
129 | 
130 | The Spam Assassin corpus files from the naive bayes chapter (are old and)
131 | contain some ugly characters that caused me problems until I tried opening the
132 | files with
133 | 
134 | ```
135 | encoding='ISO-8859-1'
136 | ```
137 | 
138 | # Bugs
139 | 
140 | For some reason, my Python 3 topic model in `natural_language_processing` gives slightly different results from the Python 2 version. I suspect this means there is a bug in the port, but I haven't figured out what it is yet. Let me know if you find any more bugs, it's possible there's a lazy `zip` or `map` that I missed.
141 | 


--------------------------------------------------------------------------------
/first-edition/code-python3/naive_bayes.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter, defaultdict
  2 | from machine_learning import split_data
  3 | import math, random, re, glob
  4 | 
  5 | def tokenize(message):
  6 |     message = message.lower()                       # convert to lowercase
  7 |     all_words = re.findall("[a-z0-9']+", message)   # extract the words
  8 |     return set(all_words)                           # remove duplicates
  9 | 
 10 | 
 11 | def count_words(training_set):
 12 |     """training set consists of pairs (message, is_spam)"""
 13 |     counts = defaultdict(lambda: [0, 0])
 14 |     for message, is_spam in training_set:
 15 |         for word in tokenize(message):
 16 |             counts[word][0 if is_spam else 1] += 1
 17 |     return counts
 18 | 
 19 | def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
 20 |     """turn the word_counts into a list of triplets
 21 |     w, p(w | spam) and p(w | ~spam)"""
 22 |     return [(w,
 23 |              (spam + k) / (total_spams + 2 * k),
 24 |              (non_spam + k) / (total_non_spams + 2 * k))
 25 |              for w, (spam, non_spam) in counts.items()]
 26 | 
 27 | def spam_probability(word_probs, message):
 28 |     message_words = tokenize(message)
 29 |     log_prob_if_spam = log_prob_if_not_spam = 0.0
 30 | 
 31 |     for word, prob_if_spam, prob_if_not_spam in word_probs:
 32 | 
 33 |         # for each word in the message,
 34 |         # add the log probability of seeing it
 35 |         if word in message_words:
 36 |             log_prob_if_spam += math.log(prob_if_spam)
 37 |             log_prob_if_not_spam += math.log(prob_if_not_spam)
 38 | 
 39 |         # for each word that's not in the message
 40 |         # add the log probability of _not_ seeing it
 41 |         else:
 42 |             log_prob_if_spam += math.log(1.0 - prob_if_spam)
 43 |             log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)
 44 | 
 45 |     prob_if_spam = math.exp(log_prob_if_spam)
 46 |     prob_if_not_spam = math.exp(log_prob_if_not_spam)
 47 |     return prob_if_spam / (prob_if_spam + prob_if_not_spam)
 48 | 
 49 | 
 50 | class NaiveBayesClassifier:
 51 | 
 52 |     def __init__(self, k=0.5):
 53 |         self.k = k
 54 |         self.word_probs = []
 55 | 
 56 |     def train(self, training_set):
 57 | 
 58 |         # count spam and non-spam messages
 59 |         num_spams = len([is_spam
 60 |                          for message, is_spam in training_set
 61 |                          if is_spam])
 62 |         num_non_spams = len(training_set) - num_spams
 63 | 
 64 |         # run training data through our "pipeline"
 65 |         word_counts = count_words(training_set)
 66 |         self.word_probs = word_probabilities(word_counts,
 67 |                                              num_spams,
 68 |                                              num_non_spams,
 69 |                                              self.k)
 70 | 
 71 |     def classify(self, message):
 72 |         return spam_probability(self.word_probs, message)
 73 | 
 74 | 
 75 | def get_subject_data(path):
 76 | 
 77 |     data = []
 78 | 
 79 |     # regex for stripping out the leading "Subject:" and any spaces after it
 80 |     subject_regex = re.compile(r"^Subject:\s+")
 81 | 
 82 |     # glob.glob returns every filename that matches the wildcarded path
 83 |     for fn in glob.glob(path):
 84 |         is_spam = "ham" not in fn
 85 | 
 86 |         with open(fn,'r',encoding='ISO-8859-1') as file:
 87 |             for line in file:
 88 |                 if line.startswith("Subject:"):
 89 |                     subject = subject_regex.sub("", line).strip()
 90 |                     data.append((subject, is_spam))
 91 | 
 92 |     return data
 93 | 
 94 | def p_spam_given_word(word_prob):
 95 |     word, prob_if_spam, prob_if_not_spam = word_prob
 96 |     return prob_if_spam / (prob_if_spam + prob_if_not_spam)
 97 | 
 98 | def train_and_test_model(path):
 99 | 
100 |     data = get_subject_data(path)
101 |     random.seed(0)      # just so you get the same answers as me
102 |     train_data, test_data = split_data(data, 0.75)
103 | 
104 |     classifier = NaiveBayesClassifier()
105 |     classifier.train(train_data)
106 | 
107 |     classified = [(subject, is_spam, classifier.classify(subject))
108 |               for subject, is_spam in test_data]
109 | 
110 |     counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted)
111 |                      for _, is_spam, spam_probability in classified)
112 | 
113 |     print(counts)
114 | 
115 |     classified.sort(key=lambda row: row[2])
116 |     spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:]
117 |     hammiest_spams = list(filter(lambda row: row[1], classified))[:5]
118 | 
119 |     print("spammiest_hams", spammiest_hams)
120 |     print("hammiest_spams", hammiest_spams)
121 | 
122 |     words = sorted(classifier.word_probs, key=p_spam_given_word)
123 | 
124 |     spammiest_words = words[-5:]
125 |     hammiest_words = words[:5]
126 | 
127 |     print("spammiest_words", spammiest_words)
128 |     print("hammiest_words", hammiest_words)
129 | 
130 | 
131 | if __name__ == "__main__":
132 |     #train_and_test_model(r"c:\spam\*\*")
133 |     train_and_test_model(r"/home/joel/src/spam/*/*")
134 | 


--------------------------------------------------------------------------------
/first-edition/code/naive_bayes.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from collections import Counter, defaultdict
  3 | from machine_learning import split_data
  4 | import math, random, re, glob
  5 | 
  6 | def tokenize(message):
  7 |     message = message.lower()                       # convert to lowercase
  8 |     all_words = re.findall("[a-z0-9']+", message)   # extract the words
  9 |     return set(all_words)                           # remove duplicates
 10 | 
 11 | 
 12 | def count_words(training_set):
 13 |     """training set consists of pairs (message, is_spam)"""
 14 |     counts = defaultdict(lambda: [0, 0])
 15 |     for message, is_spam in training_set:
 16 |         for word in tokenize(message):
 17 |             counts[word][0 if is_spam else 1] += 1
 18 |     return counts
 19 | 
 20 | def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
 21 |     """turn the word_counts into a list of triplets 
 22 |     w, p(w | spam) and p(w | ~spam)"""
 23 |     return [(w,
 24 |              (spam + k) / (total_spams + 2 * k),
 25 |              (non_spam + k) / (total_non_spams + 2 * k))
 26 |              for w, (spam, non_spam) in counts.iteritems()]
 27 | 
 28 | def spam_probability(word_probs, message):
 29 |     message_words = tokenize(message)
 30 |     log_prob_if_spam = log_prob_if_not_spam = 0.0
 31 | 
 32 |     for word, prob_if_spam, prob_if_not_spam in word_probs:
 33 | 
 34 |         # for each word in the message, 
 35 |         # add the log probability of seeing it 
 36 |         if word in message_words:
 37 |             log_prob_if_spam += math.log(prob_if_spam)
 38 |             log_prob_if_not_spam += math.log(prob_if_not_spam)
 39 | 
 40 |         # for each word that's not in the message
 41 |         # add the log probability of _not_ seeing it
 42 |         else:
 43 |             log_prob_if_spam += math.log(1.0 - prob_if_spam)
 44 |             log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)
 45 |             
 46 |     prob_if_spam = math.exp(log_prob_if_spam)
 47 |     prob_if_not_spam = math.exp(log_prob_if_not_spam)
 48 |     return prob_if_spam / (prob_if_spam + prob_if_not_spam)
 49 | 
 50 | 
 51 | class NaiveBayesClassifier:
 52 | 
 53 |     def __init__(self, k=0.5):
 54 |         self.k = k
 55 |         self.word_probs = []
 56 | 
 57 |     def train(self, training_set):
 58 |     
 59 |         # count spam and non-spam messages
 60 |         num_spams = len([is_spam 
 61 |                          for message, is_spam in training_set 
 62 |                          if is_spam])
 63 |         num_non_spams = len(training_set) - num_spams
 64 | 
 65 |         # run training data through our "pipeline"
 66 |         word_counts = count_words(training_set)
 67 |         self.word_probs = word_probabilities(word_counts, 
 68 |                                              num_spams, 
 69 |                                              num_non_spams,
 70 |                                              self.k)
 71 |                                              
 72 |     def classify(self, message):
 73 |         return spam_probability(self.word_probs, message)
 74 | 
 75 | 
 76 | def get_subject_data(path):
 77 | 
 78 |     data = []
 79 | 
 80 |     # regex for stripping out the leading "Subject:" and any spaces after it
 81 |     subject_regex = re.compile(r"^Subject:\s+")
 82 | 
 83 |     # glob.glob returns every filename that matches the wildcarded path
 84 |     for fn in glob.glob(path):
 85 |         is_spam = "ham" not in fn
 86 |         
 87 |         with open(fn,'r') as file:
 88 |             for line in file:
 89 |                 if line.startswith("Subject:"):
 90 |                     subject = subject_regex.sub("", line).strip()
 91 |                     data.append((subject, is_spam))
 92 | 
 93 |     return data
 94 | 
 95 | def p_spam_given_word(word_prob):
 96 |     word, prob_if_spam, prob_if_not_spam = word_prob
 97 |     return prob_if_spam / (prob_if_spam + prob_if_not_spam)
 98 | 
 99 | def train_and_test_model(path):
100 | 
101 |     data = get_subject_data(path)
102 |     random.seed(0)      # just so you get the same answers as me
103 |     train_data, test_data = split_data(data, 0.75)    
104 | 
105 |     classifier = NaiveBayesClassifier()
106 |     classifier.train(train_data)
107 | 
108 |     classified = [(subject, is_spam, classifier.classify(subject))
109 |               for subject, is_spam in test_data]
110 | 
111 |     counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted)
112 |                      for _, is_spam, spam_probability in classified)
113 | 
114 |     print counts
115 | 
116 |     classified.sort(key=lambda row: row[2])
117 |     spammiest_hams = filter(lambda row: not row[1], classified)[-5:]
118 |     hammiest_spams = filter(lambda row: row[1], classified)[:5]
119 | 
120 |     print "spammiest_hams", spammiest_hams
121 |     print "hammiest_spams", hammiest_spams
122 | 
123 |     words = sorted(classifier.word_probs, key=p_spam_given_word)
124 | 
125 |     spammiest_words = words[-5:]
126 |     hammiest_words = words[:5]
127 | 
128 |     print "spammiest_words", spammiest_words
129 |     print "hammiest_words", hammiest_words
130 | 
131 | 
132 | if __name__ == "__main__":
133 |     train_and_test_model(r"c:\spam\*\*")


--------------------------------------------------------------------------------
/scratch/probability.py:
--------------------------------------------------------------------------------
  1 | def uniform_cdf(x: float) -> float:
  2 |     """Returns the probability that a uniform random variable is <= x"""
  3 |     if x < 0:   return 0    # uniform random is never less than 0
  4 |     elif x < 1: return x    # e.g. P(X <= 0.4) = 0.4
  5 |     else:       return 1    # uniform random is always less than 1
  6 | 
  7 | import math
  8 | SQRT_TWO_PI = math.sqrt(2 * math.pi)
  9 | 
 10 | def normal_pdf(x: float, mu: float = 0, sigma: float = 1) -> float:
 11 |     return (math.exp(-(x-mu) ** 2 / 2 / sigma ** 2) / (SQRT_TWO_PI * sigma))
 12 | 
 13 | import matplotlib.pyplot as plt
 14 | xs = [x / 10.0 for x in range(-50, 50)]
 15 | plt.plot(xs,[normal_pdf(x,sigma=1) for x in xs],'-',label='mu=0,sigma=1')
 16 | plt.plot(xs,[normal_pdf(x,sigma=2) for x in xs],'--',label='mu=0,sigma=2')
 17 | plt.plot(xs,[normal_pdf(x,sigma=0.5) for x in xs],':',label='mu=0,sigma=0.5')
 18 | plt.plot(xs,[normal_pdf(x,mu=-1)   for x in xs],'-.',label='mu=-1,sigma=1')
 19 | plt.legend()
 20 | plt.title("Various Normal pdfs")
 21 | # plt.show()
 22 | 
 23 | 
 24 | # plt.savefig('im/various_normal_pdfs.png')
 25 | plt.gca().clear()
 26 | plt.close()
 27 | plt.clf()
 28 | 
 29 | def normal_cdf(x: float, mu: float = 0, sigma: float = 1) -> float:
 30 |     return (1 + math.erf((x - mu) / math.sqrt(2) / sigma)) / 2
 31 | 
 32 | xs = [x / 10.0 for x in range(-50, 50)]
 33 | plt.plot(xs,[normal_cdf(x,sigma=1) for x in xs],'-',label='mu=0,sigma=1')
 34 | plt.plot(xs,[normal_cdf(x,sigma=2) for x in xs],'--',label='mu=0,sigma=2')
 35 | plt.plot(xs,[normal_cdf(x,sigma=0.5) for x in xs],':',label='mu=0,sigma=0.5')
 36 | plt.plot(xs,[normal_cdf(x,mu=-1) for x in xs],'-.',label='mu=-1,sigma=1')
 37 | plt.legend(loc=4) # bottom right
 38 | plt.title("Various Normal cdfs")
 39 | # plt.show()
 40 | 
 41 | 
 42 | plt.close()
 43 | plt.gca().clear()
 44 | plt.clf()
 45 | 
 46 | def inverse_normal_cdf(p: float,
 47 |                        mu: float = 0,
 48 |                        sigma: float = 1,
 49 |                        tolerance: float = 0.00001) -> float:
 50 |     """Find approximate inverse using binary search"""
 51 | 
 52 |     # if not standard, compute standard and rescale
 53 |     if mu != 0 or sigma != 1:
 54 |         return mu + sigma * inverse_normal_cdf(p, tolerance=tolerance)
 55 | 
 56 |     low_z = -10.0                      # normal_cdf(-10) is (very close to) 0
 57 |     hi_z  =  10.0                      # normal_cdf(10)  is (very close to) 1
 58 |     while hi_z - low_z > tolerance:
 59 |         mid_z = (low_z + hi_z) / 2     # Consider the midpoint
 60 |         mid_p = normal_cdf(mid_z)      # and the cdf's value there
 61 |         if mid_p < p:
 62 |             low_z = mid_z              # Midpoint too low, search above it
 63 |         else:
 64 |             hi_z = mid_z               # Midpoint too high, search below it
 65 | 
 66 |     return mid_z
 67 | 
 68 | 
 69 | import random
 70 | 
 71 | def bernoulli_trial(p: float) -> int:
 72 |     """Returns 1 with probability p and 0 with probability 1-p"""
 73 |     return 1 if random.random() < p else 0
 74 | 
 75 | def binomial(n: int, p: float) -> int:
 76 |     """Returns the sum of n bernoulli(p) trials"""
 77 |     return sum(bernoulli_trial(p) for _ in range(n))
 78 | 
 79 | from collections import Counter
 80 | 
 81 | def binomial_histogram(p: float, n: int, num_points: int) -> None:
 82 |     """Picks points from a Binomial(n, p) and plots their histogram"""
 83 |     data = [binomial(n, p) for _ in range(num_points)]
 84 | 
 85 |     # use a bar chart to show the actual binomial samples
 86 |     histogram = Counter(data)
 87 |     plt.bar([x - 0.4 for x in histogram.keys()],
 88 |             [v / num_points for v in histogram.values()],
 89 |             0.8,
 90 |             color='0.75')
 91 | 
 92 |     mu = p * n
 93 |     sigma = math.sqrt(n * p * (1 - p))
 94 | 
 95 |     # use a line chart to show the normal approximation
 96 |     xs = range(min(data), max(data) + 1)
 97 |     ys = [normal_cdf(i + 0.5, mu, sigma) - normal_cdf(i - 0.5, mu, sigma)
 98 |           for i in xs]
 99 |     plt.plot(xs,ys)
100 |     plt.title("Binomial Distribution vs. Normal Approximation")
101 | #     plt.show()
102 | 
103 | def main():
104 |     import enum, random
105 |     
106 |     # An Enum is a typed set of enumerated values. We can use them
107 |     # to make our code more descriptive and readable.
108 |     class Kid(enum.Enum):
109 |         BOY = 0
110 |         GIRL = 1
111 |     
112 |     def random_kid() -> Kid:
113 |         return random.choice([Kid.BOY, Kid.GIRL])
114 |     
115 |     both_girls = 0
116 |     older_girl = 0
117 |     either_girl = 0
118 |     
119 |     random.seed(0)
120 |     
121 |     for _ in range(10000):
122 |         younger = random_kid()
123 |         older = random_kid()
124 |         if older == Kid.GIRL:
125 |             older_girl += 1
126 |         if older == Kid.GIRL and younger == Kid.GIRL:
127 |             both_girls += 1
128 |         if older == Kid.GIRL or younger == Kid.GIRL:
129 |             either_girl += 1
130 |     
131 |     print("P(both | older):", both_girls / older_girl)     # 0.514 ~ 1/2
132 |     print("P(both | either): ", both_girls / either_girl)  # 0.342 ~ 1/3
133 |     
134 |     
135 |     
136 |     assert 0.48 < both_girls / older_girl < 0.52
137 |     assert 0.30 < both_girls / either_girl < 0.35
138 |     
139 |     def uniform_pdf(x: float) -> float:
140 |         return 1 if 0 <= x < 1 else 0
141 |     
142 | if __name__ == "__main__": main()
143 | 


--------------------------------------------------------------------------------
/scratch/visualization.py:
--------------------------------------------------------------------------------
  1 | from matplotlib import pyplot as plt
  2 | 
  3 | years = [1950, 1960, 1970, 1980, 1990, 2000, 2010]
  4 | gdp = [300.2, 543.3, 1075.9, 2862.5, 5979.6, 10289.7, 14958.3]
  5 | 
  6 | # create a line chart, years on x-axis, gdp on y-axis
  7 | plt.plot(years, gdp, color='green', marker='o', linestyle='solid')
  8 | 
  9 | # add a title
 10 | plt.title("Nominal GDP")
 11 | 
 12 | # add a label to the y-axis
 13 | plt.ylabel("Billions of $")
 14 | # plt.show()
 15 | 
 16 | 
 17 | plt.savefig('im/viz_gdp.png')
 18 | plt.gca().clear()
 19 | 
 20 | movies = ["Annie Hall", "Ben-Hur", "Casablanca", "Gandhi", "West Side Story"]
 21 | num_oscars = [5, 11, 3, 8, 10]
 22 | 
 23 | # plot bars with left x-coordinates [0, 1, 2, 3, 4], heights [num_oscars]
 24 | plt.bar(range(len(movies)), num_oscars)
 25 | 
 26 | plt.title("My Favorite Movies")     # add a title
 27 | plt.ylabel("# of Academy Awards")   # label the y-axis
 28 | 
 29 | # label x-axis with movie names at bar centers
 30 | plt.xticks(range(len(movies)), movies)
 31 | 
 32 | # plt.show()
 33 | 
 34 | 
 35 | plt.savefig('im/viz_movies.png')
 36 | plt.gca().clear()
 37 | 
 38 | from collections import Counter
 39 | grades = [83, 95, 91, 87, 70, 0, 85, 82, 100, 67, 73, 77, 0]
 40 | 
 41 | # Bucket grades by decile, but put 100 in with the 90s
 42 | histogram = Counter(min(grade // 10 * 10, 90) for grade in grades)
 43 | 
 44 | plt.bar([x + 5 for x in histogram.keys()],  # Shift bars right by 5
 45 |         histogram.values(),                 # Give each bar its correct height
 46 |         10,                                 # Give each bar a width of 8
 47 |         edgecolor=(0, 0, 0))                # Black edges for each bar
 48 | 
 49 | plt.axis([-5, 105, 0, 5])                  # x-axis from -5 to 105,
 50 |                                            # y-axis from 0 to 5
 51 | 
 52 | plt.xticks([10 * i for i in range(11)])    # x-axis labels at 0, 10, ..., 100
 53 | plt.xlabel("Decile")
 54 | plt.ylabel("# of Students")
 55 | plt.title("Distribution of Exam 1 Grades")
 56 | # plt.show()
 57 | 
 58 | 
 59 | plt.savefig('im/viz_grades.png')
 60 | plt.gca().clear()
 61 | 
 62 | mentions = [500, 505]
 63 | years = [2017, 2018]
 64 | 
 65 | plt.bar(years, mentions, 0.8)
 66 | plt.xticks(years)
 67 | plt.ylabel("# of times I heard someone say 'data science'")
 68 | 
 69 | # if you don't do this, matplotlib will label the x-axis 0, 1
 70 | # and then add a +2.013e3 off in the corner (bad matplotlib!)
 71 | plt.ticklabel_format(useOffset=False)
 72 | 
 73 | # misleading y-axis only shows the part above 500
 74 | plt.axis([2016.5, 2018.5, 499, 506])
 75 | plt.title("Look at the 'Huge' Increase!")
 76 | # plt.show()
 77 | 
 78 | 
 79 | plt.savefig('im/viz_misleading_y_axis.png')
 80 | plt.gca().clear()
 81 | 
 82 | 
 83 | plt.bar(years, mentions, 0.8)
 84 | plt.xticks(years)
 85 | plt.ylabel("# of times I heard someone say 'data science'")
 86 | plt.ticklabel_format(useOffset=False)
 87 | 
 88 | plt.axis([2016.5, 2018.5, 0, 550])
 89 | plt.title("Not So Huge Anymore")
 90 | # plt.show()
 91 | 
 92 | 
 93 | plt.savefig('im/viz_non_misleading_y_axis.png')
 94 | plt.gca().clear()
 95 | 
 96 | variance     = [1, 2, 4, 8, 16, 32, 64, 128, 256]
 97 | bias_squared = [256, 128, 64, 32, 16, 8, 4, 2, 1]
 98 | total_error  = [x + y for x, y in zip(variance, bias_squared)]
 99 | xs = [i for i, _ in enumerate(variance)]
100 | 
101 | # We can make multiple calls to plt.plot
102 | # to show multiple series on the same chart
103 | plt.plot(xs, variance,     'g-',  label='variance')    # green solid line
104 | plt.plot(xs, bias_squared, 'r-.', label='bias^2')      # red dot-dashed line
105 | plt.plot(xs, total_error,  'b:',  label='total error') # blue dotted line
106 | 
107 | # Because we've assigned labels to each series,
108 | # we can get a legend for free (loc=9 means "top center")
109 | plt.legend(loc=9)
110 | plt.xlabel("model complexity")
111 | plt.xticks([])
112 | plt.title("The Bias-Variance Tradeoff")
113 | # plt.show()
114 | 
115 | 
116 | plt.savefig('im/viz_line_chart.png')
117 | plt.gca().clear()
118 | 
119 | friends = [ 70,  65,  72,  63,  71,  64,  60,  64,  67]
120 | minutes = [175, 170, 205, 120, 220, 130, 105, 145, 190]
121 | labels =  ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']
122 | 
123 | plt.scatter(friends, minutes)
124 | 
125 | # label each point
126 | for label, friend_count, minute_count in zip(labels, friends, minutes):
127 |     plt.annotate(label,
128 |         xy=(friend_count, minute_count), # Put the label with its point
129 |         xytext=(5, -5),                  # but slightly offset
130 |         textcoords='offset points')
131 | 
132 | plt.title("Daily Minutes vs. Number of Friends")
133 | plt.xlabel("# of friends")
134 | plt.ylabel("daily minutes spent on the site")
135 | # plt.show()
136 | 
137 | 
138 | plt.savefig('im/viz_scatterplot.png')
139 | plt.gca().clear()
140 | 
141 | test_1_grades = [ 99, 90, 85, 97, 80]
142 | test_2_grades = [100, 85, 60, 90, 70]
143 | 
144 | plt.scatter(test_1_grades, test_2_grades)
145 | plt.title("Axes Aren't Comparable")
146 | plt.xlabel("test 1 grade")
147 | plt.ylabel("test 2 grade")
148 | # plt.show()
149 | 
150 | 
151 | plt.savefig('im/viz_scatterplot_axes_not_comparable.png')
152 | plt.gca().clear()
153 | 
154 | 
155 | test_1_grades = [ 99, 90, 85, 97, 80]
156 | test_2_grades = [100, 85, 60, 90, 70]
157 | plt.scatter(test_1_grades, test_2_grades)
158 | plt.title("Axes Are Comparable")
159 | plt.axis("equal")
160 | plt.xlabel("test 1 grade")
161 | plt.ylabel("test 2 grade")
162 | plt.savefig('im/viz_scatterplot_axes_comparable.png')
163 | plt.gca().clear()
164 | 
165 | 


--------------------------------------------------------------------------------
/scratch/k_nearest_neighbors.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | from collections import Counter
  3 | 
  4 | def raw_majority_vote(labels: List[str]) -> str:
  5 |     votes = Counter(labels)
  6 |     winner, _ = votes.most_common(1)[0]
  7 |     return winner
  8 | 
  9 | assert raw_majority_vote(['a', 'b', 'c', 'b']) == 'b'
 10 | 
 11 | def majority_vote(labels: List[str]) -> str:
 12 |     """Assumes that labels are ordered from nearest to farthest."""
 13 |     vote_counts = Counter(labels)
 14 |     winner, winner_count = vote_counts.most_common(1)[0]
 15 |     num_winners = len([count
 16 |                        for count in vote_counts.values()
 17 |                        if count == winner_count])
 18 | 
 19 |     if num_winners == 1:
 20 |         return winner                     # unique winner, so return it
 21 |     else:
 22 |         return majority_vote(labels[:-1]) # try again without the farthest
 23 | 
 24 | # Tie, so look at first 4, then 'b'
 25 | assert majority_vote(['a', 'b', 'c', 'b', 'a']) == 'b'
 26 | 
 27 | from typing import NamedTuple
 28 | from scratch.linear_algebra import Vector, distance
 29 | 
 30 | class LabeledPoint(NamedTuple):
 31 |     point: Vector
 32 |     label: str
 33 | 
 34 | def knn_classify(k: int,
 35 |                  labeled_points: List[LabeledPoint],
 36 |                  new_point: Vector) -> str:
 37 | 
 38 |     # Order the labeled points from nearest to farthest.
 39 |     by_distance = sorted(labeled_points,
 40 |                          key=lambda lp: distance(lp.point, new_point))
 41 | 
 42 |     # Find the labels for the k closest
 43 |     k_nearest_labels = [lp.label for lp in by_distance[:k]]
 44 | 
 45 |     # and let them vote.
 46 |     return majority_vote(k_nearest_labels)
 47 | 
 48 | 
 49 | import random
 50 | 
 51 | def random_point(dim: int) -> Vector:
 52 |     return [random.random() for _ in range(dim)]
 53 | 
 54 | def random_distances(dim: int, num_pairs: int) -> List[float]:
 55 |     return [distance(random_point(dim), random_point(dim))
 56 |             for _ in range(num_pairs)]
 57 | 
 58 | def main():
 59 |     from typing import Dict
 60 |     import csv
 61 |     from collections import defaultdict
 62 |     
 63 |     def parse_iris_row(row: List[str]) -> LabeledPoint:
 64 |         """
 65 |         sepal_length, sepal_width, petal_length, petal_width, class
 66 |         """
 67 |         measurements = [float(value) for value in row[:-1]]
 68 |         # class is e.g. "Iris-virginica"; we just want "virginica"
 69 |         label = row[-1].split("-")[-1]
 70 |     
 71 |         return LabeledPoint(measurements, label)
 72 |     
 73 |     with open('iris.data') as f:
 74 |         reader = csv.reader(f)
 75 |         iris_data = [parse_iris_row(row) for row in reader]
 76 |     
 77 |     # We'll also group just the points by species/label so we can plot them.
 78 |     points_by_species: Dict[str, List[Vector]] = defaultdict(list)
 79 |     for iris in iris_data:
 80 |         points_by_species[iris.label].append(iris.point)
 81 |     
 82 |     from matplotlib import pyplot as plt
 83 |     metrics = ['sepal length', 'sepal width', 'petal length', 'petal width']
 84 |     pairs = [(i, j) for i in range(4) for j in range(4) if i < j]
 85 |     marks = ['+', '.', 'x']  # we have 3 classes, so 3 markers
 86 |     
 87 |     fig, ax = plt.subplots(2, 3)
 88 |     
 89 |     for row in range(2):
 90 |         for col in range(3):
 91 |             i, j = pairs[3 * row + col]
 92 |             ax[row][col].set_title(f"{metrics[i]} vs {metrics[j]}", fontsize=8)
 93 |             ax[row][col].set_xticks([])
 94 |             ax[row][col].set_yticks([])
 95 |     
 96 |             for mark, (species, points) in zip(marks, points_by_species.items()):
 97 |                 xs = [point[i] for point in points]
 98 |                 ys = [point[j] for point in points]
 99 |                 ax[row][col].scatter(xs, ys, marker=mark, label=species)
100 |     
101 |     ax[-1][-1].legend(loc='lower right', prop={'size': 6})
102 |     # plt.show()
103 |     
104 |     
105 |     
106 |     plt.savefig('im/iris_scatter.png')
107 |     plt.gca().clear()
108 |     
109 |     import random
110 |     from scratch.machine_learning import split_data
111 |     
112 |     random.seed(12)
113 |     iris_train, iris_test = split_data(iris_data, 0.70)
114 |     assert len(iris_train) == 0.7 * 150
115 |     assert len(iris_test) == 0.3 * 150
116 |     
117 |     from typing import Tuple
118 |     
119 |     # track how many times we see (predicted, actual)
120 |     confusion_matrix: Dict[Tuple[str, str], int] = defaultdict(int)
121 |     num_correct = 0
122 |     
123 |     for iris in iris_test:
124 |         predicted = knn_classify(5, iris_train, iris.point)
125 |         actual = iris.label
126 |     
127 |         if predicted == actual:
128 |             num_correct += 1
129 |     
130 |         confusion_matrix[(predicted, actual)] += 1
131 |     
132 |     pct_correct = num_correct / len(iris_test)
133 |     print(pct_correct, confusion_matrix)
134 |     
135 |     import tqdm
136 |     dimensions = range(1, 101)
137 |     
138 |     avg_distances = []
139 |     min_distances = []
140 |     
141 |     random.seed(0)
142 |     for dim in tqdm.tqdm(dimensions, desc="Curse of Dimensionality"):
143 |         distances = random_distances(dim, 10000)      # 10,000 random pairs
144 |         avg_distances.append(sum(distances) / 10000)  # track the average
145 |         min_distances.append(min(distances))          # track the minimum
146 |     
147 |     min_avg_ratio = [min_dist / avg_dist
148 |                      for min_dist, avg_dist in zip(min_distances, avg_distances)]
149 |     
150 | if __name__ == "__main__": main()


--------------------------------------------------------------------------------
/first-edition/code-python3/visualizing_data.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | from collections import Counter
  3 | 
  4 | def make_chart_simple_line_chart():
  5 | 
  6 |     years = [1950, 1960, 1970, 1980, 1990, 2000, 2010]
  7 |     gdp = [300.2, 543.3, 1075.9, 2862.5, 5979.6, 10289.7, 14958.3]
  8 | 
  9 |     # create a line chart, years on x-axis, gdp on y-axis
 10 |     plt.plot(years, gdp, color='green', marker='o', linestyle='solid')
 11 | 
 12 |     # add a title
 13 |     plt.title("Nominal GDP")
 14 | 
 15 |     # add a label to the y-axis
 16 |     plt.ylabel("Billions of $")
 17 |     plt.show()
 18 | 
 19 | 
 20 | def make_chart_simple_bar_chart():
 21 | 
 22 |     movies = ["Annie Hall", "Ben-Hur", "Casablanca", "Gandhi", "West Side Story"]
 23 |     num_oscars = [5, 11, 3, 8, 10]
 24 | 
 25 |     # bars are by default width 0.8, so we'll add 0.1 to the left coordinates
 26 |     # so that each bar is centered
 27 |     xs = [i + 0.1 for i, _ in enumerate(movies)]
 28 | 
 29 |     # plot bars with left x-coordinates [xs], heights [num_oscars]
 30 |     plt.bar(xs, num_oscars)
 31 |     plt.ylabel("# of Academy Awards")
 32 |     plt.title("My Favorite Movies")
 33 | 
 34 |     # label x-axis with movie names at bar centers
 35 |     plt.xticks([i + 0.5 for i, _ in enumerate(movies)], movies)
 36 | 
 37 |     plt.show()
 38 | 
 39 | def make_chart_histogram():
 40 |     grades = [83,95,91,87,70,0,85,82,100,67,73,77,0]
 41 |     decile = lambda grade: grade // 10 * 10
 42 |     histogram = Counter(decile(grade) for grade in grades)
 43 | 
 44 |     plt.bar([x - 4 for x in histogram.keys()], # shift each bar to the left by 4
 45 |             histogram.values(),                # give each bar its correct height
 46 |             8)                                 # give each bar a width of 8
 47 |     plt.axis([-5, 105, 0, 5])                  # x-axis from -5 to 105,
 48 |                                                # y-axis from 0 to 5
 49 |     plt.xticks([10 * i for i in range(11)])    # x-axis labels at 0, 10, ..., 100
 50 |     plt.xlabel("Decile")
 51 |     plt.ylabel("# of Students")
 52 |     plt.title("Distribution of Exam 1 Grades")
 53 |     plt.show()
 54 | 
 55 | def make_chart_misleading_y_axis(mislead=True):
 56 | 
 57 |     mentions = [500, 505]
 58 |     years = [2013, 2014]
 59 | 
 60 |     plt.bar([2012.6, 2013.6], mentions, 0.8)
 61 |     plt.xticks(years)
 62 |     plt.ylabel("# of times I heard someone say 'data science'")
 63 | 
 64 |     # if you don't do this, matplotlib will label the x-axis 0, 1
 65 |     # and then add a +2.013e3 off in the corner (bad matplotlib!)
 66 |     plt.ticklabel_format(useOffset=False)
 67 | 
 68 |     if mislead:
 69 |         # misleading y-axis only shows the part above 500
 70 |         plt.axis([2012.5,2014.5,499,506])
 71 |         plt.title("Look at the 'Huge' Increase!")
 72 |     else:
 73 |         plt.axis([2012.5,2014.5,0,550])
 74 |         plt.title("Not So Huge Anymore.")
 75 |     plt.show()
 76 | 
 77 | def make_chart_several_line_charts():
 78 | 
 79 |     variance     = [1,2,4,8,16,32,64,128,256]
 80 |     bias_squared = [256,128,64,32,16,8,4,2,1]
 81 |     total_error  = [x + y for x, y in zip(variance, bias_squared)]
 82 | 
 83 |     xs = range(len(variance))
 84 | 
 85 |     # we can make multiple calls to plt.plot
 86 |     # to show multiple series on the same chart
 87 |     plt.plot(xs, variance,     'g-',  label='variance')    # green solid line
 88 |     plt.plot(xs, bias_squared, 'r-.', label='bias^2')      # red dot-dashed line
 89 |     plt.plot(xs, total_error,  'b:',  label='total error') # blue dotted line
 90 | 
 91 |     # because we've assigned labels to each series
 92 |     # we can get a legend for free
 93 |     # loc=9 means "top center"
 94 |     plt.legend(loc=9)
 95 |     plt.xlabel("model complexity")
 96 |     plt.title("The Bias-Variance Tradeoff")
 97 |     plt.show()
 98 | 
 99 | def make_chart_scatter_plot():
100 | 
101 |     friends = [ 70, 65, 72, 63, 71, 64, 60, 64, 67]
102 |     minutes = [175, 170, 205, 120, 220, 130, 105, 145, 190]
103 |     labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']
104 | 
105 |     plt.scatter(friends, minutes)
106 | 
107 |     # label each point
108 |     for label, friend_count, minute_count in zip(labels, friends, minutes):
109 |         plt.annotate(label,
110 |                      xy=(friend_count, minute_count), # put the label with its point
111 |                      xytext=(5, -5), # but slightly offset
112 |                      textcoords='offset points')
113 | 
114 |     plt.title("Daily Minutes vs. Number of Friends")
115 |     plt.xlabel("# of friends")
116 |     plt.ylabel("daily minutes spent on the site")
117 |     plt.show()
118 | 
119 | def make_chart_scatterplot_axes(equal_axes=False):
120 | 
121 |     test_1_grades = [ 99, 90, 85, 97, 80]
122 |     test_2_grades = [100, 85, 60, 90, 70]
123 | 
124 |     plt.scatter(test_1_grades, test_2_grades)
125 |     plt.xlabel("test 1 grade")
126 |     plt.ylabel("test 2 grade")
127 | 
128 |     if equal_axes:
129 |         plt.title("Axes Are Comparable")
130 |         plt.axis("equal")
131 |     else:
132 |         plt.title("Axes Aren't Comparable")
133 | 
134 |     plt.show()
135 | 
136 | def make_chart_pie_chart():
137 | 
138 |     plt.pie([0.95, 0.05], labels=["Uses pie charts", "Knows better"])
139 | 
140 |     # make sure pie is a circle and not an oval
141 |     plt.axis("equal")
142 |     plt.show()
143 | 
144 | 
145 | if __name__ == "__main__":
146 | 
147 |     make_chart_simple_line_chart()
148 | 
149 |     make_chart_simple_bar_chart()
150 | 
151 |     make_chart_histogram()
152 | 
153 |     make_chart_misleading_y_axis(mislead=True)
154 | 
155 |     make_chart_misleading_y_axis(mislead=False)
156 | 
157 |     make_chart_several_line_charts()
158 | 
159 |     make_chart_scatterplot_axes(equal_axes=False)
160 | 
161 |     make_chart_scatterplot_axes(equal_axes=True)
162 | 
163 |     make_chart_pie_chart()
164 | 


--------------------------------------------------------------------------------
/first-edition/code/visualizing_data.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | from collections import Counter
  3 | 
  4 | def make_chart_simple_line_chart(plt):
  5 | 
  6 |     years = [1950, 1960, 1970, 1980, 1990, 2000, 2010]
  7 |     gdp = [300.2, 543.3, 1075.9, 2862.5, 5979.6, 10289.7, 14958.3]
  8 | 
  9 |     # create a line chart, years on x-axis, gdp on y-axis
 10 |     plt.plot(years, gdp, color='green', marker='o', linestyle='solid')
 11 | 
 12 |     # add a title
 13 |     plt.title("Nominal GDP")
 14 | 
 15 |     # add a label to the y-axis
 16 |     plt.ylabel("Billions of $")
 17 |     plt.show()
 18 | 
 19 | 
 20 | def make_chart_simple_bar_chart(plt):
 21 | 
 22 |     movies = ["Annie Hall", "Ben-Hur", "Casablanca", "Gandhi", "West Side Story"]
 23 |     num_oscars = [5, 11, 3, 8, 10]
 24 | 
 25 |     # bars are by default width 0.8, so we'll add 0.1 to the left coordinates
 26 |     # so that each bar is centered
 27 |     xs = [i + 0.1 for i, _ in enumerate(movies)]
 28 | 
 29 |     # plot bars with left x-coordinates [xs], heights [num_oscars]
 30 |     plt.bar(xs, num_oscars)
 31 |     plt.ylabel("# of Academy Awards")
 32 |     plt.title("My Favorite Movies")
 33 | 
 34 |     # label x-axis with movie names at bar centers
 35 |     plt.xticks([i + 0.5 for i, _ in enumerate(movies)], movies)
 36 |     
 37 |     plt.show()
 38 | 
 39 | def make_chart_histogram(plt):
 40 |     grades = [83,95,91,87,70,0,85,82,100,67,73,77,0]
 41 |     decile = lambda grade: grade // 10 * 10 
 42 |     histogram = Counter(decile(grade) for grade in grades)
 43 | 
 44 |     plt.bar([x - 4 for x in histogram.keys()], # shift each bar to the left by 4
 45 |             histogram.values(),                # give each bar its correct height
 46 |             8)                                 # give each bar a width of 8
 47 |     plt.axis([-5, 105, 0, 5])                  # x-axis from -5 to 105,
 48 |                                                # y-axis from 0 to 5
 49 |     plt.xticks([10 * i for i in range(11)])    # x-axis labels at 0, 10, ..., 100
 50 |     plt.xlabel("Decile")
 51 |     plt.ylabel("# of Students")
 52 |     plt.title("Distribution of Exam 1 Grades")
 53 |     plt.show()
 54 | 
 55 | def make_chart_misleading_y_axis(plt, mislead=True):
 56 | 
 57 |     mentions = [500, 505]
 58 |     years = [2013, 2014]
 59 | 
 60 |     plt.bar([2012.6, 2013.6], mentions, 0.8)
 61 |     plt.xticks(years)
 62 |     plt.ylabel("# of times I heard someone say 'data science'")
 63 | 
 64 |     # if you don't do this, matplotlib will label the x-axis 0, 1
 65 |     # and then add a +2.013e3 off in the corner (bad matplotlib!)
 66 |     plt.ticklabel_format(useOffset=False)
 67 | 
 68 |     if mislead:
 69 |         # misleading y-axis only shows the part above 500
 70 |         plt.axis([2012.5,2014.5,499,506])
 71 |         plt.title("Look at the 'Huge' Increase!")
 72 |     else:
 73 |         plt.axis([2012.5,2014.5,0,550])
 74 |         plt.title("Not So Huge Anymore.")       
 75 |     plt.show()
 76 | 
 77 | def make_chart_several_line_charts(plt):
 78 | 
 79 |     variance     = [1,2,4,8,16,32,64,128,256]
 80 |     bias_squared = [256,128,64,32,16,8,4,2,1]
 81 |     total_error  = [x + y for x, y in zip(variance, bias_squared)]
 82 | 
 83 |     xs = range(len(variance))
 84 | 
 85 |     # we can make multiple calls to plt.plot 
 86 |     # to show multiple series on the same chart
 87 |     plt.plot(xs, variance,     'g-',  label='variance')    # green solid line
 88 |     plt.plot(xs, bias_squared, 'r-.', label='bias^2')      # red dot-dashed line
 89 |     plt.plot(xs, total_error,  'b:',  label='total error') # blue dotted line
 90 | 
 91 |     # because we've assigned labels to each series
 92 |     # we can get a legend for free
 93 |     # loc=9 means "top center"
 94 |     plt.legend(loc=9)
 95 |     plt.xlabel("model complexity")
 96 |     plt.title("The Bias-Variance Tradeoff")
 97 |     plt.show()
 98 | 
 99 | def make_chart_scatter_plot(plt):
100 | 
101 |     friends = [ 70, 65, 72, 63, 71, 64, 60, 64, 67]
102 |     minutes = [175, 170, 205, 120, 220, 130, 105, 145, 190]
103 |     labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']
104 | 
105 |     plt.scatter(friends, minutes)
106 |     
107 |     # label each point
108 |     for label, friend_count, minute_count in zip(labels, friends, minutes):
109 |         plt.annotate(label,
110 |                      xy=(friend_count, minute_count), # put the label with its point
111 |                      xytext=(5, -5), # but slightly offset
112 |                      textcoords='offset points')
113 | 
114 |     plt.title("Daily Minutes vs. Number of Friends")
115 |     plt.xlabel("# of friends")
116 |     plt.ylabel("daily minutes spent on the site")
117 |     plt.show()
118 | 
119 | def make_chart_scatterplot_axes(plt, equal_axes=False):
120 | 
121 |     test_1_grades = [ 99, 90, 85, 97, 80]
122 |     test_2_grades = [100, 85, 60, 90, 70]
123 | 
124 |     plt.scatter(test_1_grades, test_2_grades)
125 |     plt.xlabel("test 1 grade")
126 |     plt.ylabel("test 2 grade")
127 | 
128 |     if equal_axes:
129 |         plt.title("Axes Are Comparable")
130 |         plt.axis("equal")
131 |     else:
132 |         plt.title("Axes Aren't Comparable")
133 | 
134 |     plt.show()
135 | 
136 | def make_chart_pie_chart(plt):
137 | 
138 |     plt.pie([0.95, 0.05], labels=["Uses pie charts", "Knows better"])
139 | 
140 |     # make sure pie is a circle and not an oval
141 |     plt.axis("equal")
142 |     plt.show()
143 | 
144 | 
145 | if __name__ == "__main__":
146 | 
147 |     make_chart_simple_line_chart(plt)
148 | 
149 |     make_chart_simple_bar_chart(plt)
150 | 
151 |     make_chart_histogram(plt)
152 | 
153 |     make_chart_misleading_y_axis(plt, mislead=True)
154 | 
155 |     make_chart_misleading_y_axis(plt, mislead=False)
156 | 
157 |     make_chart_several_line_charts(plt)
158 | 
159 |     make_chart_scatterplot_axes(plt, equal_axes=False)
160 | 
161 |     make_chart_scatterplot_axes(plt, equal_axes=True)
162 | 
163 |     make_chart_pie_chart(plt)
164 | 


--------------------------------------------------------------------------------
/scratch/gradient_descent.py:
--------------------------------------------------------------------------------
  1 | from scratch.linear_algebra import Vector, dot
  2 | 
  3 | def sum_of_squares(v: Vector) -> float:
  4 |     """Computes the sum of squared elements in v"""
  5 |     return dot(v, v)
  6 | 
  7 | from typing import Callable
  8 | 
  9 | def difference_quotient(f: Callable[[float], float],
 10 |                         x: float,
 11 |                         h: float) -> float:
 12 |     return (f(x + h) - f(x)) / h
 13 | 
 14 | def square(x: float) -> float:
 15 |     return x * x
 16 | 
 17 | def derivative(x: float) -> float:
 18 |     return 2 * x
 19 | 
 20 | def estimate_gradient(f: Callable[[Vector], float],
 21 |                       v: Vector,
 22 |                       h: float = 0.0001):
 23 |     return [partial_difference_quotient(f, v, i, h)
 24 |             for i in range(len(v))]
 25 | 
 26 | import random
 27 | from scratch.linear_algebra import distance, add, scalar_multiply
 28 | 
 29 | def gradient_step(v: Vector, gradient: Vector, step_size: float) -> Vector:
 30 |     """Moves `step_size` in the `gradient` direction from `v`"""
 31 |     assert len(v) == len(gradient)
 32 |     step = scalar_multiply(step_size, gradient)
 33 |     return add(v, step)
 34 | 
 35 | def sum_of_squares_gradient(v: Vector) -> Vector:
 36 |     return [2 * v_i for v_i in v]
 37 | 
 38 | # x ranges from -50 to 49, y is always 20 * x + 5
 39 | inputs = [(x, 20 * x + 5) for x in range(-50, 50)]
 40 | 
 41 | def linear_gradient(x: float, y: float, theta: Vector) -> Vector:
 42 |     slope, intercept = theta
 43 |     predicted = slope * x + intercept    # The prediction of the model.
 44 |     error = (predicted - y)              # error is (predicted - actual)
 45 |     squared_error = error ** 2           # We'll minimize squared error
 46 |     grad = [2 * error * x, 2 * error]    # using its gradient.
 47 |     return grad
 48 | 
 49 | from typing import TypeVar, List, Iterator
 50 | 
 51 | T = TypeVar('T')  # this allows us to type "generic" functions
 52 | 
 53 | def minibatches(dataset: List[T],
 54 |                 batch_size: int,
 55 |                 shuffle: bool = True) -> Iterator[List[T]]:
 56 |     """Generates `batch_size`-sized minibatches from the dataset"""
 57 |     # Start indexes 0, batch_size, 2 * batch_size, ...
 58 |     batch_starts = [start for start in range(0, len(dataset), batch_size)]
 59 | 
 60 |     if shuffle: random.shuffle(batch_starts)  # shuffle the batches
 61 | 
 62 |     for start in batch_starts:
 63 |         end = start + batch_size
 64 |         yield dataset[start:end]
 65 | 
 66 | def main():
 67 |     xs = range(-10, 11)
 68 |     actuals = [derivative(x) for x in xs]
 69 |     estimates = [difference_quotient(square, x, h=0.001) for x in xs]
 70 |     
 71 |     # plot to show they're basically the same
 72 |     import matplotlib.pyplot as plt
 73 |     plt.title("Actual Derivatives vs. Estimates")
 74 |     plt.plot(xs, actuals, 'rx', label='Actual')       # red  x
 75 |     plt.plot(xs, estimates, 'b+', label='Estimate')   # blue +
 76 |     plt.legend(loc=9)
 77 |     # plt.show()
 78 |     
 79 |     
 80 |     plt.close()
 81 |     
 82 |     def partial_difference_quotient(f: Callable[[Vector], float],
 83 |                                     v: Vector,
 84 |                                     i: int,
 85 |                                     h: float) -> float:
 86 |         """Returns the i-th partial difference quotient of f at v"""
 87 |         w = [v_j + (h if j == i else 0)    # add h to just the ith element of v
 88 |              for j, v_j in enumerate(v)]
 89 |     
 90 |         return (f(w) - f(v)) / h
 91 |     
 92 |     
 93 |     # "Using the Gradient" example
 94 |     
 95 |     # pick a random starting point
 96 |     v = [random.uniform(-10, 10) for i in range(3)]
 97 |     
 98 |     for epoch in range(1000):
 99 |         grad = sum_of_squares_gradient(v)    # compute the gradient at v
100 |         v = gradient_step(v, grad, -0.01)    # take a negative gradient step
101 |         print(epoch, v)
102 |     
103 |     assert distance(v, [0, 0, 0]) < 0.001    # v should be close to 0
104 |     
105 |     
106 |     # First "Using Gradient Descent to Fit Models" example
107 |     
108 |     from scratch.linear_algebra import vector_mean
109 |     
110 |     # Start with random values for slope and intercept.
111 |     theta = [random.uniform(-1, 1), random.uniform(-1, 1)]
112 |     
113 |     learning_rate = 0.001
114 |     
115 |     for epoch in range(5000):
116 |         # Compute the mean of the gradients
117 |         grad = vector_mean([linear_gradient(x, y, theta) for x, y in inputs])
118 |         # Take a step in that direction
119 |         theta = gradient_step(theta, grad, -learning_rate)
120 |         print(epoch, theta)
121 |     
122 |     slope, intercept = theta
123 |     assert 19.9 < slope < 20.1,   "slope should be about 20"
124 |     assert 4.9 < intercept < 5.1, "intercept should be about 5"
125 |     
126 |     
127 |     # Minibatch gradient descent example
128 |     
129 |     theta = [random.uniform(-1, 1), random.uniform(-1, 1)]
130 |     
131 |     for epoch in range(1000):
132 |         for batch in minibatches(inputs, batch_size=20):
133 |             grad = vector_mean([linear_gradient(x, y, theta) for x, y in batch])
134 |             theta = gradient_step(theta, grad, -learning_rate)
135 |         print(epoch, theta)
136 |     
137 |     slope, intercept = theta
138 |     assert 19.9 < slope < 20.1,   "slope should be about 20"
139 |     assert 4.9 < intercept < 5.1, "intercept should be about 5"
140 |     
141 |     
142 |     # Stochastic gradient descent example
143 |     
144 |     theta = [random.uniform(-1, 1), random.uniform(-1, 1)]
145 |     
146 |     for epoch in range(100):
147 |         for x, y in inputs:
148 |             grad = linear_gradient(x, y, theta)
149 |             theta = gradient_step(theta, grad, -learning_rate)
150 |         print(epoch, theta)
151 |     
152 |     slope, intercept = theta
153 |     assert 19.9 < slope < 20.1,   "slope should be about 20"
154 |     assert 4.9 < intercept < 5.1, "intercept should be about 5"
155 |     
156 | if __name__ == "__main__": main()


--------------------------------------------------------------------------------
/first-edition/code-python3/stats.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | from linear_algebra import sum_of_squares, dot
  3 | import math
  4 | 
  5 | num_friends = [100,49,41,40,25,21,21,19,19,18,18,16,15,15,15,15,14,14,13,13,13,13,12,12,11,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,8,8,8,8,8,8,8,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
  6 | 
  7 | def make_friend_counts_histogram(plt):
  8 |     friend_counts = Counter(num_friends)
  9 |     xs = range(101)
 10 |     ys = [friend_counts[x] for x in xs]
 11 |     plt.bar(xs, ys)
 12 |     plt.axis([0,101,0,25])
 13 |     plt.title("Histogram of Friend Counts")
 14 |     plt.xlabel("# of friends")
 15 |     plt.ylabel("# of people")
 16 |     plt.show()
 17 | 
 18 | num_points = len(num_friends)               # 204
 19 | 
 20 | largest_value = max(num_friends)            # 100
 21 | smallest_value = min(num_friends)           # 1
 22 | 
 23 | sorted_values = sorted(num_friends)
 24 | smallest_value = sorted_values[0]           # 1
 25 | second_smallest_value = sorted_values[1]    # 1
 26 | second_largest_value = sorted_values[-2]    # 49
 27 | 
 28 | # this isn't right if you don't from __future__ import division
 29 | def mean(x):
 30 |     return sum(x) / len(x)
 31 | 
 32 | def median(v):
 33 |     """finds the 'middle-most' value of v"""
 34 |     n = len(v)
 35 |     sorted_v = sorted(v)
 36 |     midpoint = n // 2
 37 | 
 38 |     if n % 2 == 1:
 39 |         # if odd, return the middle value
 40 |         return sorted_v[midpoint]
 41 |     else:
 42 |         # if even, return the average of the middle values
 43 |         lo = midpoint - 1
 44 |         hi = midpoint
 45 |         return (sorted_v[lo] + sorted_v[hi]) / 2
 46 | 
 47 | def quantile(x, p):
 48 |     """returns the pth-percentile value in x"""
 49 |     p_index = int(p * len(x))
 50 |     return sorted(x)[p_index]
 51 | 
 52 | def mode(x):
 53 |     """returns a list, might be more than one mode"""
 54 |     counts = Counter(x)
 55 |     max_count = max(counts.values())
 56 |     return [x_i for x_i, count in counts.items()
 57 |             if count == max_count]
 58 | 
 59 | # "range" already means something in Python, so we'll use a different name
 60 | def data_range(x):
 61 |     return max(x) - min(x)
 62 | 
 63 | def de_mean(x):
 64 |     """translate x by subtracting its mean (so the result has mean 0)"""
 65 |     x_bar = mean(x)
 66 |     return [x_i - x_bar for x_i in x]
 67 | 
 68 | def variance(x):
 69 |     """assumes x has at least two elements"""
 70 |     n = len(x)
 71 |     deviations = de_mean(x)
 72 |     return sum_of_squares(deviations) / (n - 1)
 73 | 
 74 | def standard_deviation(x):
 75 |     return math.sqrt(variance(x))
 76 | 
 77 | def interquartile_range(x):
 78 |     return quantile(x, 0.75) - quantile(x, 0.25)
 79 | 
 80 | ####
 81 | #
 82 | # CORRELATION
 83 | #
 84 | #####
 85 | 
 86 | daily_minutes = [1,68.77,51.25,52.08,38.36,44.54,57.13,51.4,41.42,31.22,34.76,54.01,38.79,47.59,49.1,27.66,41.03,36.73,48.65,28.12,46.62,35.57,32.98,35,26.07,23.77,39.73,40.57,31.65,31.21,36.32,20.45,21.93,26.02,27.34,23.49,46.94,30.5,33.8,24.23,21.4,27.94,32.24,40.57,25.07,19.42,22.39,18.42,46.96,23.72,26.41,26.97,36.76,40.32,35.02,29.47,30.2,31,38.11,38.18,36.31,21.03,30.86,36.07,28.66,29.08,37.28,15.28,24.17,22.31,30.17,25.53,19.85,35.37,44.6,17.23,13.47,26.33,35.02,32.09,24.81,19.33,28.77,24.26,31.98,25.73,24.86,16.28,34.51,15.23,39.72,40.8,26.06,35.76,34.76,16.13,44.04,18.03,19.65,32.62,35.59,39.43,14.18,35.24,40.13,41.82,35.45,36.07,43.67,24.61,20.9,21.9,18.79,27.61,27.21,26.61,29.77,20.59,27.53,13.82,33.2,25,33.1,36.65,18.63,14.87,22.2,36.81,25.53,24.62,26.25,18.21,28.08,19.42,29.79,32.8,35.99,28.32,27.79,35.88,29.06,36.28,14.1,36.63,37.49,26.9,18.58,38.48,24.48,18.95,33.55,14.24,29.04,32.51,25.63,22.22,19,32.73,15.16,13.9,27.2,32.01,29.27,33,13.74,20.42,27.32,18.23,35.35,28.48,9.08,24.62,20.12,35.26,19.92,31.02,16.49,12.16,30.7,31.22,34.65,13.13,27.51,33.2,31.57,14.1,33.42,17.44,10.12,24.42,9.82,23.39,30.93,15.03,21.67,31.09,33.29,22.61,26.89,23.48,8.38,27.81,32.35,23.84]
 87 | 
 88 | def covariance(x, y):
 89 |     n = len(x)
 90 |     return dot(de_mean(x), de_mean(y)) / (n - 1)
 91 | 
 92 | def correlation(x, y):
 93 |     stdev_x = standard_deviation(x)
 94 |     stdev_y = standard_deviation(y)
 95 |     if stdev_x > 0 and stdev_y > 0:
 96 |         return covariance(x, y) / stdev_x / stdev_y
 97 |     else:
 98 |         return 0 # if no variation, correlation is zero
 99 | 
100 | outlier = num_friends.index(100) # index of outlier
101 | 
102 | num_friends_good = [x
103 |                     for i, x in enumerate(num_friends)
104 |                     if i != outlier]
105 | 
106 | daily_minutes_good = [x
107 |                       for i, x in enumerate(daily_minutes)
108 |                       if i != outlier]
109 | 
110 | 
111 | 
112 | if __name__ == "__main__":
113 | 
114 |     print("num_points", len(num_friends))
115 |     print("largest value", max(num_friends))
116 |     print("smallest value", min(num_friends))
117 |     print("second_smallest_value", sorted_values[1])
118 |     print("second_largest_value", sorted_values[-2]  )
119 |     print("mean(num_friends)", mean(num_friends))
120 |     print("median(num_friends)", median(num_friends))
121 |     print("quantile(num_friends, 0.10)", quantile(num_friends, 0.10))
122 |     print("quantile(num_friends, 0.25)", quantile(num_friends, 0.25))
123 |     print("quantile(num_friends, 0.75)", quantile(num_friends, 0.75))
124 |     print("quantile(num_friends, 0.90)", quantile(num_friends, 0.90))
125 |     print("mode(num_friends)", mode(num_friends))
126 |     print("data_range(num_friends)", data_range(num_friends))
127 |     print("variance(num_friends)", variance(num_friends))
128 |     print("standard_deviation(num_friends)", standard_deviation(num_friends))
129 |     print("interquartile_range(num_friends)", interquartile_range(num_friends))
130 | 
131 |     print("covariance(num_friends, daily_minutes)", covariance(num_friends, daily_minutes))
132 |     print("correlation(num_friends, daily_minutes)", correlation(num_friends, daily_minutes))
133 |     print("correlation(num_friends_good, daily_minutes_good)", correlation(num_friends_good, daily_minutes_good))
134 | 


--------------------------------------------------------------------------------
/first-edition/code/statistics.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from collections import Counter
  3 | from linear_algebra import sum_of_squares, dot
  4 | import math
  5 | 
  6 | num_friends = [100,49,41,40,25,21,21,19,19,18,18,16,15,15,15,15,14,14,13,13,13,13,12,12,11,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,8,8,8,8,8,8,8,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
  7 | 
  8 | def make_friend_counts_histogram(plt):
  9 |     friend_counts = Counter(num_friends)
 10 |     xs = range(101)
 11 |     ys = [friend_counts[x] for x in xs]
 12 |     plt.bar(xs, ys)
 13 |     plt.axis([0,101,0,25])
 14 |     plt.title("Histogram of Friend Counts")
 15 |     plt.xlabel("# of friends")
 16 |     plt.ylabel("# of people")
 17 |     plt.show()
 18 | 
 19 | num_points = len(num_friends)               # 204
 20 | 
 21 | largest_value = max(num_friends)            # 100
 22 | smallest_value = min(num_friends)           # 1
 23 | 
 24 | sorted_values = sorted(num_friends)
 25 | smallest_value = sorted_values[0]           # 1
 26 | second_smallest_value = sorted_values[1]    # 1
 27 | second_largest_value = sorted_values[-2]    # 49
 28 | 
 29 | # this isn't right if you don't from __future__ import division
 30 | def mean(x): 
 31 |     return sum(x) / len(x)
 32 | 
 33 | def median(v):
 34 |     """finds the 'middle-most' value of v"""
 35 |     n = len(v)
 36 |     sorted_v = sorted(v)
 37 |     midpoint = n // 2
 38 |     
 39 |     if n % 2 == 1:
 40 |         # if odd, return the middle value
 41 |         return sorted_v[midpoint]
 42 |     else:
 43 |         # if even, return the average of the middle values
 44 |         lo = midpoint - 1
 45 |         hi = midpoint
 46 |         return (sorted_v[lo] + sorted_v[hi]) / 2
 47 |         
 48 | def quantile(x, p):
 49 |     """returns the pth-percentile value in x"""
 50 |     p_index = int(p * len(x))
 51 |     return sorted(x)[p_index]
 52 | 
 53 | def mode(x):
 54 |     """returns a list, might be more than one mode"""
 55 |     counts = Counter(x)
 56 |     max_count = max(counts.values())
 57 |     return [x_i for x_i, count in counts.iteritems()
 58 |             if count == max_count]
 59 | 
 60 | # "range" already means something in Python, so we'll use a different name
 61 | def data_range(x):
 62 |     return max(x) - min(x)
 63 | 
 64 | def de_mean(x):
 65 |     """translate x by subtracting its mean (so the result has mean 0)"""
 66 |     x_bar = mean(x)
 67 |     return [x_i - x_bar for x_i in x]
 68 | 
 69 | def variance(x):
 70 |     """assumes x has at least two elements"""
 71 |     n = len(x)
 72 |     deviations = de_mean(x)
 73 |     return sum_of_squares(deviations) / (n - 1)
 74 |     
 75 | def standard_deviation(x):
 76 |     return math.sqrt(variance(x))
 77 | 
 78 | def interquartile_range(x):
 79 |     return quantile(x, 0.75) - quantile(x, 0.25)
 80 | 
 81 | ####
 82 | #
 83 | # CORRELATION
 84 | #
 85 | #####
 86 | 
 87 | daily_minutes = [1,68.77,51.25,52.08,38.36,44.54,57.13,51.4,41.42,31.22,34.76,54.01,38.79,47.59,49.1,27.66,41.03,36.73,48.65,28.12,46.62,35.57,32.98,35,26.07,23.77,39.73,40.57,31.65,31.21,36.32,20.45,21.93,26.02,27.34,23.49,46.94,30.5,33.8,24.23,21.4,27.94,32.24,40.57,25.07,19.42,22.39,18.42,46.96,23.72,26.41,26.97,36.76,40.32,35.02,29.47,30.2,31,38.11,38.18,36.31,21.03,30.86,36.07,28.66,29.08,37.28,15.28,24.17,22.31,30.17,25.53,19.85,35.37,44.6,17.23,13.47,26.33,35.02,32.09,24.81,19.33,28.77,24.26,31.98,25.73,24.86,16.28,34.51,15.23,39.72,40.8,26.06,35.76,34.76,16.13,44.04,18.03,19.65,32.62,35.59,39.43,14.18,35.24,40.13,41.82,35.45,36.07,43.67,24.61,20.9,21.9,18.79,27.61,27.21,26.61,29.77,20.59,27.53,13.82,33.2,25,33.1,36.65,18.63,14.87,22.2,36.81,25.53,24.62,26.25,18.21,28.08,19.42,29.79,32.8,35.99,28.32,27.79,35.88,29.06,36.28,14.1,36.63,37.49,26.9,18.58,38.48,24.48,18.95,33.55,14.24,29.04,32.51,25.63,22.22,19,32.73,15.16,13.9,27.2,32.01,29.27,33,13.74,20.42,27.32,18.23,35.35,28.48,9.08,24.62,20.12,35.26,19.92,31.02,16.49,12.16,30.7,31.22,34.65,13.13,27.51,33.2,31.57,14.1,33.42,17.44,10.12,24.42,9.82,23.39,30.93,15.03,21.67,31.09,33.29,22.61,26.89,23.48,8.38,27.81,32.35,23.84]
 88 | 
 89 | def covariance(x, y):
 90 |     n = len(x)
 91 |     return dot(de_mean(x), de_mean(y)) / (n - 1)
 92 | 
 93 | def correlation(x, y):
 94 |     stdev_x = standard_deviation(x)
 95 |     stdev_y = standard_deviation(y)
 96 |     if stdev_x > 0 and stdev_y > 0:
 97 |         return covariance(x, y) / stdev_x / stdev_y
 98 |     else:
 99 |         return 0 # if no variation, correlation is zero
100 | 
101 | outlier = num_friends.index(100) # index of outlier
102 | 
103 | num_friends_good = [x 
104 |                     for i, x in enumerate(num_friends) 
105 |                     if i != outlier]
106 | 
107 | daily_minutes_good = [x 
108 |                       for i, x in enumerate(daily_minutes) 
109 |                       if i != outlier]
110 | 
111 | 
112 | 
113 | if __name__ == "__main__":
114 | 
115 |     print "num_points", len(num_friends)
116 |     print "largest value", max(num_friends)
117 |     print "smallest value", min(num_friends)
118 |     print "second_smallest_value", sorted_values[1]
119 |     print "second_largest_value", sorted_values[-2]  
120 |     print "mean(num_friends)", mean(num_friends)
121 |     print "median(num_friends)", median(num_friends)
122 |     print "quantile(num_friends, 0.10)", quantile(num_friends, 0.10)
123 |     print "quantile(num_friends, 0.25)", quantile(num_friends, 0.25)
124 |     print "quantile(num_friends, 0.75)", quantile(num_friends, 0.75)
125 |     print "quantile(num_friends, 0.90)", quantile(num_friends, 0.90)
126 |     print "mode(num_friends)", mode(num_friends)
127 |     print "data_range(num_friends)", data_range(num_friends)
128 |     print "variance(num_friends)", variance(num_friends)
129 |     print "standard_deviation(num_friends)", standard_deviation(num_friends)
130 |     print "interquartile_range(num_friends)", interquartile_range(num_friends)
131 | 
132 |     print "covariance(num_friends, daily_minutes)", covariance(num_friends, daily_minutes)
133 |     print "correlation(num_friends, daily_minutes)", correlation(num_friends, daily_minutes)
134 |     print "correlation(num_friends_good, daily_minutes_good)", correlation(num_friends_good, daily_minutes_good)
135 |     


--------------------------------------------------------------------------------
/scratch/linear_algebra.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | 
  3 | Vector = List[float]
  4 | 
  5 | height_weight_age = [70,  # inches,
  6 |                      170, # pounds,
  7 |                      40 ] # years
  8 | 
  9 | grades = [95,   # exam1
 10 |           80,   # exam2
 11 |           75,   # exam3
 12 |           62 ]  # exam4
 13 | 
 14 | def add(v: Vector, w: Vector) -> Vector:
 15 |     """Adds corresponding elements"""
 16 |     assert len(v) == len(w), "vectors must be the same length"
 17 | 
 18 |     return [v_i + w_i for v_i, w_i in zip(v, w)]
 19 | 
 20 | assert add([1, 2, 3], [4, 5, 6]) == [5, 7, 9]
 21 | 
 22 | def subtract(v: Vector, w: Vector) -> Vector:
 23 |     """Subtracts corresponding elements"""
 24 |     assert len(v) == len(w), "vectors must be the same length"
 25 | 
 26 |     return [v_i - w_i for v_i, w_i in zip(v, w)]
 27 | 
 28 | assert subtract([5, 7, 9], [4, 5, 6]) == [1, 2, 3]
 29 | 
 30 | def vector_sum(vectors: List[Vector]) -> Vector:
 31 |     """Sums all corresponding elements"""
 32 |     # Check that vectors is not empty
 33 |     assert vectors, "no vectors provided!"
 34 | 
 35 |     # Check the vectors are all the same size
 36 |     num_elements = len(vectors[0])
 37 |     assert all(len(v) == num_elements for v in vectors), "different sizes!"
 38 | 
 39 |     # the i-th element of the result is the sum of every vector[i]
 40 |     return [sum(vector[i] for vector in vectors)
 41 |             for i in range(num_elements)]
 42 | 
 43 | assert vector_sum([[1, 2], [3, 4], [5, 6], [7, 8]]) == [16, 20]
 44 | 
 45 | def scalar_multiply(c: float, v: Vector) -> Vector:
 46 |     """Multiplies every element by c"""
 47 |     return [c * v_i for v_i in v]
 48 | 
 49 | assert scalar_multiply(2, [1, 2, 3]) == [2, 4, 6]
 50 | 
 51 | def vector_mean(vectors: List[Vector]) -> Vector:
 52 |     """Computes the element-wise average"""
 53 |     n = len(vectors)
 54 |     return scalar_multiply(1/n, vector_sum(vectors))
 55 | 
 56 | assert vector_mean([[1, 2], [3, 4], [5, 6]]) == [3, 4]
 57 | 
 58 | def dot(v: Vector, w: Vector) -> float:
 59 |     """Computes v_1 * w_1 + ... + v_n * w_n"""
 60 |     assert len(v) == len(w), "vectors must be same length"
 61 | 
 62 |     return sum(v_i * w_i for v_i, w_i in zip(v, w))
 63 | 
 64 | assert dot([1, 2, 3], [4, 5, 6]) == 32  # 1 * 4 + 2 * 5 + 3 * 6
 65 | 
 66 | def sum_of_squares(v: Vector) -> float:
 67 |     """Returns v_1 * v_1 + ... + v_n * v_n"""
 68 |     return dot(v, v)
 69 | 
 70 | assert sum_of_squares([1, 2, 3]) == 14  # 1 * 1 + 2 * 2 + 3 * 3
 71 | 
 72 | import math
 73 | 
 74 | def magnitude(v: Vector) -> float:
 75 |     """Returns the magnitude (or length) of v"""
 76 |     return math.sqrt(sum_of_squares(v))   # math.sqrt is square root function
 77 | 
 78 | assert magnitude([3, 4]) == 5
 79 | 
 80 | def squared_distance(v: Vector, w: Vector) -> float:
 81 |     """Computes (v_1 - w_1) ** 2 + ... + (v_n - w_n) ** 2"""
 82 |     return sum_of_squares(subtract(v, w))
 83 | 
 84 | def distance(v: Vector, w: Vector) -> float:
 85 |     """Computes the distance between v and w"""
 86 |     return math.sqrt(squared_distance(v, w))
 87 | 
 88 | 
 89 | def distance(v: Vector, w: Vector) -> float:  # type: ignore
 90 |     return magnitude(subtract(v, w))
 91 | 
 92 | # Another type alias
 93 | Matrix = List[List[float]]
 94 | 
 95 | A = [[1, 2, 3],  # A has 2 rows and 3 columns
 96 |      [4, 5, 6]]
 97 | 
 98 | B = [[1, 2],     # B has 3 rows and 2 columns
 99 |      [3, 4],
100 |      [5, 6]]
101 | 
102 | from typing import Tuple
103 | 
104 | def shape(A: Matrix) -> Tuple[int, int]:
105 |     """Returns (# of rows of A, # of columns of A)"""
106 |     num_rows = len(A)
107 |     num_cols = len(A[0]) if A else 0   # number of elements in first row
108 |     return num_rows, num_cols
109 | 
110 | assert shape([[1, 2, 3], [4, 5, 6]]) == (2, 3)  # 2 rows, 3 columns
111 | 
112 | def get_row(A: Matrix, i: int) -> Vector:
113 |     """Returns the i-th row of A (as a Vector)"""
114 |     return A[i]             # A[i] is already the ith row
115 | 
116 | def get_column(A: Matrix, j: int) -> Vector:
117 |     """Returns the j-th column of A (as a Vector)"""
118 |     return [A_i[j]          # jth element of row A_i
119 |             for A_i in A]   # for each row A_i
120 | 
121 | from typing import Callable
122 | 
123 | def make_matrix(num_rows: int,
124 |                 num_cols: int,
125 |                 entry_fn: Callable[[int, int], float]) -> Matrix:
126 |     """
127 |     Returns a num_rows x num_cols matrix
128 |     whose (i,j)-th entry is entry_fn(i, j)
129 |     """
130 |     return [[entry_fn(i, j)             # given i, create a list
131 |              for j in range(num_cols)]  #   [entry_fn(i, 0), ... ]
132 |             for i in range(num_rows)]   # create one list for each i
133 | 
134 | def identity_matrix(n: int) -> Matrix:
135 |     """Returns the n x n identity matrix"""
136 |     return make_matrix(n, n, lambda i, j: 1 if i == j else 0)
137 | 
138 | assert identity_matrix(5) == [[1, 0, 0, 0, 0],
139 |                               [0, 1, 0, 0, 0],
140 |                               [0, 0, 1, 0, 0],
141 |                               [0, 0, 0, 1, 0],
142 |                               [0, 0, 0, 0, 1]]
143 | 
144 | data = [[70, 170, 40],
145 |         [65, 120, 26],
146 |         [77, 250, 19],
147 |         # ....
148 |        ]
149 | 
150 | friendships = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4),
151 |                (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]
152 | 
153 | #            user 0  1  2  3  4  5  6  7  8  9
154 | #
155 | friend_matrix = [[0, 1, 1, 0, 0, 0, 0, 0, 0, 0],  # user 0
156 |                  [1, 0, 1, 1, 0, 0, 0, 0, 0, 0],  # user 1
157 |                  [1, 1, 0, 1, 0, 0, 0, 0, 0, 0],  # user 2
158 |                  [0, 1, 1, 0, 1, 0, 0, 0, 0, 0],  # user 3
159 |                  [0, 0, 0, 1, 0, 1, 0, 0, 0, 0],  # user 4
160 |                  [0, 0, 0, 0, 1, 0, 1, 1, 0, 0],  # user 5
161 |                  [0, 0, 0, 0, 0, 1, 0, 0, 1, 0],  # user 6
162 |                  [0, 0, 0, 0, 0, 1, 0, 0, 1, 0],  # user 7
163 |                  [0, 0, 0, 0, 0, 0, 1, 1, 0, 1],  # user 8
164 |                  [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]]  # user 9
165 | 
166 | assert friend_matrix[0][2] == 1, "0 and 2 are friends"
167 | assert friend_matrix[0][8] == 0, "0 and 8 are not friends"
168 | 
169 | # only need to look at one row
170 | friends_of_five = [i
171 |                    for i, is_friend in enumerate(friend_matrix[5])
172 |                    if is_friend]
173 | 
174 | 


--------------------------------------------------------------------------------
/first-edition/code-python3/decision_trees.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter, defaultdict
  2 | from functools import partial
  3 | import math, random
  4 | 
  5 | def entropy(class_probabilities):
  6 |     """given a list of class probabilities, compute the entropy"""
  7 |     return sum(-p * math.log(p, 2) for p in class_probabilities if p)
  8 | 
  9 | def class_probabilities(labels):
 10 |     total_count = len(labels)
 11 |     return [count / total_count
 12 |             for count in Counter(labels).values()]
 13 | 
 14 | def data_entropy(labeled_data):
 15 |     labels = [label for _, label in labeled_data]
 16 |     probabilities = class_probabilities(labels)
 17 |     return entropy(probabilities)
 18 | 
 19 | def partition_entropy(subsets):
 20 |     """find the entropy from this partition of data into subsets"""
 21 |     total_count = sum(len(subset) for subset in subsets)
 22 | 
 23 |     return sum( data_entropy(subset) * len(subset) / total_count
 24 |                 for subset in subsets )
 25 | 
 26 | def group_by(items, key_fn):
 27 |     """returns a defaultdict(list), where each input item
 28 |     is in the list whose key is key_fn(item)"""
 29 |     groups = defaultdict(list)
 30 |     for item in items:
 31 |         key = key_fn(item)
 32 |         groups[key].append(item)
 33 |     return groups
 34 | 
 35 | def partition_by(inputs, attribute):
 36 |     """returns a dict of inputs partitioned by the attribute
 37 |     each input is a pair (attribute_dict, label)"""
 38 |     return group_by(inputs, lambda x: x[0][attribute])
 39 | 
 40 | def partition_entropy_by(inputs,attribute):
 41 |     """computes the entropy corresponding to the given partition"""
 42 |     partitions = partition_by(inputs, attribute)
 43 |     return partition_entropy(partitions.values())
 44 | 
 45 | def classify(tree, input):
 46 |     """classify the input using the given decision tree"""
 47 | 
 48 |     # if this is a leaf node, return its value
 49 |     if tree in [True, False]:
 50 |         return tree
 51 | 
 52 |     # otherwise find the correct subtree
 53 |     attribute, subtree_dict = tree
 54 | 
 55 |     subtree_key = input.get(attribute)  # None if input is missing attribute
 56 | 
 57 |     if subtree_key not in subtree_dict: # if no subtree for key,
 58 |         subtree_key = None              # we'll use the None subtree
 59 | 
 60 |     subtree = subtree_dict[subtree_key] # choose the appropriate subtree
 61 |     return classify(subtree, input)     # and use it to classify the input
 62 | 
 63 | def build_tree_id3(inputs, split_candidates=None):
 64 | 
 65 |     # if this is our first pass,
 66 |     # all keys of the first input are split candidates
 67 |     if split_candidates is None:
 68 |         split_candidates = inputs[0][0].keys()
 69 | 
 70 |     # count Trues and Falses in the inputs
 71 |     num_inputs = len(inputs)
 72 |     num_trues = len([label for item, label in inputs if label])
 73 |     num_falses = num_inputs - num_trues
 74 | 
 75 |     if num_trues == 0:                  # if only Falses are left
 76 |         return False                    # return a "False" leaf
 77 | 
 78 |     if num_falses == 0:                 # if only Trues are left
 79 |         return True                     # return a "True" leaf
 80 | 
 81 |     if not split_candidates:            # if no split candidates left
 82 |         return num_trues >= num_falses  # return the majority leaf
 83 | 
 84 |     # otherwise, split on the best attribute
 85 |     best_attribute = min(split_candidates,
 86 |         key=partial(partition_entropy_by, inputs))
 87 | 
 88 |     partitions = partition_by(inputs, best_attribute)
 89 |     new_candidates = [a for a in split_candidates
 90 |                       if a != best_attribute]
 91 | 
 92 |     # recursively build the subtrees
 93 |     subtrees = { attribute : build_tree_id3(subset, new_candidates)
 94 |                  for attribute, subset in partitions.items() }
 95 | 
 96 |     subtrees[None] = num_trues > num_falses # default case
 97 | 
 98 |     return (best_attribute, subtrees)
 99 | 
100 | def forest_classify(trees, input):
101 |     votes = [classify(tree, input) for tree in trees]
102 |     vote_counts = Counter(votes)
103 |     return vote_counts.most_common(1)[0][0]
104 | 
105 | 
106 | if __name__ == "__main__":
107 | 
108 |     inputs = [
109 |         ({'level':'Senior','lang':'Java','tweets':'no','phd':'no'},   False),
110 |         ({'level':'Senior','lang':'Java','tweets':'no','phd':'yes'},  False),
111 |         ({'level':'Mid','lang':'Python','tweets':'no','phd':'no'},     True),
112 |         ({'level':'Junior','lang':'Python','tweets':'no','phd':'no'},  True),
113 |         ({'level':'Junior','lang':'R','tweets':'yes','phd':'no'},      True),
114 |         ({'level':'Junior','lang':'R','tweets':'yes','phd':'yes'},    False),
115 |         ({'level':'Mid','lang':'R','tweets':'yes','phd':'yes'},        True),
116 |         ({'level':'Senior','lang':'Python','tweets':'no','phd':'no'}, False),
117 |         ({'level':'Senior','lang':'R','tweets':'yes','phd':'no'},      True),
118 |         ({'level':'Junior','lang':'Python','tweets':'yes','phd':'no'}, True),
119 |         ({'level':'Senior','lang':'Python','tweets':'yes','phd':'yes'},True),
120 |         ({'level':'Mid','lang':'Python','tweets':'no','phd':'yes'},    True),
121 |         ({'level':'Mid','lang':'Java','tweets':'yes','phd':'no'},      True),
122 |         ({'level':'Junior','lang':'Python','tweets':'no','phd':'yes'},False)
123 |     ]
124 | 
125 |     for key in ['level','lang','tweets','phd']:
126 |         print(key, partition_entropy_by(inputs, key))
127 |     print()
128 | 
129 |     senior_inputs = [(input, label)
130 |                      for input, label in inputs if input["level"] == "Senior"]
131 | 
132 |     for key in ['lang', 'tweets', 'phd']:
133 |         print(key, partition_entropy_by(senior_inputs, key))
134 |     print()
135 | 
136 |     print("building the tree")
137 |     tree = build_tree_id3(inputs)
138 |     print(tree)
139 | 
140 |     print("Junior / Java / tweets / no phd", classify(tree,
141 |         { "level" : "Junior",
142 |           "lang" : "Java",
143 |           "tweets" : "yes",
144 |           "phd" : "no"} ))
145 | 
146 |     print("Junior / Java / tweets / phd", classify(tree,
147 |         { "level" : "Junior",
148 |                  "lang" : "Java",
149 |                  "tweets" : "yes",
150 |                  "phd" : "yes"} ))
151 | 
152 |     print("Intern", classify(tree, { "level" : "Intern" } ))
153 |     print("Senior", classify(tree, { "level" : "Senior" } ))
154 | 


--------------------------------------------------------------------------------
/first-edition/code-python3/logistic_regression.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | from functools import partial, reduce
  3 | from linear_algebra import dot, vector_add
  4 | from gradient_descent import maximize_stochastic, maximize_batch
  5 | from working_with_data import rescale
  6 | from machine_learning import train_test_split
  7 | from multiple_regression import estimate_beta, predict
  8 | import math, random
  9 | 
 10 | def logistic(x):
 11 |     return 1.0 / (1 + math.exp(-x))
 12 | 
 13 | def logistic_prime(x):
 14 |     return logistic(x) * (1 - logistic(x))
 15 | 
 16 | def logistic_log_likelihood_i(x_i, y_i, beta):
 17 |     if y_i == 1:
 18 |         return math.log(logistic(dot(x_i, beta)))
 19 |     else:
 20 |         return math.log(1 - logistic(dot(x_i, beta)))
 21 | 
 22 | def logistic_log_likelihood(x, y, beta):
 23 |     return sum(logistic_log_likelihood_i(x_i, y_i, beta)
 24 |                for x_i, y_i in zip(x, y))
 25 | 
 26 | def logistic_log_partial_ij(x_i, y_i, beta, j):
 27 |     """here i is the index of the data point,
 28 |     j the index of the derivative"""
 29 | 
 30 |     return (y_i - logistic(dot(x_i, beta))) * x_i[j]
 31 | 
 32 | def logistic_log_gradient_i(x_i, y_i, beta):
 33 |     """the gradient of the log likelihood
 34 |     corresponding to the i-th data point"""
 35 | 
 36 |     return [logistic_log_partial_ij(x_i, y_i, beta, j)
 37 |             for j, _ in enumerate(beta)]
 38 | 
 39 | def logistic_log_gradient(x, y, beta):
 40 |     return reduce(vector_add,
 41 |                   [logistic_log_gradient_i(x_i, y_i, beta)
 42 |                    for x_i, y_i in zip(x,y)])
 43 | 
 44 | if __name__ == "__main__":
 45 | 
 46 |     data = [(0.7,48000,1),(1.9,48000,0),(2.5,60000,1),(4.2,63000,0),(6,76000,0),(6.5,69000,0),(7.5,76000,0),(8.1,88000,0),(8.7,83000,1),(10,83000,1),(0.8,43000,0),(1.8,60000,0),(10,79000,1),(6.1,76000,0),(1.4,50000,0),(9.1,92000,0),(5.8,75000,0),(5.2,69000,0),(1,56000,0),(6,67000,0),(4.9,74000,0),(6.4,63000,1),(6.2,82000,0),(3.3,58000,0),(9.3,90000,1),(5.5,57000,1),(9.1,102000,0),(2.4,54000,0),(8.2,65000,1),(5.3,82000,0),(9.8,107000,0),(1.8,64000,0),(0.6,46000,1),(0.8,48000,0),(8.6,84000,1),(0.6,45000,0),(0.5,30000,1),(7.3,89000,0),(2.5,48000,1),(5.6,76000,0),(7.4,77000,0),(2.7,56000,0),(0.7,48000,0),(1.2,42000,0),(0.2,32000,1),(4.7,56000,1),(2.8,44000,1),(7.6,78000,0),(1.1,63000,0),(8,79000,1),(2.7,56000,0),(6,52000,1),(4.6,56000,0),(2.5,51000,0),(5.7,71000,0),(2.9,65000,0),(1.1,33000,1),(3,62000,0),(4,71000,0),(2.4,61000,0),(7.5,75000,0),(9.7,81000,1),(3.2,62000,0),(7.9,88000,0),(4.7,44000,1),(2.5,55000,0),(1.6,41000,0),(6.7,64000,1),(6.9,66000,1),(7.9,78000,1),(8.1,102000,0),(5.3,48000,1),(8.5,66000,1),(0.2,56000,0),(6,69000,0),(7.5,77000,0),(8,86000,0),(4.4,68000,0),(4.9,75000,0),(1.5,60000,0),(2.2,50000,0),(3.4,49000,1),(4.2,70000,0),(7.7,98000,0),(8.2,85000,0),(5.4,88000,0),(0.1,46000,0),(1.5,37000,0),(6.3,86000,0),(3.7,57000,0),(8.4,85000,0),(2,42000,0),(5.8,69000,1),(2.7,64000,0),(3.1,63000,0),(1.9,48000,0),(10,72000,1),(0.2,45000,0),(8.6,95000,0),(1.5,64000,0),(9.8,95000,0),(5.3,65000,0),(7.5,80000,0),(9.9,91000,0),(9.7,50000,1),(2.8,68000,0),(3.6,58000,0),(3.9,74000,0),(4.4,76000,0),(2.5,49000,0),(7.2,81000,0),(5.2,60000,1),(2.4,62000,0),(8.9,94000,0),(2.4,63000,0),(6.8,69000,1),(6.5,77000,0),(7,86000,0),(9.4,94000,0),(7.8,72000,1),(0.2,53000,0),(10,97000,0),(5.5,65000,0),(7.7,71000,1),(8.1,66000,1),(9.8,91000,0),(8,84000,0),(2.7,55000,0),(2.8,62000,0),(9.4,79000,0),(2.5,57000,0),(7.4,70000,1),(2.1,47000,0),(5.3,62000,1),(6.3,79000,0),(6.8,58000,1),(5.7,80000,0),(2.2,61000,0),(4.8,62000,0),(3.7,64000,0),(4.1,85000,0),(2.3,51000,0),(3.5,58000,0),(0.9,43000,0),(0.9,54000,0),(4.5,74000,0),(6.5,55000,1),(4.1,41000,1),(7.1,73000,0),(1.1,66000,0),(9.1,81000,1),(8,69000,1),(7.3,72000,1),(3.3,50000,0),(3.9,58000,0),(2.6,49000,0),(1.6,78000,0),(0.7,56000,0),(2.1,36000,1),(7.5,90000,0),(4.8,59000,1),(8.9,95000,0),(6.2,72000,0),(6.3,63000,0),(9.1,100000,0),(7.3,61000,1),(5.6,74000,0),(0.5,66000,0),(1.1,59000,0),(5.1,61000,0),(6.2,70000,0),(6.6,56000,1),(6.3,76000,0),(6.5,78000,0),(5.1,59000,0),(9.5,74000,1),(4.5,64000,0),(2,54000,0),(1,52000,0),(4,69000,0),(6.5,76000,0),(3,60000,0),(4.5,63000,0),(7.8,70000,0),(3.9,60000,1),(0.8,51000,0),(4.2,78000,0),(1.1,54000,0),(6.2,60000,0),(2.9,59000,0),(2.1,52000,0),(8.2,87000,0),(4.8,73000,0),(2.2,42000,1),(9.1,98000,0),(6.5,84000,0),(6.9,73000,0),(5.1,72000,0),(9.1,69000,1),(9.8,79000,1),]
 47 |     data = list(map(list, data)) # change tuples to lists
 48 | 
 49 |     x = [[1] + row[:2] for row in data] # each element is [1, experience, salary]
 50 |     y = [row[2] for row in data]        # each element is paid_account
 51 | 
 52 |     print("linear regression:")
 53 | 
 54 |     rescaled_x = rescale(x)
 55 |     beta = estimate_beta(rescaled_x, y)
 56 |     print(beta)
 57 | 
 58 |     print("logistic regression:")
 59 | 
 60 |     random.seed(0)
 61 |     x_train, x_test, y_train, y_test = train_test_split(rescaled_x, y, 0.33)
 62 | 
 63 |     # want to maximize log likelihood on the training data
 64 |     fn = partial(logistic_log_likelihood, x_train, y_train)
 65 |     gradient_fn = partial(logistic_log_gradient, x_train, y_train)
 66 | 
 67 |     # pick a random starting point
 68 |     beta_0 = [1, 1, 1]
 69 | 
 70 |     # and maximize using gradient descent
 71 |     beta_hat = maximize_batch(fn, gradient_fn, beta_0)
 72 | 
 73 |     print("beta_batch", beta_hat)
 74 | 
 75 |     beta_0 = [1, 1, 1]
 76 |     beta_hat = maximize_stochastic(logistic_log_likelihood_i,
 77 |                                    logistic_log_gradient_i,
 78 |                                    x_train, y_train, beta_0)
 79 | 
 80 |     print("beta stochastic", beta_hat)
 81 | 
 82 |     true_positives = false_positives = true_negatives = false_negatives = 0
 83 | 
 84 |     for x_i, y_i in zip(x_test, y_test):
 85 |         predict = logistic(dot(beta_hat, x_i))
 86 | 
 87 |         if y_i == 1 and predict >= 0.5:  # TP: paid and we predict paid
 88 |             true_positives += 1
 89 |         elif y_i == 1:                   # FN: paid and we predict unpaid
 90 |             false_negatives += 1
 91 |         elif predict >= 0.5:             # FP: unpaid and we predict paid
 92 |             false_positives += 1
 93 |         else:                            # TN: unpaid and we predict unpaid
 94 |             true_negatives += 1
 95 | 
 96 |     precision = true_positives / (true_positives + false_positives)
 97 |     recall = true_positives / (true_positives + false_negatives)
 98 | 
 99 |     print("precision", precision)
100 |     print("recall", recall)
101 | 


--------------------------------------------------------------------------------
/first-edition/code-python3/mapreduce.py:
--------------------------------------------------------------------------------
  1 | import math, random, re, datetime
  2 | from collections import defaultdict, Counter
  3 | from functools import partial
  4 | from naive_bayes import tokenize
  5 | 
  6 | def word_count_old(documents):
  7 |     """word count not using MapReduce"""
  8 |     return Counter(word
  9 |         for document in documents
 10 |         for word in tokenize(document))
 11 | 
 12 | def wc_mapper(document):
 13 |     """for each word in the document, emit (word,1)"""
 14 |     for word in tokenize(document):
 15 |         yield (word, 1)
 16 | 
 17 | def wc_reducer(word, counts):
 18 |     """sum up the counts for a word"""
 19 |     yield (word, sum(counts))
 20 | 
 21 | def word_count(documents):
 22 |     """count the words in the input documents using MapReduce"""
 23 | 
 24 |     # place to store grouped values
 25 |     collector = defaultdict(list)
 26 | 
 27 |     for document in documents:
 28 |         for word, count in wc_mapper(document):
 29 |             collector[word].append(count)
 30 | 
 31 |     return [output
 32 |             for word, counts in collector.items()
 33 |             for output in wc_reducer(word, counts)]
 34 | 
 35 | def map_reduce(inputs, mapper, reducer):
 36 |     """runs MapReduce on the inputs using mapper and reducer"""
 37 |     collector = defaultdict(list)
 38 | 
 39 |     for input in inputs:
 40 |         for key, value in mapper(input):
 41 |             collector[key].append(value)
 42 | 
 43 |     return [output
 44 |             for key, values in collector.items()
 45 |             for output in reducer(key,values)]
 46 | 
 47 | def reduce_with(aggregation_fn, key, values):
 48 |     """reduces a key-values pair by applying aggregation_fn to the values"""
 49 |     yield (key, aggregation_fn(values))
 50 | 
 51 | def values_reducer(aggregation_fn):
 52 |     """turns a function (values -> output) into a reducer"""
 53 |     return partial(reduce_with, aggregation_fn)
 54 | 
 55 | sum_reducer = values_reducer(sum)
 56 | max_reducer = values_reducer(max)
 57 | min_reducer = values_reducer(min)
 58 | count_distinct_reducer = values_reducer(lambda values: len(set(values)))
 59 | 
 60 | #
 61 | # Analyzing Status Updates
 62 | #
 63 | 
 64 | status_updates = [
 65 |     {"id": 1,
 66 |      "username" : "joelgrus",
 67 |      "text" : "Is anyone interested in a data science book?",
 68 |      "created_at" : datetime.datetime(2013, 12, 21, 11, 47, 0),
 69 |      "liked_by" : ["data_guy", "data_gal", "bill"] },
 70 |     # add your own
 71 | ]
 72 | 
 73 | def data_science_day_mapper(status_update):
 74 |     """yields (day_of_week, 1) if status_update contains "data science" """
 75 |     if "data science" in status_update["text"].lower():
 76 |         day_of_week = status_update["created_at"].weekday()
 77 |         yield (day_of_week, 1)
 78 | 
 79 | data_science_days = map_reduce(status_updates,
 80 |                                data_science_day_mapper,
 81 |                                sum_reducer)
 82 | 
 83 | def words_per_user_mapper(status_update):
 84 |     user = status_update["username"]
 85 |     for word in tokenize(status_update["text"]):
 86 |         yield (user, (word, 1))
 87 | 
 88 | def most_popular_word_reducer(user, words_and_counts):
 89 |     """given a sequence of (word, count) pairs,
 90 |     return the word with the highest total count"""
 91 | 
 92 |     word_counts = Counter()
 93 |     for word, count in words_and_counts:
 94 |         word_counts[word] += count
 95 | 
 96 |     word, count = word_counts.most_common(1)[0]
 97 | 
 98 |     yield (user, (word, count))
 99 | 
100 | user_words = map_reduce(status_updates,
101 |                         words_per_user_mapper,
102 |                         most_popular_word_reducer)
103 | 
104 | def liker_mapper(status_update):
105 |     user = status_update["username"]
106 |     for liker in status_update["liked_by"]:
107 |         yield (user, liker)
108 | 
109 | distinct_likers_per_user = map_reduce(status_updates,
110 |                                       liker_mapper,
111 |                                       count_distinct_reducer)
112 | 
113 | 
114 | #
115 | # matrix multiplication
116 | #
117 | 
118 | def matrix_multiply_mapper(m, element):
119 |     """m is the common dimension (columns of A, rows of B)
120 |     element is a tuple (matrix_name, i, j, value)"""
121 |     matrix, i, j, value = element
122 | 
123 |     if matrix == "A":
124 |         for column in range(m):
125 |             # A_ij is the jth entry in the sum for each C_i_column
126 |             yield((i, column), (j, value))
127 |     else:
128 |         for row in range(m):
129 |             # B_ij is the ith entry in the sum for each C_row_j
130 |             yield((row, j), (i, value))
131 | 
132 | def matrix_multiply_reducer(m, key, indexed_values):
133 |     results_by_index = defaultdict(list)
134 |     for index, value in indexed_values:
135 |         results_by_index[index].append(value)
136 | 
137 |     # sum up all the products of the positions with two results
138 |     sum_product = sum(results[0] * results[1]
139 |                       for results in results_by_index.values()
140 |                       if len(results) == 2)
141 | 
142 |     if sum_product != 0.0:
143 |         yield (key, sum_product)
144 | 
145 | if __name__ == "__main__":
146 | 
147 |     documents = ["data science", "big data", "science fiction"]
148 | 
149 |     wc_mapper_results = [result
150 |                          for document in documents
151 |                          for result in wc_mapper(document)]
152 | 
153 |     print("wc_mapper results")
154 |     print(wc_mapper_results)
155 |     print()
156 | 
157 |     print("word count results")
158 |     print(word_count(documents))
159 |     print()
160 | 
161 |     print("word count using map_reduce function")
162 |     print(map_reduce(documents, wc_mapper, wc_reducer))
163 |     print()
164 | 
165 |     print("data science days")
166 |     print(data_science_days)
167 |     print()
168 | 
169 |     print("user words")
170 |     print(user_words)
171 |     print()
172 | 
173 |     print("distinct likers")
174 |     print(distinct_likers_per_user)
175 |     print()
176 | 
177 |     # matrix multiplication
178 | 
179 |     entries = [("A", 0, 0, 3), ("A", 0, 1,  2),
180 |            ("B", 0, 0, 4), ("B", 0, 1, -1), ("B", 1, 0, 10)]
181 |     mapper = partial(matrix_multiply_mapper, 3)
182 |     reducer = partial(matrix_multiply_reducer, 3)
183 | 
184 |     print("map-reduce matrix multiplication")
185 |     print("entries:", entries)
186 |     print("result:", map_reduce(entries, mapper, reducer))
187 | 


--------------------------------------------------------------------------------
/first-edition/code/logistic_regression.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from collections import Counter
  3 | from functools import partial
  4 | from linear_algebra import dot, vector_add
  5 | from gradient_descent import maximize_stochastic, maximize_batch
  6 | from working_with_data import rescale
  7 | from machine_learning import train_test_split
  8 | from multiple_regression import estimate_beta, predict
  9 | import math, random
 10 | 
 11 | def logistic(x):
 12 |     return 1.0 / (1 + math.exp(-x))
 13 | 
 14 | def logistic_prime(x):
 15 |     return logistic(x) * (1 - logistic(x))
 16 | 
 17 | def logistic_log_likelihood_i(x_i, y_i, beta):
 18 |     if y_i == 1:
 19 |         return math.log(logistic(dot(x_i, beta)))
 20 |     else:
 21 |         return math.log(1 - logistic(dot(x_i, beta)))
 22 | 
 23 | def logistic_log_likelihood(x, y, beta):
 24 |     return sum(logistic_log_likelihood_i(x_i, y_i, beta)
 25 |                for x_i, y_i in zip(x, y))
 26 | 
 27 | def logistic_log_partial_ij(x_i, y_i, beta, j):
 28 |     """here i is the index of the data point,
 29 |     j the index of the derivative"""
 30 | 
 31 |     return (y_i - logistic(dot(x_i, beta))) * x_i[j]
 32 |     
 33 | def logistic_log_gradient_i(x_i, y_i, beta):
 34 |     """the gradient of the log likelihood 
 35 |     corresponding to the i-th data point"""
 36 | 
 37 |     return [logistic_log_partial_ij(x_i, y_i, beta, j)
 38 |             for j, _ in enumerate(beta)]
 39 |             
 40 | def logistic_log_gradient(x, y, beta):
 41 |     return reduce(vector_add,
 42 |                   [logistic_log_gradient_i(x_i, y_i, beta)
 43 |                    for x_i, y_i in zip(x,y)])    
 44 | 
 45 | if __name__ == "__main__":
 46 | 
 47 |     data = [(0.7,48000,1),(1.9,48000,0),(2.5,60000,1),(4.2,63000,0),(6,76000,0),(6.5,69000,0),(7.5,76000,0),(8.1,88000,0),(8.7,83000,1),(10,83000,1),(0.8,43000,0),(1.8,60000,0),(10,79000,1),(6.1,76000,0),(1.4,50000,0),(9.1,92000,0),(5.8,75000,0),(5.2,69000,0),(1,56000,0),(6,67000,0),(4.9,74000,0),(6.4,63000,1),(6.2,82000,0),(3.3,58000,0),(9.3,90000,1),(5.5,57000,1),(9.1,102000,0),(2.4,54000,0),(8.2,65000,1),(5.3,82000,0),(9.8,107000,0),(1.8,64000,0),(0.6,46000,1),(0.8,48000,0),(8.6,84000,1),(0.6,45000,0),(0.5,30000,1),(7.3,89000,0),(2.5,48000,1),(5.6,76000,0),(7.4,77000,0),(2.7,56000,0),(0.7,48000,0),(1.2,42000,0),(0.2,32000,1),(4.7,56000,1),(2.8,44000,1),(7.6,78000,0),(1.1,63000,0),(8,79000,1),(2.7,56000,0),(6,52000,1),(4.6,56000,0),(2.5,51000,0),(5.7,71000,0),(2.9,65000,0),(1.1,33000,1),(3,62000,0),(4,71000,0),(2.4,61000,0),(7.5,75000,0),(9.7,81000,1),(3.2,62000,0),(7.9,88000,0),(4.7,44000,1),(2.5,55000,0),(1.6,41000,0),(6.7,64000,1),(6.9,66000,1),(7.9,78000,1),(8.1,102000,0),(5.3,48000,1),(8.5,66000,1),(0.2,56000,0),(6,69000,0),(7.5,77000,0),(8,86000,0),(4.4,68000,0),(4.9,75000,0),(1.5,60000,0),(2.2,50000,0),(3.4,49000,1),(4.2,70000,0),(7.7,98000,0),(8.2,85000,0),(5.4,88000,0),(0.1,46000,0),(1.5,37000,0),(6.3,86000,0),(3.7,57000,0),(8.4,85000,0),(2,42000,0),(5.8,69000,1),(2.7,64000,0),(3.1,63000,0),(1.9,48000,0),(10,72000,1),(0.2,45000,0),(8.6,95000,0),(1.5,64000,0),(9.8,95000,0),(5.3,65000,0),(7.5,80000,0),(9.9,91000,0),(9.7,50000,1),(2.8,68000,0),(3.6,58000,0),(3.9,74000,0),(4.4,76000,0),(2.5,49000,0),(7.2,81000,0),(5.2,60000,1),(2.4,62000,0),(8.9,94000,0),(2.4,63000,0),(6.8,69000,1),(6.5,77000,0),(7,86000,0),(9.4,94000,0),(7.8,72000,1),(0.2,53000,0),(10,97000,0),(5.5,65000,0),(7.7,71000,1),(8.1,66000,1),(9.8,91000,0),(8,84000,0),(2.7,55000,0),(2.8,62000,0),(9.4,79000,0),(2.5,57000,0),(7.4,70000,1),(2.1,47000,0),(5.3,62000,1),(6.3,79000,0),(6.8,58000,1),(5.7,80000,0),(2.2,61000,0),(4.8,62000,0),(3.7,64000,0),(4.1,85000,0),(2.3,51000,0),(3.5,58000,0),(0.9,43000,0),(0.9,54000,0),(4.5,74000,0),(6.5,55000,1),(4.1,41000,1),(7.1,73000,0),(1.1,66000,0),(9.1,81000,1),(8,69000,1),(7.3,72000,1),(3.3,50000,0),(3.9,58000,0),(2.6,49000,0),(1.6,78000,0),(0.7,56000,0),(2.1,36000,1),(7.5,90000,0),(4.8,59000,1),(8.9,95000,0),(6.2,72000,0),(6.3,63000,0),(9.1,100000,0),(7.3,61000,1),(5.6,74000,0),(0.5,66000,0),(1.1,59000,0),(5.1,61000,0),(6.2,70000,0),(6.6,56000,1),(6.3,76000,0),(6.5,78000,0),(5.1,59000,0),(9.5,74000,1),(4.5,64000,0),(2,54000,0),(1,52000,0),(4,69000,0),(6.5,76000,0),(3,60000,0),(4.5,63000,0),(7.8,70000,0),(3.9,60000,1),(0.8,51000,0),(4.2,78000,0),(1.1,54000,0),(6.2,60000,0),(2.9,59000,0),(2.1,52000,0),(8.2,87000,0),(4.8,73000,0),(2.2,42000,1),(9.1,98000,0),(6.5,84000,0),(6.9,73000,0),(5.1,72000,0),(9.1,69000,1),(9.8,79000,1),]
 48 |     data = map(list, data) # change tuples to lists
 49 | 
 50 |     x = [[1] + row[:2] for row in data] # each element is [1, experience, salary]
 51 |     y = [row[2] for row in data]        # each element is paid_account
 52 | 
 53 |     print "linear regression:"
 54 | 
 55 |     rescaled_x = rescale(x)
 56 |     beta = estimate_beta(rescaled_x, y)
 57 |     print beta
 58 | 
 59 |     print "logistic regression:"
 60 | 
 61 |     random.seed(0)
 62 |     x_train, x_test, y_train, y_test = train_test_split(rescaled_x, y, 0.33)
 63 | 
 64 |     # want to maximize log likelihood on the training data
 65 |     fn = partial(logistic_log_likelihood, x_train, y_train)
 66 |     gradient_fn = partial(logistic_log_gradient, x_train, y_train)
 67 | 
 68 |     # pick a random starting point
 69 |     beta_0 = [1, 1, 1]
 70 | 
 71 |     # and maximize using gradient descent
 72 |     beta_hat = maximize_batch(fn, gradient_fn, beta_0)
 73 | 
 74 |     print "beta_batch", beta_hat
 75 | 
 76 |     beta_0 = [1, 1, 1]
 77 |     beta_hat = maximize_stochastic(logistic_log_likelihood_i,
 78 |                                logistic_log_gradient_i,
 79 |                                x_train, y_train, beta_0)
 80 | 
 81 |     print "beta stochastic", beta_hat
 82 | 
 83 |     true_positives = false_positives = true_negatives = false_negatives = 0
 84 | 
 85 |     for x_i, y_i in zip(x_test, y_test):
 86 |         predict = logistic(dot(beta_hat, x_i))
 87 | 
 88 |         if y_i == 1 and predict >= 0.5:  # TP: paid and we predict paid
 89 |             true_positives += 1
 90 |         elif y_i == 1:                   # FN: paid and we predict unpaid
 91 |             false_negatives += 1
 92 |         elif predict >= 0.5:             # FP: unpaid and we predict paid
 93 |             false_positives += 1
 94 |         else:                            # TN: unpaid and we predict unpaid
 95 |             true_negatives += 1
 96 | 
 97 |     precision = true_positives / (true_positives + false_positives)
 98 |     recall = true_positives / (true_positives + false_negatives)
 99 | 
100 |     print "precision", precision
101 |     print "recall", recall


--------------------------------------------------------------------------------
/first-edition/code/decision_trees.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from collections import Counter, defaultdict
  3 | from functools import partial
  4 | import math, random
  5 | 
  6 | def entropy(class_probabilities):
  7 |     """given a list of class probabilities, compute the entropy"""
  8 |     return sum(-p * math.log(p, 2) for p in class_probabilities if p)
  9 | 
 10 | def class_probabilities(labels):
 11 |     total_count = len(labels)
 12 |     return [count / total_count
 13 |             for count in Counter(labels).values()]
 14 | 
 15 | def data_entropy(labeled_data):        
 16 |     labels = [label for _, label in labeled_data]
 17 |     probabilities = class_probabilities(labels)
 18 |     return entropy(probabilities)
 19 | 
 20 | def partition_entropy(subsets):
 21 |     """find the entropy from this partition of data into subsets"""
 22 |     total_count = sum(len(subset) for subset in subsets)
 23 |     
 24 |     return sum( data_entropy(subset) * len(subset) / total_count
 25 |                 for subset in subsets )
 26 | 
 27 | def group_by(items, key_fn):
 28 |     """returns a defaultdict(list), where each input item 
 29 |     is in the list whose key is key_fn(item)"""
 30 |     groups = defaultdict(list)
 31 |     for item in items:
 32 |         key = key_fn(item)
 33 |         groups[key].append(item)
 34 |     return groups
 35 |     
 36 | def partition_by(inputs, attribute):
 37 |     """returns a dict of inputs partitioned by the attribute
 38 |     each input is a pair (attribute_dict, label)"""
 39 |     return group_by(inputs, lambda x: x[0][attribute])    
 40 | 
 41 | def partition_entropy_by(inputs,attribute):
 42 |     """computes the entropy corresponding to the given partition"""        
 43 |     partitions = partition_by(inputs, attribute)
 44 |     return partition_entropy(partitions.values())        
 45 | 
 46 | def classify(tree, input):
 47 |     """classify the input using the given decision tree"""
 48 |     
 49 |     # if this is a leaf node, return its value
 50 |     if tree in [True, False]:
 51 |         return tree
 52 |    
 53 |     # otherwise find the correct subtree
 54 |     attribute, subtree_dict = tree
 55 |     
 56 |     subtree_key = input.get(attribute)  # None if input is missing attribute
 57 | 
 58 |     if subtree_key not in subtree_dict: # if no subtree for key,
 59 |         subtree_key = None              # we'll use the None subtree
 60 |     
 61 |     subtree = subtree_dict[subtree_key] # choose the appropriate subtree
 62 |     return classify(subtree, input)     # and use it to classify the input
 63 | 
 64 | def build_tree_id3(inputs, split_candidates=None):
 65 | 
 66 |     # if this is our first pass, 
 67 |     # all keys of the first input are split candidates
 68 |     if split_candidates is None:
 69 |         split_candidates = inputs[0][0].keys()
 70 | 
 71 |     # count Trues and Falses in the inputs
 72 |     num_inputs = len(inputs)
 73 |     num_trues = len([label for item, label in inputs if label])
 74 |     num_falses = num_inputs - num_trues
 75 |     
 76 |     if num_trues == 0:                  # if only Falses are left
 77 |         return False                    # return a "False" leaf
 78 |         
 79 |     if num_falses == 0:                 # if only Trues are left
 80 |         return True                     # return a "True" leaf
 81 | 
 82 |     if not split_candidates:            # if no split candidates left
 83 |         return num_trues >= num_falses  # return the majority leaf
 84 |                             
 85 |     # otherwise, split on the best attribute
 86 |     best_attribute = min(split_candidates,
 87 |         key=partial(partition_entropy_by, inputs))
 88 | 
 89 |     partitions = partition_by(inputs, best_attribute)
 90 |     new_candidates = [a for a in split_candidates 
 91 |                       if a != best_attribute]
 92 |     
 93 |     # recursively build the subtrees
 94 |     subtrees = { attribute : build_tree_id3(subset, new_candidates)
 95 |                  for attribute, subset in partitions.iteritems() }
 96 | 
 97 |     subtrees[None] = num_trues > num_falses # default case
 98 | 
 99 |     return (best_attribute, subtrees)
100 | 
101 | def forest_classify(trees, input):
102 |     votes = [classify(tree, input) for tree in trees]
103 |     vote_counts = Counter(votes)
104 |     return vote_counts.most_common(1)[0][0]
105 | 
106 | 
107 | if __name__ == "__main__":
108 | 
109 |     inputs = [
110 |         ({'level':'Senior','lang':'Java','tweets':'no','phd':'no'},   False),
111 |         ({'level':'Senior','lang':'Java','tweets':'no','phd':'yes'},  False),
112 |         ({'level':'Mid','lang':'Python','tweets':'no','phd':'no'},     True),
113 |         ({'level':'Junior','lang':'Python','tweets':'no','phd':'no'},  True),
114 |         ({'level':'Junior','lang':'R','tweets':'yes','phd':'no'},      True),
115 |         ({'level':'Junior','lang':'R','tweets':'yes','phd':'yes'},    False),
116 |         ({'level':'Mid','lang':'R','tweets':'yes','phd':'yes'},        True),
117 |         ({'level':'Senior','lang':'Python','tweets':'no','phd':'no'}, False),
118 |         ({'level':'Senior','lang':'R','tweets':'yes','phd':'no'},      True),
119 |         ({'level':'Junior','lang':'Python','tweets':'yes','phd':'no'}, True),
120 |         ({'level':'Senior','lang':'Python','tweets':'yes','phd':'yes'},True),
121 |         ({'level':'Mid','lang':'Python','tweets':'no','phd':'yes'},    True),
122 |         ({'level':'Mid','lang':'Java','tweets':'yes','phd':'no'},      True),
123 |         ({'level':'Junior','lang':'Python','tweets':'no','phd':'yes'},False)
124 |     ]
125 | 
126 |     for key in ['level','lang','tweets','phd']:
127 |         print key, partition_entropy_by(inputs, key)
128 |     print
129 | 
130 |     senior_inputs = [(input, label)
131 |                      for input, label in inputs if input["level"] == "Senior"]
132 | 
133 |     for key in ['lang', 'tweets', 'phd']:
134 |         print key, partition_entropy_by(senior_inputs, key)
135 |     print
136 | 
137 |     print "building the tree"
138 |     tree = build_tree_id3(inputs)
139 |     print tree
140 | 
141 |     print "Junior / Java / tweets / no phd", classify(tree, 
142 |         { "level" : "Junior", 
143 |           "lang" : "Java", 
144 |           "tweets" : "yes", 
145 |           "phd" : "no"} ) 
146 | 
147 |     print "Junior / Java / tweets / phd", classify(tree, 
148 |         { "level" : "Junior", 
149 |                  "lang" : "Java", 
150 |                  "tweets" : "yes", 
151 |                  "phd" : "yes"} )
152 | 
153 |     print "Intern", classify(tree, { "level" : "Intern" } )
154 |     print "Senior", classify(tree, { "level" : "Senior" } )
155 | 
156 | 


--------------------------------------------------------------------------------
/first-edition/code/mapreduce.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import math, random, re, datetime
  3 | from collections import defaultdict, Counter
  4 | from functools import partial
  5 | from naive_bayes import tokenize
  6 | 
  7 | def word_count_old(documents):
  8 |     """word count not using MapReduce"""
  9 |     return Counter(word 
 10 |         for document in documents 
 11 |         for word in tokenize(document))
 12 | 
 13 | def wc_mapper(document):
 14 |     """for each word in the document, emit (word,1)"""        
 15 |     for word in tokenize(document):
 16 |         yield (word, 1)
 17 | 
 18 | def wc_reducer(word, counts):
 19 |     """sum up the counts for a word"""
 20 |     yield (word, sum(counts))
 21 | 
 22 | def word_count(documents):
 23 |     """count the words in the input documents using MapReduce"""
 24 | 
 25 |     # place to store grouped values
 26 |     collector = defaultdict(list) 
 27 | 
 28 |     for document in documents:
 29 |         for word, count in wc_mapper(document):
 30 |             collector[word].append(count)
 31 | 
 32 |     return [output
 33 |             for word, counts in collector.iteritems()
 34 |             for output in wc_reducer(word, counts)]
 35 | 
 36 | def map_reduce(inputs, mapper, reducer):
 37 |     """runs MapReduce on the inputs using mapper and reducer"""
 38 |     collector = defaultdict(list)
 39 | 
 40 |     for input in inputs:
 41 |         for key, value in mapper(input):
 42 |             collector[key].append(value)
 43 | 
 44 |     return [output
 45 |             for key, values in collector.iteritems()
 46 |             for output in reducer(key,values)]
 47 | 
 48 | def reduce_with(aggregation_fn, key, values):
 49 |     """reduces a key-values pair by applying aggregation_fn to the values"""
 50 |     yield (key, aggregation_fn(values))
 51 | 
 52 | def values_reducer(aggregation_fn):
 53 |     """turns a function (values -> output) into a reducer"""
 54 |     return partial(reduce_with, aggregation_fn)
 55 | 
 56 | sum_reducer = values_reducer(sum)
 57 | max_reducer = values_reducer(max)
 58 | min_reducer = values_reducer(min)
 59 | count_distinct_reducer = values_reducer(lambda values: len(set(values)))
 60 | 
 61 | # 
 62 | # Analyzing Status Updates
 63 | #
 64 | 
 65 | status_updates = [
 66 |     {"id": 1, 
 67 |      "username" : "joelgrus", 
 68 |      "text" : "Is anyone interested in a data science book?",
 69 |      "created_at" : datetime.datetime(2013, 12, 21, 11, 47, 0),
 70 |      "liked_by" : ["data_guy", "data_gal", "bill"] },
 71 |     # add your own
 72 | ]
 73 | 
 74 | def data_science_day_mapper(status_update):
 75 |     """yields (day_of_week, 1) if status_update contains "data science" """
 76 |     if "data science" in status_update["text"].lower():
 77 |         day_of_week = status_update["created_at"].weekday()
 78 |         yield (day_of_week, 1)
 79 |         
 80 | data_science_days = map_reduce(status_updates, 
 81 |                                data_science_day_mapper, 
 82 |                                sum_reducer)
 83 | 
 84 | def words_per_user_mapper(status_update):
 85 |     user = status_update["username"]
 86 |     for word in tokenize(status_update["text"]):
 87 |         yield (user, (word, 1))
 88 |             
 89 | def most_popular_word_reducer(user, words_and_counts):
 90 |     """given a sequence of (word, count) pairs, 
 91 |     return the word with the highest total count"""
 92 |     
 93 |     word_counts = Counter()
 94 |     for word, count in words_and_counts:
 95 |         word_counts[word] += count
 96 | 
 97 |     word, count = word_counts.most_common(1)[0]
 98 |                        
 99 |     yield (user, (word, count))
100 | 
101 | user_words = map_reduce(status_updates,
102 |                         words_per_user_mapper, 
103 |                         most_popular_word_reducer)
104 | 
105 | def liker_mapper(status_update):
106 |     user = status_update["username"]
107 |     for liker in status_update["liked_by"]:
108 |         yield (user, liker)
109 |                 
110 | distinct_likers_per_user = map_reduce(status_updates, 
111 |                                       liker_mapper, 
112 |                                       count_distinct_reducer)
113 | 
114 | 
115 | #
116 | # matrix multiplication
117 | #
118 | 
119 | def matrix_multiply_mapper(m, element):
120 |     """m is the common dimension (columns of A, rows of B)
121 |     element is a tuple (matrix_name, i, j, value)"""
122 |     matrix, i, j, value = element
123 | 
124 |     if matrix == "A":
125 |         for column in range(m):
126 |             # A_ij is the jth entry in the sum for each C_i_column
127 |             yield((i, column), (j, value))
128 |     else:
129 |         for row in range(m):
130 |             # B_ij is the ith entry in the sum for each C_row_j
131 |             yield((row, j), (i, value))
132 |      
133 | def matrix_multiply_reducer(m, key, indexed_values):
134 |     results_by_index = defaultdict(list)
135 |     for index, value in indexed_values:
136 |         results_by_index[index].append(value)
137 | 
138 |     # sum up all the products of the positions with two results
139 |     sum_product = sum(results[0] * results[1]
140 |                       for results in results_by_index.values()
141 |                       if len(results) == 2)
142 |                       
143 |     if sum_product != 0.0:
144 |         yield (key, sum_product)
145 | 
146 | if __name__ == "__main__":
147 | 
148 |     documents = ["data science", "big data", "science fiction"]
149 | 
150 |     wc_mapper_results = [result 
151 |                          for document in documents
152 |                          for result in wc_mapper(document)]
153 | 
154 |     print "wc_mapper results"
155 |     print wc_mapper_results
156 |     print 
157 | 
158 |     print "word count results"
159 |     print word_count(documents)
160 |     print
161 | 
162 |     print "word count using map_reduce function"
163 |     print map_reduce(documents, wc_mapper, wc_reducer)
164 |     print
165 | 
166 |     print "data science days"
167 |     print data_science_days
168 |     print
169 | 
170 |     print "user words"
171 |     print user_words
172 |     print
173 | 
174 |     print "distinct likers"
175 |     print distinct_likers_per_user
176 |     print
177 | 
178 |     # matrix multiplication
179 | 
180 |     entries = [("A", 0, 0, 3), ("A", 0, 1,  2),
181 |            ("B", 0, 0, 4), ("B", 0, 1, -1), ("B", 1, 0, 10)]
182 |     mapper = partial(matrix_multiply_mapper, 3)
183 |     reducer = partial(matrix_multiply_reducer, 3)
184 | 
185 |     print "map-reduce matrix multiplication"
186 |     print "entries:", entries
187 |     print "result:", map_reduce(entries, mapper, reducer)
188 | 
189 |     


--------------------------------------------------------------------------------
/first-edition/code-python3/gradient_descent.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | from linear_algebra import distance, vector_subtract, scalar_multiply
  3 | from functools import reduce
  4 | import math, random
  5 | 
  6 | def sum_of_squares(v):
  7 |     """computes the sum of squared elements in v"""
  8 |     return sum(v_i ** 2 for v_i in v)
  9 | 
 10 | def difference_quotient(f, x, h):
 11 |     return (f(x + h) - f(x)) / h
 12 | 
 13 | def plot_estimated_derivative():
 14 | 
 15 |     def square(x):
 16 |         return x * x
 17 | 
 18 |     def derivative(x):
 19 |         return 2 * x
 20 | 
 21 |     derivative_estimate = lambda x: difference_quotient(square, x, h=0.00001)
 22 | 
 23 |     # plot to show they're basically the same
 24 |     import matplotlib.pyplot as plt
 25 |     x = range(-10,10)
 26 |     plt.plot(x, map(derivative, x), 'rx')           # red  x
 27 |     plt.plot(x, map(derivative_estimate, x), 'b+')  # blue +
 28 |     plt.show()                                      # purple *, hopefully
 29 | 
 30 | def partial_difference_quotient(f, v, i, h):
 31 | 
 32 |     # add h to just the i-th element of v
 33 |     w = [v_j + (h if j == i else 0)
 34 |          for j, v_j in enumerate(v)]
 35 | 
 36 |     return (f(w) - f(v)) / h
 37 | 
 38 | def estimate_gradient(f, v, h=0.00001):
 39 |     return [partial_difference_quotient(f, v, i, h)
 40 |             for i, _ in enumerate(v)]
 41 | 
 42 | def step(v, direction, step_size):
 43 |     """move step_size in the direction from v"""
 44 |     return [v_i + step_size * direction_i
 45 |             for v_i, direction_i in zip(v, direction)]
 46 | 
 47 | def sum_of_squares_gradient(v):
 48 |     return [2 * v_i for v_i in v]
 49 | 
 50 | def safe(f):
 51 |     """define a new function that wraps f and return it"""
 52 |     def safe_f(*args, **kwargs):
 53 |         try:
 54 |             return f(*args, **kwargs)
 55 |         except:
 56 |             return float('inf')         # this means "infinity" in Python
 57 |     return safe_f
 58 | 
 59 | 
 60 | #
 61 | #
 62 | # minimize / maximize batch
 63 | #
 64 | #
 65 | 
 66 | def minimize_batch(target_fn, gradient_fn, theta_0, tolerance=0.000001):
 67 |     """use gradient descent to find theta that minimizes target function"""
 68 | 
 69 |     step_sizes = [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
 70 | 
 71 |     theta = theta_0                           # set theta to initial value
 72 |     target_fn = safe(target_fn)               # safe version of target_fn
 73 |     value = target_fn(theta)                  # value we're minimizing
 74 | 
 75 |     while True:
 76 |         gradient = gradient_fn(theta)
 77 |         next_thetas = [step(theta, gradient, -step_size)
 78 |                        for step_size in step_sizes]
 79 | 
 80 |         # choose the one that minimizes the error function
 81 |         next_theta = min(next_thetas, key=target_fn)
 82 |         next_value = target_fn(next_theta)
 83 | 
 84 |         # stop if we're "converging"
 85 |         if abs(value - next_value) < tolerance:
 86 |             return theta
 87 |         else:
 88 |             theta, value = next_theta, next_value
 89 | 
 90 | def negate(f):
 91 |     """return a function that for any input x returns -f(x)"""
 92 |     return lambda *args, **kwargs: -f(*args, **kwargs)
 93 | 
 94 | def negate_all(f):
 95 |     """the same when f returns a list of numbers"""
 96 |     return lambda *args, **kwargs: [-y for y in f(*args, **kwargs)]
 97 | 
 98 | def maximize_batch(target_fn, gradient_fn, theta_0, tolerance=0.000001):
 99 |     return minimize_batch(negate(target_fn),
100 |                           negate_all(gradient_fn),
101 |                           theta_0,
102 |                           tolerance)
103 | 
104 | #
105 | # minimize / maximize stochastic
106 | #
107 | 
108 | def in_random_order(data):
109 |     """generator that returns the elements of data in random order"""
110 |     indexes = [i for i, _ in enumerate(data)]  # create a list of indexes
111 |     random.shuffle(indexes)                    # shuffle them
112 |     for i in indexes:                          # return the data in that order
113 |         yield data[i]
114 | 
115 | def minimize_stochastic(target_fn, gradient_fn, x, y, theta_0, alpha_0=0.01):
116 | 
117 |     data = list(zip(x, y))
118 |     theta = theta_0                             # initial guess
119 |     alpha = alpha_0                             # initial step size
120 |     min_theta, min_value = None, float("inf")   # the minimum so far
121 |     iterations_with_no_improvement = 0
122 | 
123 |     # if we ever go 100 iterations with no improvement, stop
124 |     while iterations_with_no_improvement < 100:
125 |         value = sum( target_fn(x_i, y_i, theta) for x_i, y_i in data )
126 | 
127 |         if value < min_value:
128 |             # if we've found a new minimum, remember it
129 |             # and go back to the original step size
130 |             min_theta, min_value = theta, value
131 |             iterations_with_no_improvement = 0
132 |             alpha = alpha_0
133 |         else:
134 |             # otherwise we're not improving, so try shrinking the step size
135 |             iterations_with_no_improvement += 1
136 |             alpha *= 0.9
137 | 
138 |         # and take a gradient step for each of the data points
139 |         for x_i, y_i in in_random_order(data):
140 |             gradient_i = gradient_fn(x_i, y_i, theta)
141 |             theta = vector_subtract(theta, scalar_multiply(alpha, gradient_i))
142 | 
143 |     return min_theta
144 | 
145 | def maximize_stochastic(target_fn, gradient_fn, x, y, theta_0, alpha_0=0.01):
146 |     return minimize_stochastic(negate(target_fn),
147 |                                negate_all(gradient_fn),
148 |                                x, y, theta_0, alpha_0)
149 | 
150 | if __name__ == "__main__":
151 | 
152 |     print("using the gradient")
153 | 
154 |     v = [random.randint(-10,10) for i in range(3)]
155 | 
156 |     tolerance = 0.0000001
157 | 
158 |     while True:
159 |         #print v, sum_of_squares(v)
160 |         gradient = sum_of_squares_gradient(v)   # compute the gradient at v
161 |         next_v = step(v, gradient, -0.01)       # take a negative gradient step
162 |         if distance(next_v, v) < tolerance:     # stop if we're converging
163 |             break
164 |         v = next_v                              # continue if we're not
165 | 
166 |     print("minimum v", v)
167 |     print("minimum value", sum_of_squares(v))
168 |     print()
169 | 
170 | 
171 |     print("using minimize_batch")
172 | 
173 |     v = [random.randint(-10,10) for i in range(3)]
174 | 
175 |     v = minimize_batch(sum_of_squares, sum_of_squares_gradient, v)
176 | 
177 |     print("minimum v", v)
178 |     print("minimum value", sum_of_squares(v))
179 | 


--------------------------------------------------------------------------------
/first-edition/code/gradient_descent.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from collections import Counter
  3 | from linear_algebra import distance, vector_subtract, scalar_multiply
  4 | import math, random
  5 | 
  6 | def sum_of_squares(v):
  7 |     """computes the sum of squared elements in v"""
  8 |     return sum(v_i ** 2 for v_i in v)
  9 | 
 10 | def difference_quotient(f, x, h):
 11 |     return (f(x + h) - f(x)) / h
 12 | 
 13 | def plot_estimated_derivative():
 14 | 
 15 |     def square(x):
 16 |         return x * x
 17 | 
 18 |     def derivative(x):
 19 |         return 2 * x
 20 | 
 21 |     derivative_estimate = lambda x: difference_quotient(square, x, h=0.00001)
 22 | 
 23 |     # plot to show they're basically the same
 24 |     import matplotlib.pyplot as plt
 25 |     x = range(-10,10)
 26 |     plt.plot(x, map(derivative, x), 'rx')           # red  x
 27 |     plt.plot(x, map(derivative_estimate, x), 'b+')  # blue +
 28 |     plt.show()                                      # purple *, hopefully
 29 | 
 30 | def partial_difference_quotient(f, v, i, h):
 31 | 
 32 |     # add h to just the i-th element of v
 33 |     w = [v_j + (h if j == i else 0)
 34 |          for j, v_j in enumerate(v)]
 35 |          
 36 |     return (f(w) - f(v)) / h
 37 | 
 38 | def estimate_gradient(f, v, h=0.00001):
 39 |     return [partial_difference_quotient(f, v, i, h)
 40 |             for i, _ in enumerate(v)] 
 41 | 
 42 | def step(v, direction, step_size):
 43 |     """move step_size in the direction from v"""
 44 |     return [v_i + step_size * direction_i
 45 |             for v_i, direction_i in zip(v, direction)]
 46 | 
 47 | def sum_of_squares_gradient(v): 
 48 |     return [2 * v_i for v_i in v]
 49 | 
 50 | def safe(f):
 51 |     """define a new function that wraps f and return it"""
 52 |     def safe_f(*args, **kwargs):
 53 |         try:
 54 |             return f(*args, **kwargs)
 55 |         except:
 56 |             return float('inf')         # this means "infinity" in Python
 57 |     return safe_f
 58 | 
 59 | 
 60 | #
 61 | # 
 62 | # minimize / maximize batch
 63 | #
 64 | #
 65 | 
 66 | def minimize_batch(target_fn, gradient_fn, theta_0, tolerance=0.000001):
 67 |     """use gradient descent to find theta that minimizes target function"""
 68 |     
 69 |     step_sizes = [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
 70 |     
 71 |     theta = theta_0                           # set theta to initial value
 72 |     target_fn = safe(target_fn)               # safe version of target_fn
 73 |     value = target_fn(theta)                  # value we're minimizing
 74 |     
 75 |     while True:
 76 |         gradient = gradient_fn(theta)  
 77 |         next_thetas = [step(theta, gradient, -step_size)
 78 |                        for step_size in step_sizes]
 79 |                    
 80 |         # choose the one that minimizes the error function        
 81 |         next_theta = min(next_thetas, key=target_fn)
 82 |         next_value = target_fn(next_theta)
 83 |         
 84 |         # stop if we're "converging"
 85 |         if abs(value - next_value) < tolerance:
 86 |             return theta
 87 |         else:
 88 |             theta, value = next_theta, next_value
 89 | 
 90 | def negate(f):
 91 |     """return a function that for any input x returns -f(x)"""
 92 |     return lambda *args, **kwargs: -f(*args, **kwargs)
 93 |     
 94 | def negate_all(f):
 95 |     """the same when f returns a list of numbers"""
 96 |     return lambda *args, **kwargs: [-y for y in f(*args, **kwargs)]
 97 | 
 98 | def maximize_batch(target_fn, gradient_fn, theta_0, tolerance=0.000001):
 99 |     return minimize_batch(negate(target_fn),
100 |                           negate_all(gradient_fn),
101 |                           theta_0, 
102 |                           tolerance)
103 | 
104 | #
105 | # minimize / maximize stochastic
106 | #
107 | 
108 | def in_random_order(data):
109 |     """generator that returns the elements of data in random order"""
110 |     indexes = [i for i, _ in enumerate(data)]  # create a list of indexes
111 |     random.shuffle(indexes)                    # shuffle them
112 |     for i in indexes:                          # return the data in that order
113 |         yield data[i]
114 | 
115 | def minimize_stochastic(target_fn, gradient_fn, x, y, theta_0, alpha_0=0.01):
116 | 
117 |     data = zip(x, y)
118 |     theta = theta_0                             # initial guess
119 |     alpha = alpha_0                             # initial step size
120 |     min_theta, min_value = None, float("inf")   # the minimum so far
121 |     iterations_with_no_improvement = 0
122 |     
123 |     # if we ever go 100 iterations with no improvement, stop
124 |     while iterations_with_no_improvement < 100:
125 |         value = sum( target_fn(x_i, y_i, theta) for x_i, y_i in data )
126 | 
127 |         if value < min_value:
128 |             # if we've found a new minimum, remember it
129 |             # and go back to the original step size
130 |             min_theta, min_value = theta, value
131 |             iterations_with_no_improvement = 0
132 |             alpha = alpha_0
133 |         else:
134 |             # otherwise we're not improving, so try shrinking the step size
135 |             iterations_with_no_improvement += 1
136 |             alpha *= 0.9
137 | 
138 |         # and take a gradient step for each of the data points        
139 |         for x_i, y_i in in_random_order(data):
140 |             gradient_i = gradient_fn(x_i, y_i, theta)
141 |             theta = vector_subtract(theta, scalar_multiply(alpha, gradient_i))
142 |             
143 |     return min_theta
144 | 
145 | def maximize_stochastic(target_fn, gradient_fn, x, y, theta_0, alpha_0=0.01):
146 |     return minimize_stochastic(negate(target_fn),
147 |                                negate_all(gradient_fn),
148 |                                x, y, theta_0, alpha_0)
149 | 
150 | if __name__ == "__main__":
151 | 
152 |     print "using the gradient"
153 | 
154 |     v = [random.randint(-10,10) for i in range(3)]
155 | 
156 |     tolerance = 0.0000001
157 | 
158 |     while True:
159 |         #print v, sum_of_squares(v)
160 |         gradient = sum_of_squares_gradient(v)   # compute the gradient at v
161 |         next_v = step(v, gradient, -0.01)       # take a negative gradient step
162 |         if distance(next_v, v) < tolerance:     # stop if we're converging
163 |             break
164 |         v = next_v                              # continue if we're not
165 | 
166 |     print "minimum v", v
167 |     print "minimum value", sum_of_squares(v)
168 |     print
169 | 
170 | 
171 |     print "using minimize_batch"
172 | 
173 |     v = [random.randint(-10,10) for i in range(3)]
174 | 
175 |     v = minimize_batch(sum_of_squares, sum_of_squares_gradient, v)
176 | 
177 |     print "minimum v", v
178 |     print "minimum value", sum_of_squares(v)
179 | 


--------------------------------------------------------------------------------
/scratch/naive_bayes.py:
--------------------------------------------------------------------------------
  1 | from typing import Set
  2 | import re
  3 | 
  4 | def tokenize(text: str) -> Set[str]:
  5 |     text = text.lower()                         # Convert to lowercase,
  6 |     all_words = re.findall("[a-z0-9']+", text)  # extract the words, and
  7 |     return set(all_words)                       # remove duplicates.
  8 | 
  9 | assert tokenize("Data Science is science") == {"data", "science", "is"}
 10 | 
 11 | from typing import NamedTuple
 12 | 
 13 | class Message(NamedTuple):
 14 |     text: str
 15 |     is_spam: bool
 16 | 
 17 | from typing import List, Tuple, Dict, Iterable
 18 | import math
 19 | from collections import defaultdict
 20 | 
 21 | class NaiveBayesClassifier:
 22 |     def __init__(self, k: float = 0.5) -> None:
 23 |         self.k = k  # smoothing factor
 24 | 
 25 |         self.tokens: Set[str] = set()
 26 |         self.token_spam_counts: Dict[str, int] = defaultdict(int)
 27 |         self.token_ham_counts: Dict[str, int] = defaultdict(int)
 28 |         self.spam_messages = self.ham_messages = 0
 29 | 
 30 |     def train(self, messages: Iterable[Message]) -> None:
 31 |         for message in messages:
 32 |             # Increment message counts
 33 |             if message.is_spam:
 34 |                 self.spam_messages += 1
 35 |             else:
 36 |                 self.ham_messages += 1
 37 | 
 38 |             # Increment word counts
 39 |             for token in tokenize(message.text):
 40 |                 self.tokens.add(token)
 41 |                 if message.is_spam:
 42 |                     self.token_spam_counts[token] += 1
 43 |                 else:
 44 |                     self.token_ham_counts[token] += 1
 45 | 
 46 |     def _probabilities(self, token: str) -> Tuple[float, float]:
 47 |         """returns P(token | spam) and P(token | not spam)"""
 48 |         spam = self.token_spam_counts[token]
 49 |         ham = self.token_ham_counts[token]
 50 | 
 51 |         p_token_spam = (spam + self.k) / (self.spam_messages + 2 * self.k)
 52 |         p_token_ham = (ham + self.k) / (self.ham_messages + 2 * self.k)
 53 | 
 54 |         return p_token_spam, p_token_ham
 55 | 
 56 |     def predict(self, text: str) -> float:
 57 |         text_tokens = tokenize(text)
 58 |         log_prob_if_spam = log_prob_if_ham = 0.0
 59 | 
 60 |         # Iterate through each word in our vocabulary.
 61 |         for token in self.tokens:
 62 |             prob_if_spam, prob_if_ham = self._probabilities(token)
 63 | 
 64 |             # If *token* appears in the message,
 65 |             # add the log probability of seeing it;
 66 |             if token in text_tokens:
 67 |                 log_prob_if_spam += math.log(prob_if_spam)
 68 |                 log_prob_if_ham += math.log(prob_if_ham)
 69 | 
 70 |             # otherwise add the log probability of _not_ seeing it
 71 |             # which is log(1 - probability of seeing it)
 72 |             else:
 73 |                 log_prob_if_spam += math.log(1.0 - prob_if_spam)
 74 |                 log_prob_if_ham += math.log(1.0 - prob_if_ham)
 75 | 
 76 |         prob_if_spam = math.exp(log_prob_if_spam)
 77 |         prob_if_ham = math.exp(log_prob_if_ham)
 78 |         return prob_if_spam / (prob_if_spam + prob_if_ham)
 79 | 
 80 | messages = [Message("spam rules", is_spam=True),
 81 |             Message("ham rules", is_spam=False),
 82 |             Message("hello ham", is_spam=False)]
 83 | 
 84 | model = NaiveBayesClassifier(k=0.5)
 85 | model.train(messages)
 86 | 
 87 | assert model.tokens == {"spam", "ham", "rules", "hello"}
 88 | assert model.spam_messages == 1
 89 | assert model.ham_messages == 2
 90 | assert model.token_spam_counts == {"spam": 1, "rules": 1}
 91 | assert model.token_ham_counts == {"ham": 2, "rules": 1, "hello": 1}
 92 | 
 93 | text = "hello spam"
 94 | 
 95 | probs_if_spam = [
 96 |     (1 + 0.5) / (1 + 2 * 0.5),      # "spam"  (present)
 97 |     1 - (0 + 0.5) / (1 + 2 * 0.5),  # "ham"   (not present)
 98 |     1 - (1 + 0.5) / (1 + 2 * 0.5),  # "rules" (not present)
 99 |     (0 + 0.5) / (1 + 2 * 0.5)       # "hello" (present)
100 | ]
101 | 
102 | probs_if_ham = [
103 |     (0 + 0.5) / (2 + 2 * 0.5),      # "spam"  (present)
104 |     1 - (2 + 0.5) / (2 + 2 * 0.5),  # "ham"   (not present)
105 |     1 - (1 + 0.5) / (2 + 2 * 0.5),  # "rules" (not present)
106 |     (1 + 0.5) / (2 + 2 * 0.5),      # "hello" (present)
107 | ]
108 | 
109 | p_if_spam = math.exp(sum(math.log(p) for p in probs_if_spam))
110 | p_if_ham = math.exp(sum(math.log(p) for p in probs_if_ham))
111 | 
112 | # Should be about 0.83
113 | assert model.predict(text) == p_if_spam / (p_if_spam + p_if_ham)
114 | 
115 | def drop_final_s(word):
116 |     return re.sub("s$", "", word)
117 | 
118 | def main():
119 |     import glob, re
120 |     
121 |     # modify the path to wherever you've put the files
122 |     path = 'spam_data/*/*'
123 |     
124 |     data: List[Message] = []
125 |     
126 |     # glob.glob returns every filename that matches the wildcarded path
127 |     for filename in glob.glob(path):
128 |         is_spam = "ham" not in filename
129 |     
130 |         # There are some garbage characters in the emails, the errors='ignore'
131 |         # skips them instead of raising an exception.
132 |         with open(filename, errors='ignore') as email_file:
133 |             for line in email_file:
134 |                 if line.startswith("Subject:"):
135 |                     subject = line.lstrip("Subject: ")
136 |                     data.append(Message(subject, is_spam))
137 |                     break  # done with this file
138 |     
139 |     import random
140 |     from scratch.machine_learning import split_data
141 |     
142 |     random.seed(0)      # just so you get the same answers as me
143 |     train_messages, test_messages = split_data(data, 0.75)
144 |     
145 |     model = NaiveBayesClassifier()
146 |     model.train(train_messages)
147 |     
148 |     from collections import Counter
149 |     
150 |     predictions = [(message, model.predict(message.text))
151 |                    for message in test_messages]
152 |     
153 |     # Assume that spam_probability > 0.5 corresponds to spam prediction
154 |     # and count the combinations of (actual is_spam, predicted is_spam)
155 |     confusion_matrix = Counter((message.is_spam, spam_probability > 0.5)
156 |                                for message, spam_probability in predictions)
157 |     
158 |     print(confusion_matrix)
159 |     
160 |     def p_spam_given_token(token: str, model: NaiveBayesClassifier) -> float:
161 |         # We probably shouldn't call private methods, but it's for a good cause.
162 |         prob_if_spam, prob_if_ham = model._probabilities(token)
163 |     
164 |         return prob_if_spam / (prob_if_spam + prob_if_ham)
165 |     
166 |     words = sorted(model.tokens, key=lambda t: p_spam_given_token(t, model))
167 |     
168 |     print("spammiest_words", words[-10:])
169 |     print("hammiest_words", words[:10])
170 |     
171 | if __name__ == "__main__": main()


--------------------------------------------------------------------------------
/first-edition/code-python3/hypothesis_and_inference.py:
--------------------------------------------------------------------------------
  1 | from probability import normal_cdf, inverse_normal_cdf
  2 | import math, random
  3 | 
  4 | def normal_approximation_to_binomial(n, p):
  5 |     """finds mu and sigma corresponding to a Binomial(n, p)"""
  6 |     mu = p * n
  7 |     sigma = math.sqrt(p * (1 - p) * n)
  8 |     return mu, sigma
  9 | 
 10 | #####
 11 | #
 12 | # probabilities a normal lies in an interval
 13 | #
 14 | ######
 15 | 
 16 | # the normal cdf _is_ the probability the variable is below a threshold
 17 | normal_probability_below = normal_cdf
 18 | 
 19 | # it's above the threshold if it's not below the threshold
 20 | def normal_probability_above(lo, mu=0, sigma=1):
 21 |     return 1 - normal_cdf(lo, mu, sigma)
 22 | 
 23 | # it's between if it's less than hi, but not less than lo
 24 | def normal_probability_between(lo, hi, mu=0, sigma=1):
 25 |     return normal_cdf(hi, mu, sigma) - normal_cdf(lo, mu, sigma)
 26 | 
 27 | # it's outside if it's not between
 28 | def normal_probability_outside(lo, hi, mu=0, sigma=1):
 29 |     return 1 - normal_probability_between(lo, hi, mu, sigma)
 30 | 
 31 | ######
 32 | #
 33 | #  normal bounds
 34 | #
 35 | ######
 36 | 
 37 | 
 38 | def normal_upper_bound(probability, mu=0, sigma=1):
 39 |     """returns the z for which P(Z <= z) = probability"""
 40 |     return inverse_normal_cdf(probability, mu, sigma)
 41 | 
 42 | def normal_lower_bound(probability, mu=0, sigma=1):
 43 |     """returns the z for which P(Z >= z) = probability"""
 44 |     return inverse_normal_cdf(1 - probability, mu, sigma)
 45 | 
 46 | def normal_two_sided_bounds(probability, mu=0, sigma=1):
 47 |     """returns the symmetric (about the mean) bounds
 48 |     that contain the specified probability"""
 49 |     tail_probability = (1 - probability) / 2
 50 | 
 51 |     # upper bound should have tail_probability above it
 52 |     upper_bound = normal_lower_bound(tail_probability, mu, sigma)
 53 | 
 54 |     # lower bound should have tail_probability below it
 55 |     lower_bound = normal_upper_bound(tail_probability, mu, sigma)
 56 | 
 57 |     return lower_bound, upper_bound
 58 | 
 59 | def two_sided_p_value(x, mu=0, sigma=1):
 60 |     if x >= mu:
 61 |         # if x is greater than the mean, the tail is above x
 62 |         return 2 * normal_probability_above(x, mu, sigma)
 63 |     else:
 64 |         # if x is less than the mean, the tail is below x
 65 |         return 2 * normal_probability_below(x, mu, sigma)
 66 | 
 67 | def count_extreme_values():
 68 |     extreme_value_count = 0
 69 |     for _ in range(100000):
 70 |         num_heads = sum(1 if random.random() < 0.5 else 0    # count # of heads
 71 |                         for _ in range(1000))                # in 1000 flips
 72 |         if num_heads >= 530 or num_heads <= 470:             # and count how often
 73 |             extreme_value_count += 1                         # the # is 'extreme'
 74 | 
 75 |     return extreme_value_count / 100000
 76 | 
 77 | upper_p_value = normal_probability_above
 78 | lower_p_value = normal_probability_below
 79 | 
 80 | ##
 81 | #
 82 | # P-hacking
 83 | #
 84 | ##
 85 | 
 86 | def run_experiment():
 87 |     """flip a fair coin 1000 times, True = heads, False = tails"""
 88 |     return [random.random() < 0.5 for _ in range(1000)]
 89 | 
 90 | def reject_fairness(experiment):
 91 |     """using the 5% significance levels"""
 92 |     num_heads = len([flip for flip in experiment if flip])
 93 |     return num_heads < 469 or num_heads > 531
 94 | 
 95 | 
 96 | ##
 97 | #
 98 | # running an A/B test
 99 | #
100 | ##
101 | 
102 | def estimated_parameters(N, n):
103 |     p = n / N
104 |     sigma = math.sqrt(p * (1 - p) / N)
105 |     return p, sigma
106 | 
107 | def a_b_test_statistic(N_A, n_A, N_B, n_B):
108 |     p_A, sigma_A = estimated_parameters(N_A, n_A)
109 |     p_B, sigma_B = estimated_parameters(N_B, n_B)
110 |     return (p_B - p_A) / math.sqrt(sigma_A ** 2 + sigma_B ** 2)
111 | 
112 | ##
113 | #
114 | # Bayesian Inference
115 | #
116 | ##
117 | 
118 | def B(alpha, beta):
119 |     """a normalizing constant so that the total probability is 1"""
120 |     return math.gamma(alpha) * math.gamma(beta) / math.gamma(alpha + beta)
121 | 
122 | def beta_pdf(x, alpha, beta):
123 |     if x < 0 or x > 1:          # no weight outside of [0, 1]
124 |         return 0
125 |     return x ** (alpha - 1) * (1 - x) ** (beta - 1) / B(alpha, beta)
126 | 
127 | 
128 | if __name__ == "__main__":
129 | 
130 |     mu_0, sigma_0 = normal_approximation_to_binomial(1000, 0.5)
131 |     print("mu_0", mu_0)
132 |     print("sigma_0", sigma_0)
133 |     print("normal_two_sided_bounds(0.95, mu_0, sigma_0)", normal_two_sided_bounds(0.95, mu_0, sigma_0))
134 |     print()
135 |     print("power of a test")
136 | 
137 |     print("95% bounds based on assumption p is 0.5")
138 | 
139 |     lo, hi = normal_two_sided_bounds(0.95, mu_0, sigma_0)
140 |     print("lo", lo)
141 |     print("hi", hi)
142 | 
143 |     print("actual mu and sigma based on p = 0.55")
144 |     mu_1, sigma_1 = normal_approximation_to_binomial(1000, 0.55)
145 |     print("mu_1", mu_1)
146 |     print("sigma_1", sigma_1)
147 | 
148 |     # a type 2 error means we fail to reject the null hypothesis
149 |     # which will happen when X is still in our original interval
150 |     type_2_probability = normal_probability_between(lo, hi, mu_1, sigma_1)
151 |     power = 1 - type_2_probability # 0.887
152 | 
153 |     print("type 2 probability", type_2_probability)
154 |     print("power", power)
155 |     print
156 | 
157 |     print("one-sided test")
158 |     hi = normal_upper_bound(0.95, mu_0, sigma_0)
159 |     print("hi", hi) # is 526 (< 531, since we need more probability in the upper tail)
160 |     type_2_probability = normal_probability_below(hi, mu_1, sigma_1)
161 |     power = 1 - type_2_probability # = 0.936
162 |     print("type 2 probability", type_2_probability)
163 |     print("power", power)
164 |     print()
165 | 
166 |     print("two_sided_p_value(529.5, mu_0, sigma_0)", two_sided_p_value(529.5, mu_0, sigma_0))
167 | 
168 |     print("two_sided_p_value(531.5, mu_0, sigma_0)", two_sided_p_value(531.5, mu_0, sigma_0))
169 | 
170 |     print("upper_p_value(525, mu_0, sigma_0)", upper_p_value(525, mu_0, sigma_0))
171 |     print("upper_p_value(527, mu_0, sigma_0)", upper_p_value(527, mu_0, sigma_0))
172 |     print()
173 | 
174 |     print("P-hacking")
175 | 
176 |     random.seed(0)
177 |     experiments = [run_experiment() for _ in range(1000)]
178 |     num_rejections = len([experiment
179 |                           for experiment in experiments
180 |                           if reject_fairness(experiment)])
181 | 
182 |     print(num_rejections, "rejections out of 1000")
183 |     print()
184 | 
185 |     print("A/B testing")
186 |     z = a_b_test_statistic(1000, 200, 1000, 180)
187 |     print("a_b_test_statistic(1000, 200, 1000, 180)", z)
188 |     print("p-value", two_sided_p_value(z))
189 |     z = a_b_test_statistic(1000, 200, 1000, 150)
190 |     print("a_b_test_statistic(1000, 200, 1000, 150)", z)
191 |     print("p-value", two_sided_p_value(z))
192 | 


--------------------------------------------------------------------------------
/first-edition/code-python3/recommender_systems.py:
--------------------------------------------------------------------------------
  1 | import math, random
  2 | from collections import defaultdict, Counter
  3 | from linear_algebra import dot
  4 | 
  5 | users_interests = [
  6 |     ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
  7 |     ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
  8 |     ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
  9 |     ["R", "Python", "statistics", "regression", "probability"],
 10 |     ["machine learning", "regression", "decision trees", "libsvm"],
 11 |     ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
 12 |     ["statistics", "probability", "mathematics", "theory"],
 13 |     ["machine learning", "scikit-learn", "Mahout", "neural networks"],
 14 |     ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
 15 |     ["Hadoop", "Java", "MapReduce", "Big Data"],
 16 |     ["statistics", "R", "statsmodels"],
 17 |     ["C++", "deep learning", "artificial intelligence", "probability"],
 18 |     ["pandas", "R", "Python"],
 19 |     ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
 20 |     ["libsvm", "regression", "support vector machines"]
 21 | ]
 22 | 
 23 | popular_interests = Counter(interest
 24 |                             for user_interests in users_interests
 25 |                             for interest in user_interests).most_common()
 26 | 
 27 | def most_popular_new_interests(user_interests, max_results=5):
 28 |     suggestions = [(interest, frequency)
 29 |                    for interest, frequency in popular_interests
 30 |                    if interest not in user_interests]
 31 |     return suggestions[:max_results]
 32 | 
 33 | #
 34 | # user-based filtering
 35 | #
 36 | 
 37 | def cosine_similarity(v, w):
 38 |     return dot(v, w) / math.sqrt(dot(v, v) * dot(w, w))
 39 | 
 40 | unique_interests = sorted(list({ interest
 41 |                                  for user_interests in users_interests
 42 |                                  for interest in user_interests }))
 43 | 
 44 | def make_user_interest_vector(user_interests):
 45 |     """given a list of interests, produce a vector whose i-th element is 1
 46 |     if unique_interests[i] is in the list, 0 otherwise"""
 47 |     return [1 if interest in user_interests else 0
 48 |             for interest in unique_interests]
 49 | 
 50 | user_interest_matrix = list(map(make_user_interest_vector, users_interests))
 51 | 
 52 | user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j)
 53 |                       for interest_vector_j in user_interest_matrix]
 54 |                      for interest_vector_i in user_interest_matrix]
 55 | 
 56 | def most_similar_users_to(user_id):
 57 |     pairs = [(other_user_id, similarity)                      # find other
 58 |              for other_user_id, similarity in                 # users with
 59 |                 enumerate(user_similarities[user_id])         # nonzero
 60 |              if user_id != other_user_id and similarity > 0]  # similarity
 61 | 
 62 |     return sorted(pairs,                                      # sort them
 63 |                   key=lambda pair: pair[1],                   # most similar
 64 |                   reverse=True)                               # first
 65 | 
 66 | 
 67 | def user_based_suggestions(user_id, include_current_interests=False):
 68 |     # sum up the similarities
 69 |     suggestions = defaultdict(float)
 70 |     for other_user_id, similarity in most_similar_users_to(user_id):
 71 |         for interest in users_interests[other_user_id]:
 72 |             suggestions[interest] += similarity
 73 | 
 74 |     # convert them to a sorted list
 75 |     suggestions = sorted(suggestions.items(),
 76 |                          key=lambda pair: pair[1],
 77 |                          reverse=True)
 78 | 
 79 |     # and (maybe) exclude already-interests
 80 |     if include_current_interests:
 81 |         return suggestions
 82 |     else:
 83 |         return [(suggestion, weight)
 84 |                 for suggestion, weight in suggestions
 85 |                 if suggestion not in users_interests[user_id]]
 86 | 
 87 | #
 88 | # Item-Based Collaborative Filtering
 89 | #
 90 | 
 91 | interest_user_matrix = [[user_interest_vector[j]
 92 |                          for user_interest_vector in user_interest_matrix]
 93 |                         for j, _ in enumerate(unique_interests)]
 94 | 
 95 | interest_similarities = [[cosine_similarity(user_vector_i, user_vector_j)
 96 |                           for user_vector_j in interest_user_matrix]
 97 |                          for user_vector_i in interest_user_matrix]
 98 | 
 99 | def most_similar_interests_to(interest_id):
100 |     similarities = interest_similarities[interest_id]
101 |     pairs = [(unique_interests[other_interest_id], similarity)
102 |              for other_interest_id, similarity in enumerate(similarities)
103 |              if interest_id != other_interest_id and similarity > 0]
104 |     return sorted(pairs,
105 |                   key=lambda pair: pair[1],
106 |                   reverse=True)
107 | 
108 | def item_based_suggestions(user_id, include_current_interests=False):
109 |     suggestions = defaultdict(float)
110 |     user_interest_vector = user_interest_matrix[user_id]
111 |     for interest_id, is_interested in enumerate(user_interest_vector):
112 |         if is_interested == 1:
113 |             similar_interests = most_similar_interests_to(interest_id)
114 |             for interest, similarity in similar_interests:
115 |                 suggestions[interest] += similarity
116 | 
117 |     suggestions = sorted(suggestions.items(),
118 |                          key=lambda pair: pair[1],
119 |                          reverse=True)
120 | 
121 |     if include_current_interests:
122 |         return suggestions
123 |     else:
124 |         return [(suggestion, weight)
125 |                 for suggestion, weight in suggestions
126 |                 if suggestion not in users_interests[user_id]]
127 | 
128 | 
129 | if __name__ == "__main__":
130 | 
131 |     print("Popular Interests")
132 |     print(popular_interests)
133 |     print()
134 | 
135 |     print("Most Popular New Interests")
136 |     print("already like:", ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"])
137 |     print(most_popular_new_interests(["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"]))
138 |     print()
139 |     print("already like:", ["R", "Python", "statistics", "regression", "probability"])
140 |     print(most_popular_new_interests(["R", "Python", "statistics", "regression", "probability"]))
141 |     print()
142 | 
143 |     print("User based similarity")
144 |     print("most similar to 0")
145 |     print(most_similar_users_to(0))
146 | 
147 |     print("Suggestions for 0")
148 |     print(user_based_suggestions(0))
149 |     print()
150 | 
151 |     print("Item based similarity")
152 |     print("most similar to 'Big Data'")
153 |     print(most_similar_interests_to(0))
154 |     print()
155 | 
156 |     print("suggestions for user 0")
157 |     print(item_based_suggestions(0))
158 | 


--------------------------------------------------------------------------------
/first-edition/code/hypothesis_and_inference.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from probability import normal_cdf, inverse_normal_cdf
  3 | import math, random
  4 | 
  5 | def normal_approximation_to_binomial(n, p):
  6 |     """finds mu and sigma corresponding to a Binomial(n, p)"""
  7 |     mu = p * n
  8 |     sigma = math.sqrt(p * (1 - p) * n)
  9 |     return mu, sigma
 10 | 
 11 | #####
 12 | #
 13 | # probabilities a normal lies in an interval
 14 | #
 15 | ######
 16 | 
 17 | # the normal cdf _is_ the probability the variable is below a threshold
 18 | normal_probability_below = normal_cdf
 19 | 
 20 | # it's above the threshold if it's not below the threshold
 21 | def normal_probability_above(lo, mu=0, sigma=1):
 22 |     return 1 - normal_cdf(lo, mu, sigma)
 23 |     
 24 | # it's between if it's less than hi, but not less than lo
 25 | def normal_probability_between(lo, hi, mu=0, sigma=1):
 26 |     return normal_cdf(hi, mu, sigma) - normal_cdf(lo, mu, sigma)
 27 | 
 28 | # it's outside if it's not between
 29 | def normal_probability_outside(lo, hi, mu=0, sigma=1):
 30 |     return 1 - normal_probability_between(lo, hi, mu, sigma)
 31 | 
 32 | ######
 33 | #
 34 | #  normal bounds
 35 | #
 36 | ######
 37 | 
 38 | 
 39 | def normal_upper_bound(probability, mu=0, sigma=1):
 40 |     """returns the z for which P(Z <= z) = probability"""
 41 |     return inverse_normal_cdf(probability, mu, sigma)
 42 |     
 43 | def normal_lower_bound(probability, mu=0, sigma=1):
 44 |     """returns the z for which P(Z >= z) = probability"""
 45 |     return inverse_normal_cdf(1 - probability, mu, sigma)
 46 | 
 47 | def normal_two_sided_bounds(probability, mu=0, sigma=1):
 48 |     """returns the symmetric (about the mean) bounds 
 49 |     that contain the specified probability"""
 50 |     tail_probability = (1 - probability) / 2
 51 | 
 52 |     # upper bound should have tail_probability above it
 53 |     upper_bound = normal_lower_bound(tail_probability, mu, sigma)
 54 | 
 55 |     # lower bound should have tail_probability below it
 56 |     lower_bound = normal_upper_bound(tail_probability, mu, sigma)
 57 | 
 58 |     return lower_bound, upper_bound
 59 | 
 60 | def two_sided_p_value(x, mu=0, sigma=1):
 61 |     if x >= mu:
 62 |         # if x is greater than the mean, the tail is above x
 63 |         return 2 * normal_probability_above(x, mu, sigma)
 64 |     else:
 65 |         # if x is less than the mean, the tail is below x
 66 |         return 2 * normal_probability_below(x, mu, sigma)   
 67 | 
 68 | def count_extreme_values():
 69 |     extreme_value_count = 0
 70 |     for _ in range(100000):
 71 |         num_heads = sum(1 if random.random() < 0.5 else 0    # count # of heads
 72 |                         for _ in range(1000))                # in 1000 flips
 73 |         if num_heads >= 530 or num_heads <= 470:             # and count how often
 74 |             extreme_value_count += 1                         # the # is 'extreme'
 75 | 
 76 |     return extreme_value_count / 100000
 77 | 
 78 | upper_p_value = normal_probability_above
 79 | lower_p_value = normal_probability_below    
 80 | 
 81 | ##
 82 | #
 83 | # P-hacking
 84 | #
 85 | ##
 86 | 
 87 | def run_experiment():
 88 |     """flip a fair coin 1000 times, True = heads, False = tails"""
 89 |     return [random.random() < 0.5 for _ in range(1000)]
 90 | 
 91 | def reject_fairness(experiment):
 92 |     """using the 5% significance levels"""
 93 |     num_heads = len([flip for flip in experiment if flip])
 94 |     return num_heads < 469 or num_heads > 531
 95 | 
 96 | 
 97 | ##
 98 | #
 99 | # running an A/B test
100 | #
101 | ##
102 | 
103 | def estimated_parameters(N, n):
104 |     p = n / N
105 |     sigma = math.sqrt(p * (1 - p) / N)
106 |     return p, sigma
107 | 
108 | def a_b_test_statistic(N_A, n_A, N_B, n_B):
109 |     p_A, sigma_A = estimated_parameters(N_A, n_A)
110 |     p_B, sigma_B = estimated_parameters(N_B, n_B)
111 |     return (p_B - p_A) / math.sqrt(sigma_A ** 2 + sigma_B ** 2)
112 | 
113 | ##
114 | #
115 | # Bayesian Inference
116 | #
117 | ##
118 | 
119 | def B(alpha, beta):
120 |     """a normalizing constant so that the total probability is 1"""
121 |     return math.gamma(alpha) * math.gamma(beta) / math.gamma(alpha + beta)
122 | 
123 | def beta_pdf(x, alpha, beta):
124 |     if x < 0 or x > 1:          # no weight outside of [0, 1]    
125 |         return 0        
126 |     return x ** (alpha - 1) * (1 - x) ** (beta - 1) / B(alpha, beta)
127 | 
128 | 
129 | if __name__ == "__main__":
130 | 
131 |     mu_0, sigma_0 = normal_approximation_to_binomial(1000, 0.5)
132 |     print "mu_0", mu_0
133 |     print "sigma_0", sigma_0
134 |     print "normal_two_sided_bounds(0.95, mu_0, sigma_0)", normal_two_sided_bounds(0.95, mu_0, sigma_0)
135 |     print
136 |     print "power of a test"
137 |     
138 |     print "95% bounds based on assumption p is 0.5"
139 |     
140 |     lo, hi = normal_two_sided_bounds(0.95, mu_0, sigma_0)
141 |     print "lo", lo
142 |     print "hi", hi
143 | 
144 |     print "actual mu and sigma based on p = 0.55"
145 |     mu_1, sigma_1 = normal_approximation_to_binomial(1000, 0.55)
146 |     print "mu_1", mu_1
147 |     print "sigma_1", sigma_1
148 | 
149 |     # a type 2 error means we fail to reject the null hypothesis
150 |     # which will happen when X is still in our original interval
151 |     type_2_probability = normal_probability_between(lo, hi, mu_1, sigma_1)
152 |     power = 1 - type_2_probability # 0.887
153 | 
154 |     print "type 2 probability", type_2_probability
155 |     print "power", power
156 |     print
157 | 
158 |     print "one-sided test"
159 |     hi = normal_upper_bound(0.95, mu_0, sigma_0) 
160 |     print "hi", hi # is 526 (< 531, since we need more probability in the upper tail)
161 |     type_2_probability = normal_probability_below(hi, mu_1, sigma_1)
162 |     power = 1 - type_2_probability # = 0.936
163 |     print "type 2 probability", type_2_probability
164 |     print "power", power
165 |     print
166 | 
167 |     print "two_sided_p_value(529.5, mu_0, sigma_0)", two_sided_p_value(529.5, mu_0, sigma_0)  
168 | 
169 |     print "two_sided_p_value(531.5, mu_0, sigma_0)", two_sided_p_value(531.5, mu_0, sigma_0)
170 | 
171 |     print "upper_p_value(525, mu_0, sigma_0)", upper_p_value(525, mu_0, sigma_0)
172 |     print "upper_p_value(527, mu_0, sigma_0)", upper_p_value(527, mu_0, sigma_0)    
173 |     print 
174 | 
175 |     print "P-hacking"
176 | 
177 |     random.seed(0)
178 |     experiments = [run_experiment() for _ in range(1000)]
179 |     num_rejections = len([experiment
180 |                           for experiment in experiments 
181 |                           if reject_fairness(experiment)])
182 | 
183 |     print num_rejections, "rejections out of 1000"
184 |     print
185 | 
186 |     print "A/B testing"
187 |     z = a_b_test_statistic(1000, 200, 1000, 180)
188 |     print "a_b_test_statistic(1000, 200, 1000, 180)", z
189 |     print "p-value", two_sided_p_value(z)
190 |     z = a_b_test_statistic(1000, 200, 1000, 150)
191 |     print "a_b_test_statistic(1000, 200, 1000, 150)", z
192 |     print "p-value", two_sided_p_value(z)
193 | 


--------------------------------------------------------------------------------
/first-edition/code/recommender_systems.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import math, random
  3 | from collections import defaultdict, Counter
  4 | from linear_algebra import dot
  5 | 
  6 | users_interests = [
  7 |     ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
  8 |     ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
  9 |     ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
 10 |     ["R", "Python", "statistics", "regression", "probability"],
 11 |     ["machine learning", "regression", "decision trees", "libsvm"],
 12 |     ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
 13 |     ["statistics", "probability", "mathematics", "theory"],
 14 |     ["machine learning", "scikit-learn", "Mahout", "neural networks"],
 15 |     ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
 16 |     ["Hadoop", "Java", "MapReduce", "Big Data"],
 17 |     ["statistics", "R", "statsmodels"],
 18 |     ["C++", "deep learning", "artificial intelligence", "probability"],
 19 |     ["pandas", "R", "Python"],
 20 |     ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
 21 |     ["libsvm", "regression", "support vector machines"]
 22 | ]
 23 | 
 24 | popular_interests = Counter(interest
 25 |                             for user_interests in users_interests
 26 |                             for interest in user_interests).most_common()
 27 | 
 28 | def most_popular_new_interests(user_interests, max_results=5):
 29 |     suggestions = [(interest, frequency) 
 30 |                    for interest, frequency in popular_interests
 31 |                    if interest not in user_interests]
 32 |     return suggestions[:max_results]
 33 | 
 34 | #
 35 | # user-based filtering
 36 | #
 37 | 
 38 | def cosine_similarity(v, w):
 39 |     return dot(v, w) / math.sqrt(dot(v, v) * dot(w, w))
 40 | 
 41 | unique_interests = sorted(list({ interest 
 42 |                                  for user_interests in users_interests
 43 |                                  for interest in user_interests }))
 44 | 
 45 | def make_user_interest_vector(user_interests):
 46 |     """given a list of interests, produce a vector whose i-th element is 1
 47 |     if unique_interests[i] is in the list, 0 otherwise"""
 48 |     return [1 if interest in user_interests else 0
 49 |             for interest in unique_interests]
 50 | 
 51 | user_interest_matrix = map(make_user_interest_vector, users_interests)
 52 | 
 53 | user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j)
 54 |                       for interest_vector_j in user_interest_matrix]
 55 |                      for interest_vector_i in user_interest_matrix]
 56 | 
 57 | def most_similar_users_to(user_id):
 58 |     pairs = [(other_user_id, similarity)                      # find other
 59 |              for other_user_id, similarity in                 # users with
 60 |                 enumerate(user_similarities[user_id])         # nonzero 
 61 |              if user_id != other_user_id and similarity > 0]  # similarity
 62 | 
 63 |     return sorted(pairs,                                      # sort them
 64 |                   key=lambda (_, similarity): similarity,     # most similar
 65 |                   reverse=True)                               # first
 66 | 
 67 | 
 68 | def user_based_suggestions(user_id, include_current_interests=False):
 69 |     # sum up the similarities
 70 |     suggestions = defaultdict(float)
 71 |     for other_user_id, similarity in most_similar_users_to(user_id):
 72 |         for interest in users_interests[other_user_id]:
 73 |             suggestions[interest] += similarity
 74 | 
 75 |     # convert them to a sorted list
 76 |     suggestions = sorted(suggestions.items(),
 77 |                          key=lambda (_, weight): weight,
 78 |                          reverse=True)
 79 | 
 80 |     # and (maybe) exclude already-interests
 81 |     if include_current_interests:
 82 |         return suggestions
 83 |     else:
 84 |         return [(suggestion, weight) 
 85 |                 for suggestion, weight in suggestions
 86 |                 if suggestion not in users_interests[user_id]]
 87 | 
 88 | #
 89 | # Item-Based Collaborative Filtering
 90 | #
 91 | 
 92 | interest_user_matrix = [[user_interest_vector[j]
 93 |                          for user_interest_vector in user_interest_matrix]
 94 |                         for j, _ in enumerate(unique_interests)]
 95 | 
 96 | interest_similarities = [[cosine_similarity(user_vector_i, user_vector_j)
 97 |                           for user_vector_j in interest_user_matrix]
 98 |                          for user_vector_i in interest_user_matrix]
 99 | 
100 | def most_similar_interests_to(interest_id):
101 |     similarities = interest_similarities[interest_id]
102 |     pairs = [(unique_interests[other_interest_id], similarity)
103 |              for other_interest_id, similarity in enumerate(similarities)
104 |              if interest_id != other_interest_id and similarity > 0]
105 |     return sorted(pairs,
106 |                   key=lambda (_, similarity): similarity,
107 |                   reverse=True)
108 | 
109 | def item_based_suggestions(user_id, include_current_interests=False):
110 |     suggestions = defaultdict(float)
111 |     user_interest_vector = user_interest_matrix[user_id]
112 |     for interest_id, is_interested in enumerate(user_interest_vector):
113 |         if is_interested == 1:
114 |             similar_interests = most_similar_interests_to(interest_id)
115 |             for interest, similarity in similar_interests:
116 |                 suggestions[interest] += similarity
117 | 
118 |     suggestions = sorted(suggestions.items(),
119 |                          key=lambda (_, similarity): similarity,
120 |                          reverse=True)
121 | 
122 |     if include_current_interests:
123 |         return suggestions
124 |     else:
125 |         return [(suggestion, weight) 
126 |                 for suggestion, weight in suggestions
127 |                 if suggestion not in users_interests[user_id]]
128 | 
129 | 
130 | if __name__ == "__main__":
131 | 
132 |     print "Popular Interests"
133 |     print popular_interests
134 |     print
135 | 
136 |     print "Most Popular New Interests"
137 |     print "already like:", ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"]
138 |     print most_popular_new_interests(["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"])
139 |     print
140 |     print "already like:", ["R", "Python", "statistics", "regression", "probability"]
141 |     print most_popular_new_interests(["R", "Python", "statistics", "regression", "probability"])
142 |     print    
143 | 
144 |     print "User based similarity"
145 |     print "most similar to 0"
146 |     print most_similar_users_to(0)
147 | 
148 |     print "Suggestions for 0"
149 |     print user_based_suggestions(0)
150 |     print
151 | 
152 |     print "Item based similarity"
153 |     print "most similar to 'Big Data'"
154 |     print most_similar_interests_to(0)
155 |     print
156 | 
157 |     print "suggestions for user 0"
158 |     print item_based_suggestions(0)
159 | 
160 | 


--------------------------------------------------------------------------------
/first-edition/code-python3/clustering.py:
--------------------------------------------------------------------------------
  1 | from linear_algebra import squared_distance, vector_mean, distance
  2 | import math, random
  3 | import matplotlib.image as mpimg
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | class KMeans:
  7 |     """performs k-means clustering"""
  8 | 
  9 |     def __init__(self, k):
 10 |         self.k = k          # number of clusters
 11 |         self.means = None   # means of clusters
 12 | 
 13 |     def classify(self, input):
 14 |         """return the index of the cluster closest to the input"""
 15 |         return min(range(self.k),
 16 |                    key=lambda i: squared_distance(input, self.means[i]))
 17 | 
 18 |     def train(self, inputs):
 19 | 
 20 |         self.means = random.sample(inputs, self.k)
 21 |         assignments = None
 22 | 
 23 |         while True:
 24 |             # Find new assignments
 25 |             new_assignments = list(map(self.classify, inputs))
 26 | 
 27 |             # If no assignments have changed, we're done.
 28 |             if assignments == new_assignments:
 29 |                 return
 30 | 
 31 |             # Otherwise keep the new assignments,
 32 |             assignments = new_assignments
 33 | 
 34 |             for i in range(self.k):
 35 |                 i_points = [p for p, a in zip(inputs, assignments) if a == i]
 36 |                 # avoid divide-by-zero if i_points is empty
 37 |                 if i_points:
 38 |                     self.means[i] = vector_mean(i_points)
 39 | 
 40 | def squared_clustering_errors(inputs, k):
 41 |     """finds the total squared error from k-means clustering the inputs"""
 42 |     clusterer = KMeans(k)
 43 |     clusterer.train(inputs)
 44 |     means = clusterer.means
 45 |     assignments = list(map(clusterer.classify, inputs))
 46 | 
 47 |     return sum(squared_distance(input,means[cluster])
 48 |                for input, cluster in zip(inputs, assignments))
 49 | 
 50 | def plot_squared_clustering_errors():
 51 | 
 52 |     ks = range(1, len(inputs) + 1)
 53 |     errors = [squared_clustering_errors(inputs, k) for k in ks]
 54 | 
 55 |     plt.plot(ks, errors)
 56 |     plt.xticks(ks)
 57 |     plt.xlabel("k")
 58 |     plt.ylabel("total squared error")
 59 |     plt.show()
 60 | 
 61 | #
 62 | # using clustering to recolor an image
 63 | #
 64 | 
 65 | def recolor_image(input_file, k=5):
 66 | 
 67 |     img = mpimg.imread(path_to_png_file)
 68 |     pixels = [pixel for row in img for pixel in row]
 69 |     clusterer = KMeans(k)
 70 |     clusterer.train(pixels) # this might take a while
 71 | 
 72 |     def recolor(pixel):
 73 |         cluster = clusterer.classify(pixel) # index of the closest cluster
 74 |         return clusterer.means[cluster]     # mean of the closest cluster
 75 | 
 76 |     new_img = [[recolor(pixel) for pixel in row]
 77 |                for row in img]
 78 | 
 79 |     plt.imshow(new_img)
 80 |     plt.axis('off')
 81 |     plt.show()
 82 | 
 83 | #
 84 | # hierarchical clustering
 85 | #
 86 | 
 87 | def is_leaf(cluster):
 88 |     """a cluster is a leaf if it has length 1"""
 89 |     return len(cluster) == 1
 90 | 
 91 | def get_children(cluster):
 92 |     """returns the two children of this cluster if it's a merged cluster;
 93 |     raises an exception if this is a leaf cluster"""
 94 |     if is_leaf(cluster):
 95 |         raise TypeError("a leaf cluster has no children")
 96 |     else:
 97 |         return cluster[1]
 98 | 
 99 | def get_values(cluster):
100 |     """returns the value in this cluster (if it's a leaf cluster)
101 |     or all the values in the leaf clusters below it (if it's not)"""
102 |     if is_leaf(cluster):
103 |         return cluster # is already a 1-tuple containing value
104 |     else:
105 |         return [value
106 |                 for child in get_children(cluster)
107 |                 for value in get_values(child)]
108 | 
109 | def cluster_distance(cluster1, cluster2, distance_agg=min):
110 |     """finds the aggregate distance between elements of cluster1
111 |     and elements of cluster2"""
112 |     return distance_agg([distance(input1, input2)
113 |                         for input1 in get_values(cluster1)
114 |                         for input2 in get_values(cluster2)])
115 | 
116 | def get_merge_order(cluster):
117 |     if is_leaf(cluster):
118 |         return float('inf')
119 |     else:
120 |         return cluster[0] # merge_order is first element of 2-tuple
121 | 
122 | def bottom_up_cluster(inputs, distance_agg=min):
123 |     # start with every input a leaf cluster / 1-tuple
124 |     clusters = [(input,) for input in inputs]
125 | 
126 |     # as long as we have more than one cluster left...
127 |     while len(clusters) > 1:
128 |         # find the two closest clusters
129 |         c1, c2 = min([(cluster1, cluster2)
130 |                      for i, cluster1 in enumerate(clusters)
131 |                      for cluster2 in clusters[:i]],
132 |                      key=lambda p: cluster_distance(p[0], p[1], distance_agg))
133 | 
134 |         # remove them from the list of clusters
135 |         clusters = [c for c in clusters if c != c1 and c != c2]
136 | 
137 |         # merge them, using merge_order = # of clusters left
138 |         merged_cluster = (len(clusters), [c1, c2])
139 | 
140 |         # and add their merge
141 |         clusters.append(merged_cluster)
142 | 
143 |     # when there's only one cluster left, return it
144 |     return clusters[0]
145 | 
146 | def generate_clusters(base_cluster, num_clusters):
147 |     # start with a list with just the base cluster
148 |     clusters = [base_cluster]
149 | 
150 |     # as long as we don't have enough clusters yet...
151 |     while len(clusters) < num_clusters:
152 |         # choose the last-merged of our clusters
153 |         next_cluster = min(clusters, key=get_merge_order)
154 |         # remove it from the list
155 |         clusters = [c for c in clusters if c != next_cluster]
156 |         # and add its children to the list (i.e., unmerge it)
157 |         clusters.extend(get_children(next_cluster))
158 | 
159 |     # once we have enough clusters...
160 |     return clusters
161 | 
162 | if __name__ == "__main__":
163 | 
164 |     inputs = [[-14,-5],[13,13],[20,23],[-19,-11],[-9,-16],[21,27],[-49,15],[26,13],[-46,5],[-34,-1],[11,15],[-49,0],[-22,-16],[19,28],[-12,-8],[-13,-19],[-41,8],[-11,-6],[-25,-9],[-18,-3]]
165 | 
166 |     random.seed(0) # so you get the same results as me
167 |     clusterer = KMeans(3)
168 |     clusterer.train(inputs)
169 |     print("3-means:")
170 |     print(clusterer.means)
171 |     print()
172 | 
173 |     random.seed(0)
174 |     clusterer = KMeans(2)
175 |     clusterer.train(inputs)
176 |     print("2-means:")
177 |     print(clusterer.means)
178 |     print()
179 | 
180 |     print("errors as a function of k")
181 | 
182 |     for k in range(1, len(inputs) + 1):
183 |         print(k, squared_clustering_errors(inputs, k))
184 |     print()
185 | 
186 | 
187 |     print("bottom up hierarchical clustering")
188 | 
189 |     base_cluster = bottom_up_cluster(inputs)
190 |     print(base_cluster)
191 | 
192 |     print()
193 |     print("three clusters, min:")
194 |     for cluster in generate_clusters(base_cluster, 3):
195 |         print(get_values(cluster))
196 | 
197 |     print()
198 |     print("three clusters, max:")
199 |     base_cluster = bottom_up_cluster(inputs, max)
200 |     for cluster in generate_clusters(base_cluster, 3):
201 |         print(get_values(cluster))
202 | 


--------------------------------------------------------------------------------
/first-edition/code/clustering.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from linear_algebra import squared_distance, vector_mean, distance
  3 | import math, random
  4 | import matplotlib.image as mpimg
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | class KMeans:
  8 |     """performs k-means clustering"""
  9 | 
 10 |     def __init__(self, k):
 11 |         self.k = k          # number of clusters
 12 |         self.means = None   # means of clusters
 13 |         
 14 |     def classify(self, input):
 15 |         """return the index of the cluster closest to the input"""
 16 |         return min(range(self.k),
 17 |                    key=lambda i: squared_distance(input, self.means[i]))
 18 |                    
 19 |     def train(self, inputs):
 20 |     
 21 |         self.means = random.sample(inputs, self.k)
 22 |         assignments = None
 23 |         
 24 |         while True:
 25 |             # Find new assignments
 26 |             new_assignments = map(self.classify, inputs)
 27 | 
 28 |             # If no assignments have changed, we're done.
 29 |             if assignments == new_assignments:                
 30 |                 return
 31 | 
 32 |             # Otherwise keep the new assignments,
 33 |             assignments = new_assignments    
 34 | 
 35 |             for i in range(self.k):
 36 |                 i_points = [p for p, a in zip(inputs, assignments) if a == i]
 37 |                 # avoid divide-by-zero if i_points is empty
 38 |                 if i_points:                                
 39 |                     self.means[i] = vector_mean(i_points)    
 40 | 
 41 | def squared_clustering_errors(inputs, k):
 42 |     """finds the total squared error from k-means clustering the inputs"""
 43 |     clusterer = KMeans(k)
 44 |     clusterer.train(inputs)
 45 |     means = clusterer.means
 46 |     assignments = map(clusterer.classify, inputs)
 47 |     
 48 |     return sum(squared_distance(input,means[cluster])
 49 |                for input, cluster in zip(inputs, assignments))
 50 | 
 51 | def plot_squared_clustering_errors(plt):
 52 | 
 53 |     ks = range(1, len(inputs) + 1)
 54 |     errors = [squared_clustering_errors(inputs, k) for k in ks]
 55 | 
 56 |     plt.plot(ks, errors)
 57 |     plt.xticks(ks)
 58 |     plt.xlabel("k")
 59 |     plt.ylabel("total squared error")
 60 |     plt.show()
 61 | 
 62 | #
 63 | # using clustering to recolor an image
 64 | #
 65 | 
 66 | def recolor_image(input_file, k=5):
 67 | 
 68 |     img = mpimg.imread(path_to_png_file)
 69 |     pixels = [pixel for row in img for pixel in row]
 70 |     clusterer = KMeans(k)
 71 |     clusterer.train(pixels) # this might take a while    
 72 | 
 73 |     def recolor(pixel):
 74 |         cluster = clusterer.classify(pixel) # index of the closest cluster
 75 |         return clusterer.means[cluster]     # mean of the closest cluster
 76 | 
 77 |     new_img = [[recolor(pixel) for pixel in row]
 78 |                for row in img]
 79 | 
 80 |     plt.imshow(new_img)
 81 |     plt.axis('off')
 82 |     plt.show()
 83 | 
 84 | #
 85 | # hierarchical clustering
 86 | #
 87 | 
 88 | def is_leaf(cluster):
 89 |     """a cluster is a leaf if it has length 1"""
 90 |     return len(cluster) == 1
 91 | 
 92 | def get_children(cluster):
 93 |     """returns the two children of this cluster if it's a merged cluster;
 94 |     raises an exception if this is a leaf cluster"""
 95 |     if is_leaf(cluster):
 96 |         raise TypeError("a leaf cluster has no children")
 97 |     else:
 98 |         return cluster[1]
 99 | 
100 | def get_values(cluster):
101 |     """returns the value in this cluster (if it's a leaf cluster)
102 |     or all the values in the leaf clusters below it (if it's not)"""
103 |     if is_leaf(cluster):
104 |         return cluster # is already a 1-tuple containing value
105 |     else:
106 |         return [value
107 |                 for child in get_children(cluster)
108 |                 for value in get_values(child)]
109 | 
110 | def cluster_distance(cluster1, cluster2, distance_agg=min):
111 |     """finds the aggregate distance between elements of cluster1
112 |     and elements of cluster2"""
113 |     return distance_agg([distance(input1, input2)
114 |                         for input1 in get_values(cluster1)
115 |                         for input2 in get_values(cluster2)])
116 | 
117 | def get_merge_order(cluster):
118 |     if is_leaf(cluster):
119 |         return float('inf')
120 |     else:
121 |         return cluster[0] # merge_order is first element of 2-tuple
122 | 
123 | def bottom_up_cluster(inputs, distance_agg=min):
124 |     # start with every input a leaf cluster / 1-tuple
125 |     clusters = [(input,) for input in inputs]
126 |     
127 |     # as long as we have more than one cluster left...
128 |     while len(clusters) > 1:
129 |         # find the two closest clusters
130 |         c1, c2 = min([(cluster1, cluster2)
131 |                      for i, cluster1 in enumerate(clusters)
132 |                      for cluster2 in clusters[:i]],
133 |                      key=lambda (x, y): cluster_distance(x, y, distance_agg))
134 | 
135 |         # remove them from the list of clusters
136 |         clusters = [c for c in clusters if c != c1 and c != c2]
137 | 
138 |         # merge them, using merge_order = # of clusters left
139 |         merged_cluster = (len(clusters), [c1, c2])
140 | 
141 |         # and add their merge
142 |         clusters.append(merged_cluster)
143 | 
144 |     # when there's only one cluster left, return it
145 |     return clusters[0]
146 | 
147 | def generate_clusters(base_cluster, num_clusters):
148 |     # start with a list with just the base cluster
149 |     clusters = [base_cluster]
150 |     
151 |     # as long as we don't have enough clusters yet...
152 |     while len(clusters) < num_clusters:
153 |         # choose the last-merged of our clusters
154 |         next_cluster = min(clusters, key=get_merge_order)
155 |         # remove it from the list
156 |         clusters = [c for c in clusters if c != next_cluster]
157 |         # and add its children to the list (i.e., unmerge it)
158 |         clusters.extend(get_children(next_cluster))
159 | 
160 |     # once we have enough clusters...
161 |     return clusters
162 | 
163 | if __name__ == "__main__":
164 | 
165 |     inputs = [[-14,-5],[13,13],[20,23],[-19,-11],[-9,-16],[21,27],[-49,15],[26,13],[-46,5],[-34,-1],[11,15],[-49,0],[-22,-16],[19,28],[-12,-8],[-13,-19],[-41,8],[-11,-6],[-25,-9],[-18,-3]]
166 | 
167 |     random.seed(0) # so you get the same results as me
168 |     clusterer = KMeans(3)
169 |     clusterer.train(inputs)
170 |     print "3-means:"
171 |     print clusterer.means
172 |     print
173 | 
174 |     random.seed(0)
175 |     clusterer = KMeans(2)
176 |     clusterer.train(inputs)
177 |     print "2-means:"
178 |     print clusterer.means
179 |     print
180 | 
181 |     print "errors as a function of k"
182 | 
183 |     for k in range(1, len(inputs) + 1):
184 |         print k, squared_clustering_errors(inputs, k)
185 |     print
186 | 
187 | 
188 |     print "bottom up hierarchical clustering"
189 | 
190 |     base_cluster = bottom_up_cluster(inputs)
191 |     print base_cluster
192 | 
193 |     print
194 |     print "three clusters, min:"
195 |     for cluster in generate_clusters(base_cluster, 3):
196 |         print get_values(cluster)
197 | 
198 |     print
199 |     print "three clusters, max:"
200 |     base_cluster = bottom_up_cluster(inputs, max)
201 |     for cluster in generate_clusters(base_cluster, 3):
202 |         print get_values(cluster)
203 | 


--------------------------------------------------------------------------------