├── README.md └── src ├── __init__.py ├── gradient_descent.gif ├── gradient_descent.py ├── gradient_descent2.gif ├── k_means.py ├── kmeans.gif ├── kmeans2.gif ├── kmeans_cluster.gif ├── sgd.gif └── stupid_tricks.py /README.md: -------------------------------------------------------------------------------- 1 | # Stupid Itertools Tricks For Data Science 2 | 3 | Here's the code that goes along with my 2015 PyData Seattle talk. 4 | 5 | [Slides](https://docs.google.com/presentation/d/1eI60SL3UxtWfr9ktrv48-pcIkk4S7JiDmeXGCyyGhCs/edit?usp=sharing) 6 | 7 | [Video](https://www.youtube.com/watch?v=ThS4juptJjQ) 8 | 9 | The original title was 10 | 11 | ## Learning Data Science Using Functional Python 12 | 13 | but then when I started writing it it was turning out to be 14 | 15 | ## Stupid Itertools Tricks 16 | 17 | but since the conference was PyData, I figured I better get data back into it somehow. 18 | 19 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joelgrus/stupid-itertools-tricks-pydata/ab5a552c9d5cbfddf79d77d73ca86d56ead3278d/src/__init__.py -------------------------------------------------------------------------------- /src/gradient_descent.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joelgrus/stupid-itertools-tricks-pydata/ab5a552c9d5cbfddf79d77d73ca86d56ead3278d/src/gradient_descent.gif -------------------------------------------------------------------------------- /src/gradient_descent.py: -------------------------------------------------------------------------------- 1 | from stupid_tricks import * 2 | from itertools import chain 3 | import numpy as np 4 | import matplotlib 5 | import matplotlib.pyplot as plt 6 | from matplotlib import animation 7 | import random 8 | import math 9 | from functools import reduce, partial 10 | from operator import add 11 | 12 | def gradient_step(df, alpha, x_i): 13 | return [x_ij + alpha * df_j 14 | for x_ij, df_j in zip(x_i, df(x_i))] 15 | 16 | def gradient_descent(df, x_0, alpha=0.1): 17 | return iterate(partial(gradient_step, df, -alpha), x_0) 18 | 19 | def run_gradient_descent(seed=0): 20 | """runs gradient descent to find a minimum of x^2 + y^2""" 21 | random.seed(seed) 22 | colors = [color for color in matplotlib.colors.cnames] 23 | 24 | def random_point(): 25 | return (2 * random.random() - 1, 2 * random.random() - 1) 26 | 27 | def df(x_i): 28 | """this is the gradient of x^2 + y^2""" 29 | return [2 * x_ij for x_ij in x_i] 30 | 31 | for color in random.sample(colors, 50): 32 | path = take(10, gradient_descent(df, random_point())) 33 | for i, (x, y) in enumerate(path): 34 | plt.plot(x, y, color=color, marker='*', markersize=20-2*i) 35 | 36 | plt.show() 37 | 38 | def run_gradient_descent_animation(seed=0, nframes=25): 39 | random.seed(seed) 40 | 41 | colors = [color for color in matplotlib.colors.cnames] 42 | 43 | def random_point(): 44 | return (2 * random.random() - 1, 2 * random.random() - 1) 45 | 46 | def df(x_i): 47 | """this is the gradient of x^2 + y^2""" 48 | return [2 * x_ij for x_ij in x_i] 49 | 50 | paths = [take(nframes, gradient_descent(df, random_point())) 51 | for _ in range(50)] 52 | 53 | def animation_frame(nframe): 54 | points = [path[nframe] for path in paths] 55 | 56 | for color, point in zip(colors, points): 57 | markersize = 10 - 10 * nframe / nframes 58 | plt.plot(*point, color=color, marker='*', markersize=markersize) 59 | 60 | fig = plt.figure(figsize=(5,4)) 61 | anim = animation.FuncAnimation(fig, animation_frame, frames=nframes) 62 | anim.save('gradient_descent.gif', writer='imagemagick', fps=4) 63 | 64 | 65 | def run_gradient_descent2(seed=0): 66 | """runs gradient descent to find a minimum of 67 | exp(x^3 / 3 + x - y^2)""" 68 | 69 | colors = [color for color in matplotlib.colors.cnames] 70 | 71 | def random_point(): 72 | return (3 * random.random() - 1, 3 * random.random() - 1) 73 | 74 | def f(x): 75 | """has min at (1,0), saddle point at (-1,0)""" 76 | return -math.exp(x[0]**3/-3 + x[0] - x[1]**2) 77 | 78 | def df(x): 79 | return ((1 - x[0]**2) * f(x), -2 * x[1] * f(x)) 80 | 81 | for color in random.sample(colors, 50): 82 | path = take(100, gradient_descent(df, random_point())) 83 | for i, (x, y) in enumerate(path): 84 | plt.plot(x, y, color=color, marker='*', markersize=25-i/4) 85 | 86 | plt.show() 87 | 88 | def run_gradient_descent_animation2(seed=0, nframes=25): 89 | random.seed(seed) 90 | 91 | colors = [color for color in matplotlib.colors.cnames] 92 | 93 | def random_point(): 94 | return (2 * random.random() - 1, 2 * random.random() - 1) 95 | 96 | def f(x): 97 | """has min at (1,0), saddle point at (-1,0)""" 98 | return -math.exp(x[0]**3/-3 + x[0] - x[1]**2) 99 | 100 | def df(x): 101 | return ((1 - x[0]**2) * f(x), -2 * x[1] * f(x)) 102 | 103 | paths = [take(nframes, gradient_descent(df, random_point())) 104 | for _ in range(50)] 105 | 106 | def animation_frame(nframe): 107 | points = [path[nframe] for path in paths] 108 | 109 | for color, point in zip(colors, points): 110 | markersize = 10 - 10 * nframe / nframes 111 | plt.plot(*point, color=color, marker='*', markersize=markersize) 112 | 113 | fig = plt.figure(figsize=(5,4)) 114 | anim = animation.FuncAnimation(fig, animation_frame, frames=nframes) 115 | anim.save('gradient_descent2.gif', writer='imagemagick', fps=4) 116 | 117 | def sgd_step(df, alpha, prev_beta, xy_i): 118 | """df is a function of x_i, y_i, beta""" 119 | x_i, y_i = xy_i 120 | gradient = df(x_i, y_i, prev_beta) 121 | return [beta_j + alpha * df_j 122 | for beta_j, df_j in zip(prev_beta, gradient)] 123 | 124 | def sgd(df, x, y, beta_0, alpha=0.1): 125 | xys = chain([beta_0], cycle(zip(x, y))) 126 | return accumulate(xys, partial(sgd_step, df, -alpha)) 127 | 128 | def run_sgd(seed=0, steps=5000, show_every=500): 129 | random.seed(seed) 130 | x = [(1, random.random()) for _ in range(100)] 131 | y = [-5 * x_i[0] + 10 * x_i[1] + random.random() for x_i in x] 132 | 133 | def predict(x_i, beta): return x_i[0] * beta[0] + x_i[1] * beta[1] 134 | 135 | def error(x_i, y_i, beta): return predict(x_i, beta) - y_i 136 | 137 | def sqerror(x_i, y_i, beta): return error(x_i, y_i, beta) ** 2 138 | 139 | def sqerror_gradient(x_i, y_i, beta): 140 | return (2 * x_i[0] * error(x_i, y_i, beta), 141 | 2 * x_i[1] * error(x_i, y_i, beta)) 142 | 143 | beta_0 = (random.random(), random.random()) 144 | results = [x for x in take(steps, sgd(sqerror_gradient, x, y, beta_0, 0.01))] 145 | 146 | subresults = results[::show_every] 147 | 148 | _, xs = zip(*x) 149 | plt.scatter(xs, y) 150 | 151 | for i, (a, b) in enumerate(subresults): 152 | plt.plot([0, 1], [a, a+b]) 153 | 154 | plt.show() 155 | return subresults 156 | 157 | def run_sgd_animation(seed=0, steps=5000, show_every=250): 158 | random.seed(seed) 159 | x = [(1, random.random()) for _ in range(100)] 160 | y = [-5 * x_i[0] + 10 * x_i[1] + random.random() for x_i in x] 161 | 162 | def predict(x_i, beta): return x_i[0] * beta[0] + x_i[1] * beta[1] 163 | 164 | def error(x_i, y_i, beta): return predict(x_i, beta) - y_i 165 | 166 | def sqerror(x_i, y_i, beta): return error(x_i, y_i, beta) ** 2 167 | 168 | def sqerror_gradient(x_i, y_i, beta): 169 | return (-2 * x_i[0] * error(x_i, y_i, beta), 170 | -2 * x_i[1] * error(x_i, y_i, beta)) 171 | 172 | beta_0 = (random.random(), random.random()) 173 | results = [x for x in take(steps, sgd(sqerror_gradient, x, y, beta_0, 0.01))] 174 | 175 | subresults = results[::show_every] 176 | nframes = len(subresults) 177 | 178 | def animation_frame(nframe): 179 | a, b = subresults[nframe] 180 | plt.plot([0, 1], [a, a+b]) 181 | 182 | fig = plt.figure(figsize=(5,4)) 183 | _, xs = zip(*x) 184 | fig.gca().scatter(xs, y) 185 | 186 | anim = animation.FuncAnimation(fig, animation_frame, frames=nframes) 187 | anim.save('sgd.gif', writer='imagemagick', fps=4) 188 | -------------------------------------------------------------------------------- /src/gradient_descent2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joelgrus/stupid-itertools-tricks-pydata/ab5a552c9d5cbfddf79d77d73ca86d56ead3278d/src/gradient_descent2.gif -------------------------------------------------------------------------------- /src/k_means.py: -------------------------------------------------------------------------------- 1 | from stupid_tricks import * 2 | from sklearn.datasets.samples_generator import make_blobs 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import matplotlib 6 | from matplotlib import animation 7 | import random 8 | from functools import reduce, partial 9 | from operator import add 10 | 11 | class KMeans: 12 | """good old class based solution""" 13 | def __init__(self, k): 14 | self.k = k 15 | self.means = [None for _ in range(k)] 16 | 17 | def fit(self, points, num_iters=10): 18 | assignments = [None for _ in points] 19 | self.means = random.sample(list(points), self.k) 20 | for _ in range(num_iters): 21 | for i, point in enumerate(points): 22 | assignments[i] = self.predict(point) 23 | for j in range(self.k): 24 | cluster = [p for p, c in zip(points, assignments) if c == j] 25 | self.means[j] = list(map(lambda x: x / len(cluster), reduce(partial(map, add), cluster))) 26 | 27 | def predict(self, point): 28 | d_min = float('inf') 29 | for j, m in enumerate(self.means): 30 | d = sum((m_i - p_i)**2 for m_i, p_i in zip(m, point)) 31 | if d < d_min: 32 | prediction = j 33 | d_min = d 34 | return prediction 35 | 36 | def run_kmeans(seed=1): 37 | random.seed(seed) 38 | points = np.random.random((100,2)) 39 | 40 | model = KMeans(5) 41 | model.fit(points, num_iters=100) 42 | assignments = [model.predict(point) for point in points] 43 | 44 | for x, y in model.means: 45 | plt.plot(x, y, marker='*', markersize=10, color='black') 46 | 47 | for j, color in zip(range(5), 48 | ['r', 'g', 'b', 'm', 'c']): 49 | cluster = [p 50 | for p, c in zip(points, assignments) 51 | if j == c] 52 | xs, ys = zip(*cluster) 53 | plt.scatter(xs, ys, color=color) 54 | 55 | plt.show() 56 | 57 | # functional version 58 | # ------------------ 59 | 60 | def k_meanses(points, k): 61 | initial_means = random.sample(points, k) 62 | return iterate(partial(new_means, points), 63 | initial_means) 64 | 65 | def no_repeat(prev, curr): 66 | if prev == curr: raise StopIteration 67 | else: return curr 68 | 69 | def until_convergence(it): 70 | return accumulate(it, no_repeat) 71 | 72 | def new_means(points, old_means): 73 | k = len(old_means) 74 | assignments = [closest_index(point, old_means) 75 | for point in points] 76 | clusters = [[point 77 | for point, c in zip(points, assignments) 78 | if c == j] for j in range(k)] 79 | return [cluster_mean(cluster) for cluster in clusters] 80 | 81 | def closest_index(point, means): 82 | return min(enumerate(means), 83 | key=lambda pair: squared_distance(point, pair[1]))[0] 84 | 85 | def squared_distance(p, q): 86 | return sum((p_i - q_i)**2 for p_i, q_i in zip(p, q)) 87 | 88 | def cluster_mean(points): 89 | num_points = len(points) 90 | dim = len(points[0]) if points else 0 91 | sum_points = [sum(point[j] for point in points) 92 | for j in range(dim)] 93 | return [s / num_points for s in sum_points] 94 | 95 | 96 | def run_kmeans_functional(seed=0): 97 | random.seed(seed) 98 | data = [(random.random(), random.random()) for _ in range(500)] 99 | meanses = [mean for mean in until_convergence(k_meanses(data, 5))] 100 | 101 | x, y = zip(*data) 102 | plt.scatter(x, y, color='black') 103 | 104 | colors = ['r', 'g', 'b', 'c', 'm'] 105 | for i, means in enumerate(meanses): 106 | for m, color in zip(means, colors): 107 | plt.plot(*m, color=color, 108 | marker='*', 109 | markersize=3*i) 110 | 111 | plt.show() 112 | 113 | 114 | def run_kmeans_animation(seed=0, k=5): 115 | random.seed(seed) 116 | data = [(random.random(), random.random()) for _ in range(500)] 117 | meanses = [mean for mean in until_convergence(k_meanses(data, k))] 118 | 119 | # colors = random.sample(list(matplotlib.colors.cnames), k) 120 | colors = ['r', 'g', 'b', 'c', 'm'] 121 | 122 | def animation_frame(nframe): 123 | means = meanses[nframe] 124 | plt.cla() 125 | assignments = [closest_index(point, means) 126 | for point in data] 127 | clusters = [[point 128 | for point, c in zip(data, assignments) 129 | if c == j] for j in range(k)] 130 | 131 | for cluster, color, mean in zip(clusters, colors, means): 132 | x, y = zip(*cluster) 133 | plt.scatter(x, y, color=color) 134 | plt.plot(*mean, color=color, marker='*', markersize=20) 135 | 136 | fig = plt.figure(figsize=(5,4)) 137 | anim = animation.FuncAnimation(fig, animation_frame, frames=len(meanses)) 138 | anim.save('kmeans_cluster.gif', writer='imagemagick', fps=4) 139 | 140 | def run_kmeans_animation2(seed=0, k=5): 141 | random.seed(seed) 142 | data = [(random.choice([0,1,2,4,5]) + random.random(), 143 | random.normalvariate(0, 1)) for _ in range(500)] 144 | meanses = [mean for mean in until_convergence(k_meanses(data, k))] 145 | 146 | # colors = random.sample(list(matplotlib.colors.cnames), k) 147 | colors = ['r', 'g', 'b', 'c', 'm'] 148 | 149 | def animation_frame(nframe): 150 | means = meanses[nframe] 151 | plt.cla() 152 | assignments = [closest_index(point, means) 153 | for point in data] 154 | clusters = [[point 155 | for point, c in zip(data, assignments) 156 | if c == j] for j in range(k)] 157 | 158 | for cluster, color, mean in zip(clusters, colors, means): 159 | x, y = zip(*cluster) 160 | plt.scatter(x, y, color=color) 161 | plt.plot(*mean, color=color, marker='*', markersize=20) 162 | 163 | fig = plt.figure(figsize=(5,4)) 164 | anim = animation.FuncAnimation(fig, animation_frame, frames=len(meanses)) 165 | anim.save('kmeans2.gif', writer='imagemagick', fps=4) 166 | -------------------------------------------------------------------------------- /src/kmeans.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joelgrus/stupid-itertools-tricks-pydata/ab5a552c9d5cbfddf79d77d73ca86d56ead3278d/src/kmeans.gif -------------------------------------------------------------------------------- /src/kmeans2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joelgrus/stupid-itertools-tricks-pydata/ab5a552c9d5cbfddf79d77d73ca86d56ead3278d/src/kmeans2.gif -------------------------------------------------------------------------------- /src/kmeans_cluster.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joelgrus/stupid-itertools-tricks-pydata/ab5a552c9d5cbfddf79d77d73ca86d56ead3278d/src/kmeans_cluster.gif -------------------------------------------------------------------------------- /src/sgd.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joelgrus/stupid-itertools-tricks-pydata/ab5a552c9d5cbfddf79d77d73ca86d56ead3278d/src/sgd.gif -------------------------------------------------------------------------------- /src/stupid_tricks.py: -------------------------------------------------------------------------------- 1 | from itertools import count, repeat, cycle, islice, tee, repeat, accumulate, chain 2 | 3 | head = next 4 | 5 | def tail(it): 6 | next(it) 7 | return it 8 | 9 | def take(n, it): 10 | return [x for x in islice(it, n)] 11 | 12 | def drop(n, it): 13 | return islice(it, n, None) 14 | 15 | def iterate(f, x): 16 | """return (x, f(x), f(f(x)), ...)""" 17 | return accumulate(repeat(x), lambda fx, _: f(fx)) 18 | 19 | def until_convergence(it): 20 | """returns elements of it until the same element appears twice in a row, 21 | then stops""" 22 | def no_repeat(prev, curr): 23 | if prev == curr: raise StopIteration 24 | else: return curr 25 | 26 | return accumulate(it, no_repeat) 27 | 28 | def within_tolerance(tol, prev, curr): 29 | if abs(prev - curr) < tol: 30 | raise StopIteration 31 | else: 32 | return curr 33 | 34 | def until_nearly_convergence(it, tolerance=0.001): 35 | return accumulate(it, partial(within_tolerance, tolerance)) 36 | --------------------------------------------------------------------------------