├── README.md
└── src
    ├── __init__.py
    ├── gradient_descent.gif
    ├── gradient_descent.py
    ├── gradient_descent2.gif
    ├── k_means.py
    ├── kmeans.gif
    ├── kmeans2.gif
    ├── kmeans_cluster.gif
    ├── sgd.gif
    └── stupid_tricks.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Stupid Itertools Tricks For Data Science
 2 | 
 3 | Here's the code that goes along with my 2015 PyData Seattle talk.  
 4 | 
 5 | [Slides](https://docs.google.com/presentation/d/1eI60SL3UxtWfr9ktrv48-pcIkk4S7JiDmeXGCyyGhCs/edit?usp=sharing)
 6 | 
 7 | [Video](https://www.youtube.com/watch?v=ThS4juptJjQ)
 8 | 
 9 | The original title was 
10 | 
11 | ## Learning Data Science Using Functional Python 
12 | 
13 | but then when I started writing it it was turning out to be
14 | 
15 | ## Stupid Itertools Tricks
16 | 
17 | but since the conference was PyData, I figured I better get data back into it somehow.
18 | 
19 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joelgrus/stupid-itertools-tricks-pydata/ab5a552c9d5cbfddf79d77d73ca86d56ead3278d/src/__init__.py


--------------------------------------------------------------------------------
/src/gradient_descent.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joelgrus/stupid-itertools-tricks-pydata/ab5a552c9d5cbfddf79d77d73ca86d56ead3278d/src/gradient_descent.gif


--------------------------------------------------------------------------------
/src/gradient_descent.py:
--------------------------------------------------------------------------------
  1 | from stupid_tricks import *
  2 | from itertools import chain
  3 | import numpy as np
  4 | import matplotlib
  5 | import matplotlib.pyplot as plt
  6 | from matplotlib import animation
  7 | import random
  8 | import math
  9 | from functools import reduce, partial
 10 | from operator import add
 11 | 
 12 | def gradient_step(df, alpha, x_i):
 13 |   return [x_ij + alpha * df_j
 14 |           for x_ij, df_j in zip(x_i, df(x_i))]
 15 | 
 16 | def gradient_descent(df, x_0, alpha=0.1):
 17 |   return iterate(partial(gradient_step, df, -alpha), x_0)
 18 | 
 19 | def run_gradient_descent(seed=0):
 20 |   """runs gradient descent to find a minimum of x^2 + y^2"""
 21 |   random.seed(seed)
 22 |   colors = [color for color in matplotlib.colors.cnames]
 23 | 
 24 |   def random_point():
 25 |     return (2 * random.random() - 1, 2 * random.random() - 1)
 26 | 
 27 |   def df(x_i):
 28 |     """this is the gradient of x^2 + y^2"""
 29 |     return [2 * x_ij for x_ij in x_i]
 30 | 
 31 |   for color in random.sample(colors, 50):
 32 |     path = take(10, gradient_descent(df, random_point()))
 33 |     for i, (x, y) in enumerate(path):
 34 |       plt.plot(x, y, color=color, marker='*', markersize=20-2*i)
 35 | 
 36 |   plt.show()
 37 | 
 38 | def run_gradient_descent_animation(seed=0, nframes=25):
 39 |   random.seed(seed)
 40 | 
 41 |   colors = [color for color in matplotlib.colors.cnames]
 42 | 
 43 |   def random_point():
 44 |     return (2 * random.random() - 1, 2 * random.random() - 1)
 45 | 
 46 |   def df(x_i):
 47 |     """this is the gradient of x^2 + y^2"""
 48 |     return [2 * x_ij for x_ij in x_i]
 49 | 
 50 |   paths = [take(nframes, gradient_descent(df, random_point()))
 51 |            for _ in range(50)]
 52 | 
 53 |   def animation_frame(nframe):
 54 |     points = [path[nframe] for path in paths]
 55 | 
 56 |     for color, point in zip(colors, points):
 57 |       markersize = 10 - 10 * nframe / nframes
 58 |       plt.plot(*point, color=color, marker='*', markersize=markersize)
 59 | 
 60 |   fig = plt.figure(figsize=(5,4))
 61 |   anim = animation.FuncAnimation(fig, animation_frame, frames=nframes)
 62 |   anim.save('gradient_descent.gif', writer='imagemagick', fps=4)
 63 | 
 64 | 
 65 | def run_gradient_descent2(seed=0):
 66 |   """runs gradient descent to find a minimum of
 67 |   exp(x^3 / 3 + x - y^2)"""
 68 | 
 69 |   colors = [color for color in matplotlib.colors.cnames]
 70 | 
 71 |   def random_point():
 72 |     return (3 * random.random() - 1, 3 * random.random() - 1)
 73 | 
 74 |   def f(x):
 75 |     """has min at (1,0), saddle point at (-1,0)"""
 76 |     return -math.exp(x[0]**3/-3 + x[0] - x[1]**2)
 77 | 
 78 |   def df(x):
 79 |     return ((1 - x[0]**2) * f(x), -2 * x[1] * f(x))
 80 | 
 81 |   for color in random.sample(colors, 50):
 82 |     path = take(100, gradient_descent(df, random_point()))
 83 |     for i, (x, y) in enumerate(path):
 84 |       plt.plot(x, y, color=color, marker='*', markersize=25-i/4)
 85 | 
 86 |   plt.show()
 87 | 
 88 | def run_gradient_descent_animation2(seed=0, nframes=25):
 89 |   random.seed(seed)
 90 | 
 91 |   colors = [color for color in matplotlib.colors.cnames]
 92 | 
 93 |   def random_point():
 94 |     return (2 * random.random() - 1, 2 * random.random() - 1)
 95 | 
 96 |   def f(x):
 97 |     """has min at (1,0), saddle point at (-1,0)"""
 98 |     return -math.exp(x[0]**3/-3 + x[0] - x[1]**2)
 99 | 
100 |   def df(x):
101 |     return ((1 - x[0]**2) * f(x), -2 * x[1] * f(x))
102 | 
103 |   paths = [take(nframes, gradient_descent(df, random_point()))
104 |            for _ in range(50)]
105 | 
106 |   def animation_frame(nframe):
107 |     points = [path[nframe] for path in paths]
108 | 
109 |     for color, point in zip(colors, points):
110 |       markersize = 10 - 10 * nframe / nframes
111 |       plt.plot(*point, color=color, marker='*', markersize=markersize)
112 | 
113 |   fig = plt.figure(figsize=(5,4))
114 |   anim = animation.FuncAnimation(fig, animation_frame, frames=nframes)
115 |   anim.save('gradient_descent2.gif', writer='imagemagick', fps=4)
116 | 
117 | def sgd_step(df, alpha, prev_beta, xy_i):
118 |   """df is a function of x_i, y_i, beta"""
119 |   x_i, y_i = xy_i
120 |   gradient = df(x_i, y_i, prev_beta)
121 |   return [beta_j + alpha * df_j
122 |           for beta_j, df_j in zip(prev_beta, gradient)]
123 | 
124 | def sgd(df, x, y, beta_0, alpha=0.1):
125 |   xys = chain([beta_0], cycle(zip(x, y)))
126 |   return accumulate(xys, partial(sgd_step, df, -alpha))
127 | 
128 | def run_sgd(seed=0, steps=5000, show_every=500):
129 |   random.seed(seed)
130 |   x = [(1, random.random()) for _ in range(100)]
131 |   y = [-5 * x_i[0] + 10 * x_i[1] + random.random() for x_i in x]
132 | 
133 |   def predict(x_i, beta): return x_i[0] * beta[0] + x_i[1] * beta[1]
134 | 
135 |   def error(x_i, y_i, beta): return predict(x_i, beta) - y_i
136 | 
137 |   def sqerror(x_i, y_i, beta): return error(x_i, y_i, beta) ** 2
138 | 
139 |   def sqerror_gradient(x_i, y_i, beta):
140 |     return (2 * x_i[0] * error(x_i, y_i, beta),
141 |             2 * x_i[1] * error(x_i, y_i, beta))
142 | 
143 |   beta_0 = (random.random(), random.random())
144 |   results = [x for x in take(steps, sgd(sqerror_gradient, x, y, beta_0, 0.01))]
145 | 
146 |   subresults = results[::show_every]
147 | 
148 |   _, xs = zip(*x)
149 |   plt.scatter(xs, y)
150 | 
151 |   for i, (a, b) in enumerate(subresults):
152 |     plt.plot([0, 1], [a, a+b])
153 | 
154 |   plt.show()
155 |   return subresults
156 | 
157 | def run_sgd_animation(seed=0, steps=5000, show_every=250):
158 |   random.seed(seed)
159 |   x = [(1, random.random()) for _ in range(100)]
160 |   y = [-5 * x_i[0] + 10 * x_i[1] + random.random() for x_i in x]
161 | 
162 |   def predict(x_i, beta): return x_i[0] * beta[0] + x_i[1] * beta[1]
163 | 
164 |   def error(x_i, y_i, beta): return predict(x_i, beta) - y_i
165 | 
166 |   def sqerror(x_i, y_i, beta): return error(x_i, y_i, beta) ** 2
167 | 
168 |   def sqerror_gradient(x_i, y_i, beta):
169 |     return (-2 * x_i[0] * error(x_i, y_i, beta),
170 |             -2 * x_i[1] * error(x_i, y_i, beta))
171 | 
172 |   beta_0 = (random.random(), random.random())
173 |   results = [x for x in take(steps, sgd(sqerror_gradient, x, y, beta_0, 0.01))]
174 | 
175 |   subresults = results[::show_every]
176 |   nframes = len(subresults)
177 | 
178 |   def animation_frame(nframe):
179 |     a, b = subresults[nframe]
180 |     plt.plot([0, 1], [a, a+b])
181 | 
182 |   fig = plt.figure(figsize=(5,4))
183 |   _, xs = zip(*x)
184 |   fig.gca().scatter(xs, y)
185 | 
186 |   anim = animation.FuncAnimation(fig, animation_frame, frames=nframes)
187 |   anim.save('sgd.gif', writer='imagemagick', fps=4)
188 | 


--------------------------------------------------------------------------------
/src/gradient_descent2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joelgrus/stupid-itertools-tricks-pydata/ab5a552c9d5cbfddf79d77d73ca86d56ead3278d/src/gradient_descent2.gif


--------------------------------------------------------------------------------
/src/k_means.py:
--------------------------------------------------------------------------------
  1 | from stupid_tricks import *
  2 | from sklearn.datasets.samples_generator import make_blobs
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import matplotlib
  6 | from matplotlib import animation
  7 | import random
  8 | from functools import reduce, partial
  9 | from operator import add
 10 | 
 11 | class KMeans:
 12 |   """good old class based solution"""
 13 |   def __init__(self, k):
 14 |     self.k = k
 15 |     self.means = [None for _ in range(k)]
 16 | 
 17 |   def fit(self, points, num_iters=10):
 18 |     assignments = [None for _ in points]
 19 |     self.means = random.sample(list(points), self.k)
 20 |     for _ in range(num_iters):
 21 |       for i, point in enumerate(points):
 22 |         assignments[i] = self.predict(point)
 23 |       for j in range(self.k):
 24 |         cluster = [p for p, c in zip(points, assignments) if c == j]
 25 |         self.means[j] = list(map(lambda x: x / len(cluster), reduce(partial(map, add), cluster)))
 26 | 
 27 |   def predict(self, point):
 28 |     d_min = float('inf')
 29 |     for j, m in enumerate(self.means):
 30 |       d = sum((m_i - p_i)**2 for m_i, p_i in zip(m, point))
 31 |       if d < d_min:
 32 |         prediction = j
 33 |         d_min = d
 34 |     return prediction
 35 | 
 36 | def run_kmeans(seed=1):
 37 |   random.seed(seed)
 38 |   points = np.random.random((100,2))
 39 | 
 40 |   model = KMeans(5)
 41 |   model.fit(points, num_iters=100)
 42 |   assignments = [model.predict(point) for point in points]
 43 | 
 44 |   for x, y in model.means:
 45 |     plt.plot(x, y, marker='*', markersize=10, color='black')
 46 | 
 47 |   for j, color in zip(range(5),
 48 |                       ['r', 'g', 'b', 'm', 'c']):
 49 |     cluster = [p
 50 |                for p, c in zip(points, assignments)
 51 |                if j == c]
 52 |     xs, ys = zip(*cluster)
 53 |     plt.scatter(xs, ys, color=color)
 54 | 
 55 |   plt.show()
 56 | 
 57 | # functional version
 58 | # ------------------
 59 | 
 60 | def k_meanses(points, k):
 61 |   initial_means = random.sample(points, k)
 62 |   return iterate(partial(new_means, points),
 63 |                  initial_means)
 64 | 
 65 | def no_repeat(prev, curr):
 66 |   if prev == curr: raise StopIteration
 67 |   else: return curr
 68 | 
 69 | def until_convergence(it):
 70 |   return accumulate(it, no_repeat)
 71 | 
 72 | def new_means(points, old_means):
 73 |   k = len(old_means)
 74 |   assignments = [closest_index(point, old_means)
 75 |                  for point in points]
 76 |   clusters = [[point
 77 |                for point, c in zip(points, assignments)
 78 |                if c == j] for j in range(k)]
 79 |   return [cluster_mean(cluster) for cluster in clusters]
 80 | 
 81 | def closest_index(point, means):
 82 |   return min(enumerate(means),
 83 |              key=lambda pair: squared_distance(point, pair[1]))[0]
 84 | 
 85 | def squared_distance(p, q):
 86 |   return sum((p_i - q_i)**2 for p_i, q_i in zip(p, q))
 87 | 
 88 | def cluster_mean(points):
 89 |   num_points = len(points)
 90 |   dim = len(points[0]) if points else 0
 91 |   sum_points = [sum(point[j] for point in points)
 92 |                 for j in range(dim)]
 93 |   return [s / num_points for s in sum_points]
 94 | 
 95 | 
 96 | def run_kmeans_functional(seed=0):
 97 |   random.seed(seed)
 98 |   data = [(random.random(), random.random()) for _ in range(500)]
 99 |   meanses = [mean for mean in until_convergence(k_meanses(data, 5))]
100 | 
101 |   x, y = zip(*data)
102 |   plt.scatter(x, y, color='black')
103 | 
104 |   colors = ['r', 'g', 'b', 'c', 'm']
105 |   for i, means in enumerate(meanses):
106 |     for m, color in zip(means, colors):
107 |       plt.plot(*m, color=color,
108 |                marker='*',
109 |                markersize=3*i)
110 | 
111 |   plt.show()
112 | 
113 | 
114 | def run_kmeans_animation(seed=0, k=5):
115 |   random.seed(seed)
116 |   data = [(random.random(), random.random()) for _ in range(500)]
117 |   meanses = [mean for mean in until_convergence(k_meanses(data, k))]
118 | 
119 |   # colors = random.sample(list(matplotlib.colors.cnames), k)
120 |   colors = ['r', 'g', 'b', 'c', 'm']
121 | 
122 |   def animation_frame(nframe):
123 |     means = meanses[nframe]
124 |     plt.cla()
125 |     assignments = [closest_index(point, means)
126 |                    for point in data]
127 |     clusters = [[point
128 |                  for point, c in zip(data, assignments)
129 |                  if c == j] for j in range(k)]
130 | 
131 |     for cluster, color, mean in zip(clusters, colors, means):
132 |       x, y = zip(*cluster)
133 |       plt.scatter(x, y, color=color)
134 |       plt.plot(*mean, color=color, marker='*', markersize=20)
135 | 
136 |   fig = plt.figure(figsize=(5,4))
137 |   anim = animation.FuncAnimation(fig, animation_frame, frames=len(meanses))
138 |   anim.save('kmeans_cluster.gif', writer='imagemagick', fps=4)
139 | 
140 | def run_kmeans_animation2(seed=0, k=5):
141 |   random.seed(seed)
142 |   data = [(random.choice([0,1,2,4,5]) + random.random(),
143 |            random.normalvariate(0, 1)) for _ in range(500)]
144 |   meanses = [mean for mean in until_convergence(k_meanses(data, k))]
145 | 
146 |   # colors = random.sample(list(matplotlib.colors.cnames), k)
147 |   colors = ['r', 'g', 'b', 'c', 'm']
148 | 
149 |   def animation_frame(nframe):
150 |     means = meanses[nframe]
151 |     plt.cla()
152 |     assignments = [closest_index(point, means)
153 |                    for point in data]
154 |     clusters = [[point
155 |                  for point, c in zip(data, assignments)
156 |                  if c == j] for j in range(k)]
157 | 
158 |     for cluster, color, mean in zip(clusters, colors, means):
159 |       x, y = zip(*cluster)
160 |       plt.scatter(x, y, color=color)
161 |       plt.plot(*mean, color=color, marker='*', markersize=20)
162 | 
163 |   fig = plt.figure(figsize=(5,4))
164 |   anim = animation.FuncAnimation(fig, animation_frame, frames=len(meanses))
165 |   anim.save('kmeans2.gif', writer='imagemagick', fps=4)
166 | 


--------------------------------------------------------------------------------
/src/kmeans.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joelgrus/stupid-itertools-tricks-pydata/ab5a552c9d5cbfddf79d77d73ca86d56ead3278d/src/kmeans.gif


--------------------------------------------------------------------------------
/src/kmeans2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joelgrus/stupid-itertools-tricks-pydata/ab5a552c9d5cbfddf79d77d73ca86d56ead3278d/src/kmeans2.gif


--------------------------------------------------------------------------------
/src/kmeans_cluster.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joelgrus/stupid-itertools-tricks-pydata/ab5a552c9d5cbfddf79d77d73ca86d56ead3278d/src/kmeans_cluster.gif


--------------------------------------------------------------------------------
/src/sgd.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joelgrus/stupid-itertools-tricks-pydata/ab5a552c9d5cbfddf79d77d73ca86d56ead3278d/src/sgd.gif


--------------------------------------------------------------------------------
/src/stupid_tricks.py:
--------------------------------------------------------------------------------
 1 | from itertools import count, repeat, cycle, islice, tee, repeat, accumulate, chain
 2 | 
 3 | head = next
 4 | 
 5 | def tail(it):
 6 |   next(it)
 7 |   return it
 8 | 
 9 | def take(n, it):
10 |   return [x for x in islice(it, n)]
11 | 
12 | def drop(n, it):
13 |   return islice(it, n, None)
14 | 
15 | def iterate(f, x):
16 |   """return (x, f(x), f(f(x)), ...)"""
17 |   return accumulate(repeat(x), lambda fx, _: f(fx))
18 | 
19 | def until_convergence(it):
20 |   """returns elements of it until the same element appears twice in a row,
21 |   then stops"""
22 |   def no_repeat(prev, curr):
23 |     if prev == curr: raise StopIteration
24 |     else: return curr
25 | 
26 |   return accumulate(it, no_repeat)
27 | 
28 | def within_tolerance(tol, prev, curr):
29 |   if abs(prev - curr) < tol:
30 |     raise StopIteration
31 |   else:
32 |     return curr
33 | 
34 | def until_nearly_convergence(it, tolerance=0.001):
35 |   return accumulate(it, partial(within_tolerance, tolerance))
36 | 


--------------------------------------------------------------------------------