├── .gitignore
├── Dockerfile
├── README.md
├── requirements.txt
├── video1
    └── hello-world.py
├── video2
    ├── iris.pdf
    └── viz.py
├── video3
    └── dogs.py
├── video4
    └── pipeline.py
├── video5
    └── ep5.py
├── video6
    └── README.md
├── video7
    ├── README.md
    └── ep7.ipynb
├── video8
    └── decision_tree.py
└── video9
    └── README.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | #Ipython Notebook
62 | .ipynb_checkpoints
63 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:16.04
2 | RUN apt-get update && apt-get install -y libblas3 libc6 liblapack3 gcc gfortran python3-dev\
3 |     libgcc1 libgfortran3 libstdc++6 g++ graphviz build-essential\
4 |     python3-tk tk-dev libpng12-dev curl python3-pip git && apt-get autoclean
5 | COPY requirements.txt .
6 | RUN python3 -m pip install -r requirements.txt
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # :space_invader: Machine Learning Recipes
 2 | 
 3 | Code developed in video series "Machine Learning Recipes", eventually with some personal comments and annotations.
 4 | 
 5 | ## Installation
 6 | 
 7 | ```bash
 8 | # Ubuntu < 16.04
 9 | sudo apt-get install libatlas-dev libatlas3-base gfortran python-dev\
10 |     libblas3 liblapack3 build-essential libatlas-base-dev graphviz\
11 |     libgraphviz-dev pkg-config build-essential python-tk tk-dev\
12 |     libpng12-dev curl
13 | 
14 | 
15 | # Ubuntu 16.04+
16 | sudo apt install libblas3 libc6 liblapack3 gcc gfortran python-dev\
17 |     libgcc1 libgfortran3 libstdc++6 g++ graphviz build-essential\
18 |     python-tk tk-dev libpng12-dev curl
19 | 
20 | # After:
21 | pip install -r requirements.txt
22 | 
23 | # Docker
24 | docker pull cassiobotaro/mlr
25 | ```
26 | ## Usage
27 | 
28 | `python <example_code>.py`
29 | 
30 | via docker
31 | 
32 | `docker run --rm  -v $(pwd):/mlr  casiobotaro/mlr python3 mlr/video<number>/<example_code>.py `
33 | 
34 | ## Contributing
35 | 
36 | 1. Fork it!
37 | 2. Create your feature branch: `git checkout -b my-new-feature`
38 | 3. Commit your changes: `git commit -am 'Add some feature'`
39 | 4. Push to the branch: `git push origin my-new-feature`
40 | 5. Submit a pull request :D
41 | 
42 | ## History
43 | 
44 | - [Hello World - Machine Learning Recipes #1](https://youtu.be/cKxRvEZd3Mw)
45 | - [Visualizing a Decision Tree - Machine Learning Recipes #2](https://www.youtube.com/watch?v=tNa99PG8hR8)
46 | - [What Makes a Good Feature? - Machine Learning Recipes #3](https://youtu.be/N9fDIAflCMY)
47 | - [Let’s Write a Pipeline - Machine Learning Recipes #4](https://youtu.be/84gqSbLcBFE)
48 | - [Writing Our First Classifier - Machine Learning Recipes #5](https://youtu.be/AoeEHqVSNOw)
49 | - [Train an Image Classifier with TensorFlow for Poets - Machine Learning Recipes #6](https://youtu.be/cSKfRcEDGUs)
50 | - [Classifying Handwritten Digits with TF.Learn - Machine Learning Recipes #7](https://youtu.be/Gj0iyo265bc)
51 | - [Let’s Write a Decision Tree Classifier from Scratch - Machine Learning Recipes #8](https://youtu.be/LDRbO9a6XPU)
52 | - [Intro to Feature Engineering with TensorFlow - Machine Learning Recipes #9](https://youtu.be/d12ra3b_M-0)
53 | 
54 | ## Credits
55 | 
56 | Subscribe to the [Google Developers](http://goo.gl/mQyv5L)
57 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scipy
3 | scikit-learn
4 | git+https://github.com/cassiobotaro/pydot
5 | matplotlib
6 | 


--------------------------------------------------------------------------------
/video1/hello-world.py:
--------------------------------------------------------------------------------
 1 | from sklearn import tree
 2 | 
 3 | # 1 smooth, 0 bumpy
 4 | features = [[140, 1], [130, 1], [150, 0], [170, 0]]
 5 | # 0 apple, 1 orange
 6 | labels = [0, 0, 1, 1]
 7 | 
 8 | clf = tree.DecisionTreeClassifier()
 9 | clf = clf.fit(features, labels)
10 | print(clf.predict([[160, 0]]))
11 | 


--------------------------------------------------------------------------------
/video2/iris.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobotaro/machine_learning_recipes/9cdcd23427f1752d706eab071f898667ef39779c/video2/iris.pdf


--------------------------------------------------------------------------------
/video2/viz.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pydot
 3 | from sklearn import tree
 4 | from sklearn.datasets import load_iris
 5 | from sklearn.externals.six import StringIO
 6 | 
 7 | iris = load_iris()
 8 | test_idx = [0, 50, 100]
 9 | 
10 | # training data
11 | train_target = np.delete(iris.target, test_idx)
12 | train_data = np.delete(iris.data, test_idx, axis=0)
13 | 
14 | # testing data
15 | test_target = iris.target[test_idx]
16 | test_data = iris.data[test_idx]
17 | 
18 | clf = tree.DecisionTreeClassifier()
19 | clf = clf.fit(train_data, train_target)
20 | 
21 | print(test_target)
22 | print(clf.predict(test_data))
23 | 
24 | # viz code
25 | dot_data = StringIO()
26 | tree.export_graphviz(clf, out_file=dot_data, feature_names=iris.feature_names,
27 |                      class_names=iris.target_names, filled=True, rounded=True,
28 |                      impurity=False)
29 | graph = pydot.graph_from_dot_data(dot_data.getvalue())
30 | graph.write_pdf("iris.pdf")
31 | 


--------------------------------------------------------------------------------
/video3/dogs.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | 
 4 | greyhounds = 500
 5 | labs = 500
 6 | 
 7 | grey_height = 28 + 4 * np.random.randn(greyhounds)
 8 | lab_height = 24 + 4 * np.random.randn(labs)
 9 | 
10 | plt.hist([grey_height, lab_height], stacked=True, color=['r', 'b'])
11 | plt.show()
12 | 


--------------------------------------------------------------------------------
/video4/pipeline.py:
--------------------------------------------------------------------------------
 1 | # import a dataset
 2 | from sklearn import datasets  # , tree
 3 | from sklearn.cross_validation import train_test_split
 4 | from sklearn.metrics import accuracy_score
 5 | from sklearn.neighbors import KNeighborsClassifier
 6 | 
 7 | iris = datasets.load_iris()
 8 | 
 9 | X = iris.data
10 | y = iris.target
11 | 
12 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5)
13 | 
14 | # my_classifier = tree.DecisionTreeClassifier()
15 | my_classifier = KNeighborsClassifier()
16 | 
17 | my_classifier.fit(X_train, y_train)
18 | 
19 | predictions = my_classifier.predict(X_test)
20 | 
21 | print(accuracy_score(y_test, predictions))
22 | 


--------------------------------------------------------------------------------
/video5/ep5.py:
--------------------------------------------------------------------------------
 1 | # import a dataset
 2 | from sklearn import datasets  # , tree
 3 | from sklearn.cross_validation import train_test_split
 4 | from sklearn.metrics import accuracy_score
 5 | from scipy.spatial import distance
 6 | # from sklearn.neighbors import KNeighborsClassifier
 7 | # import random
 8 | 
 9 | 
10 | def euc(a, b):
11 |     return distance.euclidean(a, b)
12 | 
13 | 
14 | class ScrappyKNN():
15 |     '''My fist classifier.
16 | 
17 |     Just a reminder:
18 |         X_train - training features
19 |         y_train - training labels
20 |         X_test -  test features
21 |         y_test -  test labels
22 | 
23 |     The accuracy can change beacuse of randomicity of dataset.
24 |     '''
25 | 
26 |     def fit(self, X_train, y_train):
27 |         self.X_train = X_train
28 |         self.y_train = y_train
29 | 
30 |     def predict(self, X_test):
31 |         predictions = []
32 |         for row in X_test:
33 |             label = self.closest(row)
34 |             predictions.append(label)
35 |         return predictions
36 | 
37 |     def closest(self, row):
38 |         best_dist = euc(row, self.X_train[0])
39 |         best_index = 0
40 |         for i in range(1, len(self.X_train)):
41 |             dist = euc(row, self.X_train[i])
42 |             if dist < best_dist:
43 |                 best_dist = dist
44 |                 best_index = i
45 |         return self.y_train[best_index]
46 | 
47 | iris = datasets.load_iris()
48 | 
49 | X = iris.data
50 | y = iris.target
51 | 
52 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5)
53 | 
54 | # my_classifier = tree.DecisionTreeClassifier()
55 | # my_classifier = KNeighborsClassifier()
56 | my_classifier = ScrappyKNN()
57 | 
58 | my_classifier.fit(X_train, y_train)
59 | 
60 | predictions = my_classifier.predict(X_test)
61 | 
62 | print(accuracy_score(y_test, predictions))
63 | 


--------------------------------------------------------------------------------
/video6/README.md:
--------------------------------------------------------------------------------
 1 | # Tensorflow for Poets
 2 | 
 3 | Codelab can be accessed [here](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/?utm_campaign=chrome_series_machinelearning_063016&utm_source=gdev&utm_medium=yt-desc#0)
 4 | # Notes about docker
 5 | 
 6 | ## Installation
 7 | 
 8 | `curl -sSL https://get.docker.com/ | sh`
 9 | 
10 | ## Give permissions to an user.
11 | 
12 | `sudo usermod -aG docker <username>`
13 | 
14 | # Description Links
15 | 
16 | * TensorFlow for Poets Codelab: https://goo.gl/QTwZ3v
17 | 
18 | * Google’s Udacity class on Deep Learning: https://goo.gl/iRqXsy
19 | 
20 | * TensorFlow tutorial: https://goo.gl/0Oz7B5
21 | 
22 | * Google Research blog on Inception: https://goo.gl/CSrfJ1
23 | 


--------------------------------------------------------------------------------
/video7/README.md:
--------------------------------------------------------------------------------
 1 | # Download tensorflow docker image
 2 | 
 3 | `docker run -it -p 8888:8888 tensorflow/tensorflow`
 4 | 
 5 | Go to your browser on http://localhost:8888/
 6 | 
 7 | Now upload ep7.ipynb on running notebook and have fun!
 8 | 
 9 | # Description links
10 | 
11 | Jupyter Notebook: https://goo.gl/NNlMNu
12 | 
13 | Docker images: https://goo.gl/8fmqVW
14 | 
15 | MNIST tutorial: https://goo.gl/GQ3t7n
16 | 
17 | Visualizing MNIST: http://goo.gl/ROcwpR (this blog is outstanding)
18 | 
19 | More notebooks: https://goo.gl/GgLIh7
20 | 
21 | More about linear classifiers: https://goo.gl/u2f2NE
22 | 
23 | Much more about linear classifiers: http://goo.gl/au1PdG (this course is outstanding, highly recommended)
24 | 
25 | More TF.Learn examples: https://goo.gl/szki63
26 | 


--------------------------------------------------------------------------------
/video8/decision_tree.py:
--------------------------------------------------------------------------------
  1 | """Code to accompany Machine Learning Recipes #8.
  2 | 
  3 | We'll write a Decision Tree Classifier, in pure Python.
  4 | """
  5 | 
  6 | # Toy dataset.
  7 | # Format: each row is an example.
  8 | # The last column is the label.
  9 | # The first two columns are features.
 10 | # Feel free to play with it by adding more features & examples.
 11 | # Interesting note: I've written this so the 2nd and 5th examples
 12 | # have the same features, but different labels - so we can see how the
 13 | # tree handles this case.
 14 | training_data = [
 15 |     ['Green', 3, 'Apple'],
 16 |     ['Yellow', 3, 'Apple'],
 17 |     ['Red', 1, 'Grape'],
 18 |     ['Red', 1, 'Grape'],
 19 |     ['Yellow', 3, 'Lemon'],
 20 | ]
 21 | 
 22 | # Column labels.
 23 | # These are used only to print the tree.
 24 | header = ["color", "diameter", "label"]
 25 | 
 26 | 
 27 | def unique_vals(rows, col):
 28 |     """Find the unique values for a column in a dataset."""
 29 |     return set([row[col] for row in rows])
 30 | 
 31 | #######
 32 | # Demo:
 33 | # unique_vals(training_data, 0)
 34 | # unique_vals(training_data, 1)
 35 | #######
 36 | 
 37 | 
 38 | def class_counts(rows):
 39 |     """Counts the number of each type of example in a dataset."""
 40 |     counts = {}  # a dictionary of label -> count.
 41 |     for row in rows:
 42 |         # in our dataset format, the label is always the last column
 43 |         label = row[-1]
 44 |         if label not in counts:
 45 |             counts[label] = 0
 46 |         counts[label] += 1
 47 |     return counts
 48 | 
 49 | #######
 50 | # Demo:
 51 | # class_counts(training_data)
 52 | #######
 53 | 
 54 | 
 55 | def is_numeric(value):
 56 |     """Test if a value is numeric."""
 57 |     return isinstance(value, int) or isinstance(value, float)
 58 | 
 59 | #######
 60 | # Demo:
 61 | # is_numeric(7)
 62 | # is_numeric("Red")
 63 | #######
 64 | 
 65 | 
 66 | class Question:
 67 |     """A Question is used to partition a dataset.
 68 | 
 69 |     This class just records a 'column number' (e.g., 0 for Color) and a
 70 |     'column value' (e.g., Green). The 'match' method is used to compare
 71 |     the feature value in an example to the feature value stored in the
 72 |     question. See the demo below.
 73 |     """
 74 | 
 75 |     def __init__(self, column, value):
 76 |         self.column = column
 77 |         self.value = value
 78 | 
 79 |     def match(self, example):
 80 |         # Compare the feature value in an example to the
 81 |         # feature value in this question.
 82 |         val = example[self.column]
 83 |         if is_numeric(val):
 84 |             return val >= self.value
 85 |         else:
 86 |             return val == self.value
 87 | 
 88 |     def __repr__(self):
 89 |         # This is just a helper method to print
 90 |         # the question in a readable format.
 91 |         condition = "=="
 92 |         if is_numeric(self.value):
 93 |             condition = ">="
 94 |         return "Is %s %s %s?" % (
 95 |             header[self.column], condition, str(self.value))
 96 | 
 97 | #######
 98 | # Demo:
 99 | # Let's write a question for a numeric attribute
100 | # Question(1, 3)
101 | # How about one for a categorical attribute
102 | # q = Question(0, 'Green')
103 | # Let's pick an example from the training set...
104 | # example = training_data[0]
105 | # ... and see if it matches the question
106 | # q.match(example)
107 | #######
108 | 
109 | 
110 | def partition(rows, question):
111 |     """Partitions a dataset.
112 | 
113 |     For each row in the dataset, check if it matches the question. If
114 |     so, add it to 'true rows', otherwise, add it to 'false rows'.
115 |     """
116 |     true_rows, false_rows = [], []
117 |     for row in rows:
118 |         if question.match(row):
119 |             true_rows.append(row)
120 |         else:
121 |             false_rows.append(row)
122 |     return true_rows, false_rows
123 | 
124 | 
125 | #######
126 | # Demo:
127 | # Let's partition the training data based on whether rows are Red.
128 | # true_rows, false_rows = partition(training_data, Question(0, 'Red'))
129 | # This will contain all the 'Red' rows.
130 | # true_rows
131 | # This will contain everything else.
132 | # false_rows
133 | #######
134 | 
135 | def gini(rows):
136 |     """Calculate the Gini Impurity for a list of rows.
137 | 
138 |     There are a few different ways to do this, I thought this one was
139 |     the most concise. See:
140 |     https://en.wikipedia.org/wiki/Decision_tree_learning#Gini_impurity
141 |     """
142 |     counts = class_counts(rows)
143 |     impurity = 1
144 |     for lbl in counts:
145 |         prob_of_lbl = counts[lbl] / float(len(rows))
146 |         impurity -= prob_of_lbl**2
147 |     return impurity
148 | 
149 | 
150 | #######
151 | # Demo:
152 | # Let's look at some example to understand how Gini Impurity works.
153 | #
154 | # First, we'll look at a dataset with no mixing.
155 | # no_mixing = [['Apple'],
156 | #              ['Apple']]
157 | # this will return 0
158 | # gini(no_mixing)
159 | #
160 | # Now, we'll look at dataset with a 50:50 apples:oranges ratio
161 | # some_mixing = [['Apple'],
162 | #               ['Orange']]
163 | # this will return 0.5 - meaning, there's a 50% chance of misclassifying
164 | # a random example we draw from the dataset.
165 | # gini(some_mixing)
166 | #
167 | # Now, we'll look at a dataset with many different labels
168 | # lots_of_mixing = [['Apple'],
169 | #                  ['Orange'],
170 | #                  ['Grape'],
171 | #                  ['Grapefruit'],
172 | #                  ['Blueberry']]
173 | # This will return 0.8
174 | # gini(lots_of_mixing)
175 | #######
176 | 
177 | def info_gain(left, right, current_uncertainty):
178 |     """Information Gain.
179 | 
180 |     The uncertainty of the starting node, minus the weighted impurity of
181 |     two child nodes.
182 |     """
183 |     p = float(len(left)) / (len(left) + len(right))
184 |     return current_uncertainty - p * gini(left) - (1 - p) * gini(right)
185 | 
186 | #######
187 | # Demo:
188 | # Calculate the uncertainy of our training data.
189 | # current_uncertainty = gini(training_data)
190 | #
191 | # How much information do we gain by partioning on 'Green'?
192 | # true_rows, false_rows = partition(training_data, Question(0, 'Green'))
193 | # info_gain(true_rows, false_rows, current_uncertainty)
194 | #
195 | # What about if we partioned on 'Red' instead?
196 | # true_rows, false_rows = partition(training_data, Question(0,'Red'))
197 | # info_gain(true_rows, false_rows, current_uncertainty)
198 | #
199 | # It looks like we learned more using 'Red' (0.37), than 'Green' (0.14).
200 | # Why? Look at the different splits that result, and see which one
201 | # looks more 'unmixed' to you.
202 | # true_rows, false_rows = partition(training_data, Question(0,'Red'))
203 | #
204 | # Here, the true_rows contain only 'Grapes'.
205 | # true_rows
206 | #
207 | # And the false rows contain two types of fruit. Not too bad.
208 | # false_rows
209 | #
210 | # On the other hand, partitioning by Green doesn't help so much.
211 | # true_rows, false_rows = partition(training_data, Question(0,'Green'))
212 | #
213 | # We've isolated one apple in the true rows.
214 | # true_rows
215 | #
216 | # But, the false-rows are badly mixed up.
217 | # false_rows
218 | #######
219 | 
220 | 
221 | def find_best_split(rows):
222 |     """Find the best question to ask by iterating over every feature / value
223 |     and calculating the information gain."""
224 |     best_gain = 0  # keep track of the best information gain
225 |     best_question = None  # keep train of the feature / value that produced it
226 |     current_uncertainty = gini(rows)
227 |     n_features = len(rows[0]) - 1  # number of columns
228 | 
229 |     for col in range(n_features):  # for each feature
230 | 
231 |         values = set([row[col] for row in rows])  # unique values in the column
232 | 
233 |         for val in values:  # for each value
234 | 
235 |             question = Question(col, val)
236 | 
237 |             # try splitting the dataset
238 |             true_rows, false_rows = partition(rows, question)
239 | 
240 |             # Skip this split if it doesn't divide the
241 |             # dataset.
242 |             if len(true_rows) == 0 or len(false_rows) == 0:
243 |                 continue
244 | 
245 |             # Calculate the information gain from this split
246 |             gain = info_gain(true_rows, false_rows, current_uncertainty)
247 | 
248 |             # You actually can use '>' instead of '>=' here
249 |             # but I wanted the tree to look a certain way for our
250 |             # toy dataset.
251 |             if gain >= best_gain:
252 |                 best_gain, best_question = gain, question
253 | 
254 |     return best_gain, best_question
255 | 
256 | #######
257 | # Demo:
258 | # Find the best question to ask first for our toy dataset.
259 | # best_gain, best_question = find_best_split(training_data)
260 | # FYI: is color == Red is just as good. See the note in the code above
261 | # where I used '>='.
262 | #######
263 | 
264 | 
265 | class Leaf:
266 |     """A Leaf node classifies data.
267 | 
268 |     This holds a dictionary of class (e.g., "Apple") -> number of times
269 |     it appears in the rows from the training data that reach this leaf.
270 |     """
271 | 
272 |     def __init__(self, rows):
273 |         self.predictions = class_counts(rows)
274 | 
275 | 
276 | class Decision_Node:
277 |     """A Decision Node asks a question.
278 | 
279 |     This holds a reference to the question, and to the two child nodes.
280 |     """
281 | 
282 |     def __init__(self,
283 |                  question,
284 |                  true_branch,
285 |                  false_branch):
286 |         self.question = question
287 |         self.true_branch = true_branch
288 |         self.false_branch = false_branch
289 | 
290 | 
291 | def build_tree(rows):
292 |     """Builds the tree.
293 | 
294 |     Rules of recursion: 1) Believe that it works. 2) Start by checking
295 |     for the base case (no further information gain). 3) Prepare for
296 |     giant stack traces.
297 |     """
298 | 
299 |     # Try partitioing the dataset on each of the unique attribute,
300 |     # calculate the information gain,
301 |     # and return the question that produces the highest gain.
302 |     gain, question = find_best_split(rows)
303 | 
304 |     # Base case: no further info gain
305 |     # Since we can ask no further questions,
306 |     # we'll return a leaf.
307 |     if gain == 0:
308 |         return Leaf(rows)
309 | 
310 |     # If we reach here, we have found a useful feature / value
311 |     # to partition on.
312 |     true_rows, false_rows = partition(rows, question)
313 | 
314 |     # Recursively build the true branch.
315 |     true_branch = build_tree(true_rows)
316 | 
317 |     # Recursively build the false branch.
318 |     false_branch = build_tree(false_rows)
319 | 
320 |     # Return a Question node.
321 |     # This records the best feature / value to ask at this point,
322 |     # as well as the branches to follow
323 |     # dependingo on the answer.
324 |     return Decision_Node(question, true_branch, false_branch)
325 | 
326 | 
327 | def print_tree(node, spacing=""):
328 |     """World's most elegant tree printing function."""
329 | 
330 |     # Base case: we've reached a leaf
331 |     if isinstance(node, Leaf):
332 |         print(spacing + "Predict", node.predictions)
333 |         return
334 | 
335 |     # Print the question at this node
336 |     print(spacing + str(node.question))
337 | 
338 |     # Call this function recursively on the true branch
339 |     print(spacing + '--> True:')
340 |     print_tree(node.true_branch, spacing + "  ")
341 | 
342 |     # Call this function recursively on the false branch
343 |     print(spacing + '--> False:')
344 |     print_tree(node.false_branch, spacing + "  ")
345 | 
346 | 
347 | def classify(row, node):
348 |     """See the 'rules of recursion' above."""
349 | 
350 |     # Base case: we've reached a leaf
351 |     if isinstance(node, Leaf):
352 |         return node.predictions
353 | 
354 |     # Decide whether to follow the true-branch or the false-branch.
355 |     # Compare the feature / value stored in the node,
356 |     # to the example we're considering.
357 |     if node.question.match(row):
358 |         return classify(row, node.true_branch)
359 |     else:
360 |         return classify(row, node.false_branch)
361 | 
362 | 
363 | #######
364 | # Demo:
365 | # The tree predicts the 1st row of our
366 | # training data is an apple with confidence 1.
367 | # my_tree = build_tree(training_data)
368 | # classify(training_data[0], my_tree)
369 | #######
370 | 
371 | def print_leaf(counts):
372 |     """A nicer way to print the predictions at a leaf."""
373 |     total = sum(counts.values()) * 1.0
374 |     probs = {}
375 |     for lbl in counts.keys():
376 |         probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
377 |     return probs
378 | 
379 | 
380 | #######
381 | # Demo:
382 | # Printing that a bit nicer
383 | # print_leaf(classify(training_data[0], my_tree))
384 | #######
385 | 
386 | #######
387 | # Demo:
388 | # On the second example, the confidence is lower
389 | # print_leaf(classify(training_data[1], my_tree))
390 | #######
391 | 
392 | if __name__ == '__main__':
393 | 
394 |     my_tree = build_tree(training_data)
395 | 
396 |     print_tree(my_tree)
397 | 
398 |     # Evaluate
399 |     testing_data = [
400 |         ['Green', 3, 'Apple'],
401 |         ['Yellow', 4, 'Apple'],
402 |         ['Red', 2, 'Grape'],
403 |         ['Red', 1, 'Grape'],
404 |         ['Yellow', 3, 'Lemon'],
405 |     ]
406 | 
407 |     for row in testing_data:
408 |         print("Actual: %s. Predicted: %s" %
409 |               (row[-1], print_leaf(classify(row, my_tree))))
410 | 
411 | # Next steps
412 | # - add support for missing (or unseen) attributes
413 | # - prune the tree to prevent overfitting
414 | # - add support for regression
415 | 


--------------------------------------------------------------------------------
/video9/README.md:
--------------------------------------------------------------------------------
 1 | To maintain original content from video
 2 | 
 3 | Links from the video:
 4 | 
 5 | Code - https://goo.gl/K9dVqv
 6 | Facets: https://goo.gl/Dfpb7W
 7 | TensorFlow Embedding Projector: https://goo.gl/2SxrYK
 8 | 
 9 | You can find Josh on Twitter: https://twitter.com/random_forests
10 | 


--------------------------------------------------------------------------------