├── PythonCode
    ├── .idea
    │   ├── PythonCode.iml
    │   ├── misc.xml
    │   ├── modules.xml
    │   └── workspace.xml
    ├── lesson 1.py
    └── lesson5.py
├── README.md
└── subtitle
    ├── Chs
        ├── #4中文字幕
        ├── Hello World - Machine Learning Recipes #1_chs.srt
        ├── Visualizing a Decision Tree - Machine Learning Recipes #2_chs.srt
        ├── What Makes a Good Feature- - Machine Learning Recipes #3.srt
        └── [ing...]Machine Learning over Coffee with a Googler.srt
    └── Eng
        ├── Hello World - Machine Learning Recipes #1.srt
        ├── Let°Øs Write a Pipeline - Machine Learning Recipes #4.srt
        ├── Machine Learning over Coffee with a Googler.srt
        ├── Visualizing a Decision Tree - Machine Learning Recipes #2.srt
        ├── What Makes a Good Feature - Machine Learning Recipes #3.srt
        └── Writing Our First Classifier - Machine Learning Recipes #5.srt


/PythonCode/.idea/PythonCode.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Nosetests" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Nosetests" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/PythonCode/.idea/misc.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="ProjectInspectionProfilesVisibleTreeState">
 4 |     <entry key="Project Default">
 5 |       <profile-state>
 6 |         <expanded-state>
 7 |           <State>
 8 |             <id />
 9 |           </State>
10 |         </expanded-state>
11 |         <selected-state>
12 |           <State>
13 |             <id>Buildout</id>
14 |           </State>
15 |         </selected-state>
16 |       </profile-state>
17 |     </entry>
18 |   </component>
19 |   <component name="ProjectLevelVcsManager" settingsEditedManually="false">
20 |     <OptionsSetting value="true" id="Add" />
21 |     <OptionsSetting value="true" id="Remove" />
22 |     <OptionsSetting value="true" id="Checkout" />
23 |     <OptionsSetting value="true" id="Update" />
24 |     <OptionsSetting value="true" id="Status" />
25 |     <OptionsSetting value="true" id="Edit" />
26 |     <ConfirmationsSetting value="0" id="Add" />
27 |     <ConfirmationsSetting value="0" id="Remove" />
28 |   </component>
29 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.10 (/usr/bin/python)" project-jdk-type="Python SDK" />
30 |   <component name="PyConsoleOptionsProvider">
31 |     <option name="myPythonConsoleState">
32 |       <console-settings is-module-sdk="true">
33 |         <option name="myUseModuleSdk" value="true" />
34 |       </console-settings>
35 |     </option>
36 |   </component>
37 | </project>


--------------------------------------------------------------------------------
/PythonCode/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/PythonCode.iml" filepath="$PROJECT_DIR$/.idea/PythonCode.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/PythonCode/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="ChangeListManager">
  4 |     <list default="true" id="a9aa7dd4-f3ec-4f49-8531-1df274f109f8" name="Default" comment="" />
  5 |     <ignored path="PythonCode.iws" />
  6 |     <ignored path=".idea/workspace.xml" />
  7 |     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
  8 |     <option name="TRACKING_ENABLED" value="true" />
  9 |     <option name="SHOW_DIALOG" value="false" />
 10 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
 11 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 12 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 13 |   </component>
 14 |   <component name="ChangesViewManager" flattened_view="true" show_ignored="false" />
 15 |   <component name="CreatePatchCommitExecutor">
 16 |     <option name="PATCH_PATH" value="" />
 17 |   </component>
 18 |   <component name="ExecutionTargetManager" SELECTED_TARGET="default_target" />
 19 |   <component name="FavoritesManager">
 20 |     <favorites_list name="PythonCode" />
 21 |   </component>
 22 |   <component name="FileEditorManager">
 23 |     <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
 24 |       <file leaf-file-name="lesson 1.py" pinned="false" current-in-tab="true">
 25 |         <entry file="file://$PROJECT_DIR$/lesson 1.py">
 26 |           <provider selected="true" editor-type-id="text-editor">
 27 |             <state relative-caret-position="15">
 28 |               <caret line="1" column="6" selection-start-line="1" selection-start-column="6" selection-end-line="1" selection-end-column="6" />
 29 |               <folding />
 30 |             </state>
 31 |           </provider>
 32 |         </entry>
 33 |       </file>
 34 |       <file leaf-file-name="lesson5.py" pinned="false" current-in-tab="false">
 35 |         <entry file="file://$PROJECT_DIR$/lesson5.py">
 36 |           <provider selected="true" editor-type-id="text-editor">
 37 |             <state relative-caret-position="60">
 38 |               <caret line="4" column="27" selection-start-line="4" selection-start-column="27" selection-end-line="4" selection-end-column="27" />
 39 |               <folding />
 40 |             </state>
 41 |           </provider>
 42 |         </entry>
 43 |       </file>
 44 |     </leaf>
 45 |   </component>
 46 |   <component name="FileTemplateManagerImpl">
 47 |     <option name="RECENT_TEMPLATES">
 48 |       <list>
 49 |         <option value="Python Script" />
 50 |       </list>
 51 |     </option>
 52 |   </component>
 53 |   <component name="IdeDocumentHistory">
 54 |     <option name="CHANGED_PATHS">
 55 |       <list>
 56 |         <option value="$PROJECT_DIR$/lesson 1.py" />
 57 |         <option value="$PROJECT_DIR$/lesson5.py" />
 58 |       </list>
 59 |     </option>
 60 |   </component>
 61 |   <component name="ProjectFrameBounds">
 62 |     <option name="x" value="12" />
 63 |     <option name="y" value="38" />
 64 |     <option name="width" value="1280" />
 65 |     <option name="height" value="722" />
 66 |   </component>
 67 |   <component name="ProjectLevelVcsManager" settingsEditedManually="false">
 68 |     <OptionsSetting value="true" id="Add" />
 69 |     <OptionsSetting value="true" id="Remove" />
 70 |     <OptionsSetting value="true" id="Checkout" />
 71 |     <OptionsSetting value="true" id="Update" />
 72 |     <OptionsSetting value="true" id="Status" />
 73 |     <OptionsSetting value="true" id="Edit" />
 74 |     <ConfirmationsSetting value="0" id="Add" />
 75 |     <ConfirmationsSetting value="0" id="Remove" />
 76 |   </component>
 77 |   <component name="ProjectView">
 78 |     <navigator currentView="ProjectPane" proportions="" version="1">
 79 |       <flattenPackages />
 80 |       <showMembers />
 81 |       <showModules />
 82 |       <showLibraryContents />
 83 |       <hideEmptyPackages />
 84 |       <abbreviatePackageNames />
 85 |       <autoscrollToSource />
 86 |       <autoscrollFromSource />
 87 |       <sortByType />
 88 |       <manualOrder />
 89 |       <foldersAlwaysOnTop value="true" />
 90 |     </navigator>
 91 |     <panes>
 92 |       <pane id="Scratches" />
 93 |       <pane id="Scope" />
 94 |       <pane id="ProjectPane">
 95 |         <subPane />
 96 |       </pane>
 97 |     </panes>
 98 |   </component>
 99 |   <component name="PropertiesComponent">
100 |     <property name="options.lastSelected" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
101 |     <property name="options.splitter.main.proportions" value="0.3" />
102 |     <property name="options.splitter.details.proportions" value="0.2" />
103 |     <property name="options.searchVisible" value="true" />
104 |     <property name="last_opened_file_path" value="$PROJECT_DIR$/lesson5.py" />
105 |     <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
106 |     <property name="settings.editor.splitter.proportion" value="0.2" />
107 |   </component>
108 |   <component name="RunManager" selected="Python.Unnamed">
109 |     <configuration default="true" type="PythonConfigurationType" factoryName="Python">
110 |       <option name="INTERPRETER_OPTIONS" value="" />
111 |       <option name="PARENT_ENVS" value="true" />
112 |       <envs>
113 |         <env name="PYTHONUNBUFFERED" value="1" />
114 |       </envs>
115 |       <option name="SDK_HOME" value="" />
116 |       <option name="WORKING_DIRECTORY" value="" />
117 |       <option name="IS_MODULE_SDK" value="false" />
118 |       <option name="ADD_CONTENT_ROOTS" value="true" />
119 |       <option name="ADD_SOURCE_ROOTS" value="true" />
120 |       <module name="PythonCode" />
121 |       <option name="SCRIPT_NAME" value="" />
122 |       <option name="PARAMETERS" value="" />
123 |       <option name="SHOW_COMMAND_LINE" value="false" />
124 |       <method />
125 |     </configuration>
126 |     <configuration default="false" name="Unnamed" type="PythonConfigurationType" factoryName="Python">
127 |       <option name="INTERPRETER_OPTIONS" value="" />
128 |       <option name="PARENT_ENVS" value="true" />
129 |       <envs>
130 |         <env name="PYTHONUNBUFFERED" value="1" />
131 |       </envs>
132 |       <option name="SDK_HOME" value="/Library/Frameworks/Python.framework/Versions/2.7/bin/python2.7" />
133 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
134 |       <option name="IS_MODULE_SDK" value="false" />
135 |       <option name="ADD_CONTENT_ROOTS" value="true" />
136 |       <option name="ADD_SOURCE_ROOTS" value="true" />
137 |       <module name="PythonCode" />
138 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/lesson5.py" />
139 |       <option name="PARAMETERS" value="" />
140 |       <option name="SHOW_COMMAND_LINE" value="false" />
141 |       <method />
142 |     </configuration>
143 |     <list size="1">
144 |       <item index="0" class="java.lang.String" itemvalue="Python.Unnamed" />
145 |     </list>
146 |   </component>
147 |   <component name="ShelveChangesManager" show_recycled="false">
148 |     <option name="remove_strategy" value="false" />
149 |   </component>
150 |   <component name="SvnConfiguration">
151 |     <configuration>$USER_HOME$/.subversion</configuration>
152 |   </component>
153 |   <component name="TaskManager">
154 |     <task active="true" id="Default" summary="Default task">
155 |       <changelist id="a9aa7dd4-f3ec-4f49-8531-1df274f109f8" name="Default" comment="" />
156 |       <created>1466868594015</created>
157 |       <option name="number" value="Default" />
158 |       <option name="presentableId" value="Default" />
159 |       <updated>1466868594015</updated>
160 |     </task>
161 |     <servers />
162 |   </component>
163 |   <component name="ToolWindowManager">
164 |     <frame x="12" y="38" width="1280" height="722" extended-state="0" />
165 |     <editor active="true" />
166 |     <layout>
167 |       <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
168 |       <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
169 |       <window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="true" content_ui="tabs" />
170 |       <window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
171 |       <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
172 |       <window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4073507" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
173 |       <window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
174 |       <window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
175 |       <window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="true" content_ui="tabs" />
176 |       <window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
177 |       <window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
178 |       <window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
179 |       <window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="SLIDING" type="SLIDING" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
180 |       <window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
181 |       <window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" />
182 |       <window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
183 |       <window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
184 |     </layout>
185 |     <layout-to-restore>
186 |       <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
187 |       <window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
188 |       <window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
189 |       <window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="SLIDING" type="SLIDING" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
190 |       <window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="true" content_ui="tabs" />
191 |       <window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
192 |       <window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="8" side_tool="false" content_ui="tabs" />
193 |       <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="9" side_tool="false" content_ui="tabs" />
194 |       <window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.24085365" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
195 |       <window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="10" side_tool="false" content_ui="tabs" />
196 |       <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
197 |       <window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" />
198 |       <window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
199 |       <window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
200 |       <window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
201 |       <window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="true" content_ui="tabs" />
202 |       <window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
203 |     </layout-to-restore>
204 |   </component>
205 |   <component name="Vcs.Log.UiProperties">
206 |     <option name="RECENTLY_FILTERED_USER_GROUPS">
207 |       <collection />
208 |     </option>
209 |     <option name="RECENTLY_FILTERED_BRANCH_GROUPS">
210 |       <collection />
211 |     </option>
212 |   </component>
213 |   <component name="VcsContentAnnotationSettings">
214 |     <option name="myLimit" value="2678400000" />
215 |   </component>
216 |   <component name="XDebuggerManager">
217 |     <breakpoint-manager>
218 |       <option name="time" value="1" />
219 |     </breakpoint-manager>
220 |     <watches-manager />
221 |   </component>
222 |   <component name="editorHistoryManager">
223 |     <entry file="file://$PROJECT_DIR$/lesson 1.py">
224 |       <provider selected="true" editor-type-id="text-editor">
225 |         <state relative-caret-position="0">
226 |           <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="2" selection-end-column="0" />
227 |           <folding />
228 |         </state>
229 |       </provider>
230 |     </entry>
231 |     <entry file="file://$PROJECT_DIR$/lesson5.py">
232 |       <provider selected="true" editor-type-id="text-editor">
233 |         <state relative-caret-position="0">
234 |           <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
235 |           <folding />
236 |         </state>
237 |       </provider>
238 |     </entry>
239 |     <entry file="file://$PROJECT_DIR$/lesson 1.py">
240 |       <provider selected="true" editor-type-id="text-editor">
241 |         <state relative-caret-position="0">
242 |           <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="2" selection-end-column="0" />
243 |           <folding />
244 |         </state>
245 |       </provider>
246 |     </entry>
247 |     <entry file="file://$PROJECT_DIR$/lesson5.py">
248 |       <provider selected="true" editor-type-id="text-editor">
249 |         <state relative-caret-position="0">
250 |           <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
251 |           <folding />
252 |         </state>
253 |       </provider>
254 |     </entry>
255 |     <entry file="file://$PROJECT_DIR$/lesson 1.py">
256 |       <provider selected="true" editor-type-id="text-editor">
257 |         <state relative-caret-position="0">
258 |           <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="2" selection-end-column="0" />
259 |           <folding />
260 |         </state>
261 |       </provider>
262 |     </entry>
263 |     <entry file="file://$PROJECT_DIR$/lesson5.py">
264 |       <provider selected="true" editor-type-id="text-editor">
265 |         <state relative-caret-position="0">
266 |           <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
267 |           <folding />
268 |         </state>
269 |       </provider>
270 |     </entry>
271 |     <entry file="file://$PROJECT_DIR$/lesson 1.py">
272 |       <provider selected="true" editor-type-id="text-editor">
273 |         <state relative-caret-position="0">
274 |           <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="2" selection-end-column="0" />
275 |           <folding />
276 |         </state>
277 |       </provider>
278 |     </entry>
279 |     <entry file="file://$PROJECT_DIR$/lesson5.py">
280 |       <provider selected="true" editor-type-id="text-editor">
281 |         <state relative-caret-position="0">
282 |           <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
283 |           <folding />
284 |         </state>
285 |       </provider>
286 |     </entry>
287 |     <entry file="file://$PROJECT_DIR$/lesson 1.py">
288 |       <provider selected="true" editor-type-id="text-editor">
289 |         <state relative-caret-position="0">
290 |           <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="2" selection-end-column="0" />
291 |           <folding />
292 |         </state>
293 |       </provider>
294 |     </entry>
295 |     <entry file="file://$PROJECT_DIR$/lesson5.py">
296 |       <provider selected="true" editor-type-id="text-editor">
297 |         <state relative-caret-position="0">
298 |           <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
299 |           <folding />
300 |         </state>
301 |       </provider>
302 |     </entry>
303 |     <entry file="file://$PROJECT_DIR$/lesson 1.py">
304 |       <provider selected="true" editor-type-id="text-editor">
305 |         <state relative-caret-position="0">
306 |           <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="2" selection-end-column="0" />
307 |           <folding />
308 |         </state>
309 |       </provider>
310 |     </entry>
311 |     <entry file="file://$PROJECT_DIR$/lesson5.py">
312 |       <provider selected="true" editor-type-id="text-editor">
313 |         <state relative-caret-position="0">
314 |           <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
315 |           <folding />
316 |         </state>
317 |       </provider>
318 |     </entry>
319 |     <entry file="file://$PROJECT_DIR$/lesson5.py">
320 |       <provider selected="true" editor-type-id="text-editor">
321 |         <state relative-caret-position="60">
322 |           <caret line="4" column="27" selection-start-line="4" selection-start-column="27" selection-end-line="4" selection-end-column="27" />
323 |           <folding />
324 |         </state>
325 |       </provider>
326 |     </entry>
327 |     <entry file="file://$PROJECT_DIR$/lesson 1.py">
328 |       <provider selected="true" editor-type-id="text-editor">
329 |         <state relative-caret-position="15">
330 |           <caret line="1" column="6" selection-start-line="1" selection-start-column="6" selection-end-line="1" selection-end-column="6" />
331 |           <folding />
332 |         </state>
333 |       </provider>
334 |     </entry>
335 |   </component>
336 | </project>


--------------------------------------------------------------------------------
/PythonCode/lesson 1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | from sklearn import tree
 6 | features = [[140, 1], [130,1],  [150, 0], [170, 0]]
 7 | labels = [0,0,1,1]
 8 | clf = tree.DecisionTreeClassifier()
 9 | clf = clf.fit(features, labels)
10 | print(clf.predict([[150, 0]]))


--------------------------------------------------------------------------------
/PythonCode/lesson5.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # author   yyn19951228
 5 | # date  Sunday June 26 2016
 6 | from sklearn import datasets
 7 | iris = datasets.load_iris()
 8 | 
 9 | X = iris.data
10 | Y = iris.target
11 | 
12 | from sklearn.cross_validation import  train_test_split
13 | X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size= .5)
14 | # 决策树模型
15 | # from sklearn import tree
16 | # my_classifier = tree.DecisionTreeClassifier()
17 | 
18 | #  KNC
19 | from sklearn.neighbors import KNeighborsClassifier
20 | my_classifier = KNeighborsClassifier()
21 | 
22 | my_classifier.fit(X_train, Y_train)
23 | 
24 | predicitions = my_classifier.predict(X_test)
25 | 
26 | from sklearn.metrics import accuracy_score
27 | print accuracy_score(Y_test,predicitions)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 官方已经出了中文字幕已经传到了优酷，此项目终止
 2 | 观看网址为： http://i.youku.com/i/UMjczOTc0NDkzNg==/custom?id=87105
 3 | 
 4 | # Google-ML-Recipes-Chs-sub-and-code
 5 | Google出品的机器学习入门视频的中文字幕翻译与示例代码
 6 | 
 7 | ## 视频文件和字幕下载地址：
 8 |   http://pan.baidu.com/s/1boOvdT1 （里面的字幕不一定是最新的，请从这里下载字幕）
 9 | 
10 | ## 在线观看地址： 
11 |   https://www.youtube.com/playlist?list=PLOU2XLYxmsIIuiBfYad6rFYQU_jL2ryal （官方地址，最新最清晰，需要翻墙）
12 |   http://list.youku.com/albumlist/show?id=27105036&ascending=1&page=1 （youku地址，翻译好的字幕我会压制到视频传到这里，不过清晰度差，广告多，建议从百度云下载视频自己加载中文字幕）
13 | 
14 | ## 参与翻译：
15 |   subtitle/chs目录下是中文字幕，文件名前面的[ing...]是尚未翻译好的字幕，因为有现成的英文字幕，所以不用卡时间轴，直接替换英文就可以了，你可以fork一份修改这个文件然后push到这里，当然你也可以修改翻译好的字幕，换成更好的翻译push过来 
16 |   
17 | ## 感谢
18 |   **sisely** （翻译）
19 |   **yyn19951228** （翻译）
20 | 
21 | ## 代码说明：
22 |   代码在PythonCode目录下，使用Python3而不是视频内使用的Python2语法，这两者之间有一点小区别，你可以push你自己的代码到PythonCode里面以帮助其他人学习
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/subtitle/Chs/#4中文字幕:
--------------------------------------------------------------------------------
   1 | 1
   2 | 00:00:00,000 --> 00:00:00,000
   3 | Youtube subtitles download by mo.dbxdb.com 
   4 | 翻译 yyn19951228
   5 | 
   6 | 2
   7 | 00:00:00,000 --> 00:00:02,844
   8 | [MUSIC PLAYING]
   9 | [音乐]
  10 | 
  11 | 3
  12 | 00:00:02,844 --> 00:00:06,640
  13 | 
  14 | 
  15 | 4
  16 | 00:00:06,640 --> 00:00:07,447
  17 | Welcome back.
  18 | 欢迎回来
  19 | 
  20 | 5
  21 | 00:00:07,447 --> 00:00:09,029
  22 | We've covered a lot
  23 | of ground already,
  24 | 我们已经学习了很多基础知识
  25 | 
  26 | 6
  27 | 00:00:09,029 --> 00:00:12,070
  28 | so today I want to review
  29 | and reinforce concepts.
  30 | 所以今天我想再复习加强一下基础概念
  31 | 
  32 | 7
  33 | 00:00:12,070 --> 00:00:14,250
  34 | To do that, we'll
  35 | explore two things.
  36 | 我们通过两个例子作为开始
  37 | 
  38 | 8
  39 | 00:00:14,250 --> 00:00:16,090
  40 | First, we'll code
  41 | up a basic pipeline
  42 | 首先，我们为监督学习
  43 | 
  44 | 9
  45 | 00:00:16,090 --> 00:00:17,640
  46 | for supervised learning.
  47 | 来码一个基础的管道封装
  48 | 
  49 | 10
  50 | 00:00:17,640 --> 00:00:19,390
  51 | I'll show you how
  52 | multiple classifiers
  53 | 我将会向你们展示多种分类器是如何
  54 | 
  55 | 11
  56 | 00:00:19,390 --> 00:00:21,280
  57 | can solve the same problem.
  58 | 能够解决同样问题的。
  59 | 
  60 | 12
  61 | 00:00:21,280 --> 00:00:23,200
  62 | Next, we'll build up a
  63 | little more intuition
  64 | 之后，我们会获得一些对于
  65 | 
  66 | 13
  67 | 00:00:23,200 --> 00:00:25,710
  68 | for what it means for an
  69 | algorithm to learn something
  70 | 从数据当中建立模型和算法的启发。
  71 | 
  72 | 14
  73 | 00:00:25,710 --> 00:00:29,502
  74 | from data, because that sounds
  75 | kind of magical, but it's not.
  76 | 虽然看起来很神奇，但是完全不需要担心。
  77 | 
  78 | 15
  79 | 00:00:29,502 --> 00:00:31,710
  80 | To kick things off, let's
  81 | look at a common experiment
  82 | 作为开始，让我们来看一个很普通的，
  83 | 
  84 | 16
  85 | 00:00:31,710 --> 00:00:33,009
  86 | you might want to do.
  87 | 你们也许很想做的实验。
  88 | 
  89 | 17
  90 | 00:00:33,009 --> 00:00:35,210
  91 | Imagine you're building
  92 | a spam classifier.
  93 | 想象一下你正在垃圾邮件分类器。
  94 | 
  95 | 18
  96 | 00:00:35,210 --> 00:00:37,510
  97 | That's just a function that
  98 | labels an incoming email
  99 | 这是一个对邮件加标签的功能，
 100 | 
 101 | 19
 102 | 00:00:37,510 --> 00:00:39,307
 103 | as spam or not spam.
 104 | 把邮件标为垃圾邮件或者非垃圾邮件。
 105 | 
 106 | 20
 107 | 00:00:39,307 --> 00:00:41,140
 108 | Now, say you've already
 109 | collected a data set
 110 | 现在，假设你已经收集了一些数据集
 111 | 
 112 | 21
 113 | 00:00:41,140 --> 00:00:42,850
 114 | and you're ready
 115 | to train a model.
 116 | 而且你准备训练一个模型。
 117 | 
 118 | 22
 119 | 00:00:42,850 --> 00:00:44,460
 120 | But before you put
 121 | it into production,
 122 | 但是在你把这个模型投入到实际应用之前，
 123 | 
 124 | 23
 125 | 00:00:44,460 --> 00:00:46,760
 126 | there's a question you
 127 | need to answer first--
 128 | 需要首先关注一个问题！
 129 | 
 130 | 24
 131 | 00:00:46,760 --> 00:00:49,820
 132 | how accurate will it be when you
 133 | use it to classify emails that
 134 | 当分类的邮件不在你收集到的数据集当中的时候，
 135 | 
 136 | 25
 137 | 00:00:49,820 --> 00:00:51,740
 138 | weren't in your training data?
 139 | 怎样保证你的模型预测是准确的？
 140 | 
 141 | 26
 142 | 00:00:51,740 --> 00:00:54,850
 143 | As best we can, we want to
 144 | verify our models work well
 145 | 为了能做到最好，我们希望能够在投入实际应用前
 146 | 
 147 | 27
 148 | 00:00:54,850 --> 00:00:56,490
 149 | before we deploy them.
 150 | 确认我们的模型有最好的泛化能力。
 151 | 
 152 | 28
 153 | 00:00:56,490 --> 00:00:59,290
 154 | And we can do an experiment
 155 | to help us figure that out.
 156 | 我们可以做一个实验来帮助我们确认。
 157 | 
 158 | 29
 159 | 00:00:59,290 --> 00:01:02,930
 160 | One approach is to partition
 161 | our data set into two parts.
 162 | 我们的方法是把我们收集到的数据集一分为二。
 163 | 
 164 | 30
 165 | 00:01:02,930 --> 00:01:05,079
 166 | We'll call these Train and Test.
 167 | 我们将它们分别称为“训练集”和“测试集”
 168 | 
 169 | 31
 170 | 00:01:05,079 --> 00:01:07,010
 171 | We'll use Train
 172 | to train our model
 173 | 我们将采用“训练集”去训练我们的模型。
 174 | 
 175 | 32
 176 | 00:01:07,010 --> 00:01:10,380
 177 | and Test to see how
 178 | accurate it is on new data.
 179 | 然后用“测试集”来测试模型的泛化准确性。
 180 | 
 181 | 33
 182 | 00:01:10,380 --> 00:01:13,890
 183 | That's a common pattern, so
 184 | let's see how it looks in code.
 185 | 这是一个很常见的套路，所以让我们来看看
 186 | 如何用代码来实现。
 187 | 
 188 | 34
 189 | 00:01:13,890 --> 00:01:17,060
 190 | To kick things off, let's import
 191 | a data set into [? SyKit. ?]
 192 | 作为开始，让我们import一个数据集到[? SyKit. ?]中
 193 | 
 194 | 35
 195 | 00:01:17,060 --> 00:01:20,019
 196 | We'll use Iris again, because
 197 | it's handily included.
 198 | 我们将再一次的使用“鸢尾花”数据集，因为
 199 | 这些使用十分方便。
 200 | 
 201 | 36
 202 | 00:01:20,019 --> 00:01:21,959
 203 | Now, we already saw
 204 | Iris in episode two.
 205 | 现在，我们已经在第二章看过“鸢尾花”数据的表现了
 206 | 
 207 | 37
 208 | 00:01:21,959 --> 00:01:23,560
 209 | But what we haven't
 210 | seen before is
 211 | 但是我们之前没有看过的是，
 212 | 
 213 | 38
 214 | 00:01:23,560 --> 00:01:26,831
 215 | that I'm calling the
 216 | features x and the labels y.
 217 | 特征x 和标签y
 218 | 
 219 | 39
 220 | 00:01:26,831 --> 00:01:28,209
 221 | Why is that?
 222 | 为什么是这个？
 223 | 
 224 | 40
 225 | 00:01:28,209 --> 00:01:30,670
 226 | Well, that's because one
 227 | way to think of a classifier
 228 | 因为我们可以把分类器
 229 | 
 230 | 
 231 | 41
 232 | 00:01:30,670 --> 00:01:32,230
 233 | is as a function.
 234 | 看成是一个函数。
 235 | 
 236 | 42
 237 | 00:01:32,230 --> 00:01:34,750
 238 | At a high level, you can
 239 | think of x as the input
 240 | 在抽象层面上，你可以把特征x 视作输入
 241 | 
 242 | 43
 243 | 00:01:34,750 --> 00:01:36,500
 244 | and y as the output.
 245 | 然后把分类标签y 视作输出。
 246 | 
 247 | 44
 248 | 00:01:36,500 --> 00:01:39,892
 249 | I'll talk more about that in
 250 | the second half of this episode.
 251 | 我会在本视频的后半部分着重讨论这个。
 252 | 
 253 | 45
 254 | 00:01:39,892 --> 00:01:42,349
 255 | After we import the data set,
 256 | the first thing we want to do
 257 | 在我们导入了数据集后，我们首先要做的
 258 | 
 259 | 46
 260 | 00:01:42,349 --> 00:01:44,590
 261 | is partition it
 262 | into Train and Test.
 263 | 是把它们分为训练集和测试集。
 264 | 
 265 | 47
 266 | 00:01:44,590 --> 00:01:46,640
 267 | And to do that, we can
 268 | import a handy utility,
 269 | 我们导入一个函数来分隔数据集，
 270 | 
 271 | 48
 272 | 00:01:46,640 --> 00:01:48,530
 273 | and it makes the syntax clear.
 274 | 让我们把程序变得优雅一点。。
 275 | 
 276 | 49
 277 | 00:01:48,530 --> 00:01:50,340
 278 | We're taking our
 279 | x's and our y's,
 280 | 我们把我们的x和y
 281 | 
 282 | 50
 283 | 00:01:50,340 --> 00:01:52,930
 284 | or our features and labels,
 285 | and partitioning them
 286 | 或者说特征和标签
 287 | 
 288 | 51
 289 | 00:01:52,930 --> 00:01:54,450
 290 | into two sets.
 291 | 分为两部分。
 292 | 
 293 | 52
 294 | 00:01:54,450 --> 00:01:56,690
 295 | X_train and y_train are
 296 | the features and labels
 297 | X_train是训练集的特征子集
 298 | 
 299 | 53
 300 | 00:01:56,690 --> 00:01:57,980
 301 | for the training set.
 302 | y_train是训练集的标签子集。
 303 | 
 304 | 54
 305 | 00:01:57,980 --> 00:02:00,630
 306 | And X_test and y_test are
 307 | the features and labels
 308 | 然后X_test是测试集的特征子集，
 309 | 
 310 | 55
 311 | 00:02:00,630 --> 00:02:02,031
 312 | for the testing set.
 313 | 然后Y_test是测试集的标签子集，
 314 | 
 315 | 56
 316 | 00:02:02,031 --> 00:02:04,239
 317 | Here, I'm just saying that
 318 | I want half the data to be
 319 | 这里，我刚刚说我希望一半的数据
 320 | 
 321 | 57
 322 | 00:02:04,239 --> 00:02:05,580
 323 | used for testing.
 324 | 能够被用作测试集。
 325 | 
 326 | 58
 327 | 00:02:05,580 --> 00:02:09,229
 328 | So if we have 150 examples
 329 | in Iris, 75 will be in Train
 330 | 所以如果我们有150个鸢尾花的数据，
 331 | 那么75个将用作训练，
 332 | 
 333 | 59
 334 | 00:02:09,229 --> 00:02:11,520
 335 | and 75 will be in Test.
 336 | 还有75个将被用作测试。
 337 | 
 338 | 60
 339 | 00:02:11,520 --> 00:02:13,280
 340 | Now we'll create our classifier.
 341 | 现在让我们来创建我们的分类器。
 342 | 
 343 | 61
 344 | 00:02:13,280 --> 00:02:14,979
 345 | I'll use two
 346 | different types here
 347 | 我将会用两种不同的类型（的分类器）
 348 | 
 349 | 62
 350 | 00:02:14,979 --> 00:02:17,860
 351 | to show you how they
 352 | accomplish the same task.
 353 | 来展示他们是如何完成同样的任务的。
 354 | 
 355 | 63
 356 | 00:02:17,860 --> 00:02:20,500
 357 | Let's start with the decision
 358 | tree we've already seen.
 359 | 让我们从我们已经见过的决策树开始。
 360 | 
 361 | 64
 362 | 00:02:20,500 --> 00:02:22,240
 363 | Note there's only
 364 | two lines of code
 365 | 注意到这里只有两行代码
 366 | 
 367 | 65
 368 | 00:02:22,240 --> 00:02:23,448
 369 | that are classifier-specific.
 370 | 是和分类器有关的。
 371 | 
 372 | 66
 373 | 00:02:23,448 --> 00:02:25,650
 374 | 
 375 | 
 376 | 67
 377 | 00:02:25,650 --> 00:02:28,830
 378 | Now let's train the classifier
 379 | using our training data.
 380 | 现在让我们用训练集来训练分类器。
 381 | 
 382 | 68
 383 | 00:02:28,830 --> 00:02:31,599
 384 | At this point, it's ready
 385 | to be used to classify data.
 386 | 在这里，分类器准备开始分类数据。
 387 | 
 388 | 69
 389 | 00:02:31,599 --> 00:02:33,330
 390 | And next, we'll call
 391 | the predict method
 392 | 接下来，让我们来调用预测的方法
 393 | 
 394 | 70
 395 | 00:02:33,330 --> 00:02:35,805
 396 | and use it to classify
 397 | our testing data.
 398 | 然后用它来分类我们的测试集。
 399 | 
 400 | 71
 401 | 00:02:35,805 --> 00:02:37,180
 402 | If you print out
 403 | the predictions,
 404 | 如果你答应出预测的结果，
 405 | 
 406 | 72
 407 | 00:02:37,180 --> 00:02:38,970
 408 | you'll see there are
 409 | a list of numbers.
 410 | 你将会看到这样一大串数字。
 411 | 
 412 | 73
 413 | 00:02:38,970 --> 00:02:40,660
 414 | These correspond
 415 | to the type of Iris
 416 | 他们是分类器对于测试集中每一组数据
 417 | 
 418 | 74
 419 | 00:02:40,660 --> 00:02:44,009
 420 | the classifier predicts for
 421 | each row in the testing data.
 422 | 预测的鸢尾花的花型的结果。
 423 | 
 424 | 
 425 | 75
 426 | 00:02:44,009 --> 00:02:46,229
 427 | Now let's see how
 428 | accurate our classifier
 429 | 现在让我们来看看分类器
 430 | 
 431 | 76
 432 | 00:02:46,229 --> 00:02:48,280
 433 | was on the testing set.
 434 | 在测试集上预测的准确度。
 435 | 
 436 | 77
 437 | 00:02:48,280 --> 00:02:50,840
 438 | Recall that up top, we have
 439 | the true labels for the testing
 440 | 回想一下，我们有测试集中每一组数据
 441 | 
 442 | 78
 443 | 00:02:50,840 --> 00:02:51,650
 444 | data.
 445 | 的准确的结果。
 446 | 
 447 | 79
 448 | 00:02:51,650 --> 00:02:53,460
 449 | To calculate our
 450 | accuracy, we can
 451 | 为了计算我们的准确率，
 452 | 
 453 | 80
 454 | 00:02:53,460 --> 00:02:55,759
 455 | compare the predicted
 456 | labels to the true labels,
 457 | 我们可以把真实的标签和预测的标签做比较，
 458 | 
 459 | 81
 460 | 00:02:55,759 --> 00:02:57,348
 461 | and tally up the score.
 462 | 然后计算出我们的得分。
 463 | 
 464 | 82
 465 | 00:02:57,348 --> 00:02:59,139
 466 | There's a convenience
 467 | method in [? Sykit ?]
 468 | 在sklearn中有一个很便捷的方法
 469 | 
 470 | 83
 471 | 00:02:59,139 --> 00:03:00,830
 472 | we can import to do that.
 473 | 我们可以导入它来帮助我们。
 474 | 
 475 | 84
 476 | 00:03:00,830 --> 00:03:03,505
 477 | Notice here, our
 478 | accuracy was over 90%.
 479 | 注意到我们现在的准确率已经超过90%，
 480 | 
 481 | 85
 482 | 00:03:03,505 --> 00:03:06,130
 483 | If you try this on your own, it
 484 | might be a little bit different
 485 | 如果你自己尝试的话，会发现结果有所不同，
 486 | 
 487 | 86
 488 | 00:03:06,130 --> 00:03:08,270
 489 | because of some randomness
 490 | in how the Train/Test
 491 | 因为电脑划分数据集和测试集的时候
 492 | 
 493 | 87
 494 | 00:03:08,270 --> 00:03:10,039
 495 | data is partitioned.
 496 | 有一定的随机性。
 497 | 
 498 | 88
 499 | 00:03:10,039 --> 00:03:11,880
 500 | Now, here's something
 501 | interesting.
 502 | 现在，有一些很有趣的事情。
 503 | 
 504 | 89
 505 | 00:03:11,880 --> 00:03:14,690
 506 | By replacing these two lines, we
 507 | can use a different classifier
 508 | 把这两行代码去掉，我们可以用一个不一样的分类器
 509 | 
 510 | 90
 511 | 00:03:14,690 --> 00:03:16,919
 512 | to accomplish the same task.
 513 | 来完成同样的分类任务。
 514 | 
 515 | 91
 516 | 00:03:16,919 --> 00:03:18,569
 517 | Instead of using
 518 | a decision tree,
 519 | 我们用KNN(K-Nearest Neighbours)
 520 | 
 521 | 92
 522 | 00:03:18,569 --> 00:03:20,930
 523 | we'll use one called
 524 | [? KNearestNeighbors. ?]
 525 | 来代替决策树。
 526 | 
 527 | 93
 528 | 00:03:20,930 --> 00:03:23,340
 529 | If we run our experiment,
 530 | we'll see that the code
 531 | 如果我们运行，会发现代码
 532 | 
 533 | 94
 534 | 00:03:23,340 --> 00:03:25,354
 535 | works in exactly the same way.
 536 | 运行过程几乎是完全一样的。
 537 | 
 538 | 95
 539 | 00:03:25,354 --> 00:03:27,270
 540 | The accuracy may be
 541 | different when you run it,
 542 | 你在运行的时候，准确路可能会有所不同。
 543 | 
 544 | 96
 545 | 00:03:27,270 --> 00:03:29,800
 546 | because this classifier works
 547 | a little bit differently
 548 | 因为这个分类算法的原理有一些不同，
 549 | 
 550 | 97
 551 | 00:03:29,800 --> 00:03:32,440
 552 | and because of the randomness
 553 | in the Train/Test split.
 554 | 而且划分训练集／测试集的时候也存在随机性。
 555 | 
 556 | 98
 557 | 00:03:32,440 --> 00:03:35,419
 558 | Likewise, if we wanted to use a
 559 | more sophisticated classifier,
 560 | 同样地，如果想使用一些更复杂的分类器，
 561 | 
 562 | 99
 563 | 00:03:35,419 --> 00:03:38,220
 564 | we could just import it
 565 | and change these two lines.
 566 | 我们可以导入他们，然后替换掉这两行代码即可。
 567 | 
 568 | 100
 569 | 00:03:38,220 --> 00:03:40,297
 570 | Otherwise, our code is the same.
 571 | 其他的代码都是完全一样的。
 572 | 
 573 | 101
 574 | 00:03:40,297 --> 00:03:42,880
 575 | The takeaway here is that while
 576 | there are many different types
 577 | 一个很方便地方就是，这些不同的分类算法
 578 | 
 579 | 102
 580 | 00:03:42,880 --> 00:03:45,919
 581 | of classifiers, at a high level,
 582 | they have a similar interface.
 583 | 在调用的时候都遵循着同样的格式。
 584 | 
 585 | 103
 586 | 00:03:45,919 --> 00:03:49,058
 587 | 
 588 | 
 589 | 104
 590 | 00:03:49,058 --> 00:03:50,849
 591 | Now let's talk a little
 592 | bit more about what
 593 | 现在让我们来看看
 594 | 
 595 | 105
 596 | 00:03:50,849 --> 00:03:53,120
 597 | it means to learn from data.
 598 | 从“数据中学习”是什么意思。
 599 | 
 600 | 106
 601 | 00:03:53,120 --> 00:03:56,080
 602 | Earlier, I said we called the
 603 | features x and the labels y,
 604 | 刚开始的时候，我说我们用到特征x 和标签y
 605 | 
 606 | 107
 607 | 00:03:56,080 --> 00:03:58,717
 608 | because they were the input
 609 | and output of a function.
 610 | 因为他们可以视作模型函数的输入和输出。
 611 | 
 612 | 108
 613 | 00:03:58,717 --> 00:04:00,800
 614 | Now, of course, a function
 615 | is something we already
 616 | 现在，当然，函数是
 617 | 
 618 | 109
 619 | 00:04:00,800 --> 00:04:02,190
 620 | know from programming.
 621 | 我们在编程中很熟悉的一个概念。
 622 | 
 623 | 110
 624 | 00:04:02,190 --> 00:04:04,900
 625 | def classify--
 626 | there's our function.
 627 | 定义一个 分类器
 628 | 这就是我们的函数。
 629 | 
 630 | 111
 631 | 00:04:04,900 --> 00:04:06,919
 632 | As we already know in
 633 | supervised learning,
 634 | 正如我们在监督学习中已经了解到的，
 635 | 
 636 | 112
 637 | 00:04:06,919 --> 00:04:09,060
 638 | we don't want to
 639 | write this ourselves.
 640 | 我们不想自己重写一个这样的函数。
 641 | 
 642 | 113
 643 | 00:04:09,060 --> 00:04:12,360
 644 | We want an algorithm to
 645 | learn it from training data.
 646 | 我们希望一个算法自己从训练集中学习。
 647 | 
 648 | 114
 649 | 00:04:12,360 --> 00:04:15,240
 650 | So what does it mean
 651 | to learn a function?
 652 | 所以把分类器写成一个函数的意义是什么？
 653 | 
 654 | 115
 655 | 00:04:15,240 --> 00:04:17,120
 656 | Well, a function is just
 657 | a mapping from input
 658 | 其实函数就是一个输入到输出
 659 | 
 660 | 116
 661 | 00:04:17,120 --> 00:04:18,660
 662 | to output values.
 663 | 之间的映射。
 664 | 
 665 | 117
 666 | 00:04:18,660 --> 00:04:20,660
 667 | Here's a function you
 668 | might have seen before-- y
 669 | 这是一个你们以前可能见过的函数
 670 | 
 671 | 118
 672 | 00:04:20,660 --> 00:04:22,699
 673 | equals mx plus b.
 674 | y = mx+b
 675 | 
 676 | 119
 677 | 00:04:22,699 --> 00:04:24,819
 678 | That's the equation
 679 | for a line, and there
 680 | 这是一条线的方程，
 681 | 
 682 | 120
 683 | 00:04:24,819 --> 00:04:27,339
 684 | are two parameters-- m,
 685 | which gives the slope;
 686 | 而且他有俩参数，
 687 | m,确定了斜率；
 688 | 
 689 | 121
 690 | 00:04:27,339 --> 00:04:29,680
 691 | and b, which gives
 692 | the y-intercept.
 693 | 和b，即截距。
 694 | 
 695 | 122
 696 | 00:04:29,680 --> 00:04:31,110
 697 | Given these
 698 | parameters, of course,
 699 | 在给给定了这些参数后，当然，
 700 | 
 701 | 123
 702 | 00:04:31,110 --> 00:04:34,319
 703 | we can plot the function
 704 | for different values of x.
 705 | 我们可以得出对应不同x的不同的函数值。
 706 | 
 707 | 124
 708 | 00:04:34,319 --> 00:04:36,610
 709 | Now, in supervised learning,
 710 | our classified function
 711 | 现在，在监督学习中，我们的分类器函数
 712 | 
 713 | 125
 714 | 00:04:36,610 --> 00:04:38,420
 715 | might have some
 716 | parameters as well,
 717 | 也有着一些类似的参数，
 718 | 
 719 | 126
 720 | 00:04:38,420 --> 00:04:41,290
 721 | but the input x are the
 722 | features for an example we
 723 | 但是输入x是我们希望分类的样本，
 724 | 
 725 | 127
 726 | 00:04:41,290 --> 00:04:43,630
 727 | want to classify,
 728 | and the output y
 729 | 而输出y则是
 730 | 
 731 | 128
 732 | 00:04:43,630 --> 00:04:47,220
 733 | is a label, like Spam or Not
 734 | Spam, or a type of flower.
 735 | 一个标签，就像是否是垃圾邮件，
 736 | 或者是什么样的花。
 737 | 
 738 | 129
 739 | 00:04:47,220 --> 00:04:49,661
 740 | So what could the body of
 741 | the function look like?
 742 | 所以，函数的主体应该怎么写呢？
 743 | 
 744 | 130
 745 | 00:04:49,661 --> 00:04:51,910
 746 | Well, that's the part we
 747 | want to write algorithmically
 748 | well,这部分我们希望通过算法，
 749 | 
 750 | 131
 751 | 00:04:51,910 --> 00:04:53,737
 752 | or in other words, learn.
 753 | 或者说，学习，来实现。
 754 | 
 755 | 132
 756 | 00:04:53,737 --> 00:04:55,319
 757 | The important thing
 758 | to understand here
 759 | 在这里我们需要理解的很重要的东西
 760 | 
 761 | 133
 762 | 00:04:55,319 --> 00:04:57,130
 763 | is we're not
 764 | starting from scratch
 765 | 就是我们不再是凭空
 766 | 
 767 | 134
 768 | 00:04:57,130 --> 00:05:00,060
 769 | and pulling the body of the
 770 | function out of thin air.
 771 | 写一段代码来作为函数进行分类。
 772 | 
 773 | 135
 774 | 00:05:00,060 --> 00:05:01,990
 775 | Instead, we start with a model.
 776 | 而是直接调用库里的模型。
 777 | 
 778 | 136
 779 | 00:05:01,990 --> 00:05:04,050
 780 | And you can think of a
 781 | model as the prototype for
 782 | 你可以把这个方法看作是一个原型模版函数，
 783 | 
 784 | 137
 785 | 00:05:04,050 --> 00:05:07,029
 786 | or the rules that define
 787 | the body of our function.
 788 | 或者是某种函数体必须遵行的格式。
 789 | 
 790 | 138
 791 | 00:05:07,029 --> 00:05:08,540
 792 | Typically, a model
 793 | has parameters
 794 | 一个典型的模型有着
 795 | 
 796 | 139
 797 | 00:05:08,540 --> 00:05:10,290
 798 | that we can adjust
 799 | with our training data.
 800 | 可以被用来通过训练集调整的参数。
 801 | 
 802 | 140
 803 | 00:05:10,290 --> 00:05:14,560
 804 | And here's a high-level example
 805 | of how this process works.
 806 | 现在让我们通过一个高层面的例子来看看
 807 | 分类器是如何工作的。
 808 | 
 809 | 
 810 | 141
 811 | 00:05:14,560 --> 00:05:17,380
 812 | Let's look at a toy data set and
 813 | think about what kind of model
 814 | 让我们先来看一个十分简单的数据集，
 815 | 然后确定用什么模型
 816 | 
 817 | 142
 818 | 00:05:17,380 --> 00:05:19,209
 819 | we could use as a classifier.
 820 | 来作为分类器。
 821 | 
 822 | 143
 823 | 00:05:19,209 --> 00:05:20,959
 824 | Pretend we're interested
 825 | in distinguishing
 826 | 假设我们对于区分红点和绿点
 827 | 
 828 | 144
 829 | 00:05:20,959 --> 00:05:23,350
 830 | between red dots and
 831 | green dots, some of which
 832 | 有着特殊的癖好，这些点我已经
 833 | 
 834 | 145
 835 | 00:05:23,350 --> 00:05:25,079
 836 | I've drawn here on a graph.
 837 | 标在了图上。
 838 | 
 839 | 146
 840 | 00:05:25,079 --> 00:05:27,209
 841 | To do that, we'll use
 842 | just two features--
 843 | 为了达到目的，我们将之用两个特征量
 844 | 
 845 | 147
 846 | 00:05:27,209 --> 00:05:29,449
 847 | the x- and
 848 | y-coordinates of a dot.
 849 | x坐标 和 y坐标 来一起表示一个点。
 850 | 
 851 | 148
 852 | 00:05:29,449 --> 00:05:32,670
 853 | Now let's think about how
 854 | we could classify this data.
 855 | 现在让我们来想想可以如何分类这些数据。
 856 | 
 857 | 149
 858 | 00:05:32,670 --> 00:05:34,089
 859 | We want a function
 860 | that considers
 861 | 我们希望这个函数能够做到
 862 | 
 863 | 150
 864 | 00:05:34,089 --> 00:05:35,800
 865 | a new dot it's
 866 | never seen before,
 867 | 将一个之前从没有见过的一个点
 868 | 
 869 | 151
 870 | 00:05:35,800 --> 00:05:38,170
 871 | and classifies it
 872 | as red or green.
 873 | 通过分类，确定为是红色点还是绿色点。
 874 | 
 875 | 152
 876 | 00:05:38,170 --> 00:05:40,990
 877 | In fact, there might be a lot
 878 | of data we want to classify.
 879 | 事实上，我们可能希望分类大量的数据。
 880 | 
 881 | 153
 882 | 00:05:40,990 --> 00:05:42,839
 883 | Here, I've drawn
 884 | our testing examples
 885 | 这里，我将测试数据点
 886 | 
 887 | 154
 888 | 00:05:42,839 --> 00:05:44,959
 889 | in light green and light red.
 890 | 标为了淡绿色和淡红色。
 891 | 
 892 | 155
 893 | 00:05:44,959 --> 00:05:47,209
 894 | These are dots that weren't
 895 | in our training data.
 896 | 这些点事我们训练数据集中所没有的。
 897 | 
 898 | 156
 899 | 00:05:47,209 --> 00:05:49,790
 900 | The classifier has never
 901 | seen them before, so how can
 902 | 分类器函数之前从来没有见过他们，
 903 | 
 904 | 157
 905 | 00:05:49,790 --> 00:05:51,699
 906 | it predict the right label?
 907 | 所以他如何能够准确预测测试集点的颜色呢？
 908 | 
 909 | 158
 910 | 00:05:51,699 --> 00:05:53,819
 911 | Well, imagine if we
 912 | could somehow draw a line
 913 | 好的，想象一下我们可以通过某种方法，
 914 | 
 915 | 159
 916 | 00:05:53,819 --> 00:05:56,036
 917 | across the data like this.
 918 | 像这样，在之间划一条线。
 919 | 
 920 | 160
 921 | 00:05:56,036 --> 00:05:57,620
 922 | Then we could say
 923 | the dots to the left
 924 | 然后我们可以分辨出，线左边的点
 925 | 
 926 | 161
 927 | 00:05:57,620 --> 00:06:00,089
 928 | of the line are green and dots
 929 | to the right of the line are
 930 | 都是绿色的，而线右边的点都是
 931 | 
 932 | 162
 933 | 00:06:00,089 --> 00:06:00,089
 934 | red.
 935 | 红色的。
 936 | 
 937 | 163
 938 | 00:06:00,920 --> 00:06:03,430
 939 | And this line can serve
 940 | as our classifier.
 941 | 这样这根线就可以作为我们的分类器了。
 942 | 
 943 | 164
 944 | 00:06:03,430 --> 00:06:05,610
 945 | So how can we learn this line?
 946 | 所以机器是如何通过下学习得到这条线的呢？
 947 | 
 948 | 165
 949 | 00:06:05,610 --> 00:06:08,240
 950 | Well, one way is to use
 951 | the training data to adjust
 952 | 一种方法就是用训练集去训练出
 953 | 
 954 | 166
 955 | 00:06:08,240 --> 00:06:09,880
 956 | the parameters of a model.
 957 | 模型（y=mk+b）的参数（m and b)。
 958 | 
 959 | 167
 960 | 00:06:09,880 --> 00:06:12,829
 961 | And let's say the model we
 962 | use is a simple straight line
 963 | 这个模型就是我们之前提到过的
 964 | 
 965 | 168
 966 | 00:06:12,829 --> 00:06:14,459
 967 | like we saw before.
 968 | 一条直线的模型。
 969 | 
 970 | 169
 971 | 00:06:14,459 --> 00:06:17,829
 972 | That means we have two
 973 | parameters to adjust-- m and b.
 974 | 那意味着我们有两个参数需要调整 m and b。
 975 | 
 976 | 170
 977 | 00:06:17,829 --> 00:06:21,050
 978 | And by changing them, we can
 979 | change where the line appears.
 980 | 通过调整这些参数，我们可以改变这条直线的位置。
 981 | 
 982 | 171
 983 | 00:06:21,050 --> 00:06:23,500
 984 | So how could we learn
 985 | the right parameters?
 986 | 所以怎样保证机器能学到正确的参数呢？
 987 | 
 988 | 172
 989 | 00:06:23,500 --> 00:06:25,690
 990 | Well, one idea is that
 991 | we can iteratively adjust
 992 | 一种思路是不断地用训练集
 993 | 
 994 | 
 995 | 173
 996 | 00:06:25,690 --> 00:06:27,639
 997 | them using our training data.
 998 | 进行迭代，来调整这两个参数。
 999 | 
1000 | 174
1001 | 00:06:27,639 --> 00:06:29,889
1002 | For example, we might
1003 | start with a random line
1004 | 举个例子，我们可以随机取一条直线作为开始，
1005 | 
1006 | 175
1007 | 00:06:29,889 --> 00:06:32,810
1008 | and use it to classify the
1009 | first training example.
1010 | 用这条随机线来对训练集进行分类。
1011 | 
1012 | 176
1013 | 00:06:32,810 --> 00:06:35,370
1014 | If it gets it right, we don't
1015 | need to change our line,
1016 | 如果它分类争取到饿花，我们就不需要
1017 | 再改变这条线的参数，
1018 | 
1019 | 177
1020 | 00:06:35,370 --> 00:06:36,968
1021 | so we move on to the next one.
1022 | 所以我们可以用它来进行分类。
1023 | 
1024 | 178
1025 | 00:06:36,968 --> 00:06:38,759
1026 | But on the other hand,
1027 | if it gets it wrong,
1028 | 但是说，如果这条线是错误的，
1029 | 
1030 | 179
1031 | 00:06:38,759 --> 00:06:41,300
1032 | we could slightly adjust
1033 | the parameters of our model
1034 | 我们可以稍微改变一下它的参数
1035 | 
1036 | 180
1037 | 00:06:41,300 --> 00:06:43,069
1038 | to make it more accurate.
1039 | 让它分类的更加准确。
1040 | 
1041 | 181
1042 | 00:06:43,069 --> 00:06:44,680
1043 | The takeaway here is this.
1044 | 机器学习在这里的定义就是，
1045 | 
1046 | 182
1047 | 00:06:44,680 --> 00:06:47,490
1048 | One way to think of learning
1049 | is using training data
1050 | 机器通过使用训练集数据
1051 | 
1052 | 183
1053 | 00:06:47,490 --> 00:06:50,980
1054 | to adjust the
1055 | parameters of a model.
1056 | 来为模型选择合适的参数。
1057 | 
1058 | 184
1059 | 00:06:50,980 --> 00:06:52,949
1060 | Now, here's something
1061 | really special.
1062 | 现在，让我们来看一些真正特别的东西。
1063 | 
1064 | 185
1065 | 00:06:52,949 --> 00:06:55,269
1066 | It's called
1067 | tensorflow/playground.
1068 | 这个网站名叫 playground.tensorflow.org
1069 | 
1070 | 186
1071 | 00:06:55,269 --> 00:06:57,370
1072 | This is a beautiful
1073 | example of a neural network
1074 | 这是一个优雅的学习神经网络的网站
1075 | 
1076 | 187
1077 | 00:06:57,370 --> 00:07:00,019
1078 | you can run and experiment
1079 | with right in your browser.
1080 | 你可以从浏览器左边这栏运行和测试。
1081 | 
1082 | 188
1083 | 00:07:00,019 --> 00:07:02,060
1084 | Now, this deserves its
1085 | own episode for sure,
1086 | 想要理解他需要做大量的工作，
1087 | 
1088 | 189
1089 | 00:07:02,060 --> 00:07:03,730
1090 | but for now, go ahead
1091 | and play with it.
1092 | 但是现在，你们只需要立刻去尝试一下就好了！
1093 | 
1094 | 190
1095 | 00:07:03,730 --> 00:07:04,930
1096 | It's awesome.
1097 | 效果十分惊艳！
1098 | 
1099 | 191
1100 | 00:07:04,930 --> 00:07:06,630
1101 | The playground comes
1102 | with different data
1103 | 这个playground有着许多不同的数据集
1104 | 
1105 | 192
1106 | 00:07:06,630 --> 00:07:08,300
1107 | sets you can try out.
1108 | 可以尝试。
1109 | 
1110 | 193
1111 | 00:07:08,300 --> 00:07:09,470
1112 | Some are very simple.
1113 | 有些十分简单。
1114 | 
1115 | 194
1116 | 00:07:09,470 --> 00:07:12,620
1117 | For example, we could use our
1118 | line to classify this one.
1119 | 例如，我们可以用这根线来分类这个数据集。
1120 | 
1121 | 195
1122 | 00:07:12,620 --> 00:07:15,980
1123 | Some data sets are
1124 | much more complex.
1125 | 有些数据集相当的复杂。
1126 | 
1127 | 196
1128 | 00:07:15,980 --> 00:07:17,620
1129 | This data set is
1130 | especially hard.
1131 | 比如这个数据集就特别难。
1132 | 
1133 | 197
1134 | 00:07:17,620 --> 00:07:20,357
1135 | And see if you can build
1136 | a network to classify it.
1137 | 看看你是否能建立一个神经网络模型来分类他。
1138 | 
1139 | 198
1140 | 00:07:20,357 --> 00:07:21,940
1141 | Now, you can think
1142 | of a neural network
1143 | 现在你可以认为神经网络模型
1144 | 
1145 | 199
1146 | 00:07:21,940 --> 00:07:24,170
1147 | as a more sophisticated
1148 | type of classifier,
1149 | 是一个更加更加复杂的分类器，
1150 | 
1151 | 200
1152 | 00:07:24,170 --> 00:07:26,430
1153 | like a decision tree
1154 | or a simple line.
1155 | 就像决策树，或者一条简单的直线（那样的分类器）。
1156 | 
1157 | 201
1158 | 00:07:26,430 --> 00:07:29,190
1159 | But in principle,
1160 | the idea is similar.
1161 | 但是准根溯源，
1162 | 本质是相似的。
1163 | 
1164 | 202
1165 | 00:07:29,190 --> 00:07:29,190
1166 | OK.
1167 | OK.
1168 | 
1169 | 203
1170 | 00:07:29,690 --> 00:07:30,687
1171 | Hope that was helpful.
1172 | 希望我们的视频对你有帮助。
1173 | 
1174 | 204
1175 | 00:07:30,687 --> 00:07:32,519
1176 | I just created a Twitter
1177 | that you can follow
1178 | 我们刚刚创建了Twitter，大家可以加一下，
1179 | 
1180 | 205
1181 | 00:07:32,519 --> 00:07:33,834
1182 | to be notified of new episodes.
1183 | 用来通知新视频上线。
1184 | 
1185 | 206
1186 | 00:07:33,834 --> 00:07:36,000
1187 | And the next one should be
1188 | out in a couple of weeks,
1189 | 下个视频应该会在几周后推出，
1190 | 
1191 | 207
1192 | 00:07:36,000 --> 00:07:38,750
1193 | depending on how much work I'm
1194 | doing for Google I/O. Thanks,
1195 | 这取决于这几周我在Google工作是否繁重，
1196 | 
1197 | 208
1198 | 00:07:38,750 --> 00:07:41,620
1199 | as always, for watching,
1200 | and I'll see you next time.
1201 | 再见，各位保重！
1202 | 
1203 | 209
1204 | 00:07:41,620 --> 00:07:53,000
1205 |  Subtitles End: mo.dbxdb.com
1206 |  译者：yyn19951228
1207 | 
1208 | 


--------------------------------------------------------------------------------
/subtitle/Chs/Hello World - Machine Learning Recipes #1_chs.srt:
--------------------------------------------------------------------------------
  1 | 1
  2 | 00:00:00,000 --> 00:00:00,000
  3 | Youtube subtitles download by mo.dbxdb.com 
  4 | 
  5 | 2
  6 | 00:00:00,000 --> 00:00:02,886
  7 | [MUSIC PLAYING]
  8 | 
  9 | 3
 10 | 00:00:02,886 --> 00:00:06,726
 11 | 
 12 | 
 13 | 4
 14 | 00:00:06,726 --> 00:00:08,100
 15 | 只需要六行代码就可以
 16 | 
 17 | 5
 18 | 00:00:08,100 --> 00:00:10,130
 19 | 写出你第一个机器学习的程序
 20 | 
 21 | 6
 22 | 00:00:10,130 --> 00:00:11,671
 23 | 我是Josh
 24 | Gordon, 今天我将会
 25 | 
 26 | 7
 27 | 00:00:11,671 --> 00:00:14,374
 28 | 带领你写出机器学习的Hello World
 29 | 
 30 | 8
 31 | 00:00:14,374 --> 00:00:16,039
 32 | 在这系列视频的最初几集,
 33 | 
 34 | 9
 35 | 00:00:16,039 --> 00:00:17,998
 36 | 我们会教你怎么从头开始
 37 | 
 38 | 10
 39 | 00:00:17,998 --> 00:00:19,079
 40 | 学习机器学习
 41 | 
 42 | 11
 43 | 00:00:19,079 --> 00:00:21,560
 44 | 首先，我们需要两个开源库,
 45 | 
 46 | 12
 47 | 00:00:21,560 --> 00:00:23,706
 48 | scikit-learn和TensorFlow.
 49 | 
 50 | 13
 51 | 00:00:23,706 --> 00:00:25,330
 52 | 我们即将使用scikit库
 53 | 
 54 | 14
 55 | 00:00:25,330 --> 00:00:27,830
 56 | 但首先，让我们大概说一下什么是机器学习
 57 | 
 58 | 15
 59 | 00:00:27,830 --> 00:00:29,240
 60 | 以及它的重要性
 61 | 
 62 | 16
 63 | 00:00:29,240 --> 00:00:31,198
 64 | 你可以把机器学习看作
 65 | 
 66 | 17
 67 | 00:00:31,198 --> 00:00:32,409
 68 | 人工智能科学的一个分支
 69 | 
 70 | 18
 71 | 00:00:32,409 --> 00:00:35,610
 72 | 早期的AI程序通常只擅长特定的事
 73 | 
 74 | 19
 75 | 00:00:35,610 --> 00:00:37,240
 76 | 比如说“深蓝”能像顶级棋手一样
 77 | 
 78 | 20
 79 | 00:00:37,240 --> 00:00:40,150
 80 | 下国际象棋，不过它也只会干这个
 81 | 
 82 | 21
 83 | 00:00:40,150 --> 00:00:41,780
 84 | 现在我们想写出一个能解决许多不同问题的
 85 | 
 86 | 22
 87 | 00:00:41,780 --> 00:00:45,340
 88 | 程序而不是对每个问题都写一个不同的程序
 89 | 
 90 | 23
 91 | 00:00:45,340 --> 00:00:47,460
 92 | AlphaGo是一个很好的例子
 93 | 
 94 | 24
 95 | 00:00:47,460 --> 00:00:50,150
 96 | 这时候它正在世界围棋竞标赛上博弈学习
 97 | 
 98 | 25
 99 | 00:00:50,150 --> 00:00:53,740
100 | 但是类似的程序也能学会玩Atari游戏
101 | 
102 | 26
103 | 00:00:53,740 --> 00:00:55,956
104 | 机器学习能把这一切变成可能
105 | 
106 | 27
107 | 00:00:55,956 --> 00:00:57,330
108 | 它能从样本和经验中
109 | 
110 | 28
111 | 00:00:57,330 --> 00:00:59,039
112 | 学习到算法
113 | 
114 | 29
115 | 00:00:59,039 --> 00:01:00,909
116 | 而不是依赖于人为编写的规则
117 | 
118 | 30
119 | 00:01:00,909 --> 00:01:02,200
120 | 所以这是十分先进的
121 | 
122 | 31
123 | 00:01:02,200 --> 00:01:03,750
124 | 但是我们今天写的是
125 | 
126 | 32
127 | 00:01:03,750 --> 00:01:05,632
128 | 一个非常简单的例子
129 | 
130 | 33
131 | 00:01:05,632 --> 00:01:07,590
132 | 我会给你一个听起来很简单的问题
133 | 
134 | 34
135 | 00:01:07,590 --> 00:01:09,662
136 | 但是不靠机器学习是解决不了的
137 | 
138 | 35
139 | 00:01:09,662 --> 00:01:11,370
140 | 你能够写代码来分辨出
141 | 
142 | 36
143 | 00:01:11,370 --> 00:01:12,774
144 | 苹果和橙子的不同之处吗？
145 | 
146 | 37
147 | 00:01:12,774 --> 00:01:15,190
148 | 想象一下我叫你写一个程序，
149 | 以图像文件作为输入
150 | 
151 | 38
152 | 00:01:15,190 --> 00:01:17,069
153 | 对其进行一些分析
154 | 
155 | 39
156 | 00:01:17,069 --> 00:01:18,650
157 | 然后输出水果的类别
158 | 
159 | 40
160 | 00:01:18,650 --> 00:01:20,040
161 | 你能解决这个问题吗？
162 | 
163 | 41
164 | 00:01:20,040 --> 00:01:22,526
165 | 你得开始写很多特定的规则
166 | 
167 | 42
168 | 00:01:22,526 --> 00:01:23,900
169 | 例如你可以编写代码来统计
170 | 
171 | 43
172 | 00:01:23,900 --> 00:01:26,316
173 | 橙色像素的数量然后和绿色像素
174 | 
175 | 44
176 | 00:01:26,316 --> 00:01:27,569
177 | 的数量作为比较
178 | 
179 | 45
180 | 00:01:27,569 --> 00:01:30,920
181 | 这个比例能给你水果种类的提示
182 | 
183 | 46
184 | 00:01:30,920 --> 00:01:33,043
185 | 这能应付像这种一样简单的图像
186 | 
187 | 47
188 | 00:01:33,043 --> 00:01:34,709
189 | 但是当你深入研究这个问题
190 | 
191 | 48
192 | 00:01:34,709 --> 00:01:37,099
193 | 你会发现这个世界是复杂的
194 | 
195 | 49
196 | 00:01:37,099 --> 00:01:38,650
197 | 你编写的规则也不再适用
198 | 
199 | 50
200 | 00:01:38,650 --> 00:01:41,180
201 | 你要怎么写代码处理黑白图片
202 | 
203 | 51
204 | 00:01:41,180 --> 00:01:44,480
205 | 或者既没有苹果也没有橙子的图片？
206 | 
207 | 52
208 | 00:01:44,480 --> 00:01:46,360
209 | 实际上对于你编写的任何规则
210 | 
211 | 53
212 | 00:01:46,360 --> 00:01:48,790
213 | 我都能找到让它不适用的图像
214 | 
215 | 54
216 | 00:01:48,790 --> 00:01:50,310
217 | 你需要编写成吨的规则,
218 | 
219 | 55
220 | 00:01:50,310 --> 00:01:52,518
221 | 而这仅仅是为了辨别出苹果和橙子
222 | 
223 | 56
224 | 00:01:52,518 --> 00:01:53,690
225 | 图像的不同之处
226 | 
227 | 57
228 | 00:01:53,690 --> 00:01:57,390
229 | 如果我给你一个新的问题，你又得重新开始
230 | 
231 | 58
232 | 00:01:57,390 --> 00:01:59,079
233 | 显然我们需要更好的方法
234 | 
235 | 59
236 | 00:01:59,079 --> 00:02:00,760
237 | 因此我们需要一种算法
238 | 
239 | 60
240 | 00:02:00,760 --> 00:02:02,480
241 | 能为我们找出其中的规则
242 | 
243 | 61
244 | 00:02:02,480 --> 00:02:04,599
245 | 让我们不必要人工地写这些规则
246 | 
247 | 62
248 | 00:02:04,599 --> 00:02:07,690
249 | 为此，我们来编写一个分类器
250 | 
251 | 63
252 | 00:02:07,690 --> 00:02:10,360
253 | 现在你可以把一个分类器当作一个函数
254 | 
255 | 64
256 | 00:02:10,360 --> 00:02:13,160
257 | 输入一些数据然后为其分配一个标签
258 | 
259 | 65
260 | 00:02:13,160 --> 00:02:14,282
261 | 作为输出
262 | 
263 | 66
264 | 00:02:14,282 --> 00:02:15,740
265 | 例如我有一张图片，想对它进行
266 | 
267 | 67
268 | 00:02:15,740 --> 00:02:18,235
269 | 分类，判断这是苹果还是橙子
270 | 
271 | 68
272 | 00:02:18,235 --> 00:02:20,110
273 | 又或者我有一封邮件想对其分类
274 | 
275 | 69
276 | 00:02:20,110 --> 00:02:22,039
277 | 看是否为垃圾邮件
278 | 
279 | 70
280 | 00:02:22,039 --> 00:02:23,690
281 | 这种自动写出分类器的技术
282 | 
283 | 71
284 | 00:02:23,690 --> 00:02:26,220
285 | 被称为有监督学习(supervised learning)
286 | 
287 | 72
288 | 00:02:26,220 --> 00:02:29,319
289 | It begins with examples of
290 | the problem you want to solve.
291 | 
292 | 73
293 | 00:02:29,319 --> 00:02:31,620
294 | 为了用代码实现它，
295 | 我们将要使用scikit-learn库
296 | 
297 | 74
298 | 00:02:31,620 --> 00:02:34,094
299 | 现在，我即将下载这个库
300 | 
301 | 75
302 | 00:02:34,094 --> 00:02:35,970
303 | 有好几种方法去下载它
304 | 
305 | 76
306 | 00:02:35,970 --> 00:02:38,241
307 | 但对我来说最简单的就是Anaconda.
308 | 
309 | 77
310 | 00:02:38,241 --> 00:02:40,449
311 | Anaconda十分方便的为我们安装完
312 | 所有依赖的组件
313 | 
314 | 78
315 | 00:02:40,449 --> 00:02:42,440
316 | 而且还是跨平台的
317 | 
318 | 79
319 | 00:02:42,440 --> 00:02:44,190
320 | 时间关系，下载和安装的部分
321 | 
322 | 80
323 | 00:02:44,190 --> 00:02:45,776
324 | 被跳过了
325 | 
326 | 81
327 | 00:02:45,776 --> 00:02:47,150
328 | 安装完后你可以测试一下
329 | 
330 | 82
331 | 00:02:47,150 --> 00:02:48,608
332 | 看看是否能正常使用
333 | 
334 | 83
335 | 00:02:48,608 --> 00:02:51,364
336 | 新建一个python脚本，然后载入SK learn库
337 | 
338 | 84
339 | 00:02:51,364 --> 00:02:53,780
340 | 如果没有错误，那这就是我们的第一行代码
341 | 
342 | 85
343 | 00:02:53,780 --> 00:02:56,145
344 | 还剩五行而已了
345 | 
346 | 86
347 | 00:02:56,145 --> 00:02:57,520
348 | 对于有监督学习, 我们会遵循
349 | 
350 | 87
351 | 00:02:57,520 --> 00:03:00,280
352 | 一些固定的步骤
353 | 
354 | 88
355 | 00:03:00,280 --> 00:03:02,340
356 | 第一步是收集训练数据
357 | 
358 | 89
359 | 00:03:02,340 --> 00:03:04,789
360 | 这是我们要解决的问题的一些样例
361 | 
362 | 90
363 | 00:03:04,789 --> 00:03:06,789
364 | 我们的目标是编写一个函数
365 | 
366 | 91
367 | 00:03:06,789 --> 00:03:08,002
368 | 对一些水果进行分类
369 | 
370 | 92
371 | 00:03:08,002 --> 00:03:10,210
372 | 一开始输入一个水果的描述然后判断
373 | 
374 | 93
375 | 00:03:10,210 --> 00:03:11,680
376 | 这是苹果还是橙子并输出所属类别
377 | 
378 | 94
379 | 00:03:11,680 --> 00:03:14,349
380 | 判断的依据是水果的特征
381 | 
382 | 95
383 | 00:03:14,349 --> 00:03:16,310
384 | 比如重量(Weight)和质感(Texture)
385 | 
386 | 96
387 | 00:03:16,310 --> 00:03:18,160
388 | 为了收集我们的训练数据，
389 | 
390 | 97
391 | 00:03:18,160 --> 00:03:19,310
392 | 想象我们去一个果园
393 | 
394 | 98
395 | 00:03:19,310 --> 00:03:21,060
396 | 我们观察不同的苹果和橙子
397 | 
398 | 99
399 | 00:03:21,060 --> 00:03:23,627
400 | 然后在表格中写下观察度量的结果描述
401 | 
402 | 100
403 | 00:03:23,627 --> 00:03:25,210
404 | 在机器学习中，这些度量的结果
405 | 
406 | 101
407 | 00:03:25,210 --> 00:03:26,650
408 | 被称为特征(Features)
409 | 
410 | 102
411 | 00:03:26,650 --> 00:03:28,970
412 | 为了简化问题，在此我们仅使用两种特征：
413 | 
414 | 103
415 | 00:03:28,970 --> 00:03:31,650
416 | 每个水果重多少克和水果的质地,
417 | 
418 | 104
419 | 00:03:31,650 --> 00:03:33,830
420 | 比如说粗糙(bumpy)或光滑(smooth)
421 | 
422 | 105
423 | 00:03:33,830 --> 00:03:35,860
424 | 一种好的特征能更容易地
425 | 
426 | 106
427 | 00:03:35,860 --> 00:03:37,960
428 | 区分出水果种类的不同之处
429 | 
430 | 107
431 | 00:03:37,960 --> 00:03:40,210
432 | 每一行训练数据都是一个样本数据
433 | 
434 | 108
435 | 00:03:40,210 --> 00:03:42,259
436 | 每一个样本数据描述了一个水果
437 | 
438 | 109
439 | 00:03:42,259 --> 00:03:44,240
440 | 最后一列数据是标签(label).
441 | 
442 | 110
443 | 00:03:44,240 --> 00:03:46,257
444 | 其定义了每一行数据是哪一种水果
445 | 
446 | 111
447 | 00:03:46,257 --> 00:03:47,840
448 | 这里只有两种可能：
449 | 
450 | 112
451 | 00:03:47,840 --> 00:03:49,430
452 | 苹果和橙子
453 | 
454 | 113
455 | 00:03:49,430 --> 00:03:51,560
456 | 这整个表格就是训练数据(training data)
457 | 
458 | 114
459 | 00:03:51,560 --> 00:03:53,069
460 | 我们想分类器从这所有的样本数据
461 | 
462 | 115
463 | 00:03:53,069 --> 00:03:55,120
464 | 中学习规则
465 | 
466 | 116
467 | 00:03:55,120 --> 00:03:57,660
468 | 如果你拥有更多的数据，你就能
469 | 
470 | 117
471 | 00:03:57,660 --> 00:03:59,310
472 | 创建更好的分类器
473 | 
474 | 118
475 | 00:03:59,310 --> 00:04:01,620
476 | 现在让我们在代码里写下训练数据
477 | 
478 | 119
479 | 00:04:01,620 --> 00:04:04,150
480 | 我们使用两个变量：
481 | features 和 labels.
482 | 
483 | 120
484 | 00:04:04,150 --> 00:04:06,060
485 | Features包括头两列数据,
486 | 
487 | 121
488 | 00:04:06,060 --> 00:04:07,887
489 | labels包括最后一列数据.
490 | 
491 | 122
492 | 00:04:07,887 --> 00:04:09,470
493 | 你可以把这当作以特征作为分类器的输入
494 | 
495 | 123
496 | 00:04:09,470 --> 00:04:13,401
497 | 然后输出我们想要的标签
498 | 
499 | 124
500 | 00:04:13,401 --> 00:04:15,650
501 | 我要把所有特征的数据类型由
502 | 
503 | 125
504 | 00:04:15,650 --> 00:04:18,980
505 | 字符串(strings)改为整数(ints)
506 | 因此我用0代表粗糙(bumpy)，
507 | 
508 | 126
509 | 00:04:18,980 --> 00:04:19,937
510 | 1代表光滑
511 | 
512 | 127
513 | 00:04:19,937 --> 00:04:22,269
514 | 我还要对labels变量做同样的事：
515 | 用0代表苹果，
516 | 
517 | 128
518 | 00:04:22,269 --> 00:04:23,740
519 | 1代表橙子
520 | 
521 | 129
522 | 00:04:23,740 --> 00:04:26,300
523 | 这是我们程序的第二和第三行
524 | 
525 | 130
526 | 00:04:26,300 --> 00:04:29,160
527 | 第二步就是使用这些样本来训练
528 | 
529 | 131
530 | 00:04:29,160 --> 00:04:30,440
531 | 一个分类器
532 | 
533 | 132
534 | 00:04:30,440 --> 00:04:32,350
535 | 我们的第一个分类器的类型是
536 | 
537 | 133
538 | 00:04:32,350 --> 00:04:34,029
539 | 决策树（decision tree）
540 | 
541 | 134
542 | 00:04:34,029 --> 00:04:35,449
543 | 我们以后会深入了解
544 | 
545 | 135
546 | 00:04:35,449 --> 00:04:37,110
547 | 它是怎样工作的
548 | 
549 | 136
550 | 00:04:37,110 --> 00:04:41,269
551 | 但是现在你可以把分类器当成
552 | 一个内置各种规则的黑盒子
553 | 
554 | 137
555 | 00:04:41,269 --> 00:04:43,880
556 | 因为分类器有许多种
557 | 
558 | 138
559 | 00:04:43,880 --> 00:04:47,740
560 | 但输入和输出基本都是相似的
561 | 
562 | 139
563 | 00:04:47,740 --> 00:04:49,170
564 | 我现在就来载入决策树
565 | 
566 | 140
567 | 00:04:49,170 --> 00:04:52,000
568 | 在脚本的第四行我们就创建了一个分类器
569 | 
570 | 141
571 | 00:04:52,000 --> 00:04:54,459
572 | 这时候这个黑盒子里面还没有规则
573 | 
574 | 142
575 | 00:04:54,459 --> 00:04:56,829
576 | 它还不知道苹果和橙子的任何信息
577 | 
578 | 143
579 | 00:04:56,829 --> 00:04:58,870
580 | 为了训练它我们需要一个学习算法
581 | 
582 | 144
583 | 00:04:58,870 --> 00:05:00,307
584 | 如果分类器是内置规则的黑盒子
585 | 
586 | 145
587 | 00:05:00,307 --> 00:05:02,139
588 | 那你可以把学习算法当作
589 | 
590 | 146
591 | 00:05:02,139 --> 00:05:04,170
592 | 通过在你的训练数据中寻找特定的模式
593 | 
594 | 147
595 | 00:05:04,170 --> 00:05:06,937
596 | 来产生规则的程序
597 | 
598 | 148
599 | 00:05:06,937 --> 00:05:09,269
600 | 例如，它会注意到橙子要更重一些
601 | 
602 | 149
603 | 00:05:09,269 --> 00:05:11,920
604 | 所以它会建立一条规则：更重的水果
605 | 
606 | 150
607 | 00:05:11,920 --> 00:05:14,269
608 | 更有可能是橙子
609 | 
610 | 151
611 | 00:05:14,269 --> 00:05:16,130
612 | 在scikit库, 训练算法在
613 | 
614 | 152
615 | 00:05:16,130 --> 00:05:19,315
616 | 分类器的类中，叫做Fit
617 | 
618 | 153
619 | 00:05:19,315 --> 00:05:21,899
620 | 你可以认为Fit等同于
621 | 
622 | 154
623 | 00:05:21,899 --> 00:05:23,136
624 | “在数据中找模式”
625 | 
626 | 155
627 | 00:05:23,136 --> 00:05:24,509
628 | 在以后我们会了解到
629 | 
630 | 156
631 | 00:05:24,509 --> 00:05:27,040
632 | 这个算法工作的细节
633 | 
634 | 157
635 | 00:05:27,040 --> 00:05:29,100
636 | 现在，我们有一个训练好的分类器
637 | 
638 | 158
639 | 00:05:29,100 --> 00:05:32,860
640 | 让我们用它来分类一个新的水果样本
641 | 
642 | 159
643 | 00:05:32,860 --> 00:05:36,036
644 | 输入分类器的是一个新样本的特征
645 | 
646 | 160
647 | 00:05:36,036 --> 00:05:37,660
648 | 这表示我们想要进行分类的水果是
649 | 
650 | 161
651 | 00:05:37,660 --> 00:05:39,750
652 | 150克和粗糙的
653 | 
654 | 162
655 | 00:05:39,750 --> 00:05:43,870
656 | 输出0代表苹果，输出1代表橙子
657 | 
658 | 163
659 | 00:05:43,870 --> 00:05:46,310
660 | 在我们按下回车查看分类结果之前
661 | 
662 | 164
663 | 00:05:46,310 --> 00:05:47,690
664 | 让我们来想一想
665 | 
666 | 165
667 | 00:05:47,690 --> 00:05:51,160
668 | 如果让你来猜，你认为会输出什么
669 | 
670 | 166
671 | 00:05:51,160 --> 00:05:53,980
672 | 为了解决这个问题，我们来对比训练数据
673 | 
674 | 167
675 | 00:05:53,980 --> 00:05:55,630
676 | 这看起来更像一个橙子
677 | 
678 | 168
679 | 00:05:55,630 --> 00:05:57,076
680 | 因为它重而且粗糙
681 | 
682 | 169
683 | 00:05:57,076 --> 00:05:59,160
684 | 这只是我的猜测，让我们按下回车
685 | 
686 | 170
687 | 00:05:59,160 --> 00:06:01,834
688 | 发现我们的分类器分类结果是一样的
689 | 
690 | 171
691 | 00:06:01,834 --> 00:06:03,250
692 | 如果你能完成这个程序，
693 | 
694 | 172
695 | 00:06:03,250 --> 00:06:06,050
696 | 那这就是你第一个机器学习程序
697 | 
698 | 173
699 | 00:06:06,050 --> 00:06:08,680
700 | 你可以为另外一种问题再建立一个分类器
701 | 
702 | 174
703 | 00:06:08,680 --> 00:06:10,769
704 | 仅仅需要改变训练数据
705 | 
706 | 175
707 | 00:06:10,769 --> 00:06:13,009
708 | 这使得这种方法比为了每个问题
709 | 
710 | 176
711 | 00:06:13,009 --> 00:06:15,101
712 | 写一个新的规则更加可重复使用
713 | 
714 | 177
715 | 00:06:15,101 --> 00:06:17,350
716 | 现在，你也许想知道为什么我们用
717 | 
718 | 178
719 | 00:06:17,350 --> 00:06:19,790
720 | 一个表格而不是图像来描述水果
721 | 
722 | 179
723 | 00:06:19,790 --> 00:06:21,759
724 | 并作为训练数据
725 | 
726 | 180
727 | 00:06:21,759 --> 00:06:23,360
728 | 当然了，你可以使用图片，这个
729 | 
730 | 181
731 | 00:06:23,360 --> 00:06:25,120
732 | 会在以后的视频中会学习到
733 | 
734 | 182
735 | 00:06:25,120 --> 00:06:27,279
736 | 但是，你接下来会看到
737 | 
738 | 183
739 | 00:06:27,279 --> 00:06:29,002
740 | 我们现在使用的方法更普遍
741 | 
742 | 184
743 | 00:06:29,002 --> 00:06:30,959
744 | 你要明白的事情是
745 | 
746 | 185
747 | 00:06:30,959 --> 00:06:32,028
748 | 写一个机器学习的程序并不难
749 | 
750 | 186
751 | 00:06:32,028 --> 00:06:33,819
752 | 但是要让它正确地工作，你需要
753 | 
754 | 187
755 | 00:06:33,819 --> 00:06:35,406
756 | 明白一些重要的概念
757 | 
758 | 188
759 | 00:06:35,406 --> 00:06:37,990
760 | 接下来的几集视频我会跟你讲解这些
761 | 
762 | 189
763 | 00:06:37,990 --> 00:06:40,197
764 | 非常感谢你的观看，我们很快会再见的
765 | 
766 | 190
767 | 00:06:40,197 --> 00:06:43,850
768 | [中文字幕：KnightJun 来源：Google Developers]
769 | 


--------------------------------------------------------------------------------
/subtitle/Chs/Visualizing a Decision Tree - Machine Learning Recipes #2_chs.srt:
--------------------------------------------------------------------------------
  1 | 1
  2 | 00:00:00,000 --> 00:00:00,000
  3 | Youtube subtitles download by mo.dbxdb.com 
  4 | 
  5 | 2
  6 | 00:00:00,000 --> 00:00:02,802
  7 | [MUSIC PLAYING]
  8 | 
  9 | 3
 10 | 00:00:02,802 --> 00:00:06,550
 11 | 
 12 | 
 13 | 4
 14 | 00:00:06,550 --> 00:00:09,370
 15 | 上集，我们使用决策树作分类器
 16 | 
 17 | 
 18 | 5
 19 | 00:00:09,370 --> 00:00:10,920
 20 | 这集我们将完善代码使其变得可视化
 21 | 
 22 | 
 23 | 6
 24 | 00:00:10,920 --> 00:00:13,032
 25 | 这样我们让更容易了解它的工作过程
 26 | 
 27 | 7
 28 | 00:00:13,032 --> 00:00:14,490
 29 | 有许多类型的分类器
 30 | 
 31 | 8
 32 | 00:00:14,490 --> 00:00:16,740
 33 | 一些你可能已经听说过――诸如神经网络
 34 | 
 35 | 9
 36 | 00:00:16,740 --> 00:00:17,870
 37 | 或支持向量机
 38 | 
 39 | 10
 40 | 00:00:17,870 --> 00:00:20,234
 41 | 那么为什么我们要从使用决策树开始?
 42 | 
 43 | 11
 44 | 00:00:20,234 --> 00:00:21,900
 45 | 其实是因为决策树有个非常特殊的性质
 46 | 
 47 | 12
 48 | 00:00:21,900 --> 00:00:23,907
 49 | 就是它的可读性很强，易于理解
 50 | 
 51 | 13
 52 | 00:00:23,907 --> 00:00:26,490
 53 | 事实上,它是为数不多的几个可判断的模型之一
 54 | 
 55 | 14
 56 | 00:00:26,490 --> 00:00:28,900
 57 | 这些模型有助于我们理解为何分类器
 58 | 
 59 | 15
 60 | 00:00:28,900 --> 00:00:29,740
 61 | 可以做出决策
 62 | 
 63 | 16
 64 | 00:00:29,740 --> 00:00:33,534
 65 | 这是让人觉得惊奇的事，也是在实践中最有用的部分
 66 | 
 67 | 17
 68 | 00:00:33,534 --> 00:00:34,950
 69 | 开始前，
 70 | 我将介绍一个
 71 | 
 72 | 18
 73 | 00:00:34,950 --> 00:00:37,079
 74 | 今天要用到的数据集
 75 | 
 76 | 19
 77 | 00:00:37,079 --> 00:00:38,670
 78 | 叫作Iris.
 79 | 
 80 | 20
 81 | 00:00:38,670 --> 00:00:41,170
 82 | Iris 是一个经典的机器学习问题
 83 | 
 84 | 21
 85 | 00:00:41,170 --> 00:00:43,270
 86 | 你想要弄清楚某朵花是属于哪一类的
 87 | 
 88 | 22
 89 | 00:00:43,270 --> 00:00:45,009
 90 | 你得根据不同的度量方法
 91 | 
 92 | 23
 93 | 00:00:45,009 --> 00:00:46,980
 94 | 例如花瓣的长度和宽度
 95 | 
 96 | 24
 97 | 00:00:46,980 --> 00:00:49,600
 98 | 数据集里包含三种不同种类的花
 99 | 
100 | 25
101 | 00:00:49,600 --> 00:00:52,870
102 | 它们都是鸢尾花的品种--Setosa（山鸢尾）、Versicolour（杂色鸢尾）
103 | 
104 | 26
105 | 00:00:52,870 --> 00:00:53,966
106 | 以及Virginica（维吉尼亚鸢尾）。
107 | 
108 | 27
109 | 00:00:53,966 --> 00:00:55,340
110 | 接下来，你可以看到每个种类
111 | 
112 | 28
113 | 00:00:55,340 --> 00:01:00,024
114 | 我们给出了50个样本，
115 | 
116 | 29
117 | 00:01:00,024 --> 00:01:01,650
118 | 也就是总的有150个样本
119 | 
120 | 30
121 | 00:01:01,650 --> 00:01:03,620
122 | 每个样本都有四个属性
123 | 
124 | 31
125 | 00:01:03,620 --> 00:01:06,670
126 | 分别是花萼的长度，花萼的宽度，花瓣的长度，花瓣的宽度
127 | 
128 | 32
129 | 00:01:06,670 --> 00:01:08,730
130 | 类似之前的苹果橘子问题
131 | 
132 | 33
133 | 00:01:08,730 --> 00:01:11,780
134 | 前四列给出了四中属性，最后一列给出了标签
135 | 
136 | 34
137 | 00:01:11,780 --> 00:01:15,170
138 | 也就是对应行的花的种类
139 | 
140 | 35
141 | 00:01:15,170 --> 00:01:18,140
142 | 我们的目标是使用这个数据集来训练分类器。
143 | 
144 | 36
145 | 00:01:18,140 --> 00:01:21,027
146 | 然后给我们一朵花
147 | 
148 | 
149 | 37
150 | 00:01:21,027 --> 00:01:23,610
151 | 我们就可以使用分类器预测这朵花的种类
152 | 
153 | 38
154 | 00:01:23,610 --> 00:01:25,036
155 | 即使我们从没见过这种花
156 | 
157 | 39
158 | 00:01:25,036 --> 00:01:26,910
159 | 了解如何处理现有的数据集是非常有用的
160 | 
161 | 40
162 | 00:01:26,910 --> 00:01:29,910
163 | 那就让我们将Iris数据集导入scikit-learn中
164 | 
165 | 41
166 | 00:01:29,910 --> 00:01:32,120
167 | 看看它们在代码中的表示形式
168 | 
169 | 42
170 | 00:01:32,120 --> 00:01:33,870
171 | scikit中提供了一些示例数据集，
172 | 
173 | 43
174 | 00:01:33,870 --> 00:01:35,770
175 | 其中就包括Iris, 
176 | 
177 | 44
178 | 00:01:35,770 --> 00:01:37,780
179 | 还有一些公用程式
180 | 
181 | 45
182 | 00:01:37,780 --> 00:01:39,760
183 | 便于我们引用
184 | 
185 | 46
186 | 00:01:39,760 --> 00:01:42,690
187 | 我们可以这样将Iris 引入代码中
188 | 
189 | 47
190 | 00:01:42,690 --> 00:01:44,530
191 | 从维基百科下载的数据集包括表和一些元数据
192 | 
193 | 48
194 | 00:01:44,530 --> 00:01:47,230
195 | 元数据告诉我们
196 | 
197 | 49
198 | 00:01:47,230 --> 00:01:49,630
199 | 那些属性的名称
200 | 
201 | 50
202 | 00:01:49,630 --> 00:01:52,430
203 | 以及不同种类的花的名字
204 | 
205 | 51
206 | 00:01:52,430 --> 00:01:54,190
207 | 属性和样本都包含在
208 | 
209 | 52
210 | 00:01:54,190 --> 00:01:56,300
211 | 数据变量中
212 | 
213 | 53
214 | 00:01:56,300 --> 00:01:58,239
215 | 例如,如果我打印第一个条目,
216 | 
217 | 54
218 | 00:01:58,239 --> 00:02:00,920
219 | 就可以得到这朵花的预测结果
220 | 
221 | 55
222 | 00:02:00,920 --> 00:02:03,819
223 | 这些对应属性的值
224 | 所以第一个值指的是花萼长度
225 | 
226 | 56
227 | 00:02:03,819 --> 00:02:06,760
228 | 第二个指的是花萼的宽度
229 | 
230 | 57
231 | 00:02:06,760 --> 00:02:09,150
232 | 以此类推
233 | 
234 | 58
235 | 00:02:09,150 --> 00:02:11,750
236 | 变量target[]指的是标签内容
237 | 
238 | 59
239 | 00:02:11,750 --> 00:02:14,690
240 | 同样, 
241 | 
242 | 60
243 | 00:02:14,690 --> 00:02:16,000
244 | 让我们把第一个打印出来看看
245 | 
246 | 61
247 | 00:02:16,000 --> 00:02:19,229
248 | 标签0表示这是一朵setosa.
249 | 
250 | 62
251 | 00:02:19,229 --> 00:02:21,449
252 | 看着这个维基百科上的图
253 | 
254 | 
255 | 63
256 | 00:02:21,449 --> 00:02:24,520
257 | 可以看到我们只是把第一行打印出来了
258 | 
259 | 
260 | 64
261 | 00:02:24,520 --> 00:02:27,967
262 | 数据和target变量都有150条
263 | 
264 | 
265 | 65
266 | 00:02:27,967 --> 00:02:29,550
267 | 你也可以像这样遍历数据集
268 | 
269 | 66
270 | 00:02:29,550 --> 00:02:32,081
271 | 并打印出来
272 | 
273 | 67
274 | 00:02:32,081 --> 00:02:34,039
275 | 既然我们已经知道了怎么处理数据集
276 | 
277 | 68
278 | 00:02:34,039 --> 00:02:35,849
279 | 我们就要开始训练分类器
280 | 
281 | 69
282 | 00:02:35,849 --> 00:02:39,300
283 | 但是在这之前，第一件事是要分割数据
284 | 
285 | 70
286 | 00:02:39,300 --> 00:02:41,440
287 | 我把几个样本弄出来
288 | 
289 | 
290 | 71
291 | 00:02:41,440 --> 00:02:43,479
292 | 先放在一边
293 | 
294 | 72
295 | 00:02:43,479 --> 00:02:46,330
296 | 我们把先放在一边的数据称为测试数据
297 | 
298 | 73
299 | 00:02:46,330 --> 00:02:48,780
300 | 把这些数据跟我们的训练数据分开
301 | 
302 | 74
303 | 00:02:48,780 --> 00:02:50,940
304 | 之后我们将用测试数据
305 | 
306 | 75
307 | 00:02:50,940 --> 00:02:53,389
308 | 来验证分类器在分类没遇到过的数据的准确性
309 | 
310 | 76
311 | 00:02:53,389 --> 00:02:55,679
312 | 
313 | 
314 | 77
315 | 00:02:55,679 --> 00:02:57,470
316 | 测试是在机器学习
317 | 
318 | 
319 | 78
320 | 00:02:57,470 --> 00:02:59,261
321 | 实践中非常重要的部分
322 | 
323 | 79
324 | 00:02:59,261 --> 00:03:02,280
325 | 在之后的课程中我们将会更详细的介绍
326 | 
327 | 
328 | 80
329 | 00:03:02,280 --> 00:03:04,710
330 | 针对这个例子，我就对每种种类的花
331 | 
332 | 81
333 | 00:03:04,710 --> 00:03:06,050
334 | 移除一个样本
335 | 
336 | 82
337 | 00:03:06,050 --> 00:03:07,520
338 | 因为数据集是按顺序排列的
339 | 
340 | 83
341 | 00:03:07,520 --> 00:03:10,009
342 | 所以第一个setosa的索引号为0 
343 | 
344 | 84
345 | 00:03:10,009 --> 00:03:14,270
346 | 第一个versicolor的索引号为50, 以此类推
347 | 
348 | 85
349 | 00:03:14,270 --> 00:03:16,770
350 | 语法看起来有点复杂,
351 | 
352 | 86
353 | 00:03:16,770 --> 00:03:21,229
354 | 但我所做就是从数据和目标变量中删除3条样本。
355 | 
356 | 87
357 | 00:03:21,229 --> 00:03:24,080
358 | 然后我将设两个新的变量集
359 | 
360 | 88
361 | 00:03:24,080 --> 00:03:26,586
362 | 一个用来训练，另一个用来测试
363 | 
364 | 89
365 | 00:03:26,586 --> 00:03:28,419
366 | 绝大部分的数据进行训练
367 | 
368 | 90
369 | 00:03:28,419 --> 00:03:31,370
370 | 只将刚刚移除的样本作测试
371 | 
372 | 91
373 | 00:03:31,370 --> 00:03:33,830
374 | 现在我们就可以像上集一样创建一个决策树分类器
375 | 
376 | 92
377 | 00:03:33,830 --> 00:03:36,569
378 | 然后用训练数据训练它
379 | 
380 | 93
381 | 00:03:36,569 --> 00:03:40,699
382 | 
383 | 
384 | 94
385 | 00:03:40,699 --> 00:03:42,840
386 | 在作可视化之前，让我们用决策树
387 | 
388 | 95
389 | 00:03:42,840 --> 00:03:44,960
390 | 对我们的测试数据作分类
391 | 
392 | 96
393 | 00:03:44,960 --> 00:03:47,449
394 | 每种种类的花我们都留了一个样本
395 | 
396 | 97
397 | 00:03:47,449 --> 00:03:50,180
398 | 然后打印出它们的标签
399 | 
400 | 98
401 | 00:03:50,180 --> 00:03:52,160
402 | 现在让我们看看决策树预测的结果
403 | 
404 | 99
405 | 00:03:52,160 --> 00:03:54,460
406 | 我们给决策树提供测试样本的属性
407 | 
408 | 100
409 | 00:03:54,460 --> 00:03:56,349
410 | 就可以得到标签
411 | 
412 | 101
413 | 00:03:56,349 --> 00:03:59,660
414 | 可以看到预测的结果和测试样本是一致的
415 | 
416 | 102
417 | 00:03:59,660 --> 00:04:01,550
418 | 这说明决策树的结果是正确的
419 | 
420 | 103
421 | 00:04:01,550 --> 00:04:04,039
422 | 这只是一个非常简单的测试
423 | 
424 | 104
425 | 00:04:04,039 --> 00:04:07,940
426 | 后面将有更加详细的介绍
427 | 
428 | 105
429 | 00:04:07,940 --> 00:04:09,819
430 | 现在我们要将决策树可视化
431 | 
432 | 106
433 | 00:04:09,819 --> 00:04:11,762
434 | 这有助于我们理解分类器的工作原理
435 | 
436 | 107
437 | 00:04:11,762 --> 00:04:13,220
438 | 从scikit教程中复制粘贴
439 | 
440 | 108
441 | 00:04:13,220 --> 00:04:15,220
442 | 一部分代码
443 | 
444 | 109
445 | 00:04:15,220 --> 00:04:16,994
446 | 因为这些代码是用来做可视化的
447 | 
448 | 110
449 | 00:04:16,994 --> 00:04:18,410
450 | 与机器学习的概念无关
451 | 
452 | 111
453 | 00:04:18,410 --> 00:04:20,380
454 | 所以细节部分我就不做介绍了
455 | 
456 | 112
457 | 00:04:20,380 --> 00:04:22,759
458 | 现在我们将两个样本的数据放在一起
459 | 
460 | 113
461 | 00:04:22,759 --> 00:04:26,329
462 | 并创建一个PDF文件
463 | 
464 | 114
465 | 00:04:26,329 --> 00:04:28,440
466 | 运行脚步可以打开pdf文件
467 | 
468 | 
469 | 115
470 | 00:04:28,440 --> 00:04:30,120
471 | 我们就可以看到决策树了
472 | 
473 | 116
474 | 00:04:30,120 --> 00:04:33,810
475 | 用决策树来分类数据，你需要从树的根部开始看
476 | 
477 | 
478 | 117
479 | 00:04:33,810 --> 00:04:35,829
480 | 每个结点都回答了关于属性的
481 | 
482 | 118
483 | 00:04:35,829 --> 00:04:37,504
484 | 是或否的问题
485 | 
486 | 119
487 | 00:04:37,504 --> 00:04:39,420
488 | 例如，这个结点的问题是花瓣的宽度
489 | 
490 | 
491 | 120
492 | 00:04:39,420 --> 00:04:41,420
493 | 是否小于0.8厘米
494 | 
495 | 121
496 | 00:04:41,420 --> 00:04:44,199
497 | 如果对于这个样本来说答案为“是”，则往左
498 | 
499 | 
500 | 122
501 | 00:04:44,199 --> 00:04:46,170
502 | 否则往右
503 | 
504 | 123
505 | 00:04:46,170 --> 00:04:48,589
506 | 现在我们用这个树
507 | 
508 | 
509 | 124
510 | 00:04:48,589 --> 00:04:50,130
511 | 对我们的一个测试数据作分类
512 | 
513 | 125
514 | 00:04:50,130 --> 00:04:53,233
515 | 这是我们第一个测试花朵的属性和标签
516 | 
517 | 126
518 | 00:04:53,233 --> 00:04:54,899
519 | 通过元数据
520 | 
521 | 127
522 | 00:04:54,899 --> 00:04:56,579
523 | 我们可以看到属性名称
524 | 
525 | 128
526 | 00:04:56,579 --> 00:04:58,980
527 | 我们知道这朵花是setosa,
528 | 
529 | 129
530 | 00:04:58,980 --> 00:05:00,779
531 | 那让我们来看看决策树的预测结果
532 | 
533 | 130
534 | 00:05:00,779 --> 00:05:03,290
535 | 
536 | 我调整窗口的大小好让大家看的清楚
537 | 
538 | 131
539 | 00:05:03,290 --> 00:05:04,889
540 | 决策树的第一个问题是
541 | 
542 | 132
543 | 00:05:04,889 --> 00:05:08,110
544 | 花朵的宽度是否小于0.8厘米
545 | 
546 | 133
547 | 00:05:08,110 --> 00:05:09,540
548 | 这是第四个属性
549 | 
550 | 134
551 | 00:05:09,540 --> 00:05:11,709
552 | 答案是“是，所以我们继续往左”
553 | 
554 | 135
555 | 00:05:11,709 --> 00:05:14,149
556 | 这个点已经是叶子结点了
557 | 
558 | 
559 | 136
560 | 00:05:14,149 --> 00:05:15,860
561 | 所以接下来不需要回答其他的问题了
562 | 
563 | 137
564 | 00:05:15,860 --> 00:05:18,490
565 | 决策树给出了预测结果，即 setosa,
566 | 
567 | 138
568 | 00:05:18,490 --> 00:05:19,440
569 | 这是正确的答案
570 | 
571 | 139
572 | 00:05:19,440 --> 00:05:23,329
573 | 注意标签值为0, 即指向了对应种类的花
574 | 
575 | 140
576 | 00:05:23,329 --> 00:05:25,930
577 | 接下来继续第二条测试数据
578 | 
579 | 141
580 | 00:05:25,930 --> 00:05:27,319
581 | 这个数据是versicolor的
582 | 
583 | 142
584 | 00:05:27,319 --> 00:05:29,329
585 | 来看看决策树给出的答案
586 | 
587 | 143
588 | 00:05:29,329 --> 00:05:31,839
589 | 我们再从树根开始，
590 | 这次花瓣的宽度
591 | 
592 | 144
593 | 00:05:31,839 --> 00:05:33,750
594 | 大于0.8厘米
595 | 
596 | 145
597 | 00:05:33,750 --> 00:05:35,839
598 | 即这个结点的问题答案为“否”
599 | 
600 | 146
601 | 00:05:35,839 --> 00:05:36,829
602 | 所以往右走
603 | 
604 | 147
605 | 00:05:36,829 --> 00:05:39,245
606 | 决策树的下一个问题是
607 | 
608 | 148
609 | 00:05:39,245 --> 00:05:40,709
610 | 花瓣的宽度是否小于1.75厘米
611 | 
612 | 149
613 | 00:05:40,709 --> 00:05:42,410
614 | 这是为了缩小范围
615 | 
616 | 150
617 | 00:05:42,410 --> 00:05:44,440
618 | 答案为“是”，所以接着往左走
619 | 
620 | 151
621 | 00:05:44,440 --> 00:05:47,319
622 | 下一个问题是花瓣的长度是否小于4.95厘米.
623 | 
624 | 152
625 | 00:05:47,319 --> 00:05:49,180
626 | 答案为“是”，故接着往左
627 | 
628 | 153
629 | 00:05:49,180 --> 00:05:51,130
630 | 最后一个问题是花瓣的宽度
631 | 
632 | 154
633 | 00:05:51,130 --> 00:05:52,810
634 |  是否小于1.65厘米.
635 | 
636 | 155
637 | 00:05:52,810 --> 00:05:54,300
638 | 答案为“是”, 所以往左
639 | 
640 | 156
641 | 00:05:54,300 --> 00:05:57,029
642 | 预测的结果就是这是样本属于versicolor,
643 | 
644 | 
645 | 157
646 | 00:05:57,029 --> 00:05:58,610
647 | 结果又一次预测正确
648 | 
649 | 158
650 | 00:05:58,610 --> 00:06:01,170
651 | 你可以把最后一个测试样本当做练习
652 | 
653 | 159
654 | 00:06:01,170 --> 00:06:03,079
655 | 记住，我们利用树来预测的过程
656 | 
657 | 
658 | 160
659 | 00:06:03,079 --> 00:06:05,607
660 | 就是代码运行的过程
661 | 
662 | 161
663 | 00:06:05,607 --> 00:06:07,440
664 | 现在你已经了解了
665 | 
666 | 162
667 | 00:06:07,440 --> 00:06:08,285
668 | 决策树的过程
669 | 
670 | 163
671 | 00:06:08,285 --> 00:06:09,660
672 | 这里还有更多要了解的内容
673 | 
674 | 164
675 | 00:06:09,660 --> 00:06:12,720
676 | 特别是他们是如何通过样本自动创建一颗树    
677 | 
678 | 165
679 | 00:06:12,720 --> 00:06:14,620
680 | 我将会在接下来的课程中作深入介绍
681 | 
682 | 166
683 | 00:06:14,620 --> 00:06:17,019
684 | 但是现在，让我们了解一个更加关键的问题
685 | 
686 | 167
687 | 00:06:17,019 --> 00:06:19,519
688 | 决策树的每一个问题
689 | 
690 | 168
691 | 00:06:19,519 --> 00:06:20,264
692 | 都是与属性有关
693 | 
694 | 169
695 | 00:06:20,264 --> 00:06:22,680
696 | 这意味着属性选取得越好
697 | 
698 | 170
699 | 00:06:22,680 --> 00:06:23,630
700 | 决策树就越准确
701 | 
702 | 171
703 | 00:06:23,630 --> 00:06:25,300
704 | 下一节课就介绍
705 | 
706 | 172
707 | 00:06:25,300 --> 00:06:26,514
708 | 怎样使属性取得更好
709 | 
710 | 173
711 | 00:06:26,514 --> 00:06:28,930
712 | 谢谢观看，下节课再见
713 | 
714 | 174
715 | 00:06:28,930 --> 00:06:31,980
716 | 本集中文字幕翻译：sisely
717 | 
718 | 175
719 | 00:06:31,980 --> 00:06:41,000
720 |  Subtitles End: mo.dbxdb.com
721 | 
722 | 


--------------------------------------------------------------------------------
/subtitle/Chs/What Makes a Good Feature- - Machine Learning Recipes #3.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KnightJun/Google-ML-Recipes-Chs-sub-and-code/ba175c398b611f7f92feec48c393ca2c9f3e1023/subtitle/Chs/What Makes a Good Feature- - Machine Learning Recipes #3.srt


--------------------------------------------------------------------------------
/subtitle/Chs/[ing...]Machine Learning over Coffee with a Googler.srt:
--------------------------------------------------------------------------------
   1 | 1
   2 | 00:00:00,000 --> 00:00:00,000
   3 | Youtube subtitles download by mo.dbxdb.com 
   4 | 
   5 | 2
   6 | 00:00:00,000 --> 00:00:02,350
   7 | [MUSIC PLAYING]
   8 | 
   9 | 3
  10 | 00:00:02,350 --> 00:00:06,297
  11 | 
  12 | 
  13 | 4
  14 | 00:00:06,297 --> 00:00:08,130
  15 | LAURENCE MORONEY: Today
  16 | I'm in the Big Apple
  17 | 
  18 | 5
  19 | 00:00:08,130 --> 00:00:09,720
  20 | meeting with Josh
  21 | Gordon from Google
  22 | 
  23 | 6
  24 | 00:00:09,720 --> 00:00:11,360
  25 | to talk about machine
  26 | learning, where
  27 | 
  28 | 7
  29 | 00:00:11,360 --> 00:00:14,310
  30 | we will dig into how it
  31 | works, why it's important,
  32 | 
  33 | 8
  34 | 00:00:14,310 --> 00:00:17,437
  35 | and where you can
  36 | learn all about it.
  37 | 
  38 | 9
  39 | 00:00:17,437 --> 00:00:19,520
  40 | Welcome to Coffee with a
  41 | Googler in New York City.
  42 | 
  43 | 10
  44 | 00:00:19,520 --> 00:00:21,400
  45 | I'm Laurence Moroney,
  46 | and I'm here today
  47 | 
  48 | 11
  49 | 00:00:21,400 --> 00:00:23,500
  50 | speaking with Joshua Gordon.
  51 | 
  52 | 12
  53 | 00:00:23,500 --> 00:00:25,180
  54 | Now, it's something
  55 | that a lot of people
  56 | 
  57 | 13
  58 | 00:00:25,180 --> 00:00:27,179
  59 | don't really understand
  60 | what machine learning is
  61 | 
  62 | 14
  63 | 00:00:27,179 --> 00:00:28,114
  64 | in a concrete manner.
  65 | 
  66 | 15
  67 | 00:00:28,114 --> 00:00:29,530
  68 | JOSHUA GORDON: So
  69 | machine learning
  70 | 
  71 | 16
  72 | 00:00:29,530 --> 00:00:31,500
  73 | is all about learning
  74 | from examples
  75 | 
  76 | 17
  77 | 00:00:31,500 --> 00:00:32,969
  78 | rather than writing
  79 | manual rules.
  80 | 
  81 | 18
  82 | 00:00:32,969 --> 00:00:34,009
  83 | LAURENCE MORONEY: Got it.
  84 | 
  85 | 19
  86 | 00:00:34,009 --> 00:00:35,510
  87 | JOSHUA GORDON: So the
  88 | short way of saying
  89 | 
  90 | 20
  91 | 00:00:35,510 --> 00:00:38,230
  92 | that is regular programming is
  93 | you write a lot of manual rules
  94 | 
  95 | 21
  96 | 00:00:38,230 --> 00:00:39,170
  97 | to solve a problem.
  98 | 
  99 | 22
 100 | 00:00:39,170 --> 00:00:41,210
 101 | In machine learning,
 102 | you let the algorithm
 103 | 
 104 | 23
 105 | 00:00:41,210 --> 00:00:42,269
 106 | find those rules for you.
 107 | 
 108 | 24
 109 | 00:00:42,269 --> 00:00:43,310
 110 | LAURENCE MORONEY: Got it.
 111 | 
 112 | 25
 113 | 00:00:43,310 --> 00:00:43,310
 114 | JOSHUA GORDON: From examples.
 115 | 
 116 | 26
 117 | 00:00:43,970 --> 00:00:45,200
 118 | LAURENCE MORONEY:
 119 | So pattern matching.
 120 | 
 121 | 27
 122 | 00:00:45,200 --> 00:00:47,520
 123 | It might be visual, or it
 124 | might be other patterns
 125 | 
 126 | 28
 127 | 00:00:47,520 --> 00:00:48,060
 128 | that are hidden in data.
 129 | 
 130 | 29
 131 | 00:00:48,060 --> 00:00:48,060
 132 | JOSHUA GORDON: Absolutely.
 133 | 
 134 | 30
 135 | 00:00:48,740 --> 00:00:51,406
 136 | And so the input to machine-- so
 137 | the beauty of machine learning,
 138 | 
 139 | 31
 140 | 00:00:51,406 --> 00:00:54,170
 141 | and the real secret sauce,
 142 | is that an algorithm that
 143 | 
 144 | 32
 145 | 00:00:54,170 --> 00:00:57,760
 146 | learns patterns from
 147 | data can solve thousands
 148 | 
 149 | 33
 150 | 00:00:57,760 --> 00:00:58,850
 151 | of different problems.
 152 | 
 153 | 34
 154 | 00:00:58,850 --> 00:01:01,266
 155 | And the reason is if I write
 156 | a Python program to recognize
 157 | 
 158 | 35
 159 | 00:01:01,266 --> 00:01:03,509
 160 | digits, my program is hard
 161 | coded to work with digits.
 162 | 
 163 | 36
 164 | 00:01:03,509 --> 00:01:04,550
 165 | LAURENCE MORONEY: Got it.
 166 | 
 167 | 37
 168 | 00:01:04,550 --> 00:01:07,049
 169 | JOSHUA GORDON: But if I write
 170 | an algorithm to learn patterns
 171 | 
 172 | 38
 173 | 00:01:07,049 --> 00:01:09,520
 174 | from data, I can use that
 175 | for speech recognition, image
 176 | 
 177 | 39
 178 | 00:01:09,520 --> 00:01:11,470
 179 | recognition, medicine.
 180 | 
 181 | 40
 182 | 00:01:11,470 --> 00:01:14,010
 183 | Basically, anything that
 184 | you can start with examples,
 185 | 
 186 | 41
 187 | 00:01:14,010 --> 00:01:17,880
 188 | just tell apart A and B, my
 189 | same algorithm that I wrote just
 190 | 
 191 | 42
 192 | 00:01:17,880 --> 00:01:20,380
 193 | once can tackle
 194 | all these problems.
 195 | 
 196 | 43
 197 | 00:01:20,380 --> 00:01:23,049
 198 | And that's a really special and
 199 | actually fairly profound thing.
 200 | 
 201 | 44
 202 | 00:01:23,049 --> 00:01:24,010
 203 | LAURENCE MORONEY: Absolutely.
 204 | 
 205 | 45
 206 | 00:01:24,010 --> 00:01:26,093
 207 | Now, one of the things in
 208 | your classes that you're
 209 | 
 210 | 46
 211 | 00:01:26,093 --> 00:01:28,500
 212 | talking about that you're
 213 | starting with language.
 214 | 
 215 | 47
 216 | 00:01:28,500 --> 00:01:30,260
 217 | You're starting with
 218 | Java and Python,
 219 | 
 220 | 48
 221 | 00:01:30,260 --> 00:01:30,260
 222 | I think it was, that you said?
 223 | 
 224 | 49
 225 | 00:01:30,810 --> 00:01:31,590
 226 | JOSHUA GORDON: Yes, absolutely.
 227 | 
 228 | 50
 229 | 00:01:31,590 --> 00:01:32,590
 230 | LAURENCE MORONEY:
 231 | So how's the class
 232 | 
 233 | 51
 234 | 00:01:32,590 --> 00:01:33,700
 235 | going to be
 236 | structured for people
 237 | 
 238 | 52
 239 | 00:01:33,700 --> 00:01:35,330
 240 | who want to be these data
 241 | scientists of the future?
 242 | 
 243 | 53
 244 | 00:01:35,330 --> 00:01:35,330
 245 | JOSHUA GORDON: Absolutely.
 246 | 
 247 | 54
 248 | 00:01:35,910 --> 00:01:37,950
 249 | So first of all, there
 250 | are zero prerequisites.
 251 | 
 252 | 55
 253 | 00:01:37,950 --> 00:01:38,380
 254 | Well, that's not true.
 255 | 
 256 | 56
 257 | 00:01:38,380 --> 00:01:39,090
 258 | There's one prerequisite.
 259 | 
 260 | 57
 261 | 00:01:39,090 --> 00:01:39,090
 262 | LAURENCE MORONEY: My favorite.
 263 | 
 264 | 58
 265 | 00:01:39,780 --> 00:01:40,209
 266 | Oh, OK.
 267 | 
 268 | 59
 269 | 00:01:40,209 --> 00:01:41,376
 270 | Well, what's the one prereq?
 271 | 
 272 | 60
 273 | 00:01:41,376 --> 00:01:44,500
 274 | JOSHUA GORDON: Basic programming
 275 | ability in Java or Python.
 276 | 
 277 | 61
 278 | 00:01:44,500 --> 00:01:48,230
 279 | And by basic, I mean you can run
 280 | scripts and you can tweak them.
 281 | 
 282 | 62
 283 | 00:01:48,230 --> 00:01:49,770
 284 | That's it.
 285 | 
 286 | 63
 287 | 00:01:49,770 --> 00:01:51,440
 288 | A little bit of
 289 | high school math.
 290 | 
 291 | 64
 292 | 00:01:51,440 --> 00:01:54,450
 293 | And that means like basic
 294 | algebra, basic geometry.
 295 | 
 296 | 65
 297 | 00:01:54,450 --> 00:01:56,470
 298 | When I say basic geometry,
 299 | to be totally honest,
 300 | 
 301 | 66
 302 | 00:01:56,470 --> 00:01:58,447
 303 | if you asked me, like,
 304 | what sine and cosine,
 305 | 
 306 | 67
 307 | 00:01:58,447 --> 00:01:59,530
 308 | I would have to Google it.
 309 | 
 310 | 68
 311 | 00:01:59,530 --> 00:02:01,510
 312 | I don't remember, honestly.
 313 | 
 314 | 69
 315 | 00:02:01,510 --> 00:02:04,419
 316 | So just basic familiarity,
 317 | and that's it.
 318 | 
 319 | 70
 320 | 00:02:04,419 --> 00:02:06,459
 321 | And we're going to teach
 322 | the class in three ways.
 323 | 
 324 | 71
 325 | 00:02:06,459 --> 00:02:09,030
 326 | We're going to teach it
 327 | totally from the ground up.
 328 | 
 329 | 72
 330 | 00:02:09,030 --> 00:02:12,205
 331 | So one problem I had with some
 332 | of the academic classes I took
 333 | 
 334 | 73
 335 | 00:02:12,205 --> 00:02:14,080
 336 | is that they'll talk
 337 | about a fancy algorithm,
 338 | 
 339 | 74
 340 | 00:02:14,080 --> 00:02:16,149
 341 | like neural
 342 | networks, but they'll
 343 | 
 344 | 75
 345 | 00:02:16,149 --> 00:02:17,440
 346 | talk about it in terms of math.
 347 | 
 348 | 76
 349 | 00:02:17,440 --> 00:02:20,120
 350 | And so at the end of the class,
 351 | I don't know how to build that.
 352 | 
 353 | 77
 354 | 00:02:20,120 --> 00:02:21,107
 355 | I can't really do it.
 356 | 
 357 | 78
 358 | 00:02:21,107 --> 00:02:22,440
 359 | We're doing it in a reverse way.
 360 | 
 361 | 79
 362 | 00:02:22,440 --> 00:02:24,440
 363 | We're building it step
 364 | by step, and we're
 365 | 
 366 | 80
 367 | 00:02:24,440 --> 00:02:27,744
 368 | explaining only the math that's
 369 | really necessary as we go.
 370 | 
 371 | 81
 372 | 00:02:27,744 --> 00:02:30,160
 373 | And instead of equations, we're
 374 | going use visual examples.
 375 | 
 376 | 82
 377 | 00:02:30,160 --> 00:02:30,160
 378 | LAURENCE MORONEY: Perfect.
 379 | 
 380 | 83
 381 | 00:02:30,729 --> 00:02:32,187
 382 | JOSHUA GORDON: So
 383 | an equation could
 384 | 
 385 | 84
 386 | 00:02:32,187 --> 00:02:34,060
 387 | be like if you talk
 388 | about gradient descent,
 389 | 
 390 | 85
 391 | 00:02:34,060 --> 00:02:36,259
 392 | gradient descent
 393 | basically means finding
 394 | 
 395 | 86
 396 | 00:02:36,259 --> 00:02:37,883
 397 | the minimum of a function.
 398 | 
 399 | 87
 400 | 00:02:37,883 --> 00:02:40,550
 401 | So if I just say that, like as a
 402 | developer, I'm like, all right,
 403 | 
 404 | 88
 405 | 00:02:40,550 --> 00:02:41,160
 406 | what does that mean?
 407 | 
 408 | 89
 409 | 00:02:41,160 --> 00:02:42,535
 410 | So you can think
 411 | of any equation,
 412 | 
 413 | 90
 414 | 00:02:42,535 --> 00:02:45,550
 415 | like x cubed plus y squared
 416 | plus whatever equals 7.
 417 | 
 418 | 91
 419 | 00:02:45,550 --> 00:02:47,250
 420 | There's some value of x and y.
 421 | 
 422 | 92
 423 | 00:02:47,250 --> 00:02:48,660
 424 | LAURENCE MORONEY: That's going
 425 | to be the bottom of that curve,
 426 | 
 427 | 93
 428 | 00:02:48,660 --> 00:02:48,660
 429 | right?
 430 | 
 431 | 94
 432 | 00:02:48,919 --> 00:02:49,270
 433 | JOSHUA GORDON: Or not equals 7.
 434 | 
 435 | 95
 436 | 00:02:49,270 --> 00:02:50,020
 437 | Equals some value.
 438 | 
 439 | 96
 440 | 00:02:50,020 --> 00:02:50,020
 441 | Right.
 442 | 
 443 | 97
 444 | 00:02:50,660 --> 00:02:52,720
 445 | Anyway, you can find
 446 | the bottom of that curve
 447 | 
 448 | 98
 449 | 00:02:52,720 --> 00:02:54,069
 450 | literally by thinking as a bowl.
 451 | 
 452 | 99
 453 | 00:02:54,069 --> 00:02:56,270
 454 | You can drop a piece
 455 | of fruit in a bowl
 456 | 
 457 | 100
 458 | 00:02:56,270 --> 00:02:57,715
 459 | and it will roll to the bottom.
 460 | 
 461 | 101
 462 | 00:02:57,715 --> 00:02:59,340
 463 | And gradient descent
 464 | just means finding
 465 | 
 466 | 102
 467 | 00:02:59,340 --> 00:03:00,960
 468 | where this function is 0.
 469 | 
 470 | 103
 471 | 00:03:00,960 --> 00:03:03,009
 472 | And you can actually
 473 | describe that really simply
 474 | 
 475 | 104
 476 | 00:03:03,009 --> 00:03:05,280
 477 | in only like 10 or
 478 | 12 lines of Python,
 479 | 
 480 | 105
 481 | 00:03:05,280 --> 00:03:07,585
 482 | actually, instead of
 483 | five slides of equations.
 484 | 
 485 | 106
 486 | 00:03:07,585 --> 00:03:09,210
 487 | LAURENCE MORONEY:
 488 | And I think it's also
 489 | 
 490 | 107
 491 | 00:03:09,210 --> 00:03:11,300
 492 | important to understand
 493 | why you need to find
 494 | 
 495 | 108
 496 | 00:03:11,300 --> 00:03:12,300
 497 | the bottom of the curve.
 498 | 
 499 | 109
 500 | 00:03:12,300 --> 00:03:13,340
 501 | JOSHUA GORDON: Absolutely.
 502 | 
 503 | 110
 504 | 00:03:13,340 --> 00:03:14,919
 505 | LAURENCE MORONEY: And just
 506 | focus on that example.
 507 | 
 508 | 111
 509 | 00:03:14,919 --> 00:03:15,330
 510 | JOSHUA GORDON: Absolutely.
 511 | 
 512 | 112
 513 | 00:03:15,330 --> 00:03:17,419
 514 | So that's difficult
 515 | to describe concisely.
 516 | 
 517 | 113
 518 | 00:03:17,419 --> 00:03:19,076
 519 | LAURENCE MORONEY: Right.
 520 | 
 521 | 114
 522 | 00:03:19,076 --> 00:03:20,660
 523 | JOSHUA GORDON: So
 524 | in machine learning,
 525 | 
 526 | 115
 527 | 00:03:20,660 --> 00:03:22,349
 528 | let's say you're
 529 | writing an algorithm.
 530 | 
 531 | 116
 532 | 00:03:22,349 --> 00:03:26,240
 533 | Let's say it's to distinguish
 534 | apples from oranges.
 535 | 
 536 | 117
 537 | 00:03:26,240 --> 00:03:29,199
 538 | You always want to know, how
 539 | accurate is my algorithm?
 540 | 
 541 | 118
 542 | 00:03:29,199 --> 00:03:31,090
 543 | Like, I can solve that
 544 | problem in one line.
 545 | 
 546 | 119
 547 | 00:03:31,090 --> 00:03:34,020
 548 | I can just say,
 549 | return math.random.
 550 | 
 551 | 120
 552 | 00:03:34,020 --> 00:03:35,389
 553 | So one line, math.random.
 554 | 
 555 | 121
 556 | 00:03:35,389 --> 00:03:37,389
 557 | LAURENCE MORONEY: That
 558 | would be the perfect one.
 559 | 
 560 | 122
 561 | 00:03:37,389 --> 00:03:39,084
 562 | JOSHUA GORDON: My
 563 | accuracy is crap.
 564 | 
 565 | 123
 566 | 00:03:39,084 --> 00:03:40,000
 567 | LAURENCE MORONEY: 50%.
 568 | 
 569 | 124
 570 | 00:03:40,000 --> 00:03:40,000
 571 | JOSHUA GORDON: Right.
 572 | 
 573 | 125
 574 | 00:03:40,874 --> 00:03:42,160
 575 | Yeah, it's 50%.
 576 | 
 577 | 126
 578 | 00:03:42,160 --> 00:03:43,190
 579 | LAURENCE MORONEY: Between
 580 | an apple and an orange.
 581 | 
 582 | 127
 583 | 00:03:43,190 --> 00:03:44,630
 584 | JOSHUA GORDON: It's a one liner.
 585 | 
 586 | 128
 587 | 00:03:44,630 --> 00:03:47,186
 588 | But really, we want
 589 | to get-- another way
 590 | 
 591 | 129
 592 | 00:03:47,186 --> 00:03:48,810
 593 | of describing accuracy
 594 | is you can think
 595 | 
 596 | 130
 597 | 00:03:48,810 --> 00:03:50,690
 598 | about it n terms of error.
 599 | 
 600 | 131
 601 | 00:03:50,690 --> 00:03:53,120
 602 | High accuracy means low error.
 603 | 
 604 | 132
 605 | 00:03:53,120 --> 00:03:57,550
 606 | And you can have an equation
 607 | that describes your error.
 608 | 
 609 | 133
 610 | 00:03:57,550 --> 00:03:59,569
 611 | And the minimum of
 612 | that equation is
 613 | 
 614 | 134
 615 | 00:03:59,569 --> 00:04:01,741
 616 | going to give you
 617 | the highest accuracy.
 618 | 
 619 | 135
 620 | 00:04:01,741 --> 00:04:03,740
 621 | So you can write your
 622 | machine learning algorithm
 623 | 
 624 | 136
 625 | 00:04:03,740 --> 00:04:06,299
 626 | to try and minimize the equation
 627 | that describes the error.
 628 | 
 629 | 137
 630 | 00:04:06,299 --> 00:04:07,340
 631 | LAURENCE MORONEY: Got it.
 632 | 
 633 | 138
 634 | 00:04:07,340 --> 00:04:09,120
 635 | JOSHUA GORDON: And we'll
 636 | make that super concrete
 637 | 
 638 | 139
 639 | 00:04:09,120 --> 00:04:11,319
 640 | in the class, but that's
 641 | where minimization comes in
 642 | 
 643 | 140
 644 | 00:04:11,319 --> 00:04:12,669
 645 | and that's where gradient
 646 | descent comes in.
 647 | 
 648 | 141
 649 | 00:04:12,669 --> 00:04:13,319
 650 | LAURENCE MORONEY:
 651 | So one of the things
 652 | 
 653 | 142
 654 | 00:04:13,319 --> 00:04:14,735
 655 | you're saying in
 656 | the class, you're
 657 | 
 658 | 143
 659 | 00:04:14,735 --> 00:04:16,485
 660 | teaching just a pure
 661 | Java, Python version.
 662 | 
 663 | 144
 664 | 00:04:16,485 --> 00:04:18,110
 665 | But there's also a
 666 | version where you're
 667 | 
 668 | 145
 669 | 00:04:18,110 --> 00:04:19,490
 670 | bringing in
 671 | preexisting libraries
 672 | 
 673 | 146
 674 | 00:04:19,490 --> 00:04:20,720
 675 | that have come from academia.
 676 | 
 677 | 147
 678 | 00:04:20,720 --> 00:04:20,720
 679 | JOSHUA GORDON: Absolutely.
 680 | 
 681 | 148
 682 | 00:04:20,985 --> 00:04:22,259
 683 | LAURENCE MORONEY: That will
 684 | solve a lot of this for you,
 685 | 
 686 | 149
 687 | 00:04:22,259 --> 00:04:22,259
 688 | right?
 689 | 
 690 | 150
 691 | 00:04:22,699 --> 00:04:23,230
 692 | JOSHUA GORDON: Absolutely.
 693 | 
 694 | 151
 695 | 00:04:23,230 --> 00:04:24,562
 696 | So I want to do a couple things.
 697 | 
 698 | 152
 699 | 00:04:24,562 --> 00:04:27,009
 700 | One is I want to
 701 | provide the TLDR.
 702 | 
 703 | 153
 704 | 00:04:27,009 --> 00:04:29,720
 705 | So honestly, as a
 706 | developer, I like to get up
 707 | 
 708 | 154
 709 | 00:04:29,720 --> 00:04:31,089
 710 | and running really fast.
 711 | 
 712 | 155
 713 | 00:04:31,089 --> 00:04:34,632
 714 | So we're also going to use
 715 | open source libraries from just
 716 | 
 717 | 156
 718 | 00:04:34,632 --> 00:04:35,589
 719 | different universities.
 720 | 
 721 | 157
 722 | 00:04:35,589 --> 00:04:37,990
 723 | There's one in New Zealand
 724 | that I really love.
 725 | 
 726 | 158
 727 | 00:04:37,990 --> 00:04:40,509
 728 | We're going to you how to build,
 729 | basically first, everything
 730 | 
 731 | 159
 732 | 00:04:40,509 --> 00:04:42,384
 733 | from the ground up step
 734 | by step from scratch.
 735 | 
 736 | 160
 737 | 00:04:42,384 --> 00:04:45,730
 738 | And the reason we do that is
 739 | because it keeps us honest.
 740 | 
 741 | 161
 742 | 00:04:45,730 --> 00:04:48,250
 743 | If you build every
 744 | single piece, you
 745 | 
 746 | 162
 747 | 00:04:48,250 --> 00:04:50,560
 748 | have some understanding
 749 | of every single piece.
 750 | 
 751 | 163
 752 | 00:04:50,560 --> 00:04:52,139
 753 | LAURENCE MORONEY: And if
 754 | you're relying on somebody else
 755 | 
 756 | 164
 757 | 00:04:52,139 --> 00:04:54,329
 758 | having done the work, you don't
 759 | fully get to understand it
 760 | 
 761 | 165
 762 | 00:04:54,329 --> 00:04:54,329
 763 | yourself.
 764 | 
 765 | 166
 766 | 00:04:54,490 --> 00:04:55,500
 767 | JOSHUA GORDON: Exactly.
 768 | 
 769 | 167
 770 | 00:04:55,500 --> 00:04:57,750
 771 | Now, another thing is using
 772 | the open source libraries,
 773 | 
 774 | 168
 775 | 00:04:57,750 --> 00:05:00,329
 776 | honestly, you can solve
 777 | probably 80% or 90%
 778 | 
 779 | 169
 780 | 00:05:00,329 --> 00:05:03,389
 781 | of the machine learning problems
 782 | you would as a data scientist.
 783 | 
 784 | 170
 785 | 00:05:03,389 --> 00:05:04,509
 786 | LAURENCE MORONEY: Nice.
 787 | 
 788 | 171
 789 | 00:05:04,509 --> 00:05:06,800
 790 | JOSHUA GORDON: Now, when you
 791 | get to the really gigantic
 792 | 
 793 | 172
 794 | 00:05:06,800 --> 00:05:09,471
 795 | problems, then really it
 796 | makes sense to use the cloud.
 797 | 
 798 | 173
 799 | 00:05:09,471 --> 00:05:11,180
 800 | So we're also going
 801 | to teach how to solve
 802 | 
 803 | 174
 804 | 00:05:11,180 --> 00:05:12,470
 805 | problems using Google APIs.
 806 | 
 807 | 175
 808 | 00:05:12,470 --> 00:05:14,529
 809 | But that's at the
 810 | very end of the class,
 811 | 
 812 | 176
 813 | 00:05:14,529 --> 00:05:16,345
 814 | and it's totally optional.
 815 | 
 816 | 177
 817 | 00:05:16,345 --> 00:05:17,519
 818 | LAURENCE MORONEY: This
 819 | is all on YouTube, right?
 820 | 
 821 | 178
 822 | 00:05:17,519 --> 00:05:18,769
 823 | JOSHUA GORDON: All on YouTube.
 824 | 
 825 | 179
 826 | 00:05:18,769 --> 00:05:21,500
 827 | There might be some ads on
 828 | it, but that's literally it.
 829 | 
 830 | 180
 831 | 00:05:21,500 --> 00:05:22,230
 832 | We think it's going
 833 | to be awesome.
 834 | 
 835 | 181
 836 | 00:05:22,230 --> 00:05:23,410
 837 | LAURENCE MORONEY: Like
 838 | source code and stuff
 839 | 
 840 | 182
 841 | 00:05:23,410 --> 00:05:23,410
 842 | that you've done?
 843 | 
 844 | 183
 845 | 00:05:23,930 --> 00:05:25,800
 846 | JOSHUA GORDON: The source
 847 | code will be on GitHub.
 848 | 
 849 | 184
 850 | 00:05:25,800 --> 00:05:26,569
 851 | LAURENCE MORONEY:
 852 | It's all on GitHub.
 853 | 
 854 | 185
 855 | 00:05:26,569 --> 00:05:26,569
 856 | Perfect.
 857 | 
 858 | 186
 859 | 00:05:26,709 --> 00:05:27,069
 860 | JOSHUA GORDON: It
 861 | will all be on GitHub.
 862 | 
 863 | 187
 864 | 00:05:27,069 --> 00:05:28,019
 865 | And the reason I
 866 | was hesitating is
 867 | 
 868 | 188
 869 | 00:05:28,019 --> 00:05:29,644
 870 | I'm writing all this
 871 | as we're speaking,
 872 | 
 873 | 189
 874 | 00:05:29,644 --> 00:05:30,819
 875 | so I'm totally exhausted.
 876 | 
 877 | 190
 878 | 00:05:30,819 --> 00:05:32,699
 879 | But yes, it's totally,
 880 | 100% out there.
 881 | 
 882 | 191
 883 | 00:05:32,699 --> 00:05:35,389
 884 | LAURENCE MORONEY: Well, you're
 885 | still looking energetic to me.
 886 | 
 887 | 192
 888 | 00:05:35,389 --> 00:05:38,386
 889 | JOSHUA GORDON: I've had a
 890 | lot of coffee with a Googler.
 891 | 
 892 | 193
 893 | 00:05:38,386 --> 00:05:39,399
 894 | Good for you.
 895 | 
 896 | 194
 897 | 00:05:39,399 --> 00:05:40,774
 898 | LAURENCE MORONEY:
 899 | Well, I for one
 900 | 
 901 | 195
 902 | 00:05:40,774 --> 00:05:42,483
 903 | am really looking
 904 | forward to this course.
 905 | 
 906 | 196
 907 | 00:05:42,483 --> 00:05:45,709
 908 | I'm looking forward to learning
 909 | what you have to teach.
 910 | 
 911 | 197
 912 | 00:05:45,709 --> 00:05:47,269
 913 | I've had the same
 914 | kind of struggles
 915 | 
 916 | 198
 917 | 00:05:47,269 --> 00:05:50,096
 918 | as you in trying to understand
 919 | the math behind this
 920 | 
 921 | 199
 922 | 00:05:50,096 --> 00:05:51,470
 923 | and why I'm doing
 924 | the math, which
 925 | 
 926 | 200
 927 | 00:05:51,470 --> 00:05:53,360
 928 | is why I had those
 929 | pointed questions earlier.
 930 | 
 931 | 201
 932 | 00:05:53,360 --> 00:05:54,139
 933 | JOSHUA GORDON: Absolutely.
 934 | 
 935 | 202
 936 | 00:05:54,139 --> 00:05:54,139
 937 | LAURENCE MORONEY:
 938 | So thanks, Josh.
 939 | 
 940 | 203
 941 | 00:05:54,829 --> 00:05:56,029
 942 | That was a whole lot of fun.
 943 | 
 944 | 204
 945 | 00:05:56,029 --> 00:05:57,814
 946 | And I've learned so
 947 | much about machine
 948 | 
 949 | 205
 950 | 00:05:57,814 --> 00:05:59,730
 951 | learning just from these
 952 | few minutes with you,
 953 | 
 954 | 206
 955 | 00:05:59,730 --> 00:06:01,410
 956 | so I'm really looking
 957 | forward to your class.
 958 | 
 959 | 207
 960 | 00:06:01,410 --> 00:06:01,410
 961 | JOSHUA GORDON: Thanks so much.
 962 | 
 963 | 208
 964 | 00:06:01,970 --> 00:06:03,550
 965 | LAURENCE MORONEY: If you've
 966 | enjoyed this episode of Coffee
 967 | 
 968 | 209
 969 | 00:06:03,550 --> 00:06:05,420
 970 | with a Googler and if you
 971 | want to learn machine learning
 972 | 
 973 | 210
 974 | 00:06:05,420 --> 00:06:07,694
 975 | for yourself, if you have
 976 | any questions for Joshua,
 977 | 
 978 | 211
 979 | 00:06:07,694 --> 00:06:09,110
 980 | or if you've any
 981 | questions for me,
 982 | 
 983 | 212
 984 | 00:06:09,110 --> 00:06:10,819
 985 | please leave them in
 986 | the comments below.
 987 | 
 988 | 213
 989 | 00:06:10,819 --> 00:06:12,610
 990 | And tune into the Google
 991 | Developers channel
 992 | 
 993 | 214
 994 | 00:06:12,610 --> 00:06:14,380
 995 | for more great videos,
 996 | including episodes
 997 | 
 998 | 215
 999 | 00:06:14,380 --> 00:06:15,529
1000 | of Coffee with a Googler.
1001 | 
1002 | 216
1003 | 00:06:15,529 --> 00:06:16,605
1004 | Thank you.
1005 | 
1006 | 217
1007 | 00:06:16,605 --> 00:06:17,521
1008 | [MUSIC PLAYING]
1009 | 
1010 | 218
1011 | 00:06:17,521 --> 00:06:19,730
1012 | JOSHUA GORDON: You really
1013 | can learn machine learning,
1014 | 
1015 | 219
1016 | 00:06:19,730 --> 00:06:21,949
1017 | and it's faster and
1018 | easier than you think.
1019 | 
1020 | 220
1021 | 00:06:21,949 --> 00:06:25,540
1022 | We've gone through a ton of
1023 | classes, textbooks, and blog
1024 | 
1025 | 221
1026 | 00:06:25,540 --> 00:06:29,120
1027 | posts to bring you the clearest
1028 | and most concise explanations
1029 | 
1030 | 222
1031 | 00:06:29,120 --> 00:06:30,459
1032 | of the hard concepts.
1033 | 
1034 | 223
1035 | 00:06:30,459 --> 00:06:32,024
1036 | We really think you're going
1037 | to be able to learn it and have
1038 | 
1039 | 224
1040 | 00:06:32,024 --> 00:06:33,290
1041 | some fun on the way.
1042 | 
1043 | 225
1044 | 00:06:33,290 --> 00:06:35,410
1045 | Click here to get started.
1046 | 
1047 | 226
1048 | 00:06:35,410 --> 00:06:36,000
1049 |  Subtitles End: mo.dbxdb.com
1050 | 
1051 | 


--------------------------------------------------------------------------------
/subtitle/Eng/Hello World - Machine Learning Recipes #1.srt:
--------------------------------------------------------------------------------
  1 | 1
  2 | 00:00:00,000 --> 00:00:00,000
  3 | Youtube subtitles download by mo.dbxdb.com 
  4 | 
  5 | 2
  6 | 00:00:00,000 --> 00:00:02,886
  7 | [MUSIC PLAYING]
  8 | 
  9 | 3
 10 | 00:00:02,886 --> 00:00:06,726
 11 | 
 12 | 
 13 | 4
 14 | 00:00:06,726 --> 00:00:08,100
 15 | Six lines of code
 16 | is all it takes
 17 | 
 18 | 5
 19 | 00:00:08,100 --> 00:00:10,130
 20 | to write your first
 21 | Machine Learning program.
 22 | 
 23 | 6
 24 | 00:00:10,130 --> 00:00:11,671
 25 | My name's Josh
 26 | Gordon, and today I'll
 27 | 
 28 | 7
 29 | 00:00:11,671 --> 00:00:14,374
 30 | walk you through writing Hello
 31 | World for Machine learning.
 32 | 
 33 | 8
 34 | 00:00:14,374 --> 00:00:16,039
 35 | In the first few
 36 | episodes of the series,
 37 | 
 38 | 9
 39 | 00:00:16,039 --> 00:00:17,998
 40 | we'll teach you how to
 41 | get started with Machine
 42 | 
 43 | 10
 44 | 00:00:17,998 --> 00:00:19,079
 45 | Learning from scratch.
 46 | 
 47 | 11
 48 | 00:00:19,079 --> 00:00:21,560
 49 | To do that, we'll work with
 50 | two open source libraries,
 51 | 
 52 | 12
 53 | 00:00:21,560 --> 00:00:23,706
 54 | scikit-learn and TensorFlow.
 55 | 
 56 | 13
 57 | 00:00:23,706 --> 00:00:25,330
 58 | We'll see scikit in
 59 | action in a minute.
 60 | 
 61 | 14
 62 | 00:00:25,330 --> 00:00:27,830
 63 | But first, let's talk quickly
 64 | about what Machine Learning is
 65 | 
 66 | 15
 67 | 00:00:27,830 --> 00:00:29,240
 68 | and why it's important.
 69 | 
 70 | 16
 71 | 00:00:29,240 --> 00:00:31,198
 72 | You can think of Machine
 73 | Learning as a subfield
 74 | 
 75 | 17
 76 | 00:00:31,198 --> 00:00:32,409
 77 | of artificial intelligence.
 78 | 
 79 | 18
 80 | 00:00:32,409 --> 00:00:35,610
 81 | Early AI programs typically
 82 | excelled at just one thing.
 83 | 
 84 | 19
 85 | 00:00:35,610 --> 00:00:37,240
 86 | For example, Deep
 87 | Blue could play chess
 88 | 
 89 | 20
 90 | 00:00:37,240 --> 00:00:40,150
 91 | at a championship level,
 92 | but that's all it could do.
 93 | 
 94 | 21
 95 | 00:00:40,150 --> 00:00:41,780
 96 | Today we want to
 97 | write one program that
 98 | 
 99 | 22
100 | 00:00:41,780 --> 00:00:45,340
101 | can solve many problems without
102 | needing to be rewritten.
103 | 
104 | 23
105 | 00:00:45,340 --> 00:00:47,460
106 | AlphaGo is a great
107 | example of that.
108 | 
109 | 24
110 | 00:00:47,460 --> 00:00:50,150
111 | As we speak, it's competing
112 | in the World Go Championship.
113 | 
114 | 25
115 | 00:00:50,150 --> 00:00:53,740
116 | But similar software can also
117 | learn to play Atari games.
118 | 
119 | 26
120 | 00:00:53,740 --> 00:00:55,956
121 | Machine Learning is what
122 | makes that possible.
123 | 
124 | 27
125 | 00:00:55,956 --> 00:00:57,330
126 | It's the study of
127 | algorithms that
128 | 
129 | 28
130 | 00:00:57,330 --> 00:00:59,039
131 | learn from examples
132 | and experience
133 | 
134 | 29
135 | 00:00:59,039 --> 00:01:00,909
136 | instead of relying
137 | on hard-coded rules.
138 | 
139 | 30
140 | 00:01:00,909 --> 00:01:02,200
141 | So that's the state-of-the-art.
142 | 
143 | 31
144 | 00:01:02,200 --> 00:01:03,750
145 | But here's a much
146 | simpler example
147 | 
148 | 32
149 | 00:01:03,750 --> 00:01:05,632
150 | we'll start coding up today.
151 | 
152 | 33
153 | 00:01:05,632 --> 00:01:07,590
154 | I'll give you a problem
155 | that sounds easy but is
156 | 
157 | 34
158 | 00:01:07,590 --> 00:01:09,662
159 | impossible to solve
160 | without Machine Learning.
161 | 
162 | 35
163 | 00:01:09,662 --> 00:01:11,370
164 | Can you write code to
165 | tell the difference
166 | 
167 | 36
168 | 00:01:11,370 --> 00:01:12,774
169 | between an apple and an orange?
170 | 
171 | 37
172 | 00:01:12,774 --> 00:01:15,190
173 | Imagine I asked you to write
174 | a program that takes an image
175 | 
176 | 38
177 | 00:01:15,190 --> 00:01:17,069
178 | file as input,
179 | does some analysis,
180 | 
181 | 39
182 | 00:01:17,069 --> 00:01:18,650
183 | and outputs the types of fruit.
184 | 
185 | 40
186 | 00:01:18,650 --> 00:01:20,040
187 | How can you solve this?
188 | 
189 | 41
190 | 00:01:20,040 --> 00:01:22,526
191 | You'd have to start by
192 | writing lots of manual rules.
193 | 
194 | 42
195 | 00:01:22,526 --> 00:01:23,900
196 | For example, you
197 | could write code
198 | 
199 | 43
200 | 00:01:23,900 --> 00:01:26,316
201 | to count how many orange pixels
202 | there are and compare that
203 | 
204 | 44
205 | 00:01:26,316 --> 00:01:27,569
206 | to the number of green ones.
207 | 
208 | 45
209 | 00:01:27,569 --> 00:01:30,920
210 | The ratio should give you a
211 | hint about the type of fruit.
212 | 
213 | 46
214 | 00:01:30,920 --> 00:01:33,043
215 | That works fine for
216 | simple images like these.
217 | 
218 | 47
219 | 00:01:33,043 --> 00:01:34,709
220 | But as you dive deeper
221 | into the problem,
222 | 
223 | 48
224 | 00:01:34,709 --> 00:01:37,099
225 | you'll find the real world
226 | is messy, and the rules you
227 | 
228 | 49
229 | 00:01:37,099 --> 00:01:38,650
230 | write start to break.
231 | 
232 | 50
233 | 00:01:38,650 --> 00:01:41,180
234 | How would you write code to
235 | handle black-and-white photos
236 | 
237 | 51
238 | 00:01:41,180 --> 00:01:44,480
239 | or images with no apples
240 | or oranges in them at all?
241 | 
242 | 52
243 | 00:01:44,480 --> 00:01:46,360
244 | In fact, for just about
245 | any rule you write,
246 | 
247 | 53
248 | 00:01:46,360 --> 00:01:48,790
249 | I can find an image
250 | where it won't work.
251 | 
252 | 54
253 | 00:01:48,790 --> 00:01:50,310
254 | You'd need to write
255 | tons of rules,
256 | 
257 | 55
258 | 00:01:50,310 --> 00:01:52,518
259 | and that's just to tell the
260 | difference between apples
261 | 
262 | 56
263 | 00:01:52,518 --> 00:01:53,690
264 | and oranges.
265 | 
266 | 57
267 | 00:01:53,690 --> 00:01:57,390
268 | If I gave you a new problem, you
269 | need to start all over again.
270 | 
271 | 58
272 | 00:01:57,390 --> 00:01:59,079
273 | Clearly, we need
274 | something better.
275 | 
276 | 59
277 | 00:01:59,079 --> 00:02:00,760
278 | To solve this, we
279 | need an algorithm
280 | 
281 | 60
282 | 00:02:00,760 --> 00:02:02,480
283 | that can figure out
284 | the rules for us,
285 | 
286 | 61
287 | 00:02:02,480 --> 00:02:04,599
288 | so we don't have to
289 | write them by hand.
290 | 
291 | 62
292 | 00:02:04,599 --> 00:02:07,690
293 | And for that, we're going
294 | to train a classifier.
295 | 
296 | 63
297 | 00:02:07,690 --> 00:02:10,360
298 | For now you can think of a
299 | classifier as a function.
300 | 
301 | 64
302 | 00:02:10,360 --> 00:02:13,160
303 | It takes some data as input
304 | and assigns a label to it
305 | 
306 | 65
307 | 00:02:13,160 --> 00:02:14,282
308 | as output.
309 | 
310 | 66
311 | 00:02:14,282 --> 00:02:15,740
312 | For example, I
313 | could have a picture
314 | 
315 | 67
316 | 00:02:15,740 --> 00:02:18,235
317 | and want to classify it
318 | as an apple or an orange.
319 | 
320 | 68
321 | 00:02:18,235 --> 00:02:20,110
322 | Or I have an email, and
323 | I want to classify it
324 | 
325 | 69
326 | 00:02:20,110 --> 00:02:22,039
327 | as spam or not spam.
328 | 
329 | 70
330 | 00:02:22,039 --> 00:02:23,690
331 | The technique to
332 | write the classifier
333 | 
334 | 71
335 | 00:02:23,690 --> 00:02:26,220
336 | automatically is called
337 | supervised learning.
338 | 
339 | 72
340 | 00:02:26,220 --> 00:02:29,319
341 | It begins with examples of
342 | the problem you want to solve.
343 | 
344 | 73
345 | 00:02:29,319 --> 00:02:31,620
346 | To code this up, we'll
347 | work with scikit-learn.
348 | 
349 | 74
350 | 00:02:31,620 --> 00:02:34,094
351 | Here, I'll download and
352 | install the library.
353 | 
354 | 75
355 | 00:02:34,094 --> 00:02:35,970
356 | There are a couple
357 | different ways to do that.
358 | 
359 | 76
360 | 00:02:35,970 --> 00:02:38,241
361 | But for me, the easiest
362 | has been to use Anaconda.
363 | 
364 | 77
365 | 00:02:38,241 --> 00:02:40,449
366 | This makes it easy to get
367 | all the dependencies set up
368 | 
369 | 78
370 | 00:02:40,449 --> 00:02:42,440
371 | and works well cross-platform.
372 | 
373 | 79
374 | 00:02:42,440 --> 00:02:44,190
375 | With the magic of
376 | video, I'll fast forward
377 | 
378 | 80
379 | 00:02:44,190 --> 00:02:45,776
380 | through downloading
381 | and installing it.
382 | 
383 | 81
384 | 00:02:45,776 --> 00:02:47,150
385 | Once it's installed,
386 | you can test
387 | 
388 | 82
389 | 00:02:47,150 --> 00:02:48,608
390 | that everything is
391 | working properly
392 | 
393 | 83
394 | 00:02:48,608 --> 00:02:51,364
395 | by starting a Python script
396 | and importing SK learn.
397 | 
398 | 84
399 | 00:02:51,364 --> 00:02:53,780
400 | Assuming that worked, that's
401 | line one of our program down,
402 | 
403 | 85
404 | 00:02:53,780 --> 00:02:56,145
405 | five to go.
406 | 
407 | 86
408 | 00:02:56,145 --> 00:02:57,520
409 | To use supervised
410 | learning, we'll
411 | 
412 | 87
413 | 00:02:57,520 --> 00:03:00,280
414 | follow a recipe with
415 | a few standard steps.
416 | 
417 | 88
418 | 00:03:00,280 --> 00:03:02,340
419 | Step one is to
420 | collect training data.
421 | 
422 | 89
423 | 00:03:02,340 --> 00:03:04,789
424 | These are examples of the
425 | problem we want to solve.
426 | 
427 | 90
428 | 00:03:04,789 --> 00:03:06,789
429 | For our problem, we're
430 | going to write a function
431 | 
432 | 91
433 | 00:03:06,789 --> 00:03:08,002
434 | to classify a piece of fruit.
435 | 
436 | 92
437 | 00:03:08,002 --> 00:03:10,210
438 | For starters, it will take
439 | a description of the fruit
440 | 
441 | 93
442 | 00:03:10,210 --> 00:03:11,680
443 | as input and
444 | predict whether it's
445 | 
446 | 94
447 | 00:03:11,680 --> 00:03:14,349
448 | an apple or an orange as
449 | output, based on features
450 | 
451 | 95
452 | 00:03:14,349 --> 00:03:16,310
453 | like its weight and texture.
454 | 
455 | 96
456 | 00:03:16,310 --> 00:03:18,160
457 | To collect our
458 | training data, imagine
459 | 
460 | 97
461 | 00:03:18,160 --> 00:03:19,310
462 | we head out to an orchard.
463 | 
464 | 98
465 | 00:03:19,310 --> 00:03:21,060
466 | We'll look at different
467 | apples and oranges
468 | 
469 | 99
470 | 00:03:21,060 --> 00:03:23,627
471 | and write down measurements
472 | that describe them in a table.
473 | 
474 | 100
475 | 00:03:23,627 --> 00:03:25,210
476 | In Machine Learning
477 | these measurements
478 | 
479 | 101
480 | 00:03:25,210 --> 00:03:26,650
481 | are called features.
482 | 
483 | 102
484 | 00:03:26,650 --> 00:03:28,970
485 | To keep things simple,
486 | here we've used just two--
487 | 
488 | 103
489 | 00:03:28,970 --> 00:03:31,650
490 | how much each fruit weighs in
491 | grams and its texture, which
492 | 
493 | 104
494 | 00:03:31,650 --> 00:03:33,830
495 | can be bumpy or smooth.
496 | 
497 | 105
498 | 00:03:33,830 --> 00:03:35,860
499 | A good feature makes
500 | it easy to discriminate
501 | 
502 | 106
503 | 00:03:35,860 --> 00:03:37,960
504 | between different
505 | types of fruit.
506 | 
507 | 107
508 | 00:03:37,960 --> 00:03:40,210
509 | Each row in our training
510 | data is an example.
511 | 
512 | 108
513 | 00:03:40,210 --> 00:03:42,259
514 | It describes one piece of fruit.
515 | 
516 | 109
517 | 00:03:42,259 --> 00:03:44,240
518 | The last column is
519 | called the label.
520 | 
521 | 110
522 | 00:03:44,240 --> 00:03:46,257
523 | It identifies what type
524 | of fruit is in each row,
525 | 
526 | 111
527 | 00:03:46,257 --> 00:03:47,840
528 | and there are just
529 | two possibilities--
530 | 
531 | 112
532 | 00:03:47,840 --> 00:03:49,430
533 | apples and oranges.
534 | 
535 | 113
536 | 00:03:49,430 --> 00:03:51,560
537 | The whole table is
538 | our training data.
539 | 
540 | 114
541 | 00:03:51,560 --> 00:03:53,069
542 | Think of these as
543 | all the examples
544 | 
545 | 115
546 | 00:03:53,069 --> 00:03:55,120
547 | we want the classifier
548 | to learn from.
549 | 
550 | 116
551 | 00:03:55,120 --> 00:03:57,660
552 | The more training data you
553 | have, the better a classifier
554 | 
555 | 117
556 | 00:03:57,660 --> 00:03:59,310
557 | you can create.
558 | 
559 | 118
560 | 00:03:59,310 --> 00:04:01,620
561 | Now let's write down our
562 | training data in code.
563 | 
564 | 119
565 | 00:04:01,620 --> 00:04:04,150
566 | We'll use two variables--
567 | features and labels.
568 | 
569 | 120
570 | 00:04:04,150 --> 00:04:06,060
571 | Features contains the
572 | first two columns,
573 | 
574 | 121
575 | 00:04:06,060 --> 00:04:07,887
576 | and labels contains the last.
577 | 
578 | 122
579 | 00:04:07,887 --> 00:04:09,470
580 | You can think of
581 | features as the input
582 | 
583 | 123
584 | 00:04:09,470 --> 00:04:13,401
585 | to the classifier and labels
586 | as the output we want.
587 | 
588 | 124
589 | 00:04:13,401 --> 00:04:15,650
590 | I'm going to change the
591 | variable types of all features
592 | 
593 | 125
594 | 00:04:15,650 --> 00:04:18,980
595 | to ints instead of strings,
596 | so I'll use 0 for bumpy and 1
597 | 
598 | 126
599 | 00:04:18,980 --> 00:04:19,937
600 | for smooth.
601 | 
602 | 127
603 | 00:04:19,937 --> 00:04:22,269
604 | I'll do the same for our
605 | labels, so I'll use 0 for apple
606 | 
607 | 128
608 | 00:04:22,269 --> 00:04:23,740
609 | and 1 for orange.
610 | 
611 | 129
612 | 00:04:23,740 --> 00:04:26,300
613 | These are lines two and
614 | three in our program.
615 | 
616 | 130
617 | 00:04:26,300 --> 00:04:29,160
618 | Step two in our recipes to
619 | use these examples to train
620 | 
621 | 131
622 | 00:04:29,160 --> 00:04:30,440
623 | a classifier.
624 | 
625 | 132
626 | 00:04:30,440 --> 00:04:32,350
627 | The type of classifier
628 | we'll start with
629 | 
630 | 133
631 | 00:04:32,350 --> 00:04:34,029
632 | is called a decision tree.
633 | 
634 | 134
635 | 00:04:34,029 --> 00:04:35,449
636 | We'll dive into
637 | the details of how
638 | 
639 | 135
640 | 00:04:35,449 --> 00:04:37,110
641 | these work in a future episode.
642 | 
643 | 136
644 | 00:04:37,110 --> 00:04:41,269
645 | But for now, it's OK to think of
646 | a classifier as a box of rules.
647 | 
648 | 137
649 | 00:04:41,269 --> 00:04:43,880
650 | That's because there are many
651 | different types of classifier,
652 | 
653 | 138
654 | 00:04:43,880 --> 00:04:47,740
655 | but the input and output
656 | type is always the same.
657 | 
658 | 139
659 | 00:04:47,740 --> 00:04:49,170
660 | I'm going to import the tree.
661 | 
662 | 140
663 | 00:04:49,170 --> 00:04:52,000
664 | Then on line four of our script,
665 | we'll create the classifier.
666 | 
667 | 141
668 | 00:04:52,000 --> 00:04:54,459
669 | At this point, it's just
670 | an empty box of rules.
671 | 
672 | 142
673 | 00:04:54,459 --> 00:04:56,829
674 | It doesn't know anything
675 | about apples and oranges yet.
676 | 
677 | 143
678 | 00:04:56,829 --> 00:04:58,870
679 | To train it, we'll need
680 | a learning algorithm.
681 | 
682 | 144
683 | 00:04:58,870 --> 00:05:00,307
684 | If a classifier
685 | is a box of rules,
686 | 
687 | 145
688 | 00:05:00,307 --> 00:05:02,139
689 | then you can think of
690 | the learning algorithm
691 | 
692 | 146
693 | 00:05:02,139 --> 00:05:04,170
694 | as the procedure
695 | that creates them.
696 | 
697 | 147
698 | 00:05:04,170 --> 00:05:06,937
699 | It does that by finding
700 | patterns in your training data.
701 | 
702 | 148
703 | 00:05:06,937 --> 00:05:09,269
704 | For example, it might notice
705 | oranges tend to weigh more,
706 | 
707 | 149
708 | 00:05:09,269 --> 00:05:11,920
709 | so it'll create a rule saying
710 | that the heavier fruit is,
711 | 
712 | 150
713 | 00:05:11,920 --> 00:05:14,269
714 | the more likely it
715 | is to be an orange.
716 | 
717 | 151
718 | 00:05:14,269 --> 00:05:16,130
719 | In scikit, the
720 | training algorithm
721 | 
722 | 152
723 | 00:05:16,130 --> 00:05:19,315
724 | is included in the classifier
725 | object, and it's called Fit.
726 | 
727 | 153
728 | 00:05:19,315 --> 00:05:21,899
729 | You can think of Fit as being
730 | a synonym for "find patterns
731 | 
732 | 154
733 | 00:05:21,899 --> 00:05:23,136
734 | in data."
735 | 
736 | 155
737 | 00:05:23,136 --> 00:05:24,509
738 | We'll get into
739 | the details of how
740 | 
741 | 156
742 | 00:05:24,509 --> 00:05:27,040
743 | this happens under the
744 | hood in a future episode.
745 | 
746 | 157
747 | 00:05:27,040 --> 00:05:29,100
748 | At this point, we have
749 | a trained classifier.
750 | 
751 | 158
752 | 00:05:29,100 --> 00:05:32,860
753 | So let's take it for a spin and
754 | use it to classify a new fruit.
755 | 
756 | 159
757 | 00:05:32,860 --> 00:05:36,036
758 | The input to the classifier is
759 | the features for a new example.
760 | 
761 | 160
762 | 00:05:36,036 --> 00:05:37,660
763 | Let's say the fruit
764 | we want to classify
765 | 
766 | 161
767 | 00:05:37,660 --> 00:05:39,750
768 | is 150 grams and bumpy.
769 | 
770 | 162
771 | 00:05:39,750 --> 00:05:43,870
772 | The output will be 0 if it's an
773 | apple or 1 if it's an orange.
774 | 
775 | 163
776 | 00:05:43,870 --> 00:05:46,310
777 | Before we hit Enter and see
778 | what the classifier predicts,
779 | 
780 | 164
781 | 00:05:46,310 --> 00:05:47,690
782 | let's think for a sec.
783 | 
784 | 165
785 | 00:05:47,690 --> 00:05:51,160
786 | If you had to guess, what would
787 | you say the output should be?
788 | 
789 | 166
790 | 00:05:51,160 --> 00:05:53,980
791 | To figure that out, compare
792 | this fruit to our training data.
793 | 
794 | 167
795 | 00:05:53,980 --> 00:05:55,630
796 | It looks like it's
797 | similar to an orange
798 | 
799 | 168
800 | 00:05:55,630 --> 00:05:57,076
801 | because it's heavy and bumpy.
802 | 
803 | 169
804 | 00:05:57,076 --> 00:05:59,160
805 | That's what I'd guess
806 | anyway, and if we hit Enter,
807 | 
808 | 170
809 | 00:05:59,160 --> 00:06:01,834
810 | it's what our classifier
811 | predicts as well.
812 | 
813 | 171
814 | 00:06:01,834 --> 00:06:03,250
815 | If everything
816 | worked for you, then
817 | 
818 | 172
819 | 00:06:03,250 --> 00:06:06,050
820 | that's it for your first
821 | Machine Learning program.
822 | 
823 | 173
824 | 00:06:06,050 --> 00:06:08,680
825 | You can create a new
826 | classifier for a new problem
827 | 
828 | 174
829 | 00:06:08,680 --> 00:06:10,769
830 | just by changing
831 | the training data.
832 | 
833 | 175
834 | 00:06:10,769 --> 00:06:13,009
835 | That makes this approach
836 | far more reusable
837 | 
838 | 176
839 | 00:06:13,009 --> 00:06:15,101
840 | than writing new rules
841 | for each problem.
842 | 
843 | 177
844 | 00:06:15,101 --> 00:06:17,350
845 | Now, you might be wondering
846 | why we described our fruit
847 | 
848 | 178
849 | 00:06:17,350 --> 00:06:19,790
850 | using a table of features
851 | instead of using pictures
852 | 
853 | 179
854 | 00:06:19,790 --> 00:06:21,759
855 | of the fruit as training data.
856 | 
857 | 180
858 | 00:06:21,759 --> 00:06:23,360
859 | Well, you can use
860 | pictures, and we'll
861 | 
862 | 181
863 | 00:06:23,360 --> 00:06:25,120
864 | get to that in a future episode.
865 | 
866 | 182
867 | 00:06:25,120 --> 00:06:27,279
868 | But, as you'll see later
869 | on, the way we did it here
870 | 
871 | 183
872 | 00:06:27,279 --> 00:06:29,002
873 | is more general.
874 | 
875 | 184
876 | 00:06:29,002 --> 00:06:30,959
877 | The neat thing is that
878 | programming with Machine
879 | 
880 | 185
881 | 00:06:30,959 --> 00:06:32,028
882 | Learning isn't hard.
883 | 
884 | 186
885 | 00:06:32,028 --> 00:06:33,819
886 | But to get it right,
887 | you need to understand
888 | 
889 | 187
890 | 00:06:33,819 --> 00:06:35,406
891 | a few important concepts.
892 | 
893 | 188
894 | 00:06:35,406 --> 00:06:37,990
895 | I'll start walking you through
896 | those in the next few episodes.
897 | 
898 | 189
899 | 00:06:37,990 --> 00:06:40,197
900 | Thanks very much for watching,
901 | and I'll see you then.
902 | 
903 | 190
904 | 00:06:40,197 --> 00:06:43,850
905 | [MUSIC PLAYING]
906 | 
907 | 191
908 | 00:06:43,850 --> 00:06:52,000
909 |  Subtitles End: mo.dbxdb.com
910 | 
911 | 


--------------------------------------------------------------------------------
/subtitle/Eng/Let°Øs Write a Pipeline - Machine Learning Recipes #4.srt:
--------------------------------------------------------------------------------
  1 | 1
  2 | 00:00:00,000 --> 00:00:00,000
  3 | Youtube subtitles download by mo.dbxdb.com 
  4 | 
  5 | 2
  6 | 00:00:00,000 --> 00:00:02,844
  7 | [MUSIC PLAYING]
  8 | 
  9 | 3
 10 | 00:00:02,844 --> 00:00:06,640
 11 | 
 12 | 
 13 | 4
 14 | 00:00:06,640 --> 00:00:07,447
 15 | Welcome back.
 16 | 
 17 | 5
 18 | 00:00:07,447 --> 00:00:09,029
 19 | We've covered a lot
 20 | of ground already,
 21 | 
 22 | 6
 23 | 00:00:09,029 --> 00:00:12,070
 24 | so today I want to review
 25 | and reinforce concepts.
 26 | 
 27 | 7
 28 | 00:00:12,070 --> 00:00:14,250
 29 | To do that, we'll
 30 | explore two things.
 31 | 
 32 | 8
 33 | 00:00:14,250 --> 00:00:16,090
 34 | First, we'll code
 35 | up a basic pipeline
 36 | 
 37 | 9
 38 | 00:00:16,090 --> 00:00:17,640
 39 | for supervised learning.
 40 | 
 41 | 10
 42 | 00:00:17,640 --> 00:00:19,390
 43 | I'll show you how
 44 | multiple classifiers
 45 | 
 46 | 11
 47 | 00:00:19,390 --> 00:00:21,280
 48 | can solve the same problem.
 49 | 
 50 | 12
 51 | 00:00:21,280 --> 00:00:23,200
 52 | Next, we'll build up a
 53 | little more intuition
 54 | 
 55 | 13
 56 | 00:00:23,200 --> 00:00:25,710
 57 | for what it means for an
 58 | algorithm to learn something
 59 | 
 60 | 14
 61 | 00:00:25,710 --> 00:00:29,502
 62 | from data, because that sounds
 63 | kind of magical, but it's not.
 64 | 
 65 | 15
 66 | 00:00:29,502 --> 00:00:31,710
 67 | To kick things off, let's
 68 | look at a common experiment
 69 | 
 70 | 16
 71 | 00:00:31,710 --> 00:00:33,009
 72 | you might want to do.
 73 | 
 74 | 17
 75 | 00:00:33,009 --> 00:00:35,210
 76 | Imagine you're building
 77 | a spam classifier.
 78 | 
 79 | 18
 80 | 00:00:35,210 --> 00:00:37,510
 81 | That's just a function that
 82 | labels an incoming email
 83 | 
 84 | 19
 85 | 00:00:37,510 --> 00:00:39,307
 86 | as spam or not spam.
 87 | 
 88 | 20
 89 | 00:00:39,307 --> 00:00:41,140
 90 | Now, say you've already
 91 | collected a data set
 92 | 
 93 | 21
 94 | 00:00:41,140 --> 00:00:42,850
 95 | and you're ready
 96 | to train a model.
 97 | 
 98 | 22
 99 | 00:00:42,850 --> 00:00:44,460
100 | But before you put
101 | it into production,
102 | 
103 | 23
104 | 00:00:44,460 --> 00:00:46,760
105 | there's a question you
106 | need to answer first--
107 | 
108 | 24
109 | 00:00:46,760 --> 00:00:49,820
110 | how accurate will it be when you
111 | use it to classify emails that
112 | 
113 | 25
114 | 00:00:49,820 --> 00:00:51,740
115 | weren't in your training data?
116 | 
117 | 26
118 | 00:00:51,740 --> 00:00:54,850
119 | As best we can, we want to
120 | verify our models work well
121 | 
122 | 27
123 | 00:00:54,850 --> 00:00:56,490
124 | before we deploy them.
125 | 
126 | 28
127 | 00:00:56,490 --> 00:00:59,290
128 | And we can do an experiment
129 | to help us figure that out.
130 | 
131 | 29
132 | 00:00:59,290 --> 00:01:02,930
133 | One approach is to partition
134 | our data set into two parts.
135 | 
136 | 30
137 | 00:01:02,930 --> 00:01:05,079
138 | We'll call these Train and Test.
139 | 
140 | 31
141 | 00:01:05,079 --> 00:01:07,010
142 | We'll use Train
143 | to train our model
144 | 
145 | 32
146 | 00:01:07,010 --> 00:01:10,380
147 | and Test to see how
148 | accurate it is on new data.
149 | 
150 | 33
151 | 00:01:10,380 --> 00:01:13,890
152 | That's a common pattern, so
153 | let's see how it looks in code.
154 | 
155 | 34
156 | 00:01:13,890 --> 00:01:17,060
157 | To kick things off, let's import
158 | a data set into [? SyKit. ?]
159 | 
160 | 35
161 | 00:01:17,060 --> 00:01:20,019
162 | We'll use Iris again, because
163 | it's handily included.
164 | 
165 | 36
166 | 00:01:20,019 --> 00:01:21,959
167 | Now, we already saw
168 | Iris in episode two.
169 | 
170 | 37
171 | 00:01:21,959 --> 00:01:23,560
172 | But what we haven't
173 | seen before is
174 | 
175 | 38
176 | 00:01:23,560 --> 00:01:26,831
177 | that I'm calling the
178 | features x and the labels y.
179 | 
180 | 39
181 | 00:01:26,831 --> 00:01:28,209
182 | Why is that?
183 | 
184 | 40
185 | 00:01:28,209 --> 00:01:30,670
186 | Well, that's because one
187 | way to think of a classifier
188 | 
189 | 41
190 | 00:01:30,670 --> 00:01:32,230
191 | is as a function.
192 | 
193 | 42
194 | 00:01:32,230 --> 00:01:34,750
195 | At a high level, you can
196 | think of x as the input
197 | 
198 | 43
199 | 00:01:34,750 --> 00:01:36,500
200 | and y as the output.
201 | 
202 | 44
203 | 00:01:36,500 --> 00:01:39,892
204 | I'll talk more about that in
205 | the second half of this episode.
206 | 
207 | 45
208 | 00:01:39,892 --> 00:01:42,349
209 | After we import the data set,
210 | the first thing we want to do
211 | 
212 | 46
213 | 00:01:42,349 --> 00:01:44,590
214 | is partition it
215 | into Train and Test.
216 | 
217 | 47
218 | 00:01:44,590 --> 00:01:46,640
219 | And to do that, we can
220 | import a handy utility,
221 | 
222 | 48
223 | 00:01:46,640 --> 00:01:48,530
224 | and it makes the syntax clear.
225 | 
226 | 49
227 | 00:01:48,530 --> 00:01:50,340
228 | We're taking our
229 | x's and our y's,
230 | 
231 | 50
232 | 00:01:50,340 --> 00:01:52,930
233 | or our features and labels,
234 | and partitioning them
235 | 
236 | 51
237 | 00:01:52,930 --> 00:01:54,450
238 | into two sets.
239 | 
240 | 52
241 | 00:01:54,450 --> 00:01:56,690
242 | X_train and y_train are
243 | the features and labels
244 | 
245 | 53
246 | 00:01:56,690 --> 00:01:57,980
247 | for the training set.
248 | 
249 | 54
250 | 00:01:57,980 --> 00:02:00,630
251 | And X_test and y_test are
252 | the features and labels
253 | 
254 | 55
255 | 00:02:00,630 --> 00:02:02,031
256 | for the testing set.
257 | 
258 | 56
259 | 00:02:02,031 --> 00:02:04,239
260 | Here, I'm just saying that
261 | I want half the data to be
262 | 
263 | 57
264 | 00:02:04,239 --> 00:02:05,580
265 | used for testing.
266 | 
267 | 58
268 | 00:02:05,580 --> 00:02:09,229
269 | So if we have 150 examples
270 | in Iris, 75 will be in Train
271 | 
272 | 59
273 | 00:02:09,229 --> 00:02:11,520
274 | and 75 will be in Test.
275 | 
276 | 60
277 | 00:02:11,520 --> 00:02:13,280
278 | Now we'll create our classifier.
279 | 
280 | 61
281 | 00:02:13,280 --> 00:02:14,979
282 | I'll use two
283 | different types here
284 | 
285 | 62
286 | 00:02:14,979 --> 00:02:17,860
287 | to show you how they
288 | accomplish the same task.
289 | 
290 | 63
291 | 00:02:17,860 --> 00:02:20,500
292 | Let's start with the decision
293 | tree we've already seen.
294 | 
295 | 64
296 | 00:02:20,500 --> 00:02:22,240
297 | Note there's only
298 | two lines of code
299 | 
300 | 65
301 | 00:02:22,240 --> 00:02:23,448
302 | that are classifier-specific.
303 | 
304 | 66
305 | 00:02:23,448 --> 00:02:25,650
306 | 
307 | 
308 | 67
309 | 00:02:25,650 --> 00:02:28,830
310 | Now let's train the classifier
311 | using our training data.
312 | 
313 | 68
314 | 00:02:28,830 --> 00:02:31,599
315 | At this point, it's ready
316 | to be used to classify data.
317 | 
318 | 69
319 | 00:02:31,599 --> 00:02:33,330
320 | And next, we'll call
321 | the predict method
322 | 
323 | 70
324 | 00:02:33,330 --> 00:02:35,805
325 | and use it to classify
326 | our testing data.
327 | 
328 | 71
329 | 00:02:35,805 --> 00:02:37,180
330 | If you print out
331 | the predictions,
332 | 
333 | 72
334 | 00:02:37,180 --> 00:02:38,970
335 | you'll see there are
336 | a list of numbers.
337 | 
338 | 73
339 | 00:02:38,970 --> 00:02:40,660
340 | These correspond
341 | to the type of Iris
342 | 
343 | 74
344 | 00:02:40,660 --> 00:02:44,009
345 | the classifier predicts for
346 | each row in the testing data.
347 | 
348 | 75
349 | 00:02:44,009 --> 00:02:46,229
350 | Now let's see how
351 | accurate our classifier
352 | 
353 | 76
354 | 00:02:46,229 --> 00:02:48,280
355 | was on the testing set.
356 | 
357 | 77
358 | 00:02:48,280 --> 00:02:50,840
359 | Recall that up top, we have
360 | the true labels for the testing
361 | 
362 | 78
363 | 00:02:50,840 --> 00:02:51,650
364 | data.
365 | 
366 | 79
367 | 00:02:51,650 --> 00:02:53,460
368 | To calculate our
369 | accuracy, we can
370 | 
371 | 80
372 | 00:02:53,460 --> 00:02:55,759
373 | compare the predicted
374 | labels to the true labels,
375 | 
376 | 81
377 | 00:02:55,759 --> 00:02:57,348
378 | and tally up the score.
379 | 
380 | 82
381 | 00:02:57,348 --> 00:02:59,139
382 | There's a convenience
383 | method in [? Sykit ?]
384 | 
385 | 83
386 | 00:02:59,139 --> 00:03:00,830
387 | we can import to do that.
388 | 
389 | 84
390 | 00:03:00,830 --> 00:03:03,505
391 | Notice here, our
392 | accuracy was over 90%.
393 | 
394 | 85
395 | 00:03:03,505 --> 00:03:06,130
396 | If you try this on your own, it
397 | might be a little bit different
398 | 
399 | 86
400 | 00:03:06,130 --> 00:03:08,270
401 | because of some randomness
402 | in how the Train/Test
403 | 
404 | 87
405 | 00:03:08,270 --> 00:03:10,039
406 | data is partitioned.
407 | 
408 | 88
409 | 00:03:10,039 --> 00:03:11,880
410 | Now, here's something
411 | interesting.
412 | 
413 | 89
414 | 00:03:11,880 --> 00:03:14,690
415 | By replacing these two lines, we
416 | can use a different classifier
417 | 
418 | 90
419 | 00:03:14,690 --> 00:03:16,919
420 | to accomplish the same task.
421 | 
422 | 91
423 | 00:03:16,919 --> 00:03:18,569
424 | Instead of using
425 | a decision tree,
426 | 
427 | 92
428 | 00:03:18,569 --> 00:03:20,930
429 | we'll use one called
430 | [? KNearestNeighbors. ?]
431 | 
432 | 93
433 | 00:03:20,930 --> 00:03:23,340
434 | If we run our experiment,
435 | we'll see that the code
436 | 
437 | 94
438 | 00:03:23,340 --> 00:03:25,354
439 | works in exactly the same way.
440 | 
441 | 95
442 | 00:03:25,354 --> 00:03:27,270
443 | The accuracy may be
444 | different when you run it,
445 | 
446 | 96
447 | 00:03:27,270 --> 00:03:29,800
448 | because this classifier works
449 | a little bit differently
450 | 
451 | 97
452 | 00:03:29,800 --> 00:03:32,440
453 | and because of the randomness
454 | in the Train/Test split.
455 | 
456 | 98
457 | 00:03:32,440 --> 00:03:35,419
458 | Likewise, if we wanted to use a
459 | more sophisticated classifier,
460 | 
461 | 99
462 | 00:03:35,419 --> 00:03:38,220
463 | we could just import it
464 | and change these two lines.
465 | 
466 | 100
467 | 00:03:38,220 --> 00:03:40,297
468 | Otherwise, our code is the same.
469 | 
470 | 101
471 | 00:03:40,297 --> 00:03:42,880
472 | The takeaway here is that while
473 | there are many different types
474 | 
475 | 102
476 | 00:03:42,880 --> 00:03:45,919
477 | of classifiers, at a high level,
478 | they have a similar interface.
479 | 
480 | 103
481 | 00:03:45,919 --> 00:03:49,058
482 | 
483 | 
484 | 104
485 | 00:03:49,058 --> 00:03:50,849
486 | Now let's talk a little
487 | bit more about what
488 | 
489 | 105
490 | 00:03:50,849 --> 00:03:53,120
491 | it means to learn from data.
492 | 
493 | 106
494 | 00:03:53,120 --> 00:03:56,080
495 | Earlier, I said we called the
496 | features x and the labels y,
497 | 
498 | 107
499 | 00:03:56,080 --> 00:03:58,717
500 | because they were the input
501 | and output of a function.
502 | 
503 | 108
504 | 00:03:58,717 --> 00:04:00,800
505 | Now, of course, a function
506 | is something we already
507 | 
508 | 109
509 | 00:04:00,800 --> 00:04:02,190
510 | know from programming.
511 | 
512 | 110
513 | 00:04:02,190 --> 00:04:04,900
514 | def classify--
515 | there's our function.
516 | 
517 | 111
518 | 00:04:04,900 --> 00:04:06,919
519 | As we already know in
520 | supervised learning,
521 | 
522 | 112
523 | 00:04:06,919 --> 00:04:09,060
524 | we don't want to
525 | write this ourselves.
526 | 
527 | 113
528 | 00:04:09,060 --> 00:04:12,360
529 | We want an algorithm to
530 | learn it from training data.
531 | 
532 | 114
533 | 00:04:12,360 --> 00:04:15,240
534 | So what does it mean
535 | to learn a function?
536 | 
537 | 115
538 | 00:04:15,240 --> 00:04:17,120
539 | Well, a function is just
540 | a mapping from input
541 | 
542 | 116
543 | 00:04:17,120 --> 00:04:18,660
544 | to output values.
545 | 
546 | 117
547 | 00:04:18,660 --> 00:04:20,660
548 | Here's a function you
549 | might have seen before-- y
550 | 
551 | 118
552 | 00:04:20,660 --> 00:04:22,699
553 | equals mx plus b.
554 | 
555 | 119
556 | 00:04:22,699 --> 00:04:24,819
557 | That's the equation
558 | for a line, and there
559 | 
560 | 120
561 | 00:04:24,819 --> 00:04:27,339
562 | are two parameters-- m,
563 | which gives the slope;
564 | 
565 | 121
566 | 00:04:27,339 --> 00:04:29,680
567 | and b, which gives
568 | the y-intercept.
569 | 
570 | 122
571 | 00:04:29,680 --> 00:04:31,110
572 | Given these
573 | parameters, of course,
574 | 
575 | 123
576 | 00:04:31,110 --> 00:04:34,319
577 | we can plot the function
578 | for different values of x.
579 | 
580 | 124
581 | 00:04:34,319 --> 00:04:36,610
582 | Now, in supervised learning,
583 | our classified function
584 | 
585 | 125
586 | 00:04:36,610 --> 00:04:38,420
587 | might have some
588 | parameters as well,
589 | 
590 | 126
591 | 00:04:38,420 --> 00:04:41,290
592 | but the input x are the
593 | features for an example we
594 | 
595 | 127
596 | 00:04:41,290 --> 00:04:43,630
597 | want to classify,
598 | and the output y
599 | 
600 | 128
601 | 00:04:43,630 --> 00:04:47,220
602 | is a label, like Spam or Not
603 | Spam, or a type of flower.
604 | 
605 | 129
606 | 00:04:47,220 --> 00:04:49,661
607 | So what could the body of
608 | the function look like?
609 | 
610 | 130
611 | 00:04:49,661 --> 00:04:51,910
612 | Well, that's the part we
613 | want to write algorithmically
614 | 
615 | 131
616 | 00:04:51,910 --> 00:04:53,737
617 | or in other words, learn.
618 | 
619 | 132
620 | 00:04:53,737 --> 00:04:55,319
621 | The important thing
622 | to understand here
623 | 
624 | 133
625 | 00:04:55,319 --> 00:04:57,130
626 | is we're not
627 | starting from scratch
628 | 
629 | 134
630 | 00:04:57,130 --> 00:05:00,060
631 | and pulling the body of the
632 | function out of thin air.
633 | 
634 | 135
635 | 00:05:00,060 --> 00:05:01,990
636 | Instead, we start with a model.
637 | 
638 | 136
639 | 00:05:01,990 --> 00:05:04,050
640 | And you can think of a
641 | model as the prototype for
642 | 
643 | 137
644 | 00:05:04,050 --> 00:05:07,029
645 | or the rules that define
646 | the body of our function.
647 | 
648 | 138
649 | 00:05:07,029 --> 00:05:08,540
650 | Typically, a model
651 | has parameters
652 | 
653 | 139
654 | 00:05:08,540 --> 00:05:10,290
655 | that we can adjust
656 | with our training data.
657 | 
658 | 140
659 | 00:05:10,290 --> 00:05:14,560
660 | And here's a high-level example
661 | of how this process works.
662 | 
663 | 141
664 | 00:05:14,560 --> 00:05:17,380
665 | Let's look at a toy data set and
666 | think about what kind of model
667 | 
668 | 142
669 | 00:05:17,380 --> 00:05:19,209
670 | we could use as a classifier.
671 | 
672 | 143
673 | 00:05:19,209 --> 00:05:20,959
674 | Pretend we're interested
675 | in distinguishing
676 | 
677 | 144
678 | 00:05:20,959 --> 00:05:23,350
679 | between red dots and
680 | green dots, some of which
681 | 
682 | 145
683 | 00:05:23,350 --> 00:05:25,079
684 | I've drawn here on a graph.
685 | 
686 | 146
687 | 00:05:25,079 --> 00:05:27,209
688 | To do that, we'll use
689 | just two features--
690 | 
691 | 147
692 | 00:05:27,209 --> 00:05:29,449
693 | the x- and
694 | y-coordinates of a dot.
695 | 
696 | 148
697 | 00:05:29,449 --> 00:05:32,670
698 | Now let's think about how
699 | we could classify this data.
700 | 
701 | 149
702 | 00:05:32,670 --> 00:05:34,089
703 | We want a function
704 | that considers
705 | 
706 | 150
707 | 00:05:34,089 --> 00:05:35,800
708 | a new dot it's
709 | never seen before,
710 | 
711 | 151
712 | 00:05:35,800 --> 00:05:38,170
713 | and classifies it
714 | as red or green.
715 | 
716 | 152
717 | 00:05:38,170 --> 00:05:40,990
718 | In fact, there might be a lot
719 | of data we want to classify.
720 | 
721 | 153
722 | 00:05:40,990 --> 00:05:42,839
723 | Here, I've drawn
724 | our testing examples
725 | 
726 | 154
727 | 00:05:42,839 --> 00:05:44,959
728 | in light green and light red.
729 | 
730 | 155
731 | 00:05:44,959 --> 00:05:47,209
732 | These are dots that weren't
733 | in our training data.
734 | 
735 | 156
736 | 00:05:47,209 --> 00:05:49,790
737 | The classifier has never
738 | seen them before, so how can
739 | 
740 | 157
741 | 00:05:49,790 --> 00:05:51,699
742 | it predict the right label?
743 | 
744 | 158
745 | 00:05:51,699 --> 00:05:53,819
746 | Well, imagine if we
747 | could somehow draw a line
748 | 
749 | 159
750 | 00:05:53,819 --> 00:05:56,036
751 | across the data like this.
752 | 
753 | 160
754 | 00:05:56,036 --> 00:05:57,620
755 | Then we could say
756 | the dots to the left
757 | 
758 | 161
759 | 00:05:57,620 --> 00:06:00,089
760 | of the line are green and dots
761 | to the right of the line are
762 | 
763 | 162
764 | 00:06:00,089 --> 00:06:00,089
765 | red.
766 | 
767 | 163
768 | 00:06:00,920 --> 00:06:03,430
769 | And this line can serve
770 | as our classifier.
771 | 
772 | 164
773 | 00:06:03,430 --> 00:06:05,610
774 | So how can we learn this line?
775 | 
776 | 165
777 | 00:06:05,610 --> 00:06:08,240
778 | Well, one way is to use
779 | the training data to adjust
780 | 
781 | 166
782 | 00:06:08,240 --> 00:06:09,880
783 | the parameters of a model.
784 | 
785 | 167
786 | 00:06:09,880 --> 00:06:12,829
787 | And let's say the model we
788 | use is a simple straight line
789 | 
790 | 168
791 | 00:06:12,829 --> 00:06:14,459
792 | like we saw before.
793 | 
794 | 169
795 | 00:06:14,459 --> 00:06:17,829
796 | That means we have two
797 | parameters to adjust-- m and b.
798 | 
799 | 170
800 | 00:06:17,829 --> 00:06:21,050
801 | And by changing them, we can
802 | change where the line appears.
803 | 
804 | 171
805 | 00:06:21,050 --> 00:06:23,500
806 | So how could we learn
807 | the right parameters?
808 | 
809 | 172
810 | 00:06:23,500 --> 00:06:25,690
811 | Well, one idea is that
812 | we can iteratively adjust
813 | 
814 | 173
815 | 00:06:25,690 --> 00:06:27,639
816 | them using our training data.
817 | 
818 | 174
819 | 00:06:27,639 --> 00:06:29,889
820 | For example, we might
821 | start with a random line
822 | 
823 | 175
824 | 00:06:29,889 --> 00:06:32,810
825 | and use it to classify the
826 | first training example.
827 | 
828 | 176
829 | 00:06:32,810 --> 00:06:35,370
830 | If it gets it right, we don't
831 | need to change our line,
832 | 
833 | 177
834 | 00:06:35,370 --> 00:06:36,968
835 | so we move on to the next one.
836 | 
837 | 178
838 | 00:06:36,968 --> 00:06:38,759
839 | But on the other hand,
840 | if it gets it wrong,
841 | 
842 | 179
843 | 00:06:38,759 --> 00:06:41,300
844 | we could slightly adjust
845 | the parameters of our model
846 | 
847 | 180
848 | 00:06:41,300 --> 00:06:43,069
849 | to make it more accurate.
850 | 
851 | 181
852 | 00:06:43,069 --> 00:06:44,680
853 | The takeaway here is this.
854 | 
855 | 182
856 | 00:06:44,680 --> 00:06:47,490
857 | One way to think of learning
858 | is using training data
859 | 
860 | 183
861 | 00:06:47,490 --> 00:06:50,980
862 | to adjust the
863 | parameters of a model.
864 | 
865 | 184
866 | 00:06:50,980 --> 00:06:52,949
867 | Now, here's something
868 | really special.
869 | 
870 | 185
871 | 00:06:52,949 --> 00:06:55,269
872 | It's called
873 | tensorflow/playground.
874 | 
875 | 186
876 | 00:06:55,269 --> 00:06:57,370
877 | This is a beautiful
878 | example of a neural network
879 | 
880 | 187
881 | 00:06:57,370 --> 00:07:00,019
882 | you can run and experiment
883 | with right in your browser.
884 | 
885 | 188
886 | 00:07:00,019 --> 00:07:02,060
887 | Now, this deserves its
888 | own episode for sure,
889 | 
890 | 189
891 | 00:07:02,060 --> 00:07:03,730
892 | but for now, go ahead
893 | and play with it.
894 | 
895 | 190
896 | 00:07:03,730 --> 00:07:04,930
897 | It's awesome.
898 | 
899 | 191
900 | 00:07:04,930 --> 00:07:06,630
901 | The playground comes
902 | with different data
903 | 
904 | 192
905 | 00:07:06,630 --> 00:07:08,300
906 | sets you can try out.
907 | 
908 | 193
909 | 00:07:08,300 --> 00:07:09,470
910 | Some are very simple.
911 | 
912 | 194
913 | 00:07:09,470 --> 00:07:12,620
914 | For example, we could use our
915 | line to classify this one.
916 | 
917 | 195
918 | 00:07:12,620 --> 00:07:15,980
919 | Some data sets are
920 | much more complex.
921 | 
922 | 196
923 | 00:07:15,980 --> 00:07:17,620
924 | This data set is
925 | especially hard.
926 | 
927 | 197
928 | 00:07:17,620 --> 00:07:20,357
929 | And see if you can build
930 | a network to classify it.
931 | 
932 | 198
933 | 00:07:20,357 --> 00:07:21,940
934 | Now, you can think
935 | of a neural network
936 | 
937 | 199
938 | 00:07:21,940 --> 00:07:24,170
939 | as a more sophisticated
940 | type of classifier,
941 | 
942 | 200
943 | 00:07:24,170 --> 00:07:26,430
944 | like a decision tree
945 | or a simple line.
946 | 
947 | 201
948 | 00:07:26,430 --> 00:07:29,190
949 | But in principle,
950 | the idea is similar.
951 | 
952 | 202
953 | 00:07:29,190 --> 00:07:29,190
954 | OK.
955 | 
956 | 203
957 | 00:07:29,690 --> 00:07:30,687
958 | Hope that was helpful.
959 | 
960 | 204
961 | 00:07:30,687 --> 00:07:32,519
962 | I just created a Twitter
963 | that you can follow
964 | 
965 | 205
966 | 00:07:32,519 --> 00:07:33,834
967 | to be notified of new episodes.
968 | 
969 | 206
970 | 00:07:33,834 --> 00:07:36,000
971 | And the next one should be
972 | out in a couple of weeks,
973 | 
974 | 207
975 | 00:07:36,000 --> 00:07:38,750
976 | depending on how much work I'm
977 | doing for Google I/O. Thanks,
978 | 
979 | 208
980 | 00:07:38,750 --> 00:07:41,620
981 | as always, for watching,
982 | and I'll see you next time.
983 | 
984 | 209
985 | 00:07:41,620 --> 00:07:53,000
986 |  Subtitles End: mo.dbxdb.com
987 | 
988 | 


--------------------------------------------------------------------------------
/subtitle/Eng/Machine Learning over Coffee with a Googler.srt:
--------------------------------------------------------------------------------
   1 | 1
   2 | 00:00:00,000 --> 00:00:00,000
   3 | Youtube subtitles download by mo.dbxdb.com 
   4 | 
   5 | 2
   6 | 00:00:00,000 --> 00:00:02,350
   7 | [MUSIC PLAYING]
   8 | 
   9 | 3
  10 | 00:00:02,350 --> 00:00:06,297
  11 | 
  12 | 
  13 | 4
  14 | 00:00:06,297 --> 00:00:08,130
  15 | LAURENCE MORONEY: Today
  16 | I'm in the Big Apple
  17 | 
  18 | 5
  19 | 00:00:08,130 --> 00:00:09,720
  20 | meeting with Josh
  21 | Gordon from Google
  22 | 
  23 | 6
  24 | 00:00:09,720 --> 00:00:11,360
  25 | to talk about machine
  26 | learning, where
  27 | 
  28 | 7
  29 | 00:00:11,360 --> 00:00:14,310
  30 | we will dig into how it
  31 | works, why it's important,
  32 | 
  33 | 8
  34 | 00:00:14,310 --> 00:00:17,437
  35 | and where you can
  36 | learn all about it.
  37 | 
  38 | 9
  39 | 00:00:17,437 --> 00:00:19,520
  40 | Welcome to Coffee with a
  41 | Googler in New York City.
  42 | 
  43 | 10
  44 | 00:00:19,520 --> 00:00:21,400
  45 | I'm Laurence Moroney,
  46 | and I'm here today
  47 | 
  48 | 11
  49 | 00:00:21,400 --> 00:00:23,500
  50 | speaking with Joshua Gordon.
  51 | 
  52 | 12
  53 | 00:00:23,500 --> 00:00:25,180
  54 | Now, it's something
  55 | that a lot of people
  56 | 
  57 | 13
  58 | 00:00:25,180 --> 00:00:27,179
  59 | don't really understand
  60 | what machine learning is
  61 | 
  62 | 14
  63 | 00:00:27,179 --> 00:00:28,114
  64 | in a concrete manner.
  65 | 
  66 | 15
  67 | 00:00:28,114 --> 00:00:29,530
  68 | JOSHUA GORDON: So
  69 | machine learning
  70 | 
  71 | 16
  72 | 00:00:29,530 --> 00:00:31,500
  73 | is all about learning
  74 | from examples
  75 | 
  76 | 17
  77 | 00:00:31,500 --> 00:00:32,969
  78 | rather than writing
  79 | manual rules.
  80 | 
  81 | 18
  82 | 00:00:32,969 --> 00:00:34,009
  83 | LAURENCE MORONEY: Got it.
  84 | 
  85 | 19
  86 | 00:00:34,009 --> 00:00:35,510
  87 | JOSHUA GORDON: So the
  88 | short way of saying
  89 | 
  90 | 20
  91 | 00:00:35,510 --> 00:00:38,230
  92 | that is regular programming is
  93 | you write a lot of manual rules
  94 | 
  95 | 21
  96 | 00:00:38,230 --> 00:00:39,170
  97 | to solve a problem.
  98 | 
  99 | 22
 100 | 00:00:39,170 --> 00:00:41,210
 101 | In machine learning,
 102 | you let the algorithm
 103 | 
 104 | 23
 105 | 00:00:41,210 --> 00:00:42,269
 106 | find those rules for you.
 107 | 
 108 | 24
 109 | 00:00:42,269 --> 00:00:43,310
 110 | LAURENCE MORONEY: Got it.
 111 | 
 112 | 25
 113 | 00:00:43,310 --> 00:00:43,310
 114 | JOSHUA GORDON: From examples.
 115 | 
 116 | 26
 117 | 00:00:43,970 --> 00:00:45,200
 118 | LAURENCE MORONEY:
 119 | So pattern matching.
 120 | 
 121 | 27
 122 | 00:00:45,200 --> 00:00:47,520
 123 | It might be visual, or it
 124 | might be other patterns
 125 | 
 126 | 28
 127 | 00:00:47,520 --> 00:00:48,060
 128 | that are hidden in data.
 129 | 
 130 | 29
 131 | 00:00:48,060 --> 00:00:48,060
 132 | JOSHUA GORDON: Absolutely.
 133 | 
 134 | 30
 135 | 00:00:48,740 --> 00:00:51,406
 136 | And so the input to machine-- so
 137 | the beauty of machine learning,
 138 | 
 139 | 31
 140 | 00:00:51,406 --> 00:00:54,170
 141 | and the real secret sauce,
 142 | is that an algorithm that
 143 | 
 144 | 32
 145 | 00:00:54,170 --> 00:00:57,760
 146 | learns patterns from
 147 | data can solve thousands
 148 | 
 149 | 33
 150 | 00:00:57,760 --> 00:00:58,850
 151 | of different problems.
 152 | 
 153 | 34
 154 | 00:00:58,850 --> 00:01:01,266
 155 | And the reason is if I write
 156 | a Python program to recognize
 157 | 
 158 | 35
 159 | 00:01:01,266 --> 00:01:03,509
 160 | digits, my program is hard
 161 | coded to work with digits.
 162 | 
 163 | 36
 164 | 00:01:03,509 --> 00:01:04,550
 165 | LAURENCE MORONEY: Got it.
 166 | 
 167 | 37
 168 | 00:01:04,550 --> 00:01:07,049
 169 | JOSHUA GORDON: But if I write
 170 | an algorithm to learn patterns
 171 | 
 172 | 38
 173 | 00:01:07,049 --> 00:01:09,520
 174 | from data, I can use that
 175 | for speech recognition, image
 176 | 
 177 | 39
 178 | 00:01:09,520 --> 00:01:11,470
 179 | recognition, medicine.
 180 | 
 181 | 40
 182 | 00:01:11,470 --> 00:01:14,010
 183 | Basically, anything that
 184 | you can start with examples,
 185 | 
 186 | 41
 187 | 00:01:14,010 --> 00:01:17,880
 188 | just tell apart A and B, my
 189 | same algorithm that I wrote just
 190 | 
 191 | 42
 192 | 00:01:17,880 --> 00:01:20,380
 193 | once can tackle
 194 | all these problems.
 195 | 
 196 | 43
 197 | 00:01:20,380 --> 00:01:23,049
 198 | And that's a really special and
 199 | actually fairly profound thing.
 200 | 
 201 | 44
 202 | 00:01:23,049 --> 00:01:24,010
 203 | LAURENCE MORONEY: Absolutely.
 204 | 
 205 | 45
 206 | 00:01:24,010 --> 00:01:26,093
 207 | Now, one of the things in
 208 | your classes that you're
 209 | 
 210 | 46
 211 | 00:01:26,093 --> 00:01:28,500
 212 | talking about that you're
 213 | starting with language.
 214 | 
 215 | 47
 216 | 00:01:28,500 --> 00:01:30,260
 217 | You're starting with
 218 | Java and Python,
 219 | 
 220 | 48
 221 | 00:01:30,260 --> 00:01:30,260
 222 | I think it was, that you said?
 223 | 
 224 | 49
 225 | 00:01:30,810 --> 00:01:31,590
 226 | JOSHUA GORDON: Yes, absolutely.
 227 | 
 228 | 50
 229 | 00:01:31,590 --> 00:01:32,590
 230 | LAURENCE MORONEY:
 231 | So how's the class
 232 | 
 233 | 51
 234 | 00:01:32,590 --> 00:01:33,700
 235 | going to be
 236 | structured for people
 237 | 
 238 | 52
 239 | 00:01:33,700 --> 00:01:35,330
 240 | who want to be these data
 241 | scientists of the future?
 242 | 
 243 | 53
 244 | 00:01:35,330 --> 00:01:35,330
 245 | JOSHUA GORDON: Absolutely.
 246 | 
 247 | 54
 248 | 00:01:35,910 --> 00:01:37,950
 249 | So first of all, there
 250 | are zero prerequisites.
 251 | 
 252 | 55
 253 | 00:01:37,950 --> 00:01:38,380
 254 | Well, that's not true.
 255 | 
 256 | 56
 257 | 00:01:38,380 --> 00:01:39,090
 258 | There's one prerequisite.
 259 | 
 260 | 57
 261 | 00:01:39,090 --> 00:01:39,090
 262 | LAURENCE MORONEY: My favorite.
 263 | 
 264 | 58
 265 | 00:01:39,780 --> 00:01:40,209
 266 | Oh, OK.
 267 | 
 268 | 59
 269 | 00:01:40,209 --> 00:01:41,376
 270 | Well, what's the one prereq?
 271 | 
 272 | 60
 273 | 00:01:41,376 --> 00:01:44,500
 274 | JOSHUA GORDON: Basic programming
 275 | ability in Java or Python.
 276 | 
 277 | 61
 278 | 00:01:44,500 --> 00:01:48,230
 279 | And by basic, I mean you can run
 280 | scripts and you can tweak them.
 281 | 
 282 | 62
 283 | 00:01:48,230 --> 00:01:49,770
 284 | That's it.
 285 | 
 286 | 63
 287 | 00:01:49,770 --> 00:01:51,440
 288 | A little bit of
 289 | high school math.
 290 | 
 291 | 64
 292 | 00:01:51,440 --> 00:01:54,450
 293 | And that means like basic
 294 | algebra, basic geometry.
 295 | 
 296 | 65
 297 | 00:01:54,450 --> 00:01:56,470
 298 | When I say basic geometry,
 299 | to be totally honest,
 300 | 
 301 | 66
 302 | 00:01:56,470 --> 00:01:58,447
 303 | if you asked me, like,
 304 | what sine and cosine,
 305 | 
 306 | 67
 307 | 00:01:58,447 --> 00:01:59,530
 308 | I would have to Google it.
 309 | 
 310 | 68
 311 | 00:01:59,530 --> 00:02:01,510
 312 | I don't remember, honestly.
 313 | 
 314 | 69
 315 | 00:02:01,510 --> 00:02:04,419
 316 | So just basic familiarity,
 317 | and that's it.
 318 | 
 319 | 70
 320 | 00:02:04,419 --> 00:02:06,459
 321 | And we're going to teach
 322 | the class in three ways.
 323 | 
 324 | 71
 325 | 00:02:06,459 --> 00:02:09,030
 326 | We're going to teach it
 327 | totally from the ground up.
 328 | 
 329 | 72
 330 | 00:02:09,030 --> 00:02:12,205
 331 | So one problem I had with some
 332 | of the academic classes I took
 333 | 
 334 | 73
 335 | 00:02:12,205 --> 00:02:14,080
 336 | is that they'll talk
 337 | about a fancy algorithm,
 338 | 
 339 | 74
 340 | 00:02:14,080 --> 00:02:16,149
 341 | like neural
 342 | networks, but they'll
 343 | 
 344 | 75
 345 | 00:02:16,149 --> 00:02:17,440
 346 | talk about it in terms of math.
 347 | 
 348 | 76
 349 | 00:02:17,440 --> 00:02:20,120
 350 | And so at the end of the class,
 351 | I don't know how to build that.
 352 | 
 353 | 77
 354 | 00:02:20,120 --> 00:02:21,107
 355 | I can't really do it.
 356 | 
 357 | 78
 358 | 00:02:21,107 --> 00:02:22,440
 359 | We're doing it in a reverse way.
 360 | 
 361 | 79
 362 | 00:02:22,440 --> 00:02:24,440
 363 | We're building it step
 364 | by step, and we're
 365 | 
 366 | 80
 367 | 00:02:24,440 --> 00:02:27,744
 368 | explaining only the math that's
 369 | really necessary as we go.
 370 | 
 371 | 81
 372 | 00:02:27,744 --> 00:02:30,160
 373 | And instead of equations, we're
 374 | going use visual examples.
 375 | 
 376 | 82
 377 | 00:02:30,160 --> 00:02:30,160
 378 | LAURENCE MORONEY: Perfect.
 379 | 
 380 | 83
 381 | 00:02:30,729 --> 00:02:32,187
 382 | JOSHUA GORDON: So
 383 | an equation could
 384 | 
 385 | 84
 386 | 00:02:32,187 --> 00:02:34,060
 387 | be like if you talk
 388 | about gradient descent,
 389 | 
 390 | 85
 391 | 00:02:34,060 --> 00:02:36,259
 392 | gradient descent
 393 | basically means finding
 394 | 
 395 | 86
 396 | 00:02:36,259 --> 00:02:37,883
 397 | the minimum of a function.
 398 | 
 399 | 87
 400 | 00:02:37,883 --> 00:02:40,550
 401 | So if I just say that, like as a
 402 | developer, I'm like, all right,
 403 | 
 404 | 88
 405 | 00:02:40,550 --> 00:02:41,160
 406 | what does that mean?
 407 | 
 408 | 89
 409 | 00:02:41,160 --> 00:02:42,535
 410 | So you can think
 411 | of any equation,
 412 | 
 413 | 90
 414 | 00:02:42,535 --> 00:02:45,550
 415 | like x cubed plus y squared
 416 | plus whatever equals 7.
 417 | 
 418 | 91
 419 | 00:02:45,550 --> 00:02:47,250
 420 | There's some value of x and y.
 421 | 
 422 | 92
 423 | 00:02:47,250 --> 00:02:48,660
 424 | LAURENCE MORONEY: That's going
 425 | to be the bottom of that curve,
 426 | 
 427 | 93
 428 | 00:02:48,660 --> 00:02:48,660
 429 | right?
 430 | 
 431 | 94
 432 | 00:02:48,919 --> 00:02:49,270
 433 | JOSHUA GORDON: Or not equals 7.
 434 | 
 435 | 95
 436 | 00:02:49,270 --> 00:02:50,020
 437 | Equals some value.
 438 | 
 439 | 96
 440 | 00:02:50,020 --> 00:02:50,020
 441 | Right.
 442 | 
 443 | 97
 444 | 00:02:50,660 --> 00:02:52,720
 445 | Anyway, you can find
 446 | the bottom of that curve
 447 | 
 448 | 98
 449 | 00:02:52,720 --> 00:02:54,069
 450 | literally by thinking as a bowl.
 451 | 
 452 | 99
 453 | 00:02:54,069 --> 00:02:56,270
 454 | You can drop a piece
 455 | of fruit in a bowl
 456 | 
 457 | 100
 458 | 00:02:56,270 --> 00:02:57,715
 459 | and it will roll to the bottom.
 460 | 
 461 | 101
 462 | 00:02:57,715 --> 00:02:59,340
 463 | And gradient descent
 464 | just means finding
 465 | 
 466 | 102
 467 | 00:02:59,340 --> 00:03:00,960
 468 | where this function is 0.
 469 | 
 470 | 103
 471 | 00:03:00,960 --> 00:03:03,009
 472 | And you can actually
 473 | describe that really simply
 474 | 
 475 | 104
 476 | 00:03:03,009 --> 00:03:05,280
 477 | in only like 10 or
 478 | 12 lines of Python,
 479 | 
 480 | 105
 481 | 00:03:05,280 --> 00:03:07,585
 482 | actually, instead of
 483 | five slides of equations.
 484 | 
 485 | 106
 486 | 00:03:07,585 --> 00:03:09,210
 487 | LAURENCE MORONEY:
 488 | And I think it's also
 489 | 
 490 | 107
 491 | 00:03:09,210 --> 00:03:11,300
 492 | important to understand
 493 | why you need to find
 494 | 
 495 | 108
 496 | 00:03:11,300 --> 00:03:12,300
 497 | the bottom of the curve.
 498 | 
 499 | 109
 500 | 00:03:12,300 --> 00:03:13,340
 501 | JOSHUA GORDON: Absolutely.
 502 | 
 503 | 110
 504 | 00:03:13,340 --> 00:03:14,919
 505 | LAURENCE MORONEY: And just
 506 | focus on that example.
 507 | 
 508 | 111
 509 | 00:03:14,919 --> 00:03:15,330
 510 | JOSHUA GORDON: Absolutely.
 511 | 
 512 | 112
 513 | 00:03:15,330 --> 00:03:17,419
 514 | So that's difficult
 515 | to describe concisely.
 516 | 
 517 | 113
 518 | 00:03:17,419 --> 00:03:19,076
 519 | LAURENCE MORONEY: Right.
 520 | 
 521 | 114
 522 | 00:03:19,076 --> 00:03:20,660
 523 | JOSHUA GORDON: So
 524 | in machine learning,
 525 | 
 526 | 115
 527 | 00:03:20,660 --> 00:03:22,349
 528 | let's say you're
 529 | writing an algorithm.
 530 | 
 531 | 116
 532 | 00:03:22,349 --> 00:03:26,240
 533 | Let's say it's to distinguish
 534 | apples from oranges.
 535 | 
 536 | 117
 537 | 00:03:26,240 --> 00:03:29,199
 538 | You always want to know, how
 539 | accurate is my algorithm?
 540 | 
 541 | 118
 542 | 00:03:29,199 --> 00:03:31,090
 543 | Like, I can solve that
 544 | problem in one line.
 545 | 
 546 | 119
 547 | 00:03:31,090 --> 00:03:34,020
 548 | I can just say,
 549 | return math.random.
 550 | 
 551 | 120
 552 | 00:03:34,020 --> 00:03:35,389
 553 | So one line, math.random.
 554 | 
 555 | 121
 556 | 00:03:35,389 --> 00:03:37,389
 557 | LAURENCE MORONEY: That
 558 | would be the perfect one.
 559 | 
 560 | 122
 561 | 00:03:37,389 --> 00:03:39,084
 562 | JOSHUA GORDON: My
 563 | accuracy is crap.
 564 | 
 565 | 123
 566 | 00:03:39,084 --> 00:03:40,000
 567 | LAURENCE MORONEY: 50%.
 568 | 
 569 | 124
 570 | 00:03:40,000 --> 00:03:40,000
 571 | JOSHUA GORDON: Right.
 572 | 
 573 | 125
 574 | 00:03:40,874 --> 00:03:42,160
 575 | Yeah, it's 50%.
 576 | 
 577 | 126
 578 | 00:03:42,160 --> 00:03:43,190
 579 | LAURENCE MORONEY: Between
 580 | an apple and an orange.
 581 | 
 582 | 127
 583 | 00:03:43,190 --> 00:03:44,630
 584 | JOSHUA GORDON: It's a one liner.
 585 | 
 586 | 128
 587 | 00:03:44,630 --> 00:03:47,186
 588 | But really, we want
 589 | to get-- another way
 590 | 
 591 | 129
 592 | 00:03:47,186 --> 00:03:48,810
 593 | of describing accuracy
 594 | is you can think
 595 | 
 596 | 130
 597 | 00:03:48,810 --> 00:03:50,690
 598 | about it n terms of error.
 599 | 
 600 | 131
 601 | 00:03:50,690 --> 00:03:53,120
 602 | High accuracy means low error.
 603 | 
 604 | 132
 605 | 00:03:53,120 --> 00:03:57,550
 606 | And you can have an equation
 607 | that describes your error.
 608 | 
 609 | 133
 610 | 00:03:57,550 --> 00:03:59,569
 611 | And the minimum of
 612 | that equation is
 613 | 
 614 | 134
 615 | 00:03:59,569 --> 00:04:01,741
 616 | going to give you
 617 | the highest accuracy.
 618 | 
 619 | 135
 620 | 00:04:01,741 --> 00:04:03,740
 621 | So you can write your
 622 | machine learning algorithm
 623 | 
 624 | 136
 625 | 00:04:03,740 --> 00:04:06,299
 626 | to try and minimize the equation
 627 | that describes the error.
 628 | 
 629 | 137
 630 | 00:04:06,299 --> 00:04:07,340
 631 | LAURENCE MORONEY: Got it.
 632 | 
 633 | 138
 634 | 00:04:07,340 --> 00:04:09,120
 635 | JOSHUA GORDON: And we'll
 636 | make that super concrete
 637 | 
 638 | 139
 639 | 00:04:09,120 --> 00:04:11,319
 640 | in the class, but that's
 641 | where minimization comes in
 642 | 
 643 | 140
 644 | 00:04:11,319 --> 00:04:12,669
 645 | and that's where gradient
 646 | descent comes in.
 647 | 
 648 | 141
 649 | 00:04:12,669 --> 00:04:13,319
 650 | LAURENCE MORONEY:
 651 | So one of the things
 652 | 
 653 | 142
 654 | 00:04:13,319 --> 00:04:14,735
 655 | you're saying in
 656 | the class, you're
 657 | 
 658 | 143
 659 | 00:04:14,735 --> 00:04:16,485
 660 | teaching just a pure
 661 | Java, Python version.
 662 | 
 663 | 144
 664 | 00:04:16,485 --> 00:04:18,110
 665 | But there's also a
 666 | version where you're
 667 | 
 668 | 145
 669 | 00:04:18,110 --> 00:04:19,490
 670 | bringing in
 671 | preexisting libraries
 672 | 
 673 | 146
 674 | 00:04:19,490 --> 00:04:20,720
 675 | that have come from academia.
 676 | 
 677 | 147
 678 | 00:04:20,720 --> 00:04:20,720
 679 | JOSHUA GORDON: Absolutely.
 680 | 
 681 | 148
 682 | 00:04:20,985 --> 00:04:22,259
 683 | LAURENCE MORONEY: That will
 684 | solve a lot of this for you,
 685 | 
 686 | 149
 687 | 00:04:22,259 --> 00:04:22,259
 688 | right?
 689 | 
 690 | 150
 691 | 00:04:22,699 --> 00:04:23,230
 692 | JOSHUA GORDON: Absolutely.
 693 | 
 694 | 151
 695 | 00:04:23,230 --> 00:04:24,562
 696 | So I want to do a couple things.
 697 | 
 698 | 152
 699 | 00:04:24,562 --> 00:04:27,009
 700 | One is I want to
 701 | provide the TLDR.
 702 | 
 703 | 153
 704 | 00:04:27,009 --> 00:04:29,720
 705 | So honestly, as a
 706 | developer, I like to get up
 707 | 
 708 | 154
 709 | 00:04:29,720 --> 00:04:31,089
 710 | and running really fast.
 711 | 
 712 | 155
 713 | 00:04:31,089 --> 00:04:34,632
 714 | So we're also going to use
 715 | open source libraries from just
 716 | 
 717 | 156
 718 | 00:04:34,632 --> 00:04:35,589
 719 | different universities.
 720 | 
 721 | 157
 722 | 00:04:35,589 --> 00:04:37,990
 723 | There's one in New Zealand
 724 | that I really love.
 725 | 
 726 | 158
 727 | 00:04:37,990 --> 00:04:40,509
 728 | We're going to you how to build,
 729 | basically first, everything
 730 | 
 731 | 159
 732 | 00:04:40,509 --> 00:04:42,384
 733 | from the ground up step
 734 | by step from scratch.
 735 | 
 736 | 160
 737 | 00:04:42,384 --> 00:04:45,730
 738 | And the reason we do that is
 739 | because it keeps us honest.
 740 | 
 741 | 161
 742 | 00:04:45,730 --> 00:04:48,250
 743 | If you build every
 744 | single piece, you
 745 | 
 746 | 162
 747 | 00:04:48,250 --> 00:04:50,560
 748 | have some understanding
 749 | of every single piece.
 750 | 
 751 | 163
 752 | 00:04:50,560 --> 00:04:52,139
 753 | LAURENCE MORONEY: And if
 754 | you're relying on somebody else
 755 | 
 756 | 164
 757 | 00:04:52,139 --> 00:04:54,329
 758 | having done the work, you don't
 759 | fully get to understand it
 760 | 
 761 | 165
 762 | 00:04:54,329 --> 00:04:54,329
 763 | yourself.
 764 | 
 765 | 166
 766 | 00:04:54,490 --> 00:04:55,500
 767 | JOSHUA GORDON: Exactly.
 768 | 
 769 | 167
 770 | 00:04:55,500 --> 00:04:57,750
 771 | Now, another thing is using
 772 | the open source libraries,
 773 | 
 774 | 168
 775 | 00:04:57,750 --> 00:05:00,329
 776 | honestly, you can solve
 777 | probably 80% or 90%
 778 | 
 779 | 169
 780 | 00:05:00,329 --> 00:05:03,389
 781 | of the machine learning problems
 782 | you would as a data scientist.
 783 | 
 784 | 170
 785 | 00:05:03,389 --> 00:05:04,509
 786 | LAURENCE MORONEY: Nice.
 787 | 
 788 | 171
 789 | 00:05:04,509 --> 00:05:06,800
 790 | JOSHUA GORDON: Now, when you
 791 | get to the really gigantic
 792 | 
 793 | 172
 794 | 00:05:06,800 --> 00:05:09,471
 795 | problems, then really it
 796 | makes sense to use the cloud.
 797 | 
 798 | 173
 799 | 00:05:09,471 --> 00:05:11,180
 800 | So we're also going
 801 | to teach how to solve
 802 | 
 803 | 174
 804 | 00:05:11,180 --> 00:05:12,470
 805 | problems using Google APIs.
 806 | 
 807 | 175
 808 | 00:05:12,470 --> 00:05:14,529
 809 | But that's at the
 810 | very end of the class,
 811 | 
 812 | 176
 813 | 00:05:14,529 --> 00:05:16,345
 814 | and it's totally optional.
 815 | 
 816 | 177
 817 | 00:05:16,345 --> 00:05:17,519
 818 | LAURENCE MORONEY: This
 819 | is all on YouTube, right?
 820 | 
 821 | 178
 822 | 00:05:17,519 --> 00:05:18,769
 823 | JOSHUA GORDON: All on YouTube.
 824 | 
 825 | 179
 826 | 00:05:18,769 --> 00:05:21,500
 827 | There might be some ads on
 828 | it, but that's literally it.
 829 | 
 830 | 180
 831 | 00:05:21,500 --> 00:05:22,230
 832 | We think it's going
 833 | to be awesome.
 834 | 
 835 | 181
 836 | 00:05:22,230 --> 00:05:23,410
 837 | LAURENCE MORONEY: Like
 838 | source code and stuff
 839 | 
 840 | 182
 841 | 00:05:23,410 --> 00:05:23,410
 842 | that you've done?
 843 | 
 844 | 183
 845 | 00:05:23,930 --> 00:05:25,800
 846 | JOSHUA GORDON: The source
 847 | code will be on GitHub.
 848 | 
 849 | 184
 850 | 00:05:25,800 --> 00:05:26,569
 851 | LAURENCE MORONEY:
 852 | It's all on GitHub.
 853 | 
 854 | 185
 855 | 00:05:26,569 --> 00:05:26,569
 856 | Perfect.
 857 | 
 858 | 186
 859 | 00:05:26,709 --> 00:05:27,069
 860 | JOSHUA GORDON: It
 861 | will all be on GitHub.
 862 | 
 863 | 187
 864 | 00:05:27,069 --> 00:05:28,019
 865 | And the reason I
 866 | was hesitating is
 867 | 
 868 | 188
 869 | 00:05:28,019 --> 00:05:29,644
 870 | I'm writing all this
 871 | as we're speaking,
 872 | 
 873 | 189
 874 | 00:05:29,644 --> 00:05:30,819
 875 | so I'm totally exhausted.
 876 | 
 877 | 190
 878 | 00:05:30,819 --> 00:05:32,699
 879 | But yes, it's totally,
 880 | 100% out there.
 881 | 
 882 | 191
 883 | 00:05:32,699 --> 00:05:35,389
 884 | LAURENCE MORONEY: Well, you're
 885 | still looking energetic to me.
 886 | 
 887 | 192
 888 | 00:05:35,389 --> 00:05:38,386
 889 | JOSHUA GORDON: I've had a
 890 | lot of coffee with a Googler.
 891 | 
 892 | 193
 893 | 00:05:38,386 --> 00:05:39,399
 894 | Good for you.
 895 | 
 896 | 194
 897 | 00:05:39,399 --> 00:05:40,774
 898 | LAURENCE MORONEY:
 899 | Well, I for one
 900 | 
 901 | 195
 902 | 00:05:40,774 --> 00:05:42,483
 903 | am really looking
 904 | forward to this course.
 905 | 
 906 | 196
 907 | 00:05:42,483 --> 00:05:45,709
 908 | I'm looking forward to learning
 909 | what you have to teach.
 910 | 
 911 | 197
 912 | 00:05:45,709 --> 00:05:47,269
 913 | I've had the same
 914 | kind of struggles
 915 | 
 916 | 198
 917 | 00:05:47,269 --> 00:05:50,096
 918 | as you in trying to understand
 919 | the math behind this
 920 | 
 921 | 199
 922 | 00:05:50,096 --> 00:05:51,470
 923 | and why I'm doing
 924 | the math, which
 925 | 
 926 | 200
 927 | 00:05:51,470 --> 00:05:53,360
 928 | is why I had those
 929 | pointed questions earlier.
 930 | 
 931 | 201
 932 | 00:05:53,360 --> 00:05:54,139
 933 | JOSHUA GORDON: Absolutely.
 934 | 
 935 | 202
 936 | 00:05:54,139 --> 00:05:54,139
 937 | LAURENCE MORONEY:
 938 | So thanks, Josh.
 939 | 
 940 | 203
 941 | 00:05:54,829 --> 00:05:56,029
 942 | That was a whole lot of fun.
 943 | 
 944 | 204
 945 | 00:05:56,029 --> 00:05:57,814
 946 | And I've learned so
 947 | much about machine
 948 | 
 949 | 205
 950 | 00:05:57,814 --> 00:05:59,730
 951 | learning just from these
 952 | few minutes with you,
 953 | 
 954 | 206
 955 | 00:05:59,730 --> 00:06:01,410
 956 | so I'm really looking
 957 | forward to your class.
 958 | 
 959 | 207
 960 | 00:06:01,410 --> 00:06:01,410
 961 | JOSHUA GORDON: Thanks so much.
 962 | 
 963 | 208
 964 | 00:06:01,970 --> 00:06:03,550
 965 | LAURENCE MORONEY: If you've
 966 | enjoyed this episode of Coffee
 967 | 
 968 | 209
 969 | 00:06:03,550 --> 00:06:05,420
 970 | with a Googler and if you
 971 | want to learn machine learning
 972 | 
 973 | 210
 974 | 00:06:05,420 --> 00:06:07,694
 975 | for yourself, if you have
 976 | any questions for Joshua,
 977 | 
 978 | 211
 979 | 00:06:07,694 --> 00:06:09,110
 980 | or if you've any
 981 | questions for me,
 982 | 
 983 | 212
 984 | 00:06:09,110 --> 00:06:10,819
 985 | please leave them in
 986 | the comments below.
 987 | 
 988 | 213
 989 | 00:06:10,819 --> 00:06:12,610
 990 | And tune into the Google
 991 | Developers channel
 992 | 
 993 | 214
 994 | 00:06:12,610 --> 00:06:14,380
 995 | for more great videos,
 996 | including episodes
 997 | 
 998 | 215
 999 | 00:06:14,380 --> 00:06:15,529
1000 | of Coffee with a Googler.
1001 | 
1002 | 216
1003 | 00:06:15,529 --> 00:06:16,605
1004 | Thank you.
1005 | 
1006 | 217
1007 | 00:06:16,605 --> 00:06:17,521
1008 | [MUSIC PLAYING]
1009 | 
1010 | 218
1011 | 00:06:17,521 --> 00:06:19,730
1012 | JOSHUA GORDON: You really
1013 | can learn machine learning,
1014 | 
1015 | 219
1016 | 00:06:19,730 --> 00:06:21,949
1017 | and it's faster and
1018 | easier than you think.
1019 | 
1020 | 220
1021 | 00:06:21,949 --> 00:06:25,540
1022 | We've gone through a ton of
1023 | classes, textbooks, and blog
1024 | 
1025 | 221
1026 | 00:06:25,540 --> 00:06:29,120
1027 | posts to bring you the clearest
1028 | and most concise explanations
1029 | 
1030 | 222
1031 | 00:06:29,120 --> 00:06:30,459
1032 | of the hard concepts.
1033 | 
1034 | 223
1035 | 00:06:30,459 --> 00:06:32,024
1036 | We really think you're going
1037 | to be able to learn it and have
1038 | 
1039 | 224
1040 | 00:06:32,024 --> 00:06:33,290
1041 | some fun on the way.
1042 | 
1043 | 225
1044 | 00:06:33,290 --> 00:06:35,410
1045 | Click here to get started.
1046 | 
1047 | 226
1048 | 00:06:35,410 --> 00:06:36,000
1049 |  Subtitles End: mo.dbxdb.com
1050 | 
1051 | 


--------------------------------------------------------------------------------
/subtitle/Eng/Visualizing a Decision Tree - Machine Learning Recipes #2.srt:
--------------------------------------------------------------------------------
  1 | 1
  2 | 00:00:00,000 --> 00:00:00,000
  3 | Youtube subtitles download by mo.dbxdb.com 
  4 | 
  5 | 2
  6 | 00:00:00,000 --> 00:00:02,802
  7 | [MUSIC PLAYING]
  8 | 
  9 | 3
 10 | 00:00:02,802 --> 00:00:06,550
 11 | 
 12 | 
 13 | 4
 14 | 00:00:06,550 --> 00:00:09,370
 15 | Last episode, we used a
 16 | decision tree as our classifier.
 17 | 
 18 | 5
 19 | 00:00:09,370 --> 00:00:10,920
 20 | Today we'll add
 21 | code to visualize it
 22 | 
 23 | 6
 24 | 00:00:10,920 --> 00:00:13,032
 25 | so we can see how it
 26 | works under the hood.
 27 | 
 28 | 7
 29 | 00:00:13,032 --> 00:00:14,490
 30 | There are many
 31 | types of classifiers
 32 | 
 33 | 8
 34 | 00:00:14,490 --> 00:00:16,740
 35 | you may have heard of before--
 36 | things like neural nets
 37 | 
 38 | 9
 39 | 00:00:16,740 --> 00:00:17,870
 40 | or support vector machines.
 41 | 
 42 | 10
 43 | 00:00:17,870 --> 00:00:20,234
 44 | So why did we use a
 45 | decision tree to start?
 46 | 
 47 | 11
 48 | 00:00:20,234 --> 00:00:21,900
 49 | Well, they have a
 50 | very unique property--
 51 | 
 52 | 12
 53 | 00:00:21,900 --> 00:00:23,907
 54 | they're easy to
 55 | read and understand.
 56 | 
 57 | 13
 58 | 00:00:23,907 --> 00:00:26,490
 59 | In fact, they're one of the few
 60 | models that are interpretable,
 61 | 
 62 | 14
 63 | 00:00:26,490 --> 00:00:28,900
 64 | where you can understand
 65 | exactly why the classifier makes
 66 | 
 67 | 15
 68 | 00:00:28,900 --> 00:00:29,740
 69 | a decision.
 70 | 
 71 | 16
 72 | 00:00:29,740 --> 00:00:33,534
 73 | That's amazingly
 74 | useful in practice.
 75 | 
 76 | 17
 77 | 00:00:33,534 --> 00:00:34,950
 78 | To get started,
 79 | I'll introduce you
 80 | 
 81 | 18
 82 | 00:00:34,950 --> 00:00:37,079
 83 | to a real data set
 84 | we'll work with today.
 85 | 
 86 | 19
 87 | 00:00:37,079 --> 00:00:38,670
 88 | It's called Iris.
 89 | 
 90 | 20
 91 | 00:00:38,670 --> 00:00:41,170
 92 | Iris is a classic
 93 | machine learning problem.
 94 | 
 95 | 21
 96 | 00:00:41,170 --> 00:00:43,270
 97 | In it, you want to identify
 98 | what type of flower
 99 | 
100 | 22
101 | 00:00:43,270 --> 00:00:45,009
102 | you have based on
103 | different measurements,
104 | 
105 | 23
106 | 00:00:45,009 --> 00:00:46,980
107 | like the length and
108 | width of the petal.
109 | 
110 | 24
111 | 00:00:46,980 --> 00:00:49,600
112 | The data set includes three
113 | different types of flowers.
114 | 
115 | 25
116 | 00:00:49,600 --> 00:00:52,870
117 | They're all species of
118 | iris-- setosa, versicolor,
119 | 
120 | 26
121 | 00:00:52,870 --> 00:00:53,966
122 | and virginica.
123 | 
124 | 27
125 | 00:00:53,966 --> 00:00:55,340
126 | Scrolling down,
127 | you can see we're
128 | 
129 | 28
130 | 00:00:55,340 --> 00:01:00,024
131 | given 50 examples of each
132 | type, so 150 examples total.
133 | 
134 | 29
135 | 00:01:00,024 --> 00:01:01,650
136 | Notice there are four
137 | features that are
138 | 
139 | 30
140 | 00:01:01,650 --> 00:01:03,620
141 | used to describe each example.
142 | 
143 | 31
144 | 00:01:03,620 --> 00:01:06,670
145 | These are the length and
146 | width of the sepal and petal.
147 | 
148 | 32
149 | 00:01:06,670 --> 00:01:08,730
150 | And just like in our
151 | apples and oranges problem,
152 | 
153 | 33
154 | 00:01:08,730 --> 00:01:11,780
155 | the first four columns give the
156 | features and the last column
157 | 
158 | 34
159 | 00:01:11,780 --> 00:01:15,170
160 | gives the labels, which is the
161 | type of flower in each row.
162 | 
163 | 35
164 | 00:01:15,170 --> 00:01:18,140
165 | Our goal is to use this data
166 | set to train a classifier.
167 | 
168 | 36
169 | 00:01:18,140 --> 00:01:21,027
170 | Then we can use that classifier
171 | to predict what species
172 | 
173 | 37
174 | 00:01:21,027 --> 00:01:23,610
175 | of flower we have if we're given
176 | a new flower that we've never
177 | 
178 | 38
179 | 00:01:23,610 --> 00:01:25,036
180 | seen before.
181 | 
182 | 39
183 | 00:01:25,036 --> 00:01:26,910
184 | Knowing how to work with
185 | an existing data set
186 | 
187 | 40
188 | 00:01:26,910 --> 00:01:29,910
189 | is a good skill, so let's
190 | import Iris into scikit-learn
191 | 
192 | 41
193 | 00:01:29,910 --> 00:01:32,120
194 | and see what it
195 | looks like in code.
196 | 
197 | 42
198 | 00:01:32,120 --> 00:01:33,870
199 | Conveniently, the
200 | friendly folks at scikit
201 | 
202 | 43
203 | 00:01:33,870 --> 00:01:35,770
204 | provided a bunch of
205 | sample data sets,
206 | 
207 | 44
208 | 00:01:35,770 --> 00:01:37,780
209 | including Iris, as
210 | well as utilities
211 | 
212 | 45
213 | 00:01:37,780 --> 00:01:39,760
214 | to make them easy to import.
215 | 
216 | 46
217 | 00:01:39,760 --> 00:01:42,690
218 | We can import Iris into
219 | our code like this.
220 | 
221 | 47
222 | 00:01:42,690 --> 00:01:44,530
223 | The data set includes
224 | both the table
225 | 
226 | 48
227 | 00:01:44,530 --> 00:01:47,230
228 | from Wikipedia as
229 | well as some metadata.
230 | 
231 | 49
232 | 00:01:47,230 --> 00:01:49,630
233 | The metadata tells you
234 | the names of the features
235 | 
236 | 50
237 | 00:01:49,630 --> 00:01:52,430
238 | and the names of different
239 | types of flowers.
240 | 
241 | 51
242 | 00:01:52,430 --> 00:01:54,190
243 | The features and
244 | examples themselves
245 | 
246 | 52
247 | 00:01:54,190 --> 00:01:56,300
248 | are contained in
249 | the data variable.
250 | 
251 | 53
252 | 00:01:56,300 --> 00:01:58,239
253 | For example, if I print
254 | out the first entry,
255 | 
256 | 54
257 | 00:01:58,239 --> 00:02:00,920
258 | you can see the measurements
259 | for this flower.
260 | 
261 | 55
262 | 00:02:00,920 --> 00:02:03,819
263 | These index to the feature
264 | names, so the first value
265 | 
266 | 56
267 | 00:02:03,819 --> 00:02:06,760
268 | refers to the sepal length,
269 | and the second to sepal width,
270 | 
271 | 57
272 | 00:02:06,760 --> 00:02:09,150
273 | and so on.
274 | 
275 | 58
276 | 00:02:09,150 --> 00:02:11,750
277 | The target variable
278 | contains the labels.
279 | 
280 | 59
281 | 00:02:11,750 --> 00:02:14,690
282 | Likewise, these index
283 | to the target names.
284 | 
285 | 60
286 | 00:02:14,690 --> 00:02:16,000
287 | Let's print out the first one.
288 | 
289 | 61
290 | 00:02:16,000 --> 00:02:19,229
291 | A label of 0 means
292 | it's a setosa.
293 | 
294 | 62
295 | 00:02:19,229 --> 00:02:21,449
296 | If you look at the
297 | table from Wikipedia,
298 | 
299 | 63
300 | 00:02:21,449 --> 00:02:24,520
301 | you'll notice that we just
302 | printed out the first row.
303 | 
304 | 64
305 | 00:02:24,520 --> 00:02:27,967
306 | Now both the data and target
307 | variables have 150 entries.
308 | 
309 | 65
310 | 00:02:27,967 --> 00:02:29,550
311 | If you want, you can
312 | iterate over them
313 | 
314 | 66
315 | 00:02:29,550 --> 00:02:32,081
316 | to print out the entire
317 | data set like this.
318 | 
319 | 67
320 | 00:02:32,081 --> 00:02:34,039
321 | Now that we know how to
322 | work with the data set,
323 | 
324 | 68
325 | 00:02:34,039 --> 00:02:35,849
326 | we're ready to
327 | train a classifier.
328 | 
329 | 69
330 | 00:02:35,849 --> 00:02:39,300
331 | But before we do that, first
332 | we need to split up the data.
333 | 
334 | 70
335 | 00:02:39,300 --> 00:02:41,440
336 | I'm going to remove
337 | several of the examples
338 | 
339 | 71
340 | 00:02:41,440 --> 00:02:43,479
341 | and put them aside for later.
342 | 
343 | 72
344 | 00:02:43,479 --> 00:02:46,330
345 | We'll call the examples I'm
346 | putting aside our testing data.
347 | 
348 | 73
349 | 00:02:46,330 --> 00:02:48,780
350 | We'll keep these separate
351 | from our training data,
352 | 
353 | 74
354 | 00:02:48,780 --> 00:02:50,940
355 | and later on we'll use
356 | our testing examples
357 | 
358 | 75
359 | 00:02:50,940 --> 00:02:53,389
360 | to test how accurate
361 | the classifier is
362 | 
363 | 76
364 | 00:02:53,389 --> 00:02:55,679
365 | on data it's never seen before.
366 | 
367 | 77
368 | 00:02:55,679 --> 00:02:57,470
369 | Testing is actually a
370 | really important part
371 | 
372 | 78
373 | 00:02:57,470 --> 00:02:59,261
374 | of doing machine learning
375 | well in practice,
376 | 
377 | 79
378 | 00:02:59,261 --> 00:03:02,280
379 | and we'll cover it in more
380 | detail in a future episode.
381 | 
382 | 80
383 | 00:03:02,280 --> 00:03:04,710
384 | Just for this exercise,
385 | I'll remove one example
386 | 
387 | 81
388 | 00:03:04,710 --> 00:03:06,050
389 | of each type of flower.
390 | 
391 | 82
392 | 00:03:06,050 --> 00:03:07,520
393 | And as it happens,
394 | the data set is
395 | 
396 | 83
397 | 00:03:07,520 --> 00:03:10,009
398 | ordered so the first
399 | setosa is at index 0,
400 | 
401 | 84
402 | 00:03:10,009 --> 00:03:14,270
403 | and the first versicolor
404 | is at 50, and so on.
405 | 
406 | 85
407 | 00:03:14,270 --> 00:03:16,770
408 | The syntax looks a little bit
409 | complicated, but all I'm doing
410 | 
411 | 86
412 | 00:03:16,770 --> 00:03:21,229
413 | is removing three entries from
414 | the data and target variables.
415 | 
416 | 87
417 | 00:03:21,229 --> 00:03:24,080
418 | Then I'll create two new
419 | sets of variables-- one
420 | 
421 | 88
422 | 00:03:24,080 --> 00:03:26,586
423 | for training and
424 | one for testing.
425 | 
426 | 89
427 | 00:03:26,586 --> 00:03:28,419
428 | Training will have the
429 | majority of our data,
430 | 
431 | 90
432 | 00:03:28,419 --> 00:03:31,370
433 | and testing will have just
434 | the examples I removed.
435 | 
436 | 91
437 | 00:03:31,370 --> 00:03:33,830
438 | Now, just as before, we
439 | can create a decision tree
440 | 
441 | 92
442 | 00:03:33,830 --> 00:03:36,569
443 | classifier and train it
444 | on our training data.
445 | 
446 | 93
447 | 00:03:36,569 --> 00:03:40,699
448 | 
449 | 
450 | 94
451 | 00:03:40,699 --> 00:03:42,840
452 | Before we visualize
453 | it, let's use the tree
454 | 
455 | 95
456 | 00:03:42,840 --> 00:03:44,960
457 | to classify our testing data.
458 | 
459 | 96
460 | 00:03:44,960 --> 00:03:47,449
461 | We know we have one
462 | flower of each type,
463 | 
464 | 97
465 | 00:03:47,449 --> 00:03:50,180
466 | and we can print out
467 | the labels we expect.
468 | 
469 | 98
470 | 00:03:50,180 --> 00:03:52,160
471 | Now let's see what
472 | the tree predicts.
473 | 
474 | 99
475 | 00:03:52,160 --> 00:03:54,460
476 | We'll give it the features
477 | for our testing data,
478 | 
479 | 100
480 | 00:03:54,460 --> 00:03:56,349
481 | and we'll get back labels.
482 | 
483 | 101
484 | 00:03:56,349 --> 00:03:59,660
485 | You can see the predicted
486 | labels match our testing data.
487 | 
488 | 102
489 | 00:03:59,660 --> 00:04:01,550
490 | That means it got
491 | them all right.
492 | 
493 | 103
494 | 00:04:01,550 --> 00:04:04,039
495 | Now, keep in mind, this
496 | was a very simple test,
497 | 
498 | 104
499 | 00:04:04,039 --> 00:04:07,940
500 | and we'll go into more
501 | detail down the road.
502 | 
503 | 105
504 | 00:04:07,940 --> 00:04:09,819
505 | Now let's visualize
506 | the tree so we can
507 | 
508 | 106
509 | 00:04:09,819 --> 00:04:11,762
510 | see how the classifier works.
511 | 
512 | 107
513 | 00:04:11,762 --> 00:04:13,220
514 | To do that, I'm
515 | going to copy-paste
516 | 
517 | 108
518 | 00:04:13,220 --> 00:04:15,220
519 | some code in from
520 | scikit's tutorials,
521 | 
522 | 109
523 | 00:04:15,220 --> 00:04:16,994
524 | and because this code
525 | is for visualization
526 | 
527 | 110
528 | 00:04:16,994 --> 00:04:18,410
529 | and not machine-learning
530 | concepts,
531 | 
532 | 111
533 | 00:04:18,410 --> 00:04:20,380
534 | I won't cover the details here.
535 | 
536 | 112
537 | 00:04:20,380 --> 00:04:22,759
538 | Note that I'm combining the
539 | code from these two examples
540 | 
541 | 113
542 | 00:04:22,759 --> 00:04:26,329
543 | to create an easy-to-read PDF.
544 | 
545 | 114
546 | 00:04:26,329 --> 00:04:28,440
547 | I can run our script
548 | and open up the PDF,
549 | 
550 | 115
551 | 00:04:28,440 --> 00:04:30,120
552 | and we can see the tree.
553 | 
554 | 116
555 | 00:04:30,120 --> 00:04:33,810
556 | To use it to classify data, you
557 | start by reading from the top.
558 | 
559 | 117
560 | 00:04:33,810 --> 00:04:35,829
561 | Each node asks a
562 | yes or no question
563 | 
564 | 118
565 | 00:04:35,829 --> 00:04:37,504
566 | about one of the features.
567 | 
568 | 119
569 | 00:04:37,504 --> 00:04:39,420
570 | For example, this node
571 | asks if the pedal width
572 | 
573 | 120
574 | 00:04:39,420 --> 00:04:41,420
575 | is less than 0.8 centimeters.
576 | 
577 | 121
578 | 00:04:41,420 --> 00:04:44,199
579 | If it's true for the example
580 | you're classifying, go left.
581 | 
582 | 122
583 | 00:04:44,199 --> 00:04:46,170
584 | Otherwise, go right.
585 | 
586 | 123
587 | 00:04:46,170 --> 00:04:48,589
588 | Now let's use this tree
589 | to classify an example
590 | 
591 | 124
592 | 00:04:48,589 --> 00:04:50,130
593 | from our testing data.
594 | 
595 | 125
596 | 00:04:50,130 --> 00:04:53,233
597 | Here are the features and label
598 | for our first testing flower.
599 | 
600 | 126
601 | 00:04:53,233 --> 00:04:54,899
602 | Remember, you can
603 | find the feature names
604 | 
605 | 127
606 | 00:04:54,899 --> 00:04:56,579
607 | by looking at the metadata.
608 | 
609 | 128
610 | 00:04:56,579 --> 00:04:58,980
611 | We know this flower is
612 | a setosa, so let's see
613 | 
614 | 129
615 | 00:04:58,980 --> 00:05:00,779
616 | what the tree predicts.
617 | 
618 | 130
619 | 00:05:00,779 --> 00:05:03,290
620 | I'll resize the windows to
621 | make this easier to see.
622 | 
623 | 131
624 | 00:05:03,290 --> 00:05:04,889
625 | And the first
626 | question the tree asks
627 | 
628 | 132
629 | 00:05:04,889 --> 00:05:08,110
630 | is whether the petal width
631 | is less than 0.8 centimeters.
632 | 
633 | 133
634 | 00:05:08,110 --> 00:05:09,540
635 | That's the fourth feature.
636 | 
637 | 134
638 | 00:05:09,540 --> 00:05:11,709
639 | The answer is true,
640 | so we proceed left.
641 | 
642 | 135
643 | 00:05:11,709 --> 00:05:14,149
644 | At this point, we're
645 | already at a leaf node.
646 | 
647 | 136
648 | 00:05:14,149 --> 00:05:15,860
649 | There are no other
650 | questions to ask,
651 | 
652 | 137
653 | 00:05:15,860 --> 00:05:18,490
654 | so the tree gives us
655 | a prediction, setosa,
656 | 
657 | 138
658 | 00:05:18,490 --> 00:05:19,440
659 | and it's right.
660 | 
661 | 139
662 | 00:05:19,440 --> 00:05:23,329
663 | Notice the label is 0, which
664 | indexes to that type of flower.
665 | 
666 | 140
667 | 00:05:23,329 --> 00:05:25,930
668 | Now let's try our
669 | second testing example.
670 | 
671 | 141
672 | 00:05:25,930 --> 00:05:27,319
673 | This one is a versicolor.
674 | 
675 | 142
676 | 00:05:27,319 --> 00:05:29,329
677 | Let's see what
678 | the tree predicts.
679 | 
680 | 143
681 | 00:05:29,329 --> 00:05:31,839
682 | Again we read from the top,
683 | and this time the pedal width
684 | 
685 | 144
686 | 00:05:31,839 --> 00:05:33,750
687 | is greater than 0.8 centimeters.
688 | 
689 | 145
690 | 00:05:33,750 --> 00:05:35,839
691 | The answer to the tree's
692 | question is false,
693 | 
694 | 146
695 | 00:05:35,839 --> 00:05:36,829
696 | so we go right.
697 | 
698 | 147
699 | 00:05:36,829 --> 00:05:39,245
700 | The next question the tree
701 | asks is whether the pedal width
702 | 
703 | 148
704 | 00:05:39,245 --> 00:05:40,709
705 | is less than 1.75.
706 | 
707 | 149
708 | 00:05:40,709 --> 00:05:42,410
709 | It's trying to narrow it down.
710 | 
711 | 150
712 | 00:05:42,410 --> 00:05:44,440
713 | That's true, so we go left.
714 | 
715 | 151
716 | 00:05:44,440 --> 00:05:47,319
717 | Now it asks if the pedal
718 | length is less than 4.95.
719 | 
720 | 152
721 | 00:05:47,319 --> 00:05:49,180
722 | That's true, so
723 | we go left again.
724 | 
725 | 153
726 | 00:05:49,180 --> 00:05:51,130
727 | And finally, the tree
728 | asks if the pedal width
729 | 
730 | 154
731 | 00:05:51,130 --> 00:05:52,810
732 | is less than 1.65.
733 | 
734 | 155
735 | 00:05:52,810 --> 00:05:54,300
736 | That's true, so left it is.
737 | 
738 | 156
739 | 00:05:54,300 --> 00:05:57,029
740 | And now we have our
741 | prediction-- it's a versicolor,
742 | 
743 | 157
744 | 00:05:57,029 --> 00:05:58,610
745 | and that's right again.
746 | 
747 | 158
748 | 00:05:58,610 --> 00:06:01,170
749 | You can try the last one
750 | on your own as an exercise.
751 | 
752 | 159
753 | 00:06:01,170 --> 00:06:03,079
754 | And remember, the way
755 | we're using the tree
756 | 
757 | 160
758 | 00:06:03,079 --> 00:06:05,607
759 | is the same way
760 | it works in code.
761 | 
762 | 161
763 | 00:06:05,607 --> 00:06:07,440
764 | So that's how you quickly
765 | visualize and read
766 | 
767 | 162
768 | 00:06:07,440 --> 00:06:08,285
769 | a decision tree.
770 | 
771 | 163
772 | 00:06:08,285 --> 00:06:09,660
773 | There's a lot more
774 | to learn here,
775 | 
776 | 164
777 | 00:06:09,660 --> 00:06:12,720
778 | especially how they're built
779 | automatically from examples.
780 | 
781 | 165
782 | 00:06:12,720 --> 00:06:14,620
783 | We'll get to that
784 | in a future episode.
785 | 
786 | 166
787 | 00:06:14,620 --> 00:06:17,019
788 | But for now, let's close
789 | with an essential point.
790 | 
791 | 167
792 | 00:06:17,019 --> 00:06:19,519
793 | Every question the tree
794 | asks must be about one
795 | 
796 | 168
797 | 00:06:19,519 --> 00:06:20,264
798 | of your features.
799 | 
800 | 169
801 | 00:06:20,264 --> 00:06:22,680
802 | That means the better your
803 | features are, the better a tree
804 | 
805 | 170
806 | 00:06:22,680 --> 00:06:23,630
807 | you can build.
808 | 
809 | 171
810 | 00:06:23,630 --> 00:06:25,300
811 | And the next episode
812 | will start looking
813 | 
814 | 172
815 | 00:06:25,300 --> 00:06:26,514
816 | at what makes a good feature.
817 | 
818 | 173
819 | 00:06:26,514 --> 00:06:28,930
820 | Thanks very much for watching,
821 | and I'll see you next time.
822 | 
823 | 174
824 | 00:06:28,930 --> 00:06:31,980
825 | [MUSIC PLAYING]
826 | 
827 | 175
828 | 00:06:31,980 --> 00:06:41,000
829 |  Subtitles End: mo.dbxdb.com
830 | 
831 | 


--------------------------------------------------------------------------------
/subtitle/Eng/What Makes a Good Feature - Machine Learning Recipes #3.srt:
--------------------------------------------------------------------------------
  1 | 1
  2 | 00:00:00,000 --> 00:00:00,000
  3 | Youtube subtitles download by mo.dbxdb.com 
  4 | 
  5 | 2
  6 | 00:00:00,000 --> 00:00:06,765
  7 | 
  8 | 
  9 | 3
 10 | 00:00:06,765 --> 00:00:08,140
 11 | JOSH GORDON:
 12 | Classifiers are only
 13 | 
 14 | 4
 15 | 00:00:08,140 --> 00:00:10,270
 16 | as good as the
 17 | features you provide.
 18 | 
 19 | 5
 20 | 00:00:10,270 --> 00:00:12,060
 21 | That means coming up
 22 | with good features
 23 | 
 24 | 6
 25 | 00:00:12,060 --> 00:00:14,740
 26 | is one of your most important
 27 | jobs in machine learning.
 28 | 
 29 | 7
 30 | 00:00:14,740 --> 00:00:17,059
 31 | But what makes a good
 32 | feature, and how can you tell?
 33 | 
 34 | 8
 35 | 00:00:17,059 --> 00:00:19,400
 36 | If you're doing
 37 | binary classification,
 38 | 
 39 | 9
 40 | 00:00:19,400 --> 00:00:21,670
 41 | then a good feature
 42 | makes it easy to decide
 43 | 
 44 | 10
 45 | 00:00:21,670 --> 00:00:23,270
 46 | between two different things.
 47 | 
 48 | 11
 49 | 00:00:23,270 --> 00:00:26,100
 50 | For example, imagine we
 51 | wanted to write a classifier
 52 | 
 53 | 12
 54 | 00:00:26,100 --> 00:00:29,090
 55 | to tell the difference
 56 | between two types of dogs--
 57 | 
 58 | 13
 59 | 00:00:29,090 --> 00:00:30,890
 60 | greyhounds and Labradors.
 61 | 
 62 | 14
 63 | 00:00:30,890 --> 00:00:34,090
 64 | Here we'll use two features--
 65 | the dog's height in inches
 66 | 
 67 | 15
 68 | 00:00:34,090 --> 00:00:35,490
 69 | and their eye color.
 70 | 
 71 | 16
 72 | 00:00:35,490 --> 00:00:38,490
 73 | Just for this toy example,
 74 | let's make a couple assumptions
 75 | 
 76 | 17
 77 | 00:00:38,490 --> 00:00:40,930
 78 | about dogs to keep
 79 | things simple.
 80 | 
 81 | 18
 82 | 00:00:40,930 --> 00:00:43,049
 83 | First, we'll say that
 84 | greyhounds are usually
 85 | 
 86 | 19
 87 | 00:00:43,049 --> 00:00:44,180
 88 | taller than Labradors.
 89 | 
 90 | 20
 91 | 00:00:44,180 --> 00:00:47,020
 92 | Next, we'll pretend that
 93 | dogs have only two eye
 94 | 
 95 | 21
 96 | 00:00:47,020 --> 00:00:48,750
 97 | colors-- blue and brown.
 98 | 
 99 | 22
100 | 00:00:48,750 --> 00:00:50,760
101 | And we'll say the
102 | color of their eyes
103 | 
104 | 23
105 | 00:00:50,760 --> 00:00:53,160
106 | doesn't depend on
107 | the breed of dog.
108 | 
109 | 24
110 | 00:00:53,160 --> 00:00:55,520
111 | This means that one of
112 | these features is useful
113 | 
114 | 25
115 | 00:00:55,520 --> 00:00:57,480
116 | and the other tells us nothing.
117 | 
118 | 26
119 | 00:00:57,480 --> 00:01:01,260
120 | To understand why, we'll
121 | visualize them using a toy
122 | 
123 | 27
124 | 00:01:01,260 --> 00:01:02,970
125 | dataset I'll create.
126 | 
127 | 28
128 | 00:01:02,970 --> 00:01:04,300
129 | Let's begin with height.
130 | 
131 | 29
132 | 00:01:04,300 --> 00:01:06,650
133 | How useful do you
134 | think this feature is?
135 | 
136 | 30
137 | 00:01:06,650 --> 00:01:08,069
138 | Well, on average,
139 | greyhounds tend
140 | 
141 | 31
142 | 00:01:08,069 --> 00:01:11,310
143 | to be a couple inches taller
144 | than Labradors, but not always.
145 | 
146 | 32
147 | 00:01:11,310 --> 00:01:13,736
148 | There's a lot of
149 | variation in the world.
150 | 
151 | 33
152 | 00:01:13,736 --> 00:01:15,110
153 | So when we think
154 | of a feature, we
155 | 
156 | 34
157 | 00:01:15,110 --> 00:01:17,620
158 | have to consider how it
159 | looks for different values
160 | 
161 | 35
162 | 00:01:17,620 --> 00:01:19,630
163 | in a population.
164 | 
165 | 36
166 | 00:01:19,630 --> 00:01:22,360
167 | Let's head into Python for
168 | a programmatic example.
169 | 
170 | 37
171 | 00:01:22,360 --> 00:01:24,440
172 | I'm creating a
173 | population of 1,000
174 | 
175 | 38
176 | 00:01:24,440 --> 00:01:27,736
177 | dogs-- 50-50 greyhound Labrador.
178 | 
179 | 39
180 | 00:01:27,736 --> 00:01:29,069
181 | I'll give each of them a height.
182 | 
183 | 40
184 | 00:01:29,069 --> 00:01:31,500
185 | For this example, we'll
186 | say that greyhounds
187 | 
188 | 41
189 | 00:01:31,500 --> 00:01:35,510
190 | are on average 28 inches
191 | tall and Labradors are 24.
192 | 
193 | 42
194 | 00:01:35,510 --> 00:01:37,563
195 | Now, all dogs are
196 | a bit different.
197 | 
198 | 43
199 | 00:01:37,563 --> 00:01:39,480
200 | Let's say that height
201 | is normally distributed,
202 | 
203 | 44
204 | 00:01:39,480 --> 00:01:42,790
205 | so we'll make both of these
206 | plus or minus 4 inches.
207 | 
208 | 45
209 | 00:01:42,790 --> 00:01:44,660
210 | This will give us two
211 | arrays of numbers,
212 | 
213 | 46
214 | 00:01:44,660 --> 00:01:47,200
215 | and we can visualize
216 | them in a histogram.
217 | 
218 | 47
219 | 00:01:47,200 --> 00:01:49,520
220 | I'll add a parameter so
221 | greyhounds are in red
222 | 
223 | 48
224 | 00:01:49,520 --> 00:01:51,319
225 | and Labradors are in blue.
226 | 
227 | 49
228 | 00:01:51,319 --> 00:01:53,319
229 | Now we can run our script.
230 | 
231 | 50
232 | 00:01:53,319 --> 00:01:57,459
233 | This shows how many dogs in our
234 | population have a given height.
235 | 
236 | 51
237 | 00:01:57,459 --> 00:01:58,959
238 | There's a lot of
239 | data on the screen,
240 | 
241 | 52
242 | 00:01:58,959 --> 00:02:03,202
243 | so let's simplify it and
244 | look at it piece by piece.
245 | 
246 | 53
247 | 00:02:03,202 --> 00:02:05,230
248 | We'll start with
249 | dogs on the far left
250 | 
251 | 54
252 | 00:02:05,230 --> 00:02:08,599
253 | of the distribution-- say,
254 | who are about 20 inches tall.
255 | 
256 | 55
257 | 00:02:08,599 --> 00:02:11,380
258 | Imagine I asked you to predict
259 | whether a dog with his height
260 | 
261 | 56
262 | 00:02:11,380 --> 00:02:13,300
263 | was a lab or a greyhound.
264 | 
265 | 57
266 | 00:02:13,300 --> 00:02:14,180
267 | What would you do?
268 | 
269 | 58
270 | 00:02:14,180 --> 00:02:16,710
271 | Well, you could figure out
272 | the probability of each type
273 | 
274 | 59
275 | 00:02:16,710 --> 00:02:18,669
276 | of dog given their height.
277 | 
278 | 60
279 | 00:02:18,669 --> 00:02:20,940
280 | Here, it's more likely
281 | the dog is a lab.
282 | 
283 | 61
284 | 00:02:20,940 --> 00:02:22,967
285 | On the other hand,
286 | if we go all the way
287 | 
288 | 62
289 | 00:02:22,967 --> 00:02:24,550
290 | to the right of the
291 | histogram and look
292 | 
293 | 63
294 | 00:02:24,550 --> 00:02:26,949
295 | at a dog who is
296 | 35 inches tall, we
297 | 
298 | 64
299 | 00:02:26,949 --> 00:02:29,449
300 | can be pretty confident
301 | they're a greyhound.
302 | 
303 | 65
304 | 00:02:29,449 --> 00:02:31,300
305 | Now, what about a
306 | dog in the middle?
307 | 
308 | 66
309 | 00:02:31,300 --> 00:02:33,520
310 | You can see the graph
311 | gives us less information
312 | 
313 | 67
314 | 00:02:33,520 --> 00:02:36,750
315 | here, because the probability
316 | of each type of dog is close.
317 | 
318 | 68
319 | 00:02:36,750 --> 00:02:40,220
320 | So height is a useful
321 | feature, but it's not perfect.
322 | 
323 | 69
324 | 00:02:40,220 --> 00:02:42,280
325 | That's why in machine
326 | learning, you almost always
327 | 
328 | 70
329 | 00:02:42,280 --> 00:02:43,482
330 | need multiple features.
331 | 
332 | 71
333 | 00:02:43,482 --> 00:02:45,440
334 | Otherwise, you could just
335 | write an if statement
336 | 
337 | 72
338 | 00:02:45,440 --> 00:02:47,160
339 | instead of bothering
340 | with the classifier.
341 | 
342 | 73
343 | 00:02:47,160 --> 00:02:50,590
344 | To figure out what types
345 | of features you should use,
346 | 
347 | 74
348 | 00:02:50,590 --> 00:02:52,389
349 | do a thought experiment.
350 | 
351 | 75
352 | 00:02:52,389 --> 00:02:53,819
353 | Pretend you're the classifier.
354 | 
355 | 76
356 | 00:02:53,819 --> 00:02:55,870
357 | If you were trying to
358 | figure out if this dog is
359 | 
360 | 77
361 | 00:02:55,870 --> 00:03:00,167
362 | a lab or a greyhound, what other
363 | things would you want to know?
364 | 
365 | 78
366 | 00:03:00,167 --> 00:03:01,750
367 | You might ask about
368 | their hair length,
369 | 
370 | 79
371 | 00:03:01,750 --> 00:03:04,680
372 | or how fast they can run,
373 | or how much they weigh.
374 | 
375 | 80
376 | 00:03:04,680 --> 00:03:06,979
377 | Exactly how many
378 | features you should use
379 | 
380 | 81
381 | 00:03:06,979 --> 00:03:08,550
382 | is more of an art
383 | than a science,
384 | 
385 | 82
386 | 00:03:08,550 --> 00:03:10,720
387 | but as a rule of thumb,
388 | think about how many you'd
389 | 
390 | 83
391 | 00:03:10,720 --> 00:03:12,620
392 | need to solve the problem.
393 | 
394 | 84
395 | 00:03:12,620 --> 00:03:15,590
396 | Now let's look at another
397 | feature like eye color.
398 | 
399 | 85
400 | 00:03:15,590 --> 00:03:17,470
401 | Just for this toy
402 | example, let's imagine
403 | 
404 | 86
405 | 00:03:17,470 --> 00:03:20,500
406 | dogs have only two eye
407 | colors, blue and brown.
408 | 
409 | 87
410 | 00:03:20,500 --> 00:03:22,099
411 | And let's say the
412 | color of their eyes
413 | 
414 | 88
415 | 00:03:22,099 --> 00:03:24,500
416 | doesn't depend on
417 | the breed of dog.
418 | 
419 | 89
420 | 00:03:24,500 --> 00:03:28,590
421 | Here's what a histogram might
422 | look like for this example.
423 | 
424 | 90
425 | 00:03:28,590 --> 00:03:32,169
426 | For most values, the
427 | distribution is about 50/50.
428 | 
429 | 91
430 | 00:03:32,169 --> 00:03:33,849
431 | So this feature
432 | tells us nothing,
433 | 
434 | 92
435 | 00:03:33,849 --> 00:03:36,110
436 | because it doesn't correlate
437 | with the type of dog.
438 | 
439 | 93
440 | 00:03:36,110 --> 00:03:39,199
441 | Including a useless feature
442 | like this in your training
443 | 
444 | 94
445 | 00:03:39,199 --> 00:03:41,940
446 | data can hurt your
447 | classifier's accuracy.
448 | 
449 | 95
450 | 00:03:41,940 --> 00:03:45,210
451 | That's because there's a chance
452 | they might appear useful purely
453 | 
454 | 96
455 | 00:03:45,210 --> 00:03:48,430
456 | by accident, especially if
457 | you have only a small amount
458 | 
459 | 97
460 | 00:03:48,430 --> 00:03:50,039
461 | of training data.
462 | 
463 | 98
464 | 00:03:50,039 --> 00:03:52,319
465 | You also want your
466 | features to be independent.
467 | 
468 | 99
469 | 00:03:52,319 --> 00:03:54,599
470 | And independent
471 | features give you
472 | 
473 | 100
474 | 00:03:54,599 --> 00:03:56,870
475 | different types of information.
476 | 
477 | 101
478 | 00:03:56,870 --> 00:03:59,360
479 | Imagine we already have a
480 | feature-- height and inches--
481 | 
482 | 102
483 | 00:03:59,360 --> 00:04:00,800
484 | in our dataset.
485 | 
486 | 103
487 | 00:04:00,800 --> 00:04:02,250
488 | Ask yourself,
489 | would it be helpful
490 | 
491 | 104
492 | 00:04:02,250 --> 00:04:05,800
493 | if we added another feature,
494 | like height in centimeters?
495 | 
496 | 105
497 | 00:04:05,800 --> 00:04:08,229
498 | No, because it's perfectly
499 | correlated with one
500 | 
501 | 106
502 | 00:04:08,229 --> 00:04:09,410
503 | we already have.
504 | 
505 | 107
506 | 00:04:09,410 --> 00:04:12,650
507 | It's good practice to remove
508 | highly correlated features
509 | 
510 | 108
511 | 00:04:12,650 --> 00:04:14,032
512 | from your training data.
513 | 
514 | 109
515 | 00:04:14,032 --> 00:04:15,490
516 | That's because a
517 | lot of classifiers
518 | 
519 | 110
520 | 00:04:15,490 --> 00:04:18,190
521 | aren't smart enough to
522 | realize that height in inches
523 | 
524 | 111
525 | 00:04:18,190 --> 00:04:20,199
526 | in centimeters are
527 | the same thing,
528 | 
529 | 112
530 | 00:04:20,199 --> 00:04:23,339
531 | so they might double count
532 | how important this feature is.
533 | 
534 | 113
535 | 00:04:23,339 --> 00:04:26,600
536 | Last, you want your features
537 | to be easy to understand.
538 | 
539 | 114
540 | 00:04:26,600 --> 00:04:28,730
541 | For a new example,
542 | imagine you want
543 | 
544 | 115
545 | 00:04:28,730 --> 00:04:30,329
546 | to predict how many
547 | days it will take
548 | 
549 | 116
550 | 00:04:30,329 --> 00:04:33,579
551 | to mail a letter between
552 | two different cities.
553 | 
554 | 117
555 | 00:04:33,579 --> 00:04:37,130
556 | The farther apart the cities
557 | are, the longer it will take.
558 | 
559 | 118
560 | 00:04:37,130 --> 00:04:39,649
561 | A great feature to use
562 | would be the distance
563 | 
564 | 119
565 | 00:04:39,649 --> 00:04:42,199
566 | between the cities in miles.
567 | 
568 | 120
569 | 00:04:42,199 --> 00:04:44,220
570 | A much worse pair
571 | of features to use
572 | 
573 | 121
574 | 00:04:44,220 --> 00:04:47,160
575 | would be the city's locations
576 | given by their latitude
577 | 
578 | 122
579 | 00:04:47,160 --> 00:04:48,259
580 | and longitude.
581 | 
582 | 123
583 | 00:04:48,259 --> 00:04:48,259
584 | And here's why.
585 | 
586 | 124
587 | 00:04:48,970 --> 00:04:51,120
588 | I can look at the
589 | distance and make
590 | 
591 | 125
592 | 00:04:51,120 --> 00:04:54,100
593 | a good guess of how long it
594 | will take the letter to arrive.
595 | 
596 | 126
597 | 00:04:54,100 --> 00:04:56,880
598 | But learning the relationship
599 | between latitude, longitude,
600 | 
601 | 127
602 | 00:04:56,880 --> 00:05:00,019
603 | and time is much harder
604 | and would require many more
605 | 
606 | 128
607 | 00:05:00,019 --> 00:05:01,985
608 | examples in your training data.
609 | 
610 | 129
611 | 00:05:01,985 --> 00:05:03,360
612 | Now, there are
613 | techniques you can
614 | 
615 | 130
616 | 00:05:03,360 --> 00:05:05,970
617 | use to figure out exactly
618 | how useful your features are,
619 | 
620 | 131
621 | 00:05:05,970 --> 00:05:08,920
622 | and even what combinations
623 | of them are best,
624 | 
625 | 132
626 | 00:05:08,920 --> 00:05:11,389
627 | so you never have to
628 | leave it to chance.
629 | 
630 | 133
631 | 00:05:11,389 --> 00:05:13,769
632 | We'll get to those
633 | in a future episode.
634 | 
635 | 134
636 | 00:05:13,769 --> 00:05:16,230
637 | Coming up next time, we'll
638 | continue building our intuition
639 | 
640 | 135
641 | 00:05:16,230 --> 00:05:17,750
642 | for supervised learning.
643 | 
644 | 136
645 | 00:05:17,750 --> 00:05:19,680
646 | We'll show how different
647 | types of classifiers
648 | 
649 | 137
650 | 00:05:19,680 --> 00:05:22,290
651 | can be used to solve the same
652 | problem and dive a little bit
653 | 
654 | 138
655 | 00:05:22,290 --> 00:05:24,240
656 | deeper into how they work.
657 | 
658 | 139
659 | 00:05:24,240 --> 00:05:27,220
660 | Thanks very much for watching,
661 | and I'll see you then.
662 | 
663 | 140
664 | 00:05:27,220 --> 00:05:40,000
665 |  Subtitles End: mo.dbxdb.com
666 | 
667 | 


--------------------------------------------------------------------------------
/subtitle/Eng/Writing Our First Classifier - Machine Learning Recipes #5.srt:
--------------------------------------------------------------------------------
   1 | 1
   2 | 00:00:00,000 --> 00:00:00,000
   3 | Youtube subtitles download by mo.dbxdb.com 
   4 | 
   5 | 2
   6 | 00:00:00,000 --> 00:00:06,030
   7 | [MUSIC PLAYING]
   8 | 
   9 | 3
  10 | 00:00:06,030 --> 00:00:06,030
  11 | Hey, everyone.
  12 | 
  13 | 4
  14 | 00:00:06,710 --> 00:00:07,910
  15 | Welcome back.
  16 | 
  17 | 5
  18 | 00:00:07,910 --> 00:00:10,220
  19 | In this episode, we're going
  20 | to do something special,
  21 | 
  22 | 6
  23 | 00:00:10,220 --> 00:00:13,164
  24 | and that's write our own
  25 | classifier from scratch.
  26 | 
  27 | 7
  28 | 00:00:13,164 --> 00:00:14,580
  29 | If you're new to
  30 | machine learning,
  31 | 
  32 | 8
  33 | 00:00:14,580 --> 00:00:16,170
  34 | this is a big milestone.
  35 | 
  36 | 9
  37 | 00:00:16,170 --> 00:00:18,560
  38 | Because if you can follow
  39 | along and do this on your own,
  40 | 
  41 | 10
  42 | 00:00:18,560 --> 00:00:21,890
  43 | it means you understand an
  44 | important piece of the puzzle.
  45 | 
  46 | 11
  47 | 00:00:21,890 --> 00:00:23,670
  48 | The classifier we're
  49 | going to write today
  50 | 
  51 | 12
  52 | 00:00:23,670 --> 00:00:26,390
  53 | is a scrappy version
  54 | of k-Nearest Neighbors.
  55 | 
  56 | 13
  57 | 00:00:26,390 --> 00:00:29,660
  58 | That's one of the simplest
  59 | classifiers around.
  60 | 
  61 | 14
  62 | 00:00:29,660 --> 00:00:32,860
  63 | First, here's a quick outline of
  64 | what we'll do in this episode.
  65 | 
  66 | 15
  67 | 00:00:32,860 --> 00:00:35,170
  68 | We'll start with our code
  69 | from Episode 4, Let's
  70 | 
  71 | 16
  72 | 00:00:35,170 --> 00:00:36,570
  73 | Write a Pipeline.
  74 | 
  75 | 17
  76 | 00:00:36,570 --> 00:00:39,290
  77 | Recall in that episode we
  78 | did a simple experiment.
  79 | 
  80 | 18
  81 | 00:00:39,290 --> 00:00:42,720
  82 | We imported a data set and
  83 | split it into train and test.
  84 | 
  85 | 19
  86 | 00:00:42,720 --> 00:00:44,580
  87 | We used train to
  88 | train a classifier,
  89 | 
  90 | 20
  91 | 00:00:44,580 --> 00:00:47,150
  92 | and test to see how
  93 | accurate it was.
  94 | 
  95 | 21
  96 | 00:00:47,150 --> 00:00:48,760
  97 | Writing the
  98 | classifier is the part
  99 | 
 100 | 22
 101 | 00:00:48,760 --> 00:00:50,940
 102 | we're going to focus on today.
 103 | 
 104 | 23
 105 | 00:00:50,940 --> 00:00:52,740
 106 | Previously we imported
 107 | the classifier
 108 | 
 109 | 24
 110 | 00:00:52,740 --> 00:00:55,280
 111 | from a library using
 112 | these two lines.
 113 | 
 114 | 25
 115 | 00:00:55,280 --> 00:00:58,190
 116 | Here we'll comment them
 117 | out and write our own.
 118 | 
 119 | 26
 120 | 00:00:58,190 --> 00:01:01,539
 121 | The rest of the pipeline
 122 | will stay exactly the same.
 123 | 
 124 | 27
 125 | 00:01:01,539 --> 00:01:03,830
 126 | I'll pop in and out of the
 127 | screencast to explain things
 128 | 
 129 | 28
 130 | 00:01:03,830 --> 00:01:05,940
 131 | as we go along.
 132 | 
 133 | 29
 134 | 00:01:05,940 --> 00:01:08,530
 135 | To start, let's run our
 136 | pipeline to remind ourselves
 137 | 
 138 | 30
 139 | 00:01:08,530 --> 00:01:10,120
 140 | what the accuracy was.
 141 | 
 142 | 31
 143 | 00:01:10,120 --> 00:01:12,370
 144 | As you can see, it's over 90%.
 145 | 
 146 | 32
 147 | 00:01:12,370 --> 00:01:14,260
 148 | And that's the goal
 149 | for the classifier
 150 | 
 151 | 33
 152 | 00:01:14,260 --> 00:01:15,660
 153 | we'll write ourselves.
 154 | 
 155 | 34
 156 | 00:01:15,660 --> 00:01:17,680
 157 | Now let's comment
 158 | out that import.
 159 | 
 160 | 35
 161 | 00:01:17,680 --> 00:01:19,540
 162 | Right off the bat,
 163 | this breaks our code.
 164 | 
 165 | 36
 166 | 00:01:19,540 --> 00:01:22,250
 167 | So the first thing we need
 168 | to do is fix our pipeline.
 169 | 
 170 | 37
 171 | 00:01:22,250 --> 00:01:24,849
 172 | And to do that, we'll implement
 173 | a class for our classifier.
 174 | 
 175 | 38
 176 | 00:01:24,849 --> 00:01:27,690
 177 | 
 178 | 
 179 | 39
 180 | 00:01:27,690 --> 00:01:29,580
 181 | I'll call it ScrappyKNN.
 182 | 
 183 | 40
 184 | 00:01:29,580 --> 00:01:31,690
 185 | And by scrappy, I
 186 | mean bare bones.
 187 | 
 188 | 41
 189 | 00:01:31,690 --> 00:01:33,709
 190 | Just enough to get it working.
 191 | 
 192 | 42
 193 | 00:01:33,709 --> 00:01:38,110
 194 | Next, I'll change our
 195 | pipeline to use it.
 196 | 
 197 | 43
 198 | 00:01:38,110 --> 00:01:40,880
 199 | Now let's see what methods
 200 | we need to implement.
 201 | 
 202 | 44
 203 | 00:01:40,880 --> 00:01:42,950
 204 | Looking at the interface
 205 | for a classifier,
 206 | 
 207 | 45
 208 | 00:01:42,950 --> 00:01:45,500
 209 | we see there are two
 210 | we care about-- fit,
 211 | 
 212 | 46
 213 | 00:01:45,500 --> 00:01:47,470
 214 | which does the
 215 | training, and predict,
 216 | 
 217 | 47
 218 | 00:01:47,470 --> 00:01:49,270
 219 | which does the prediction.
 220 | 
 221 | 48
 222 | 00:01:49,270 --> 00:01:51,599
 223 | First we'll declare
 224 | our fit method.
 225 | 
 226 | 49
 227 | 00:01:51,599 --> 00:01:54,440
 228 | Remember this takes the features
 229 | and labels for the training set
 230 | 
 231 | 50
 232 | 00:01:54,440 --> 00:01:57,680
 233 | as input, so we'll add
 234 | parameters for those.
 235 | 
 236 | 51
 237 | 00:01:57,680 --> 00:02:00,090
 238 | Now let's move on to
 239 | our predict method.
 240 | 
 241 | 52
 242 | 00:02:00,090 --> 00:02:03,470
 243 | As input, this receives the
 244 | features for our testing data.
 245 | 
 246 | 53
 247 | 00:02:03,470 --> 00:02:06,920
 248 | And as output, it returns
 249 | predictions for the labels.
 250 | 
 251 | 54
 252 | 00:02:06,920 --> 00:02:09,389
 253 | Our first goal is to get
 254 | the pipeline working,
 255 | 
 256 | 55
 257 | 00:02:09,389 --> 00:02:12,229
 258 | and to understand
 259 | what these methods do.
 260 | 
 261 | 56
 262 | 00:02:12,229 --> 00:02:14,180
 263 | So before we write
 264 | our real classifier,
 265 | 
 266 | 57
 267 | 00:02:14,180 --> 00:02:15,810
 268 | we'll start with
 269 | something simpler.
 270 | 
 271 | 58
 272 | 00:02:15,810 --> 00:02:18,240
 273 | We'll write a random classifier.
 274 | 
 275 | 59
 276 | 00:02:18,240 --> 00:02:21,690
 277 | And by random, I mean
 278 | we'll just guess the label.
 279 | 
 280 | 60
 281 | 00:02:21,690 --> 00:02:25,310
 282 | To start, we'll add some code
 283 | to the fit and predict methods.
 284 | 
 285 | 61
 286 | 00:02:25,310 --> 00:02:28,460
 287 | In fit, I'll store the
 288 | training data in this class.
 289 | 
 290 | 62
 291 | 00:02:28,460 --> 00:02:30,479
 292 | You can think of this
 293 | as just memorizing it.
 294 | 
 295 | 63
 296 | 00:02:30,479 --> 00:02:32,840
 297 | And you'll see why
 298 | we do that later on.
 299 | 
 300 | 64
 301 | 00:02:32,840 --> 00:02:34,669
 302 | Inside the predict
 303 | method, remember
 304 | 
 305 | 65
 306 | 00:02:34,669 --> 00:02:37,410
 307 | that we'll need to return
 308 | a list of predictions.
 309 | 
 310 | 66
 311 | 00:02:37,410 --> 00:02:40,090
 312 | That's because the parameter,
 313 | X_test, is actually
 314 | 
 315 | 67
 316 | 00:02:40,090 --> 00:02:42,729
 317 | a 2D array, or list of lists.
 318 | 
 319 | 68
 320 | 00:02:42,729 --> 00:02:46,410
 321 | Each row contains the features
 322 | for one testing example.
 323 | 
 324 | 69
 325 | 00:02:46,410 --> 00:02:48,169
 326 | To make a prediction
 327 | for each row,
 328 | 
 329 | 70
 330 | 00:02:48,169 --> 00:02:50,830
 331 | I'll just randomly pick a
 332 | label from the training data
 333 | 
 334 | 71
 335 | 00:02:50,830 --> 00:02:53,340
 336 | and append that to
 337 | our predictions.
 338 | 
 339 | 72
 340 | 00:02:53,340 --> 00:02:55,660
 341 | At this point, our
 342 | pipeline is working again.
 343 | 
 344 | 73
 345 | 00:02:55,660 --> 00:02:58,028
 346 | So let's run it and
 347 | see how well it does.
 348 | 
 349 | 74
 350 | 00:02:58,028 --> 00:03:00,069
 351 | Recall there are three
 352 | different types of flowers
 353 | 
 354 | 75
 355 | 00:03:00,069 --> 00:03:05,069
 356 | in the iris dataset, so
 357 | accuracy should be about 33%.
 358 | 
 359 | 76
 360 | 00:03:05,069 --> 00:03:07,110
 361 | Now we know the interface
 362 | for a classifier.
 363 | 
 364 | 77
 365 | 00:03:07,110 --> 00:03:09,060
 366 | But when we started
 367 | this exercise,
 368 | 
 369 | 78
 370 | 00:03:09,060 --> 00:03:11,770
 371 | our accuracy was above 90%.
 372 | 
 373 | 79
 374 | 00:03:11,770 --> 00:03:14,500
 375 | So let's see if
 376 | we can do better.
 377 | 
 378 | 80
 379 | 00:03:14,500 --> 00:03:16,849
 380 | To do that, we'll
 381 | implement our classifier,
 382 | 
 383 | 81
 384 | 00:03:16,849 --> 00:03:19,380
 385 | which is based on
 386 | k-Nearest Neighbors.
 387 | 
 388 | 82
 389 | 00:03:19,380 --> 00:03:22,468
 390 | Here's the intuition for
 391 | how that algorithm works.
 392 | 
 393 | 83
 394 | 00:03:22,468 --> 00:03:24,759
 395 | Let's return to our drawings
 396 | of green dots and red dots
 397 | 
 398 | 84
 399 | 00:03:24,759 --> 00:03:26,400
 400 | from the last episode.
 401 | 
 402 | 85
 403 | 00:03:26,400 --> 00:03:27,990
 404 | Imagine the dots we
 405 | see on the screen
 406 | 
 407 | 86
 408 | 00:03:27,990 --> 00:03:30,880
 409 | are the training data we
 410 | memorized in the fit method,
 411 | 
 412 | 87
 413 | 00:03:30,880 --> 00:03:33,419
 414 | say for a toy dataset.
 415 | 
 416 | 88
 417 | 00:03:33,419 --> 00:03:36,220
 418 | Now imagine we're asked to make
 419 | a prediction for this testing
 420 | 
 421 | 89
 422 | 00:03:36,220 --> 00:03:38,259
 423 | point that I'll
 424 | draw here in gray.
 425 | 
 426 | 90
 427 | 00:03:38,259 --> 00:03:39,960
 428 | How can we do that?
 429 | 
 430 | 91
 431 | 00:03:39,960 --> 00:03:41,889
 432 | Well in a nearest
 433 | neighbor classifier,
 434 | 
 435 | 92
 436 | 00:03:41,889 --> 00:03:44,220
 437 | it works exactly like it sounds.
 438 | 
 439 | 93
 440 | 00:03:44,220 --> 00:03:45,940
 441 | We'll find the
 442 | training point that's
 443 | 
 444 | 94
 445 | 00:03:45,940 --> 00:03:48,069
 446 | closest to the testing point.
 447 | 
 448 | 95
 449 | 00:03:48,069 --> 00:03:50,392
 450 | This point is the
 451 | nearest neighbor.
 452 | 
 453 | 96
 454 | 00:03:50,392 --> 00:03:51,849
 455 | Then we'll predict
 456 | that the testing
 457 | 
 458 | 97
 459 | 00:03:51,849 --> 00:03:54,169
 460 | point has the same label.
 461 | 
 462 | 98
 463 | 00:03:54,169 --> 00:03:56,880
 464 | For example, we'll guess that
 465 | this testing dot is green,
 466 | 
 467 | 99
 468 | 00:03:56,880 --> 00:03:59,880
 469 | because that's the color
 470 | of its nearest neighbor.
 471 | 
 472 | 100
 473 | 00:03:59,880 --> 00:04:02,430
 474 | As another example, if we
 475 | had a testing dot over here,
 476 | 
 477 | 101
 478 | 00:04:02,430 --> 00:04:04,169
 479 | we'd guess that it's red.
 480 | 
 481 | 102
 482 | 00:04:04,169 --> 00:04:06,400
 483 | Now what about this one
 484 | right in the middle?
 485 | 
 486 | 103
 487 | 00:04:06,400 --> 00:04:08,729
 488 | Imagine that this dot is
 489 | equidistant to the nearest
 490 | 
 491 | 104
 492 | 00:04:08,729 --> 00:04:10,750
 493 | green dot and the
 494 | nearest red one.
 495 | 
 496 | 105
 497 | 00:04:10,750 --> 00:04:13,569
 498 | There's a tie, so how
 499 | could we classify it?
 500 | 
 501 | 106
 502 | 00:04:13,569 --> 00:04:15,960
 503 | Well one way is we could
 504 | randomly break the tie.
 505 | 
 506 | 107
 507 | 00:04:15,960 --> 00:04:18,970
 508 | But there's another way,
 509 | and that's where k comes in.
 510 | 
 511 | 108
 512 | 00:04:18,970 --> 00:04:20,680
 513 | K is the number of
 514 | neighbors we consider
 515 | 
 516 | 109
 517 | 00:04:20,680 --> 00:04:22,339
 518 | when making our prediction.
 519 | 
 520 | 110
 521 | 00:04:22,339 --> 00:04:25,529
 522 | If k was 1, we'd just look at
 523 | the closest training point.
 524 | 
 525 | 111
 526 | 00:04:25,529 --> 00:04:28,170
 527 | But if k was 3, we'd look
 528 | at the three closest.
 529 | 
 530 | 112
 531 | 00:04:28,170 --> 00:04:30,860
 532 | In this case, two of those
 533 | are green and one is red.
 534 | 
 535 | 113
 536 | 00:04:30,860 --> 00:04:34,410
 537 | To predict, we could vote and
 538 | predict the majority class.
 539 | 
 540 | 114
 541 | 00:04:34,410 --> 00:04:36,230
 542 | Now there's more detail
 543 | to this algorithm,
 544 | 
 545 | 115
 546 | 00:04:36,230 --> 00:04:38,540
 547 | but that's enough
 548 | to get us started.
 549 | 
 550 | 116
 551 | 00:04:38,540 --> 00:04:40,199
 552 | To code this up,
 553 | first we'll need a way
 554 | 
 555 | 117
 556 | 00:04:40,199 --> 00:04:42,699
 557 | to find the nearest neighbor.
 558 | 
 559 | 118
 560 | 00:04:42,699 --> 00:04:44,839
 561 | And to do that, we'll
 562 | measure the straight line
 563 | 
 564 | 119
 565 | 00:04:44,839 --> 00:04:48,949
 566 | distance between two points,
 567 | just like you do with a ruler.
 568 | 
 569 | 120
 570 | 00:04:48,949 --> 00:04:52,100
 571 | There's a formula for that
 572 | called the Euclidean Distance,
 573 | 
 574 | 121
 575 | 00:04:52,100 --> 00:04:54,310
 576 | and here's what the
 577 | formula looks like.
 578 | 
 579 | 122
 580 | 00:04:54,310 --> 00:04:56,589
 581 | It measures the distance
 582 | between two points,
 583 | 
 584 | 123
 585 | 00:04:56,589 --> 00:04:59,250
 586 | and it works a bit like
 587 | the Pythagorean Theorem.
 588 | 
 589 | 124
 590 | 00:04:59,250 --> 00:05:02,350
 591 | A squared plus B squared
 592 | equals C squared.
 593 | 
 594 | 125
 595 | 00:05:02,350 --> 00:05:04,790
 596 | You can think of this term
 597 | as A, or the difference
 598 | 
 599 | 126
 600 | 00:05:04,790 --> 00:05:06,839
 601 | between the first two features.
 602 | 
 603 | 127
 604 | 00:05:06,839 --> 00:05:08,670
 605 | Likewise, you can think
 606 | of this term as B,
 607 | 
 608 | 128
 609 | 00:05:08,670 --> 00:05:11,170
 610 | or the difference between
 611 | the second pair of features.
 612 | 
 613 | 129
 614 | 00:05:11,170 --> 00:05:14,740
 615 | And the distance we compute is
 616 | the length of the hypotenuse.
 617 | 
 618 | 130
 619 | 00:05:14,740 --> 00:05:16,459
 620 | Now here's something cool.
 621 | 
 622 | 131
 623 | 00:05:16,459 --> 00:05:17,940
 624 | Right now we're
 625 | computing distance
 626 | 
 627 | 132
 628 | 00:05:17,940 --> 00:05:20,670
 629 | in two-dimensional space,
 630 | because we have just two
 631 | 
 632 | 133
 633 | 00:05:20,670 --> 00:05:22,779
 634 | features in our toy dataset.
 635 | 
 636 | 134
 637 | 00:05:22,779 --> 00:05:25,970
 638 | But what if we had three
 639 | features or three dimensions?
 640 | 
 641 | 135
 642 | 00:05:25,970 --> 00:05:28,199
 643 | Well then we'd be in a cube.
 644 | 
 645 | 136
 646 | 00:05:28,199 --> 00:05:30,449
 647 | We can still visualize
 648 | how to measure distance
 649 | 
 650 | 137
 651 | 00:05:30,449 --> 00:05:32,500
 652 | in the space with a ruler.
 653 | 
 654 | 138
 655 | 00:05:32,500 --> 00:05:35,269
 656 | But what if we had four
 657 | features or four dimensions,
 658 | 
 659 | 139
 660 | 00:05:35,269 --> 00:05:36,709
 661 | like we do in iris?
 662 | 
 663 | 140
 664 | 00:05:36,709 --> 00:05:38,500
 665 | Well, now we're in
 666 | a hypercube, and we
 667 | 
 668 | 141
 669 | 00:05:38,500 --> 00:05:40,740
 670 | can't visualize this very easy.
 671 | 
 672 | 142
 673 | 00:05:40,740 --> 00:05:42,449
 674 | The good news is the
 675 | Euclidean Distance
 676 | 
 677 | 143
 678 | 00:05:42,449 --> 00:05:46,009
 679 | works the same way regardless
 680 | of the number of dimensions.
 681 | 
 682 | 144
 683 | 00:05:46,009 --> 00:05:50,050
 684 | With more features, we can just
 685 | add more terms to the equation.
 686 | 
 687 | 145
 688 | 00:05:50,050 --> 00:05:52,060
 689 | You can find more details
 690 | about this online.
 691 | 
 692 | 146
 693 | 00:05:52,060 --> 00:05:54,670
 694 | 
 695 | 
 696 | 147
 697 | 00:05:54,670 --> 00:05:56,769
 698 | Now let's code up
 699 | Euclidean distance.
 700 | 
 701 | 148
 702 | 00:05:56,769 --> 00:05:58,269
 703 | There are plenty
 704 | of ways to do that,
 705 | 
 706 | 149
 707 | 00:05:58,269 --> 00:06:00,370
 708 | but we'll use a library
 709 | called scipy that's
 710 | 
 711 | 150
 712 | 00:06:00,370 --> 00:06:02,459
 713 | already installed by Anaconda.
 714 | 
 715 | 151
 716 | 00:06:02,459 --> 00:06:05,380
 717 | Here, A and B are lists
 718 | of numeric features.
 719 | 
 720 | 152
 721 | 00:06:05,380 --> 00:06:07,329
 722 | Say A is a point from
 723 | our training data,
 724 | 
 725 | 153
 726 | 00:06:07,329 --> 00:06:09,800
 727 | and B is a point from
 728 | our testing data.
 729 | 
 730 | 154
 731 | 00:06:09,800 --> 00:06:12,860
 732 | This function returns the
 733 | distance between them.
 734 | 
 735 | 155
 736 | 00:06:12,860 --> 00:06:14,440
 737 | That's all the math
 738 | we need, so now
 739 | 
 740 | 156
 741 | 00:06:14,440 --> 00:06:17,389
 742 | let's take a look at the
 743 | algorithm for a classifier.
 744 | 
 745 | 157
 746 | 00:06:17,389 --> 00:06:19,480
 747 | To make a prediction
 748 | for a test point,
 749 | 
 750 | 158
 751 | 00:06:19,480 --> 00:06:22,250
 752 | we'll calculate the distance
 753 | to all the training points.
 754 | 
 755 | 159
 756 | 00:06:22,250 --> 00:06:25,029
 757 | Then we'll predict the testing
 758 | point has the same label
 759 | 
 760 | 160
 761 | 00:06:25,029 --> 00:06:27,139
 762 | as the closest one.
 763 | 
 764 | 161
 765 | 00:06:27,139 --> 00:06:28,910
 766 | I'll delete the random
 767 | prediction we made,
 768 | 
 769 | 162
 770 | 00:06:28,910 --> 00:06:31,610
 771 | and replace it with a method
 772 | that finds the closest training
 773 | 
 774 | 163
 775 | 00:06:31,610 --> 00:06:33,470
 776 | point to the test point.
 777 | 
 778 | 164
 779 | 00:06:33,470 --> 00:06:35,579
 780 | For this video hard,
 781 | I'll hard-code k to 1,
 782 | 
 783 | 165
 784 | 00:06:35,579 --> 00:06:38,089
 785 | so we're writing a nearest
 786 | neighbor classifier.
 787 | 
 788 | 166
 789 | 00:06:38,089 --> 00:06:40,100
 790 | The k variable won't
 791 | appear in our code,
 792 | 
 793 | 167
 794 | 00:06:40,100 --> 00:06:42,949
 795 | since we'll always just
 796 | find the closest point.
 797 | 
 798 | 168
 799 | 00:06:42,949 --> 00:06:45,699
 800 | Inside this method, we'll loop
 801 | over all the training points
 802 | 
 803 | 169
 804 | 00:06:45,699 --> 00:06:48,250
 805 | and keep track of the
 806 | closest one so far.
 807 | 
 808 | 170
 809 | 00:06:48,250 --> 00:06:50,649
 810 | Remember that we memorized
 811 | the training data in our fit
 812 | 
 813 | 171
 814 | 00:06:50,649 --> 00:06:54,190
 815 | function, and X_train
 816 | contains the features.
 817 | 
 818 | 172
 819 | 00:06:54,190 --> 00:06:56,910
 820 | To start, I'll calculate the
 821 | distance from the test point
 822 | 
 823 | 173
 824 | 00:06:56,910 --> 00:06:58,740
 825 | to the first training point.
 826 | 
 827 | 174
 828 | 00:06:58,740 --> 00:07:00,990
 829 | I'll use this variable to
 830 | keep track of the shortest
 831 | 
 832 | 175
 833 | 00:07:00,990 --> 00:07:02,584
 834 | distance we've found so far.
 835 | 
 836 | 176
 837 | 00:07:02,584 --> 00:07:04,000
 838 | And I'll use this
 839 | variable to keep
 840 | 
 841 | 177
 842 | 00:07:04,000 --> 00:07:07,399
 843 | track of the index of the
 844 | training point that's closest.
 845 | 
 846 | 178
 847 | 00:07:07,399 --> 00:07:09,910
 848 | We'll need this later
 849 | to retrieve its label.
 850 | 
 851 | 179
 852 | 00:07:09,910 --> 00:07:12,370
 853 | Now we'll iterate over all
 854 | the other training points.
 855 | 
 856 | 180
 857 | 00:07:12,370 --> 00:07:14,410
 858 | And every time we
 859 | find a closer one,
 860 | 
 861 | 181
 862 | 00:07:14,410 --> 00:07:16,149
 863 | we'll update our variables.
 864 | 
 865 | 182
 866 | 00:07:16,149 --> 00:07:18,100
 867 | Finally, we'll use
 868 | the index to return
 869 | 
 870 | 183
 871 | 00:07:18,100 --> 00:07:22,110
 872 | the label for the
 873 | closest training example.
 874 | 
 875 | 184
 876 | 00:07:22,110 --> 00:07:24,779
 877 | At this point, we have a working
 878 | nearest neighbor classifier,
 879 | 
 880 | 185
 881 | 00:07:24,779 --> 00:07:29,509
 882 | so let's run it and see
 883 | what the accuracy is.
 884 | 
 885 | 186
 886 | 00:07:29,509 --> 00:07:31,329
 887 | As you can see, it's over 90%.
 888 | 
 889 | 187
 890 | 00:07:31,329 --> 00:07:32,250
 891 | And we did it.
 892 | 
 893 | 188
 894 | 00:07:32,250 --> 00:07:34,240
 895 | When you run this on
 896 | your own, the accuracy
 897 | 
 898 | 189
 899 | 00:07:34,240 --> 00:07:36,906
 900 | might be a bit different because
 901 | of randomness in the train test
 902 | 
 903 | 190
 904 | 00:07:36,906 --> 00:07:38,529
 905 | split.
 906 | 
 907 | 191
 908 | 00:07:38,529 --> 00:07:40,740
 909 | Now if you can code this
 910 | up and understand it,
 911 | 
 912 | 192
 913 | 00:07:40,740 --> 00:07:42,670
 914 | that's a big
 915 | accomplishment because it
 916 | 
 917 | 193
 918 | 00:07:42,670 --> 00:07:46,254
 919 | means you can write a simple
 920 | classifier from scratch.
 921 | 
 922 | 194
 923 | 00:07:46,254 --> 00:07:47,920
 924 | Now, there are a
 925 | number of pros and cons
 926 | 
 927 | 195
 928 | 00:07:47,920 --> 00:07:50,850
 929 | to this algorithm, many of
 930 | which you can find online.
 931 | 
 932 | 196
 933 | 00:07:50,850 --> 00:07:53,579
 934 | The basic pro is that it's
 935 | relatively easy to understand,
 936 | 
 937 | 197
 938 | 00:07:53,579 --> 00:07:56,000
 939 | and works reasonably
 940 | well for some problems.
 941 | 
 942 | 198
 943 | 00:07:56,000 --> 00:07:57,670
 944 | And the basic cons
 945 | are that it's slow,
 946 | 
 947 | 199
 948 | 00:07:57,670 --> 00:07:59,990
 949 | because it has to iterate
 950 | over every training point
 951 | 
 952 | 200
 953 | 00:07:59,990 --> 00:08:01,550
 954 | to make a prediction.
 955 | 
 956 | 201
 957 | 00:08:01,550 --> 00:08:04,069
 958 | And importantly, as
 959 | we saw in Episode 3,
 960 | 
 961 | 202
 962 | 00:08:04,069 --> 00:08:06,384
 963 | some features are more
 964 | informative than others.
 965 | 
 966 | 203
 967 | 00:08:06,384 --> 00:08:08,050
 968 | But there's not an
 969 | easy way to represent
 970 | 
 971 | 204
 972 | 00:08:08,050 --> 00:08:10,120
 973 | that in k-Nearest Neighbors.
 974 | 
 975 | 205
 976 | 00:08:10,120 --> 00:08:12,149
 977 | In the long run, we
 978 | want a classifier
 979 | 
 980 | 206
 981 | 00:08:12,149 --> 00:08:15,259
 982 | that learns more complex
 983 | relationships between features
 984 | 
 985 | 207
 986 | 00:08:15,259 --> 00:08:17,290
 987 | and the label we're
 988 | trying to predict.
 989 | 
 990 | 208
 991 | 00:08:17,290 --> 00:08:19,490
 992 | A decision tree is a
 993 | good example of that.
 994 | 
 995 | 209
 996 | 00:08:19,490 --> 00:08:22,120
 997 | And a neural network like we
 998 | saw in TensorFlow Playground
 999 | 
1000 | 210
1001 | 00:08:22,120 --> 00:08:23,730
1002 | is even better.
1003 | 
1004 | 211
1005 | 00:08:23,730 --> 00:08:24,940
1006 | OK, hope that was helpful.
1007 | 
1008 | 212
1009 | 00:08:24,940 --> 00:08:26,226
1010 | Thanks as always for watching.
1011 | 
1012 | 213
1013 | 00:08:26,226 --> 00:08:28,560
1014 | You can follow me on Twitter
1015 | for updates and, of course,
1016 | 
1017 | 214
1018 | 00:08:28,560 --> 00:08:29,310
1019 | Google Developers.
1020 | 
1021 | 215
1022 | 00:08:29,310 --> 00:08:32,190
1023 | And I'll see you guys next time.
1024 | 
1025 | 216
1026 | 00:08:32,190 --> 00:08:35,539
1027 | [MUSIC PLAYING]
1028 | 
1029 | 217
1030 | 00:08:35,539 --> 00:08:43,000
1031 |  Subtitles End: mo.dbxdb.com
1032 | 
1033 | 


--------------------------------------------------------------------------------