├── .gitignore
├── 01. Introduction to Jupyter Notebooks and Data - R.ipynb
├── 02. Linear Regression - R.ipynb
├── 03. Multiple Linear Regression - R.ipynb
├── 04. Polynomial Regression - R.ipynb
├── 05. Logistic Regression - R.ipynb
├── 06. Support Vector Machines - R.ipynb
├── 07. Advanced SVMs - R.ipynb
├── 08. Neural Networks Introduction - R.ipynb
├── 09. Neural Networks Advanced - R.ipynb
├── 10. Convolutional Neural Networks - R.ipynb
├── 11. Recurrent Neural Networks - R.ipynb
├── 12. Clustering - R.ipynb
├── CODE_OF_CONDUCT.md
├── Data
    ├── Arthur tales.txt
    ├── PrionData.csv
    ├── The Time Machine.txt
    ├── chocolate data multiple linear regression.txt
    ├── chocolate data.txt
    ├── dog_data.csv
    ├── football data.txt
    ├── football_data.csv
    ├── time-edit.txt
    ├── traffic_by_hour.csv
    └── trees.csv
├── LICENSE
├── Models
    └── arthur-model-epoch-30.hdf5
├── README.md
└── SECURITY.md


/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | ##
  4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
  5 | 
  6 | # User-specific files
  7 | *.suo
  8 | *.user
  9 | *.userosscache
 10 | *.sln.docstates
 11 | 
 12 | # User-specific files (MonoDevelop/Xamarin Studio)
 13 | *.userprefs
 14 | 
 15 | # Build results
 16 | [Dd]ebug/
 17 | [Dd]ebugPublic/
 18 | [Rr]elease/
 19 | [Rr]eleases/
 20 | x64/
 21 | x86/
 22 | bld/
 23 | [Bb]in/
 24 | [Oo]bj/
 25 | [Ll]og/
 26 | 
 27 | # Visual Studio 2015/2017 cache/options directory
 28 | .vs/
 29 | # Uncomment if you have tasks that create the project's static files in wwwroot
 30 | #wwwroot/
 31 | 
 32 | # Visual Studio 2017 auto generated files
 33 | Generated\ Files/
 34 | 
 35 | # MSTest test Results
 36 | [Tt]est[Rr]esult*/
 37 | [Bb]uild[Ll]og.*
 38 | 
 39 | # NUNIT
 40 | *.VisualState.xml
 41 | TestResult.xml
 42 | 
 43 | # Build Results of an ATL Project
 44 | [Dd]ebugPS/
 45 | [Rr]eleasePS/
 46 | dlldata.c
 47 | 
 48 | # Benchmark Results
 49 | BenchmarkDotNet.Artifacts/
 50 | 
 51 | # .NET Core
 52 | project.lock.json
 53 | project.fragment.lock.json
 54 | artifacts/
 55 | **/Properties/launchSettings.json
 56 | 
 57 | # StyleCop
 58 | StyleCopReport.xml
 59 | 
 60 | # Files built by Visual Studio
 61 | *_i.c
 62 | *_p.c
 63 | *_i.h
 64 | *.ilk
 65 | *.meta
 66 | *.obj
 67 | *.iobj
 68 | *.pch
 69 | *.pdb
 70 | *.ipdb
 71 | *.pgc
 72 | *.pgd
 73 | *.rsp
 74 | *.sbr
 75 | *.tlb
 76 | *.tli
 77 | *.tlh
 78 | *.tmp
 79 | *.tmp_proj
 80 | *.log
 81 | *.vspscc
 82 | *.vssscc
 83 | .builds
 84 | *.pidb
 85 | *.svclog
 86 | *.scc
 87 | 
 88 | # Chutzpah Test files
 89 | _Chutzpah*
 90 | 
 91 | # Visual C++ cache files
 92 | ipch/
 93 | *.aps
 94 | *.ncb
 95 | *.opendb
 96 | *.opensdf
 97 | *.sdf
 98 | *.cachefile
 99 | *.VC.db
100 | *.VC.VC.opendb
101 | 
102 | # Visual Studio profiler
103 | *.psess
104 | *.vsp
105 | *.vspx
106 | *.sap
107 | 
108 | # Visual Studio Trace Files
109 | *.e2e
110 | 
111 | # TFS 2012 Local Workspace
112 | $tf/
113 | 
114 | # Guidance Automation Toolkit
115 | *.gpState
116 | 
117 | # ReSharper is a .NET coding add-in
118 | _ReSharper*/
119 | *.[Rr]e[Ss]harper
120 | *.DotSettings.user
121 | 
122 | # JustCode is a .NET coding add-in
123 | .JustCode
124 | 
125 | # TeamCity is a build add-in
126 | _TeamCity*
127 | 
128 | # DotCover is a Code Coverage Tool
129 | *.dotCover
130 | 
131 | # AxoCover is a Code Coverage Tool
132 | .axoCover/*
133 | !.axoCover/settings.json
134 | 
135 | # Visual Studio code coverage results
136 | *.coverage
137 | *.coveragexml
138 | 
139 | # NCrunch
140 | _NCrunch_*
141 | .*crunch*.local.xml
142 | nCrunchTemp_*
143 | 
144 | # MightyMoose
145 | *.mm.*
146 | AutoTest.Net/
147 | 
148 | # Web workbench (sass)
149 | .sass-cache/
150 | 
151 | # Installshield output folder
152 | [Ee]xpress/
153 | 
154 | # DocProject is a documentation generator add-in
155 | DocProject/buildhelp/
156 | DocProject/Help/*.HxT
157 | DocProject/Help/*.HxC
158 | DocProject/Help/*.hhc
159 | DocProject/Help/*.hhk
160 | DocProject/Help/*.hhp
161 | DocProject/Help/Html2
162 | DocProject/Help/html
163 | 
164 | # Click-Once directory
165 | publish/
166 | 
167 | # Publish Web Output
168 | *.[Pp]ublish.xml
169 | *.azurePubxml
170 | # Note: Comment the next line if you want to checkin your web deploy settings,
171 | # but database connection strings (with potential passwords) will be unencrypted
172 | *.pubxml
173 | *.publishproj
174 | 
175 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
176 | # checkin your Azure Web App publish settings, but sensitive information contained
177 | # in these scripts will be unencrypted
178 | PublishScripts/
179 | 
180 | # NuGet Packages
181 | *.nupkg
182 | # The packages folder can be ignored because of Package Restore
183 | **/[Pp]ackages/*
184 | # except build/, which is used as an MSBuild target.
185 | !**/[Pp]ackages/build/
186 | # Uncomment if necessary however generally it will be regenerated when needed
187 | #!**/[Pp]ackages/repositories.config
188 | # NuGet v3's project.json files produces more ignorable files
189 | *.nuget.props
190 | *.nuget.targets
191 | 
192 | # Microsoft Azure Build Output
193 | csx/
194 | *.build.csdef
195 | 
196 | # Microsoft Azure Emulator
197 | ecf/
198 | rcf/
199 | 
200 | # Windows Store app package directories and files
201 | AppPackages/
202 | BundleArtifacts/
203 | Package.StoreAssociation.xml
204 | _pkginfo.txt
205 | *.appx
206 | 
207 | # Visual Studio cache files
208 | # files ending in .cache can be ignored
209 | *.[Cc]ache
210 | # but keep track of directories ending in .cache
211 | !*.[Cc]ache/
212 | 
213 | # Others
214 | ClientBin/
215 | ~$*
216 | *~
217 | *.dbmdl
218 | *.dbproj.schemaview
219 | *.jfm
220 | *.pfx
221 | *.publishsettings
222 | orleans.codegen.cs
223 | 
224 | # Including strong name files can present a security risk 
225 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
226 | #*.snk
227 | 
228 | # Since there are multiple workflows, uncomment next line to ignore bower_components
229 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
230 | #bower_components/
231 | 
232 | # RIA/Silverlight projects
233 | Generated_Code/
234 | 
235 | # Backup & report files from converting an old project file
236 | # to a newer Visual Studio version. Backup files are not needed,
237 | # because we have git ;-)
238 | _UpgradeReport_Files/
239 | Backup*/
240 | UpgradeLog*.XML
241 | UpgradeLog*.htm
242 | ServiceFabricBackup/
243 | *.rptproj.bak
244 | 
245 | # SQL Server files
246 | *.mdf
247 | *.ldf
248 | *.ndf
249 | 
250 | # Business Intelligence projects
251 | *.rdl.data
252 | *.bim.layout
253 | *.bim_*.settings
254 | *.rptproj.rsuser
255 | 
256 | # Microsoft Fakes
257 | FakesAssemblies/
258 | 
259 | # GhostDoc plugin setting file
260 | *.GhostDoc.xml
261 | 
262 | # Node.js Tools for Visual Studio
263 | .ntvs_analysis.dat
264 | node_modules/
265 | 
266 | # Visual Studio 6 build log
267 | *.plg
268 | 
269 | # Visual Studio 6 workspace options file
270 | *.opt
271 | 
272 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
273 | *.vbw
274 | 
275 | # Visual Studio LightSwitch build output
276 | **/*.HTMLClient/GeneratedArtifacts
277 | **/*.DesktopClient/GeneratedArtifacts
278 | **/*.DesktopClient/ModelManifest.xml
279 | **/*.Server/GeneratedArtifacts
280 | **/*.Server/ModelManifest.xml
281 | _Pvt_Extensions
282 | 
283 | # Paket dependency manager
284 | .paket/paket.exe
285 | paket-files/
286 | 
287 | # FAKE - F# Make
288 | .fake/
289 | 
290 | # JetBrains Rider
291 | .idea/
292 | *.sln.iml
293 | 
294 | # CodeRush
295 | .cr/
296 | 
297 | # Python Tools for Visual Studio (PTVS)
298 | __pycache__/
299 | *.pyc
300 | 
301 | # Cake - Uncomment if you are using it
302 | # tools/**
303 | # !tools/packages.config
304 | 
305 | # Tabs Studio
306 | *.tss
307 | 
308 | # Telerik's JustMock configuration file
309 | *.jmconfig
310 | 
311 | # BizTalk build output
312 | *.btp.cs
313 | *.btm.cs
314 | *.odx.cs
315 | *.xsd.cs
316 | 
317 | # OpenCover UI analysis results
318 | OpenCover/
319 | 
320 | # Azure Stream Analytics local run output 
321 | ASALocalRun/
322 | 
323 | # MSBuild Binary and Structured Log
324 | *.binlog
325 | 
326 | # NVidia Nsight GPU debugger configuration file
327 | *.nvuser
328 | 
329 | # MFractors (Xamarin productivity tool) working folder 
330 | .mfractor/
331 | 


--------------------------------------------------------------------------------
/01. Introduction to Jupyter Notebooks and Data - R.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "metadata": {},
  5 |       "cell_type": "markdown",
  6 |       "source": "# Welcome to Azure Notebooks!\n\n[R](https://www.r-project.org/) is a free programming language and software environment for statistical computing and graphics. The R language is very popular for statistical analysis and data analysis.\n\nIn the following exercise we will give you a taste of what using R is like.\n\nWe have provided some data for you, and tidied it up so it’s ready to analyse. You can move through the blocks of code below by clicking on the code (within the grey boxes), then clicking the `Run` button above."
  7 |     },
  8 |     {
  9 |       "metadata": {},
 10 |       "cell_type": "markdown",
 11 |       "source": "Exercise 1: Introduction To Jupyter Notebooks\n===\n\nThe purpose of this exercise is to get you familiar with using Jupyter Notebooks. Don't worry if you find the code difficult to understand, as this is not an R course. You will slowly learn more about the R programming language as you go, and you definitely don't need to understand every line of code.\n\nStep 1\n---\n\nNotebooks contain blocks of code that you can execute, such as the grey box below.\n\nGive it a go.\n\n** Click on the code below, then press `Run` in the toolbar above (or press __Shift+Enter__) to run the code. **"
 12 |     },
 13 |     {
 14 |       "metadata": {
 15 |         "scrolled": true,
 16 |         "trusted": true
 17 |       },
 18 |       "cell_type": "code",
 19 |       "source": "print(\"The code ran successfully!\")",
 20 |       "execution_count": null,
 21 |       "outputs": []
 22 |     },
 23 |     {
 24 |       "metadata": {},
 25 |       "cell_type": "markdown",
 26 |       "source": "If all went well, the code should have printed a message for you."
 27 |     },
 28 |     {
 29 |       "metadata": {
 30 |         "slideshow": {
 31 |           "slide_type": "slide"
 32 |         }
 33 |       },
 34 |       "cell_type": "markdown",
 35 |       "source": "Step 2\n---\n\nLet's print a message you choose this time. \n\n** In the block of code below, write a message between the quotation marks. It is OK to use spaces, numbers, and letters. Your message should appear red in colour.**"
 36 |     },
 37 |     {
 38 |       "metadata": {
 39 |         "trusted": true
 40 |       },
 41 |       "cell_type": "code",
 42 |       "source": "###\n# WRITE A MESSAGE BETWEEN THE SPEECH MARKS IN THE LINE BELOW, THEN HIT RUN.\n###\nprint(\"type something here!\")\n###\n\n# It's ok to use spaces, numbers, or letters. Your message should look red.\n# For example: print(\"this is my message\")",
 43 |       "execution_count": null,
 44 |       "outputs": []
 45 |     },
 46 |     {
 47 |       "metadata": {},
 48 |       "cell_type": "markdown",
 49 |       "source": "You will notice the hash symbols `#` in the code block above. Anything after a `#` is ignored by the computer (within Jupyter Notebook the text appears blue in colour). Using `#` at the start of a line allows you to comment the code so that it is human readable and easier to follow."
 50 |     },
 51 |     {
 52 |       "metadata": {},
 53 |       "cell_type": "markdown",
 54 |       "source": "Step 3\n---\n\nR allows us to save data to use later. In this exercise, we will save a message you create.\n\n** In the code below, write a message within the quotation marks. Again, it is OK to use spaces, numbers, and letters, as long as they are within the quotation marks. **"
 55 |     },
 56 |     {
 57 |       "metadata": {
 58 |         "trusted": true
 59 |       },
 60 |       "cell_type": "code",
 61 |       "source": "###\n# WRITE A MESSAGE BETWEEN THE SPEECH MARKS IN THE LINE BELOW, THEN PRESS RUN\n###\nmy_msg <- \"type something here!\"\n###\n\nprint(my_msg) ",
 62 |       "execution_count": null,
 63 |       "outputs": []
 64 |     },
 65 |     {
 66 |       "metadata": {},
 67 |       "cell_type": "markdown",
 68 |       "source": "OK, what happened here? \n\nIn the real world, we might put items into a cardboard box for storage, like toys, DVDs, or photo albums. We label the box, say \"My DVDs\", to identify what is inside the box.\n\nIn R, we can do something similar: when we want to store information, we use **variables** (the cardboard box in our analogy), and the variable is given a name to help us identify what it stores so we can refer back to it.\n\nThis is what you've just done in the code block above.\n\nYou created a message inside the quotation marks, then you saved it to a **variable** called `my_msg`.\n\n```\nmy_msg <- \"This is my message that I'm going to forget so I want to save it for later!\"\n           ↑↑↑\n           The message you created\n \nmy_msg <- \"This is my message that I'm going to forget so I want to save it for later!\"\n       ↑↑↑\n       The arrow (pointing left) is called the assignment symbol, and saves the information on the right\n     \nmy_msg <- \"This is my message that I'm going to forget so I want to save it for later!\"\n↑↑↑\nThe name of your variable (what the arrow is pointing towards)\n```\n\nNote that in R, variable names cannot contain spaces, or begin with a number. As per the [R FAQ](https://cran.r-project.org/doc/FAQ/R-FAQ.html#What-are-valid-names_003f), a syntactically valid name:\n\n> _... consists of letters, numbers and the dot or underline characters and starts with a letter or the dot not followed by a number. Names such as \".2way\" are not valid, and neither are the reserved words_\n\nReserved words include: `if` `else` `repeat` `while` `function` `for` `in` `next` `break` `TRUE` `FALSE` `NULL` `Inf` `NaN` `NA` `NA_integer_` `NA_real_` `NA_complex_` `NA_character_` `...` `..1` `..2` `..3` (etc)\n\nBe mindful that variable names should help describe the information you are saving; you should aim for variable names that are descriptive."
 69 |     },
 70 |     {
 71 |       "metadata": {},
 72 |       "cell_type": "markdown",
 73 |       "source": "Step 4\n---\n\nLet's try using variables again, but save a number inside our variable this time. Remember, the variable is on the *left hand side* of the `<-` assignment symbol and is the equivalent of a labelled box. The information on the *right hand side* is the information we want to store inside the variable (or a box in our analogy).\n\n** In the cell below replace `<addNumber>` with any number you choose. ** The number should not contain spaces or commas.\n\nThen __run the code__."
 74 |     },
 75 |     {
 76 |       "metadata": {
 77 |         "trusted": true
 78 |       },
 79 |       "cell_type": "code",
 80 |       "source": "###\n# REPLACE THE <addNumber> WITH A NUMBER OF YOUR CHOICE\n###\nmy_first_number <- <addNumber>\n###\n\n# Typing the name of the variable prints the information it stores to screen\nmy_first_number\n\n# Add 1 to our variable\nmy_first_number + 1\n\n# Did this calculation affect our variable? Let's check...\nmy_first_number\n\n# What's the square root of our number?\nsqrt(my_first_number)",
 81 |       "execution_count": null,
 82 |       "outputs": []
 83 |     },
 84 |     {
 85 |       "metadata": {},
 86 |       "cell_type": "markdown",
 87 |       "source": "What happened here?\n\nIn our real world example, we might store spare coins inside a cardboard box. We can use the money in different ways, for example, we may want to count how much money is in the box, take some money out of the box then deposit it in the bank, or add more money to the box.\n\nSimilarly, in R, when we save numbers inside a variable, we can perform various calculations to the numbers we store inside the variable. Above, we asked R to add a value of one to the number that we stored in the `my_first_number` variable. We also asked R to calculate the square root of our variable using the function `sqrt()`.\n\nN.B. Performing calculations on the `my_first_number` variable will not change the value it has stored. If you want to change the value of `my_first_number`, you need to use the assignment symbol `<-`. If you do not use the assignment symbol, your information/results won't be saved."
 88 |     },
 89 |     {
 90 |       "metadata": {},
 91 |       "cell_type": "markdown",
 92 |       "source": "How does the `sqrt` function work?\n\n```\nsqrt(...)\n↑↑↑\n```\nYou are calling R to perform a **function** called `sqrt`, which computes the square root of the value supplied to the function. There are many functions available in R, stored within **libraries**.\n\n```\nsqrt(...)\n    ↑   ↑\n```\nTo use functions, you need to specify the name of the function, followed by round brackets (parentheses). The pieces of information you provide within the brackets are known as **arguments**. The `sqrt` function only takes one argument.\n```\nsqrt(my_first_number)\n     ↑↑↑\n```\nIn the example above, we supplied the `sqrt()` function a variable name `my_first_number` between the brackets, and the result, i.e. the square root of `my_first_number`, is printed to the screen.               \n      \n\nStep 5\n---\n\nLet's make a graph from some data. First we need to load the appropriate library to create a graph. The first line of code below loads the `ggplot2` library that contains the functions for graphing capabilities in this exercise. At the start of (most) R programming exercises, we have to load **libraries** to help us perform tasks. \n\n** Click on the code below, then hit the `Run` button to create a scatter plot using the `ggplot2` library. You do not need to edit any of the code. **"
 93 |     },
 94 |     {
 95 |       "metadata": {
 96 |         "trusted": true
 97 |       },
 98 |       "cell_type": "code",
 99 |       "source": "# Load the required library for plotting functions\nlibrary(\"ggplot2\")\n\n# Create data to plot\n# N.B. Input to ggplot2 needs to be a data frame, i.e. x and y must be stored in the same variable\ntest_data <- data.frame(x.values = c(1, 2, 3), \n                        y.values = c(5, 4, 6))\n\n# The following code makes a scatter plot, using our continuous x and y values specified above\nggplot(data = test_data, aes(x = x.values, y = y.values)) +\n# Specify type of plot as scatter plot\ngeom_point() +\n# x-axis label\nxlab(\"x value\") +\n# y-axis label\nylab(\"y value\") +\n# Title of plot\nggtitle(\"My test plot using the ggplot2 library\") +\n# Align title to centre\ntheme(plot.title = element_text(hjust = 0.5))",
100 |       "execution_count": null,
101 |       "outputs": []
102 |     },
103 |     {
104 |       "metadata": {},
105 |       "cell_type": "markdown",
106 |       "source": "If you'd like, have a play with the code:\n\n* Change the `x.values` and `y.values` stored within the variable `test_data` and see how the graph changes. Make sure they have the same count of numbers in them (i.e. currently `x.values` and `y.values` have three numbers each).\n\n\nStep 6\n---\n\nFrom time to time, we will load data from text files, rather than create it ourselves. You can't see the text files in your browser because they are saved on the server running this website. We can load the files using R though. Let's load a text file, inspect it, then graph it.\n\n#### Run the code block below to load data about chocolate bars and inspect the data. You do not need to edit the code."
107 |     },
108 |     {
109 |       "metadata": {
110 |         "trusted": true
111 |       },
112 |       "cell_type": "code",
113 |       "source": "# Load data with information about chocolate bars, and save it to a variable called 'choc_data'\nchoc_data <- read.delim(\"Data/chocolate data.txt\")\n\n# Use the function str() below to inspect the data\n# str displays the structure of the data\nstr(choc_data)\n\n# To view the data, use the head() function\n# head returns the first part of the data\nhead(choc_data)",
114 |       "execution_count": null,
115 |       "outputs": []
116 |     },
117 |     {
118 |       "metadata": {},
119 |       "cell_type": "markdown",
120 |       "source": "The `str` function returns a compact display of the structure of an object. It informs us of the **class** (type) of the object, and its contents. \n\nBy performing the `str` function on `choc_data`, we have determined our data is of the class `data.frame`, has 5 variables, and 100 observations (abbreviated \"obs.\"). The names of each variable are shown after the $ symbol:\n\n* weight;\n* cocoa_percent;\n* sugar_percent;\n* milk_percent;\n* customer_happiness.\n\nThe `head` function, by default, returns the first six rows of our object. For our object `choc_data`, each row (horizontal) represents the information about one chocolate bar, and each column (vertical) represents the different variables. For example, the first chocolate bar is:\n* weighed 185 grams;\n* is 65% cocoa;\n* is 11% sugar;\n* is 24% milk;\n* and a customer said they were 47% happy with it.\n\n\nStep 7\n---\n\nLet's graph features from the `choc_data` variable we saved earlier. We can graph some of these features in a scatter plot, referred to as the `geom_point` function in the library `ggplot2`. Let's place `customer_happiness` on the x-axis, and `cocoa_percent` on the y-axis. \n\n### In the cell below replace:\n#### 1. `<xValues>` with `customer_happiness`\n#### 2. `<yValues>` with `cocoa_percent`\n#### then __run the code__."
121 |     },
122 |     {
123 |       "metadata": {
124 |         "trusted": true
125 |       },
126 |       "cell_type": "code",
127 |       "source": "###\n# REPLACE <xValues> WITH customer_happiness and <yValues> WITH cocoa_percent\n###\nggplot(data = choc_data, aes(x = <xValues>, y = <yValues>)) +\n###\ngeom_point() +\nxlab(\"Customer happiness\") +\nylab(\"Cocoa percent\") +\nggtitle(\"Customer satisfaction with chocolate bars given cocoa percentage\") +\ntheme(plot.title = element_text(hjust = 0.5))",
128 |       "execution_count": null,
129 |       "outputs": []
130 |     },
131 |     {
132 |       "metadata": {},
133 |       "cell_type": "markdown",
134 |       "source": "In this graph, every chocolate bar is represented by a single point. Later, we will analyse this data with AI.\n\nConclusion\n---\n\n__Well done!__ That's the end of programming Exercise 1.\n\nYou can now go back to the course and click __'Next Step'__ to move onto some key concepts of AI - models and error."
135 |     }
136 |   ],
137 |   "metadata": {
138 |     "kernelspec": {
139 |       "name": "r",
140 |       "display_name": "R",
141 |       "language": "R"
142 |     },
143 |     "language_info": {
144 |       "mimetype": "text/x-r-source",
145 |       "name": "R",
146 |       "pygments_lexer": "r",
147 |       "version": "3.5.3",
148 |       "file_extension": ".r",
149 |       "codemirror_mode": "r"
150 |     }
151 |   },
152 |   "nbformat": 4,
153 |   "nbformat_minor": 2
154 | }


--------------------------------------------------------------------------------
/02. Linear Regression - R.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |   "cells": [
 3 |     {
 4 |       "metadata": {
 5 |         "collapsed": true
 6 |       },
 7 |       "cell_type": "markdown",
 8 |       "source": "Exercise 2 - Simple Linear Regression\n===\n\nIn Exercise 1, we used R within Jupyter Notebooks to load information about chocolate bars, and stored it in a variable named `choc_data`. We checked the structure of `choc_data`, and explored some of the variables we have about chocolate bars using graphs.\n\nIn this exercise, we want to know how to make our chocolate-bar customers happier. To do this, we need to know whether chocolate bar _features_ can predict customer happiness. For example, customers may be happier when chocolate bars are bigger, or when they contain more cocoa.\n\nWe have data on customer happiness when eating chocolate bars with different features. Let's explore the relationship between customer happiness and the different features we have available.\n\nStep 1\n---\n\nFirst, we need to load the required libraries and data we will use in this exercise.\n\nBelow, we'll also use the functions `str`, `head`, and `tail` to inspect the structure of `choc_data`.\n\n** In the cell below replace: **\n\n** 1. `<structureFunction>` with `str` **\n\n** 2. `<headFunction>` with `head` **\n\n** 3. `<tailFunction>` with `tail` **\n\n** then __run the code__. **"
 9 |     },
10 |     {
11 |       "metadata": {
12 |         "scrolled": true,
13 |         "trusted": false
14 |       },
15 |       "cell_type": "code",
16 |       "source": "# Load `ggplot2` library for graphing capabilities\nlibrary(ggplot2)\n\n# Load the chocolate data and save it to the variable name `choc_data`\nchoc_data <- read.delim(\"Data/chocolate data.txt\")\n\n###\n# REPLACE <structureFunction> <headFunction> <tailFunction> WITH str, head, and tail\n###\n\n# Check the structure of `choc_data` using `str(choc_data)`\n<structureFunction>(choc_data)\n\n# Inspect the start of the data by typing `head(choc_data)`\n<headFunction>(choc_data)\n\n# Inspect the end of the data by typing `tail(choc_data)`\n<tailFunction>(choc_data)",
17 |       "execution_count": null,
18 |       "outputs": []
19 |     },
20 |     {
21 |       "metadata": {},
22 |       "cell_type": "markdown",
23 |       "source": "Our object `choc_data` contains 100 different chocolate bar observations for 5 variables: weight, cocoa percent, sugar percent, milk percent, and customer happiness.\n\nStep 2\n---\n\n\nWe want to know which chocolate bar features make customers happy.\n\nThe example below shows a linear regression between __cocoa percentage__ and __customer happiness__. \n\n** Run the code below to visualise this. You do not need to edit the code block below, just run it. **"
24 |     },
25 |     {
26 |       "metadata": {
27 |         "trusted": false
28 |       },
29 |       "cell_type": "code",
30 |       "source": "# Run this box\n\n# DO NOT EDIT THIS CODE\n\n# Create our own function to generate a linear regression model then graph the result\nlin_reg_choc <- function(x, y, my_data){\n    \n    x_arg <- my_data[ , substitute(x)]\n    y_arg <- my_data[ , substitute(y)]\n    \n    # Perform linear regression using `lm` (stands for linear models) function\n    lm_choc <- lm(formula = y_arg ~ x_arg, data = my_data)\n    \n    # Create scatter plot of choc_data together with linear model\n    ggplot(data = my_data, aes_string(x = x, y = y)) +\n    geom_point() +\n    # Add line based on linear model\n    geom_abline(intercept = lm_choc$coefficients[1], \n                slope = lm_choc$coefficients[2],\n                colour = \"red\") +\n    # x-axis label remains constant\n    xlab(\"Customer happiness\") +\n    # y-axis label; use `gsub` function to remove underscore from \n    ylab(gsub(\"_\", \" \", y)) +\n    # graph title\n    ggtitle(paste(\"Customer satisfaction with chocolate bars given\", gsub(\"_\", \" \", y))) +\n    theme(plot.title = element_text(hjust = 0.5))\n\n}\n\n# This performs the linear regression steps listed above\nlin_reg_choc(x = \"customer_happiness\", y = \"cocoa_percent\", my_data = choc_data)",
31 |       "execution_count": null,
32 |       "outputs": []
33 |     },
34 |     {
35 |       "metadata": {},
36 |       "cell_type": "markdown",
37 |       "source": "In the scatter plot above, each point represents an observation for a single chocolate bar.\n\nIt seems that __a higher percentage of cocoa increases customer happiness__. We think this because as we increase the amount of cocoa (y-axis), the amount of customer happiness (x-axis) increases, as shown by our linear model (red line). \n\nStep 3\n---\n\n** In the cell below: **\n\n** 1. replace the text `<addFeatureHere>` with __`weight`__ to see if heavier chocolate bars make people happier. **\n\n** 2. Also try the variables `sugar_percent` and  `milk_percent` to see if these improve customers' experiences. **\n\n** Remember to run each box when you are ready.**"
38 |     },
39 |     {
40 |       "metadata": {
41 |         "trusted": false
42 |       },
43 |       "cell_type": "code",
44 |       "source": "###\n# CHANGE <addFeatureHere> TO \"weight\" IN THE LINE BELOW (INCLUDING THE QUOTATION MARKS)\n###\nlin_reg_choc(x = \"customer_happiness\", y = <addFeatureHere>, my_data = choc_data)\n###",
45 |       "execution_count": null,
46 |       "outputs": []
47 |     },
48 |     {
49 |       "metadata": {
50 |         "trusted": false
51 |       },
52 |       "cell_type": "code",
53 |       "source": "###\n# CHANGE <addFeatureHere> TO \"sugar_percent\" IN THE LINE BELOW (INCLUDING THE QUOTATION MARKS)\n###\nlin_reg_choc(x = \"customer_happiness\", y = <addFeatureHere>, my_data = choc_data)\n###",
54 |       "execution_count": null,
55 |       "outputs": []
56 |     },
57 |     {
58 |       "metadata": {
59 |         "trusted": false
60 |       },
61 |       "cell_type": "code",
62 |       "source": "###\n# CHANGE <addFeatureHere> TO \"milk_percent\" IN THE LINE BELOW (INCLUDING THE QUOTATION MARKS)\n###\nlin_reg_choc(x = \"customer_happiness\", y = <addFeatureHere>, my_data = choc_data)\n###",
63 |       "execution_count": null,
64 |       "outputs": []
65 |     },
66 |     {
67 |       "metadata": {},
68 |       "cell_type": "markdown",
69 |       "source": "It looks like heavier chocolate bars make customers happier, whereas larger amounts of sugar or milk don't seem to make customers happier. \n\nWe can draw this conclusion based on the slope of our linear regression models (red line): \n\n* Our linear regression model for \"weight vs. customer happiness\" reveals that as chocolate bar weight  increases, customer happiness also increases;\n* Our linear regression models for \"sugar percent vs. customer happiness\" and \"milk percent vs. customer happiness\" reveal that as the percentage of sugar or milk increases, customer happiness decreases.\n\n> *N.B. It is possible to perform linear regression directly with `ggplot2` using the following function and arguments: `stat_smooth(method = \"lm\")`. However, we want to show you how to create linear models without the dependency of `ggplot2`.*\n\nConclusion\n---\nWell done! You have run a simple linear regression that revealed chocolate bars heavier in weight and with higher percentages of cocoa make customers happy.\n\nYou can now go back to the course and click __'Next Step'__ to move onto using linear regression with multiple features."
70 |     }
71 |   ],
72 |   "metadata": {
73 |     "kernelspec": {
74 |       "name": "r",
75 |       "display_name": "R",
76 |       "language": "R"
77 |     },
78 |     "language_info": {
79 |       "mimetype": "text/x-r-source",
80 |       "name": "R",
81 |       "pygments_lexer": "r",
82 |       "version": "3.5.3",
83 |       "file_extension": ".r",
84 |       "codemirror_mode": "r"
85 |     }
86 |   },
87 |   "nbformat": 4,
88 |   "nbformat_minor": 2
89 | }


--------------------------------------------------------------------------------
/03. Multiple Linear Regression - R.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "metadata": {
  5 |         "collapsed": true
  6 |       },
  7 |       "cell_type": "markdown",
  8 |       "source": "Exercise 3 - Multiple Linear Regression\n===\n\nFrom the previous exercise, we know that customers are happier with chocolate bars that are heavier and have a high percentage of cocoa. Customers may feel differently when they have to pay more for these bars though.\n\nIn this exercise, we will try to find the chocolate bar that best suits customers, taking into account cocoa percentage, weight, and cost, using multiple linear regression.\n\n#### Run the cell below to start installing the necessary packages."
  9 |     },
 10 |     {
 11 |       "metadata": {
 12 |         "trusted": true
 13 |       },
 14 |       "cell_type": "code",
 15 |       "source": "# Run this first! It might take a little while to load...\n\n# Loads `tidyverse` and `plot3D` packages for graphing capabilities\nsuppressMessages(install.packages(\"tidyverse\"))\nsuppressMessages(library(\"tidyverse\"))\nsuppressMessages(library(\"plot3D\"))",
 16 |       "execution_count": null,
 17 |       "outputs": []
 18 |     },
 19 |     {
 20 |       "metadata": {},
 21 |       "cell_type": "markdown",
 22 |       "source": "Step 1\n---\n\nFirst, let's load the libraries we will require for this exercise, and a new chocolate bar data set that includes the feature `cost`. We will then inspect the data that we loaded.\n\n> _It is good practice to \"sanity check\" your data objects regularly in R, i.e. check the structure of your object using `str`, and print samples of your output regularly using `head` and `tail`. Sanity checking helps prevent errors downstream, and is useful for debugging code._\n\n### In the cell below replace:\n#### 1. `<checkData1>` with `str(choc_data)`\n#### 2. `<checkData2>` with `head(choc_data)`\n#### 3. `<checkData3>` with `tail(choc_data)`\n#### then __run the code__."
 23 |     },
 24 |     {
 25 |       "metadata": {
 26 |         "trusted": true
 27 |       },
 28 |       "cell_type": "code",
 29 |       "source": "# Loads the new chocolate data for multiple linear regression and save it to the variable name `choc_data`\nchoc_data <- read.delim(\"Data/chocolate data multiple linear regression.txt\")\n\n###    \n# REPLACE <checkData1> WITH str(choc_data) TO CHECK THE STRUCTURE OF choc_data.\n###\n<checkData>\n###\n\n###\n# REPLACE <checkData2> WITH head(choc_data) TO INSPECT THE START OF THE DATA.\n###\n<checkData>\n###\n\n###\n# REPLACE <checkData3> WITH tail(choc_data) TO INSPECT THE END OF THE DATA.\n###\n<checkData>\n###",
 30 |       "execution_count": null,
 31 |       "outputs": []
 32 |     },
 33 |     {
 34 |       "metadata": {},
 35 |       "cell_type": "markdown",
 36 |       "source": "Our new data set `choc_data` contains 100 observations and 4 variables: weight, cocoa percent, cost, and customer happiness.\n\nStep 2\n---\n\nPreviously we found that customers like a high percentage of cocoa and heavier bars of chocolate. Large bars of chocolate cost more money, though, which might make customers less inclined to purchase them.\n\nLet's perform a simple linear regression to see the relationship between __customer happiness__ and chocolate bar __weight__ when the cost of the chocolate was taken into consideration for the survey.\n\n** Run the code block below. You do not need to edit the code before running it. **"
 37 |     },
 38 |     {
 39 |       "metadata": {
 40 |         "trusted": true
 41 |       },
 42 |       "cell_type": "code",
 43 |       "source": "# Run this!\n\n# DO NOT EDIT THIS CODE BLOCK\n\n# Use function from Exercise 2 to generate a simple linear regression model then graph the result\nlin_reg_choc <- function(x, y, my_data){\n    \n    x_arg <- my_data[ , substitute(x)]\n    y_arg <- my_data[ , substitute(y)]\n    \n    # Perform linear regression using `lm` (stands for linear models) function\n    lm_choc <- lm(formula = y_arg ~ x_arg, data = my_data)\n    \n    # Save lm_choc to the workspace\n    lm_choc <<- lm_choc\n    \n    # Create scatter plot of choc_data together with linear model\n    ggplot(data = my_data, aes_string(x = x, y = y)) +\n    geom_point() +\n    # Add line based on linear model\n    geom_abline(intercept = lm_choc$coefficients[1], \n                slope = lm_choc$coefficients[2],\n                colour = \"red\") +\n    # x-axis label remains constant\n    xlab(\"Customer happiness\") +\n    # y-axis label; use `gsub` function to remove underscore from \n    ylab(gsub(\"_\", \" \", y)) +\n    # graph title\n    ggtitle(paste(\"Customer satisfaction with chocolate bars given\", gsub(\"_\", \" \", y))) +\n    theme(plot.title = element_text(hjust = 0.5))\n    \n}",
 44 |       "execution_count": null,
 45 |       "outputs": []
 46 |     },
 47 |     {
 48 |       "metadata": {},
 49 |       "cell_type": "markdown",
 50 |       "source": "#### Change `<yData>` to `\"weight\"` and run the code."
 51 |     },
 52 |     {
 53 |       "metadata": {
 54 |         "trusted": true
 55 |       },
 56 |       "cell_type": "code",
 57 |       "source": "# Call our custom function to run simple linear regression and plot the results\n\n###\n# REPLACE <yData> BELOW WITH \"weight\" (INCLUDING THE QUOTATION MARKS)\n###\nlin_reg_choc(x = \"customer_happiness\", y = <yData>, my_data = choc_data)\n###",
 58 |       "execution_count": null,
 59 |       "outputs": []
 60 |     },
 61 |     {
 62 |       "metadata": {},
 63 |       "cell_type": "markdown",
 64 |       "source": "Customer happiness still increases with larger bars of chocolate. However, many data points (black) are a long way from our linear regression model (red line). This means that our model doesn't describe the data very well. It is likely that there are other features of the chocolate bars that are influencing customer happiness.\n\nRepeat the exercise looking at `cocoa_percent`, you should see a similar trend.\n\n#### Replace the `<yData>` with `\"cocoa_percent\"` and run the code."
 65 |     },
 66 |     {
 67 |       "metadata": {
 68 |         "trusted": true
 69 |       },
 70 |       "cell_type": "code",
 71 |       "source": "# Call our custom function to run simple linear regression and plot the results\n###\n# REPLACE <yData> BELOW WITH \"cocoa_percent\" (INCLUDING THE QUOTATION MARKS)\n###\nlin_reg_choc(x = \"customer_happiness\", y = <yData>, my_data = choc_data)\n###",
 72 |       "execution_count": null,
 73 |       "outputs": []
 74 |     },
 75 |     {
 76 |       "metadata": {},
 77 |       "cell_type": "markdown",
 78 |       "source": "Step 3\n---\n\nWe can check how well our data fit our simple linear regression model by obtaining the R² values. R² values range between 0 - 1, where 1 is a perfect fit. What is a \"good\" or \"bad\" fit depends on several things, but for this exercise, numbers below 0.3 will mean a poor fit.\n\nThe linear model for simple linear regression we just ran, \"cocoa percent vs. customer happiness\", is saved under `lm_choc`. Let's determine the R² value of this model. \n\n** Run the code below (you don't need to edit it).**"
 79 |     },
 80 |     {
 81 |       "metadata": {
 82 |         "trusted": true
 83 |       },
 84 |       "cell_type": "code",
 85 |       "source": "# Extract R² value from linear model\nprint(summary(lm_choc)$r.squared)",
 86 |       "execution_count": null,
 87 |       "outputs": []
 88 |     },
 89 |     {
 90 |       "metadata": {},
 91 |       "cell_type": "markdown",
 92 |       "source": "We obtain an R² value < 0.3, which means our simple linear model \"cocoa percent vs. customer happiness\" is a poor fit of the data. \n\nTo check the R² value for the \"weight vs. customer happiness\" model:\n\n** 1. In the 2nd code block in Step 2, replace `cocoa_percent` with `weight`, then run this block.**\n\n** Run the code block in Step 3. **\n\nThis will over-write the linear model saved to `lm_choc`. You should obtain an R² value of 0.1887701, which is also < 0.3, so we should create a better model for our data.\n\nStep 4\n---\n\nThe problem with our chocolate bar survey is that the chocolate bar variables aren't controlled; cost, bar weight, and cocoa percent are different for every chocolate bar.\n\nWe want to see the relationship between cocoa content and customer happiness, but cost and block weight are also influencing customer happiness.\n\nWe *could* run another survey, giving away chocolate bars that are all the same weight for free (i.e. weight and cost are constant), and ask people how happy they are with the chocolate bar given varying percentages of cocoa. However, this would be expensive and time consuming.\n\n__Alternatively, we can use multiple linear regression__. Multiple linear regression gives us the relationship between each _feature_ and customer happiness. These are provided as _coefficients_ (slopes). Positive numbers indicate a positive relationship (i.e. customer happiness increases as this feature increases), negative numbers indicate a negative relationship (customer happiness decreases as this feature increases). Unlike _simple_ linear regression, these relationships should be independent. That means that our relationship between cocoa percentage and customer happiness should not be influenced strongly by bar weight or cost. \n\n#### Change the formula for multiple linear reqression by replacing `<addCostPercentHere>` with `cost` and run the code."
 93 |     },
 94 |     {
 95 |       "metadata": {
 96 |         "trusted": true
 97 |       },
 98 |       "cell_type": "code",
 99 |       "source": "###\n# IN THE LINE BELOW REPLACE <addCostPercentHere> WITH cost\n###\nlm_choc_mlr <- lm(formula = customer_happiness ~ weight + cocoa_percent + <addCostPercentHere>, \n                  data = choc_data)\n###\n\n# Print the coefficients (slopes) of our new model\nsummary(lm_choc_mlr)",
100 |       "execution_count": null,
101 |       "outputs": []
102 |     },
103 |     {
104 |       "metadata": {},
105 |       "cell_type": "markdown",
106 |       "source": "Inspect the \"Coefficients\" heading within the results summary of our multiple linear regression model. In particular, look at the values in the \"Estimate\" column, which represent the estimate of the coefficients. Are the values positive or negative?\n\nThe coefficients for `weight` and `cocoa_percent` are both positive, which means they both independently increase customer happiness. However the coefficient for cost is negative, which means increases in cost decrease customer happiness.\n\nThe R² value (2nd last line of results summary) is also higher than before. This means our multiple linear regression model fits the data better than our simple linear regression models.\n\nStep 5\n---\n\nFrom our linear regression, we have an equation, `lm_choc`, that predicts customer happiness. It looks like so:\n\n`customer_happiness = -9.34 + weight * 0.106 + cocoa_percent * 31.9 + cost * -1.31`\n\nWe might also know that, for our company, the cost of manufacturing and shipping each bar can be calculated as:\n\n`cost = (0.05 * weight + weight * cocoa_percent)^2 * 0.0004`\n\nFrom this, we can calculate the best bar for our customers, by balancing the cost against how happy the customer is likely to be with this product. Let's plot this in 3D to see what our optimum chocolate bar should be.\n\n*** Run the code below. You do not need to edit the code. ***"
107 |     },
108 |     {
109 |       "metadata": {
110 |         "trusted": true
111 |       },
112 |       "cell_type": "code",
113 |       "source": "# DO NOT EDIT THE CODE\n\n# Calculate customer happiness for a given bar of chocolate \n# Use our multiple linear regression model `lm_choc_mlr`\nchoc_data_mlr <- \n    choc_data %>%\n    # Calculate adjusted cost as stated in equation above\n    mutate(cost_adj = (0.05 * weight + weight * cocoa_percent)^2 * 0.0004) %>% \n    # Calculate customer happiness based on multiple linear regression model `lm_choc_mlr`\n    mutate(cust_happ_mlr = coef(lm_choc_mlr)[\"(Intercept)\"] + (weight * coef(lm_choc_mlr)[\"weight\"]) +\n    (cocoa_percent * coef(lm_choc_mlr)[\"cocoa_percent\"]) + \n    (cost_adj * coef(lm_choc_mlr)[\"cost\"]))\n\n# Sanity check our data\nhead(choc_data_mlr)\n\n# Load package `plot3D` to create 3D scatter plot\n\nscatter3D(x = choc_data_mlr$weight, y = choc_data_mlr$cocoa_percent, \n          z = choc_data_mlr$cust_happ_mlr, bty = \"g\", \n          col = gg2.col(alpha = 0.75), pch = 16, theta = 45, phi = 30,\n          xlab = \"Weight (x)\", ylab = \"Cocoa percent (y)\", zlab = \"Customer happiness (z)\",\n          clab = \"Customer\\nhappiness\", ticktype = \"detailed\")",
114 |       "execution_count": null,
115 |       "outputs": []
116 |     },
117 |     {
118 |       "metadata": {},
119 |       "cell_type": "markdown",
120 |       "source": "In the 3D scatter plot above, the colour scheme is represented by customer happiness, with higher values coloured as dark grey. We can see that our optimum chocolate bar should be around 100 g and contain a high amount of cocoa. For large bars of chocolate, a cocoa content of around 50% appears to be ideal.\n\nNote how this result is different to our earlier exercise using _simple_ linear regression. With that, we assumed a large bar with a very high amount of cocoa was what customers would want.\n\nConclusion\n---\nThat's it! You can go back to the course now and click on __'Next Step'__ to carry on with our introduction to regression."
121 |     }
122 |   ],
123 |   "metadata": {
124 |     "kernelspec": {
125 |       "name": "r",
126 |       "display_name": "R",
127 |       "language": "R"
128 |     },
129 |     "language_info": {
130 |       "mimetype": "text/x-r-source",
131 |       "name": "R",
132 |       "pygments_lexer": "r",
133 |       "version": "3.5.3",
134 |       "file_extension": ".r",
135 |       "codemirror_mode": "r"
136 |     }
137 |   },
138 |   "nbformat": 4,
139 |   "nbformat_minor": 2
140 | }


--------------------------------------------------------------------------------
/04. Polynomial Regression - R.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "metadata": {},
  5 |       "cell_type": "markdown",
  6 |       "source": "Exercise 4 - Polynomial Regression\n===\n\nSometimes our data doesn't have a linear relationship, but we still want to predict an outcome.\n\nSuppose we want to predict how satisfied people might be with a piece of fruit. We would expect satisfaction would be low if the fruit is under-ripe or over-ripe, and satisfaction would be high if fruit is ripe, i.e. between under-ripe and over-ripe.\n\nThis is not something linear regression will help us with, however polynomial regression can help us make predictions for more complex non-linear relationships such as these.\n\nStep 1\n---\n\nIn this exercise, we will look at a dataset analysing internet traffic over the course of the day. Observations were made every hour, on the hour, over the course of several days. Suppose we want to predict the level of internet traffic we might see at any time during the day, how might we do this?\n\nLet's start by loading the required libraries for this session, and loading our data to have a look at it.\n\n** In the cell below replace: **\n\n** 1. `<dataPreviewStr>` with `str` **\n\n** 2. `<dataPreviewHead>` with `head` **\n\n** then __run the code__. **"
  7 |     },
  8 |     {
  9 |       "metadata": {
 10 |         "trusted": false
 11 |       },
 12 |       "cell_type": "code",
 13 |       "source": "# Load required libraries\nsuppressMessages(library(\"tidyverse\"))\n\ntraf_by_hr <- read.csv(\"Data/traffic_by_hour.csv\", check.names = FALSE)\n\n###\n# REPLACE <dataPreviewStr> WITH str AND <dataPreviewHead> WITH head\n###\n<dataPreviewStr>(traf_by_hr)\n<dataPreviewHead>(traf_by_hr)\n###",
 14 |       "execution_count": null,
 15 |       "outputs": []
 16 |     },
 17 |     {
 18 |       "metadata": {},
 19 |       "cell_type": "markdown",
 20 |       "source": "By observing the structure of `traf_by_hr`, we can observe that we have the following variables: \n\n* `Hour` 00 - 23, spread across the columns;\n* `Observation` 1 - 6, with each observation representing one row;\n* `Traffic` (in units Gbps), representing the values of each observation made every hour. \n\nStep 2\n---\n\nNext, we need to reshape the data using the package `tidyr` so we can plot our data using `ggplot2`. The `ggplot2` functions expect our data input to be in \"long\" format, i.e. a column for each feature name, and a row for each observation. Currently, our data is stored in \"wide\" format, with one feature, i.e. the hour, spread across multiple columns. We need to reshape the `traf_by_hr` data so that our variables, `Hour`, `Observation`, and `Traffic` are the column names.\n\n> \"Long\" and \"wide\" format are visual metaphors that describe two ways of presenting the exact same information. Hopefully you will understand the metaphor once you see \"long\" and \"wide\" data for yourself, as per below!\n\n#### Run the following code block to reshape `traf_by_hr` (\"wide\" format) to `traf_by_hr_long` (\"long\" format). You do not need to edit the code block."
 21 |     },
 22 |     {
 23 |       "metadata": {
 24 |         "trusted": false
 25 |       },
 26 |       "cell_type": "code",
 27 |       "source": "# Run this!\n\n# DO NOT EDIT\n\n# Reshape data to long format using tidyr's gather function\ntraf_by_hr_tall <- traf_by_hr %>%\ngather(key = \"Hour\", value = \"Traffic\") %>%\nmutate(Observation = as.factor(rep(1:6, 24)))  %>% \nmutate(Hour = as.numeric(Hour))\n\n# Check structure of reshaped data\nstr(traf_by_hr_tall)\nhead(traf_by_hr_tall, n = 10)",
 28 |       "execution_count": null,
 29 |       "outputs": []
 30 |     },
 31 |     {
 32 |       "metadata": {},
 33 |       "cell_type": "markdown",
 34 |       "source": "Compare the structural difference between `traf_by_hr_long` and `traf_by_hr` for yourself, particularly the dimensions. You should observe the following:\n\n* `traf_by_hr` (\"wide\" data): 6 observations (rows) x 24 variables (columns);\n* `traf_by_hr_long` (\"long\" data): 144 observations (rows) x 3 variables (columns).\n\nNow with our data in \"long\" format, we can use `ggplot2` functions for plotting.\n\nIn the `ggplot` function call within the code block below:\n\n** In the cell below replace: **  \n** 1. `<xData>` with `Hour` **  \n** 2. `<yData>` with `Traffic` **  \n** then __run the code__. **"
 35 |     },
 36 |     {
 37 |       "metadata": {
 38 |         "trusted": false
 39 |       },
 40 |       "cell_type": "code",
 41 |       "source": "# Visualise the data using ggplot2\ntraf_by_hr_tall  %>% \n\n###\n# REPLACE THE <xData> TO Hour AND <yData> TO Traffic\n###\nggplot(aes(x = <xData>, y = <yData>, colour = Observation)) +\n###\ngeom_line() +\nggtitle(\"Internet traffic for each hour of the day\") +\nxlab(\"Hour of the day (24 hour time)\") +\nylab(\"Internet traffic (Gbps)\") +\ntheme(plot.title = element_text(hjust = 0.5))",
 42 |       "execution_count": null,
 43 |       "outputs": []
 44 |     },
 45 |     {
 46 |       "metadata": {},
 47 |       "cell_type": "markdown",
 48 |       "source": "This plot looks a bit busy due to overplotting. We should summarize the data to help us visualize trends.\n\nStep 3\n---\n\nLet's see if we can visualize a clearer pattern by taking the __average values__ for each hour.\n\nIn the code block below, find the `geom_point` function call, then:\n\n** In the cell below replace:** \n\n**1. `<xData>` with `Hour`**  \n\n**2. `<yData>` with `Traffic_mean`**  \n\n**then __run the code__. **"
 49 |     },
 50 |     {
 51 |       "metadata": {
 52 |         "trusted": false
 53 |       },
 54 |       "cell_type": "code",
 55 |       "source": "# Find mean values for each hour using `dplyr` function `mutate`\ntraf_by_hr_tall <- traf_by_hr_tall  %>% \ngroup_by(Hour)  %>% \nmutate(Traffic_mean = mean(Traffic))  %>% \nungroup()  %>% \nas.data.frame()\n\n# Check structure of data\nstr(traf_by_hr_tall)\nhead(traf_by_hr_tall, n = 10)\n\n# Create plot\ntraf_by_hr_tall  %>% \nggplot() +\n\n# Plot the average of the 6 observations as points\n\n###\n# REPLACE <xData> TO Hour and <yData> to Traffic_mean\n###\ngeom_point(aes(x = <xData>, y = <yData>)) +\n###\n# Plot each observation 1 - 6 as a line\ngeom_line(aes(x = Hour, y = Traffic, colour = Observation), alpha = 0.5) +\nylab(\"Average internet traffic (Gbps)\") +\ntheme(plot.title = element_text(hjust = 0.5))",
 56 |       "execution_count": null,
 57 |       "outputs": []
 58 |     },
 59 |     {
 60 |       "metadata": {},
 61 |       "cell_type": "markdown",
 62 |       "source": "The plot above shows the average value for each hour as points (black), together with observations 1 - 6 as lines.\n\nWe can also plot our data using a graph type that summarizes the data for us, such as a box and whisker plot.\n\nIn the code below, within the `ggplot` function call, change the x and y variables to the features we want to observe.\n\n** In the cell below replace:**  \n**1. `<xData>` with `Hour`**  \n**2. `<yData>` with `Traffic`**  \n**then __run the code__. **"
 63 |     },
 64 |     {
 65 |       "metadata": {
 66 |         "trusted": false
 67 |       },
 68 |       "cell_type": "code",
 69 |       "source": "# Create box and whisker plot\ntraf_by_hr_tall  %>% \n###\n# REPLACE <xData> TO Hour and <yData> to Traffic\n###\nggplot(aes(x = <xData>, y = <yData>, group = Hour)) +\n###\ngeom_boxplot() +\nggtitle(\"Internet traffic for each hour of the day\") +\nxlab(\"Hour of the day (24 hour time)\") +\nylab(\"Internet traffic (Gbps)\") +\n# Align title to centre\ntheme(plot.title = element_text(hjust = 0.5))",
 70 |       "execution_count": null,
 71 |       "outputs": []
 72 |     },
 73 |     {
 74 |       "metadata": {},
 75 |       "cell_type": "markdown",
 76 |       "source": "This summarization of the data could help us make a prediction if we wanted to know the expected traffic exactly on the hour.\n\nBut, we'll need to be a bit clever if we want to make a good prediction of times in between."
 77 |     },
 78 |     {
 79 |       "metadata": {},
 80 |       "cell_type": "markdown",
 81 |       "source": "Step 4\n---\n\nLet's use the midpoints in between the hours to help us analyse the relationship between the time of day and the amount of internet traffic.\n\nThe `lm` (linear model) function together with the `poly` (polynomial) function allow us to do just this. We need to specify a feature $x$ (time of day), our outcome $y$ (the amount of internet traffic), and the $degree$ of the polynomial (how curvy the line is).\n\n> You can use the `lm` function directly, but for this exercise we will use `lm` indirectly through `ggplot2`.\n\nFirst we will test polynomial functions with degrees 1, 2, 3 and 4. Note the first degree polynomial (degree = 1) has already been completed for you; first degree polynomials are linear, so we will include it for comparison.\n\n** In the cell below replace: **  \n** 1. `<addDegree2>` with `2`**  \n** 2. `<addDegree3>` with `3`**  \n** 3. `<addDegree4>` with `4`**  \n** then __run the code__.**"
 82 |     },
 83 |     {
 84 |       "metadata": {
 85 |         "trusted": false
 86 |       },
 87 |       "cell_type": "code",
 88 |       "source": "traf_by_hr_tall  %>% \nggplot(aes(x = Hour, y = Traffic_mean)) +\ngeom_point(alpha = 0.5) +\nstat_smooth(method = \"lm\", formula = y ~ poly(x, degree = 1), colour = \"black\", linetype = \"dashed\", se = FALSE) +\n\n###\n# REPLACE <addDegree2> WITH 2, <addDegree3> WITH 3, AND <addDegree4> WITH 4\n###\nstat_smooth(method = \"lm\", formula = y ~ poly(x, degree = <addDegree2>), colour = \"#F8766D\", se = FALSE) + # red\nstat_smooth(method = \"lm\", formula = y ~ poly(x, degree = <addDegree3>), colour = \"#00BFC4\", se = FALSE) + # blue\nstat_smooth(method = \"lm\", formula = y ~ poly(x, degree = <addDegree4>), colour = \"#7CAE00\", se = FALSE) + # green\n\nxlab(\"Hour of the day (24 hour time)\") +\nylab(\"Internet traffic (Gbps)\") +\nggtitle(\"Testing fit of polynomial functions (degrees 1 - 4) to internet traffic data\") +\ntheme(plot.title = element_text(hjust = 0.5))",
 89 |       "execution_count": null,
 90 |       "outputs": []
 91 |     },
 92 |     {
 93 |       "metadata": {},
 94 |       "cell_type": "markdown",
 95 |       "source": "None of these polynomial functions do a great job of generalising the data. Let's try a few more.\n\nIn the code below, test polynomial functions of degree 5, 6 and 7.\n\n** In the cell below replace:**  \n** 1. `<addDegree5>` with `5`**  \n** 2. `<addDegree6>` with `6`**  \n** 3. `<addDegree7>` with `7`**  \n** then __run the code__.**"
 96 |     },
 97 |     {
 98 |       "metadata": {
 99 |         "trusted": false
100 |       },
101 |       "cell_type": "code",
102 |       "source": "traf_by_hr_tall  %>% \nggplot(aes(x = Hour, y = Traffic_mean)) +\ngeom_point(alpha = 0.5) +\n# Polynomials of degree 1 are linear, keep this for comparison\nstat_smooth(method = \"lm\", formula = y ~ poly(x, degree = 1), colour = \"black\", linetype = \"dashed\", se = FALSE) +\n# Change degree = ? to degree = 5\n\n###\n# REPLACE <addDegree5> WITH 5, <addDegree6> WITH 6, AND <addDegree7> WITH 7\n###\nstat_smooth(method = \"lm\", formula = y ~ poly(x, degree = <addDegree5>), colour = \"#F8766D\", se = FALSE) + # red\nstat_smooth(method = \"lm\", formula = y ~ poly(x, degree = <addDegree6>), colour = \"#00BFC4\", se = FALSE) + # blue\nstat_smooth(method = \"lm\", formula = y ~ poly(x, degree = <addDegree7>), colour = \"#7CAE00\", se = FALSE) + # green\n###\n\nxlab(\"Hour of the day (24 hour time)\") +\nylab(\"Internet traffic (Gbps)\") +\nggtitle(\"Testing fit of polynomial functions (degrees 5 - 7) to internet traffic data\") +\ntheme(plot.title = element_text(hjust = 0.5))",
103 |       "execution_count": null,
104 |       "outputs": []
105 |     },
106 |     {
107 |       "metadata": {},
108 |       "cell_type": "markdown",
109 |       "source": "It looks like the 6th and 7th degree polynomials have an identical curve, so either of these polynomials will be a good model to use.\n\nWe could use an even higher degree polynomial to fit the model to our data even more tightly, but we don't want to overfit the curve, since we just want a generalization of the relationship between time of day and internet traffic.\n\nLet's see how our 6th degree polynomial alone compares to the real data.\n\n#### Run the code below."
110 |     },
111 |     {
112 |       "metadata": {
113 |         "trusted": false
114 |       },
115 |       "cell_type": "code",
116 |       "source": "# Run this!\n\n# DO NOT EDIT\n\ntraf_by_hr_tall  %>% \nggplot(aes(x = Hour, y = Traffic, colour = Observation)) +\ngeom_line(alpha = 0.5) +\nstat_smooth(method = \"lm\", formula = y ~ poly(x, degree = 6), colour = \"black\", se = FALSE) +\nxlab(\"Hour of the day (24 hour time)\") +\nylab(\"Internet traffic (Gbps)\") +\nggtitle(\"Testing fit of 6th degree polynomial to internet traffic data\") +\ntheme(plot.title = element_text(hjust = 0.5))",
117 |       "execution_count": null,
118 |       "outputs": []
119 |     },
120 |     {
121 |       "metadata": {},
122 |       "cell_type": "markdown",
123 |       "source": "Looking good!\n\nStep 5\n---\n\nNow let's try using this polynomial regression model to make a prediction for how much internet traffic there will be at a certain time of day. Let's choose the time 12:30pm.\n\n** Replace the `<addTime>` with `12.5`, which represents the time 12.30 pm, and run the code. **"
124 |     },
125 |     {
126 |       "metadata": {
127 |         "trusted": false
128 |       },
129 |       "cell_type": "code",
130 |       "source": "###\n# REPLACE THE <addTime> WITH 12.5 (THIS REPRESENTS 12:30PM)\n###\nt <- data.frame(Hour = <addTime>)\n###\n\n# Save our 6th degree polynomial model\nlm_poly_6th <- lm(formula = Traffic ~ poly(Hour, degree = 6), data = traf_by_hr_tall)\n\n# Use predict function, and round the result to 2 decimal places\n# Input to predict function must be data frame, with column name set as x value\nt_pred <- round(predict(lm_poly_6th, t), 2)\n\nprint(paste(\"Based on our polynomial regression model, at time t = 12.5\",\n      \"the expected internet traffic is\", t_pred , \"Gbps.\"))\n\ntraf_by_hr_tall %>% \nggplot(aes(x = Hour, y = Traffic)) +\nstat_smooth(method = \"lm\", formula = y ~ poly(x, degree = 6), colour = \"black\", se = FALSE) +\n# Show predicted value as a point in red\ngeom_point(x = 12.5, y = t_pred, size = 3) +\n# Add horizontal line\ngeom_hline(yintercept = t_pred, linetype = \"dashed\", colour = \"red\") +\ngeom_vline(xintercept = 12.5, linetype = \"dashed\", colour = \"red\") +\nggtitle(\"Prediction of expected traffic at t = 12.5\") +\nxlab(\"Hour of the day (24 hour time)\") +\nylab(\"Internet traffic (Gbps)\") +\ntheme(plot.title = element_text(hjust = 0.5))",
131 |       "execution_count": null,
132 |       "outputs": []
133 |     },
134 |     {
135 |       "metadata": {},
136 |       "cell_type": "markdown",
137 |       "source": "Conclusion\n---\n\nThere we have it! You have made a polynomial regression model and used it for analysis! This models gives us a prediction for the level of internet traffic we should expect to see at any given time of the day.\n\nYou can go back to the course and either click __'Next Step'__ to start an optional step with tips on how to better work with AI models, or you can go to the next module where instead of predicting numbers we predict categories."
138 |     }
139 |   ],
140 |   "metadata": {
141 |     "kernelspec": {
142 |       "name": "r",
143 |       "display_name": "R",
144 |       "language": "R"
145 |     },
146 |     "language_info": {
147 |       "mimetype": "text/x-r-source",
148 |       "name": "R",
149 |       "pygments_lexer": "r",
150 |       "version": "3.4.1",
151 |       "file_extension": ".r",
152 |       "codemirror_mode": "r"
153 |     }
154 |   },
155 |   "nbformat": 4,
156 |   "nbformat_minor": 2
157 | }


--------------------------------------------------------------------------------
/05. Logistic Regression - R.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "metadata": {
  5 |         "collapsed": true
  6 |       },
  7 |       "cell_type": "markdown",
  8 |       "source": "Exercise 5 - Logistic Regression\n===\n\nSimple logistic regression predicts binary (yes/no) events. For example, we may want to predict if someone will arrive at work on time, or if a person shopping will buy a product. \n\nThis exercise will demonstrate simple logistic regression: predicting an outcome from only one feature.\n\n#### Run the code below to prepare the necessary libraries for this exercise."
  9 |     },
 10 |     {
 11 |       "metadata": {
 12 |         "trusted": true
 13 |       },
 14 |       "cell_type": "code",
 15 |       "source": "# Run this!\n\nsuppressMessages(install.packages(\"tidyverse\"))\nsuppressMessages(library(\"tidyverse\"))\nsuppressMessages(library(\"glmnet\"))",
 16 |       "execution_count": null,
 17 |       "outputs": []
 18 |     },
 19 |     {
 20 |       "metadata": {
 21 |         "trusted": true
 22 |       },
 23 |       "cell_type": "markdown",
 24 |       "source": "Step 1\n---\n\nWe want to place a bet on the outcome of the next football (soccer) match. It is the final of a competition, so there will not be a draw. We have historical data about our favourite team playing in matches such as this. \n\nComplete the exercise below to see the structure of this data.\n\n### In the cell below replace:\n#### 1. `<dataPreviewStr>` with `str`\n#### 2. `<dataPreviewHead>` with `head`\n#### then __run the code__."
 25 |     },
 26 |     {
 27 |       "metadata": {
 28 |         "trusted": true
 29 |       },
 30 |       "cell_type": "code",
 31 |       "source": "team_stats <- read.delim(\"Data/football data.txt\")\n\n###\n# REPLACE <dataPreviewStr> WITH str AND <dataPreviewHead> WITH head\n###\n<dataPreviewStr>(team_stats)\n<dataPreviewHead>(team_stats)\n###\n\nsummary(team_stats$average_goals_per_match)",
 32 |       "execution_count": null,
 33 |       "outputs": []
 34 |     },
 35 |     {
 36 |       "metadata": {},
 37 |       "cell_type": "markdown",
 38 |       "source": "The `team_stats` data shows the average goals per match of our team for the season in the first column, and whether the team won the competition in the second column. The `won_competition` variable is a binary outcome, where 1 represents a win, and 0 represents a loss.\n\nStep 2\n---\n\nLet's graph the data so we have a better idea of what's going on. \n\nComplete the exercise below to make a scatter plot of `team_stats`. Replace the x variable with the names of the feature we want to plot on the x-axis.\n\n#### In the cell below replace `<xData>` with `average_goals_per_match`"
 39 |     },
 40 |     {
 41 |       "metadata": {
 42 |         "trusted": true
 43 |       },
 44 |       "cell_type": "code",
 45 |       "source": "team_stats  %>% \n###\n# REPLACE <xData> WITH average_goals_per_match\n###\nggplot(aes(x = <xData>, y = as.factor(won_competition), colour = as.factor(won_competition))) +\n###\n\ngeom_jitter() +\nggtitle(\"Game statistics for favourite football team\") +\nxlab(\"Average number of goals scored per match\") +\nylab(\"Competition win\") +\n# Align title to centre\ntheme(plot.title = element_text(hjust = 0.5), legend.position = \"none\")",
 46 |       "execution_count": null,
 47 |       "outputs": []
 48 |     },
 49 |     {
 50 |       "metadata": {},
 51 |       "cell_type": "markdown",
 52 |       "source": "In the plot above, we have used ggplot2's `geom_jitter` function, which adds a small amount of random variation to the location of each point. Since we have binary outcomes in this dataset, using this function allows us to handle overplotting.\n\n> If you want to test this for yourself, change the `geom_jitter` call in the code block above to `geom_point`; it is harder to decipher which points overlap using the latter function.\n\nWe can see that in general, when our team has a good score average (x-axis), they tend to win the competition.\n\nStep 3\n---\n\nHow can we predict whether the team will win this season? Let's apply AI to this problem, by making a logisitic regression model using this data and then graphing it.\n\nWe will use the function `glm`, which stands for generalized linear models. We will set the type of model (\"family\" argument) as binomial logistic regression - to specify that we want a logistic regression model. \n\nWe'll use the standard R format for the formula, which is `labels ~ features` (if you see a `.` this means it will select all features in the dataset).\n\n### In the cell below replace:\n#### 1. `<formula>` with `won_competition ~ average_goals_per_match`\n#### 2. `<dataset>` with `team_stats`\n#### then __run the code__."
 53 |     },
 54 |     {
 55 |       "metadata": {
 56 |         "trusted": true
 57 |       },
 58 |       "cell_type": "code",
 59 |       "source": "###\n# REPLACE <formula> WITH won_competition ~ average_goals_per_match AND <dataset> WITH team_stats\n###\nglm_team <- glm(formula = <formula>, family = binomial(link = \"logit\"), \n                data = <dataset>)\n###\nsummary(glm_team)\n\n# And we'll quickly print out some predictions to make sure it's working\nhead(predict(glm_team, data = team_stats, type = \"response\"))",
 60 |       "execution_count": null,
 61 |       "outputs": []
 62 |     },
 63 |     {
 64 |       "metadata": {},
 65 |       "cell_type": "markdown",
 66 |       "source": "Alright, that's the model done. Now run the code below to graph it."
 67 |     },
 68 |     {
 69 |       "metadata": {
 70 |         "trusted": true
 71 |       },
 72 |       "cell_type": "code",
 73 |       "source": "# Run this!\n\n# Plot using ggplot2\nteam_stats %>%\nggplot(aes(x = average_goals_per_match, y = won_competition)) +\ngeom_point(aes(colour = as.factor(won_competition)), alpha = 0.5, size = 3) +\ngeom_smooth(method = \"glm\", se = FALSE, method.args = list(family = \"binomial\"), \n            colour = \"black\") +\nggtitle(\"Binomial logistic regression model for football team competition win\") +\nxlab(\"Average number of goals scored per match\") +\nylab(\"Competition win\") +\ntheme(plot.title = element_text(hjust = 0.5), legend.position = \"none\") + \nscale_y_continuous(labels = c(\"0\", \"\", \"\", \"\", \"1\"))",
 74 |       "execution_count": null,
 75 |       "outputs": []
 76 |     },
 77 |     {
 78 |       "metadata": {},
 79 |       "cell_type": "markdown",
 80 |       "source": "We now have a binomial logistic regression model to fit our data. The black line represents our model.\n\nStep 4\n------\n\nWe can read the model above like so:\n* Take the average number of goals per match for the current year. Let's say it is 2.5.\n* Find 2.5 on the x-axis. \n* What value (on the y axis) does the line have at x = 2.5?\n* If this value is above 0.5, then the model predcits that our team will win this year. If it is less than 0.5, it predicts that our team will lose.\n\nBecause this line is just a mathematical function (equation) we don't have to do this visually.\n\nIn the exercise below, choose the number of goals you want to evaluate.\n\nThe code will calculate the probability that our team will win with your chosen number of goals in the match.\n\n#### Replace `<numberOfGoals>` with a number between 0 and 3, then run the code."
 81 |     },
 82 |     {
 83 |       "metadata": {
 84 |         "trusted": true
 85 |       },
 86 |       "cell_type": "code",
 87 |       "source": "###\n# REPLACE <numberOfGoals> WITH A NUMBER BETWEEN 0 AND 3\n###\ngoals <- <numberOfGoals>\n###\n\n# Create data frame for input to predict function\nmean_goals <- data.frame(average_goals_per_match = c(goals))\n\n# Run predict function based on inout to goals\nmean_goals$prediction <- predict(object = glm_team, newdata = mean_goals, type = \"response\")\n\n# View result\nmean_goals\n\n# Print out the result to screen\npaste0(\"The probability of our team winning this year is \", round(mean_goals$prediction * 100, digits = 4), \"%\")",
 88 |       "execution_count": null,
 89 |       "outputs": []
 90 |     },
 91 |     {
 92 |       "metadata": {},
 93 |       "cell_type": "markdown",
 94 |       "source": "Now let's plot our chosen number of goals in the context of our model using ggplot2:"
 95 |     },
 96 |     {
 97 |       "metadata": {
 98 |         "trusted": true
 99 |       },
100 |       "cell_type": "code",
101 |       "source": "# Run this!\n\nteam_stats %>% \nggplot(aes(x = average_goals_per_match, y = won_competition)) +\ngeom_point(aes(colour = as.factor(won_competition)), alpha = 0.5, size = 3) +\ngeom_point(data = mean_goals, aes(x = average_goals_per_match, y = prediction), size = 5, colour = \"black\",\n           shape = \"cross\") +\ngeom_smooth(method = \"glm\", se = FALSE, method.args = list(family = \"binomial\"), \n            colour = \"black\") +\nggtitle(\"Binomial logistic regression model for football team competition win\") +\nxlab(\"Average number of goals scored per match\") +\nylab(\"Competition win\") +\ntheme(plot.title = element_text(hjust = 0.5), legend.position = \"none\") +\ngeom_hline(yintercept = mean_goals$prediction, linetype = \"dotted\") +\ngeom_vline(xintercept = mean_goals$average_goals_per_match, linetype = \"dotted\")",
102 |       "execution_count": null,
103 |       "outputs": []
104 |     },
105 |     {
106 |       "metadata": {},
107 |       "cell_type": "markdown",
108 |       "source": "Conclusion\n-----\n\nWell done! We have calculated the likelihood that our team will win this year's competition.\n\nYou can go back to the course now and click __'Next Step'__."
109 |     }
110 |   ],
111 |   "metadata": {
112 |     "kernelspec": {
113 |       "name": "r",
114 |       "display_name": "R",
115 |       "language": "R"
116 |     },
117 |     "language_info": {
118 |       "mimetype": "text/x-r-source",
119 |       "name": "R",
120 |       "pygments_lexer": "r",
121 |       "version": "3.4.1",
122 |       "file_extension": ".r",
123 |       "codemirror_mode": "r"
124 |     }
125 |   },
126 |   "nbformat": 4,
127 |   "nbformat_minor": 2
128 | }


--------------------------------------------------------------------------------
/06. Support Vector Machines - R.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "metadata": {
  5 |         "collapsed": true
  6 |       },
  7 |       "cell_type": "markdown",
  8 |       "source": "Exercise 6 - Support Vector Machines\n===\n\nSupport vector machines (SVMs) let us predict categories. This exercise will demonstrate a simple support vector machine that can predict a category from a small number of features. \n\nOur problem is that we want to be able to categorise which type of tree a new specimen belongs to. To do this, we will use leaf and trunk features of three different types of trees to train SVMs.\n\n#### Run the cell below to load the required libraries"
  9 |     },
 10 |     {
 11 |       "metadata": {
 12 |         "trusted": true
 13 |       },
 14 |       "cell_type": "code",
 15 |       "source": "# Run this!\n\n# Load required packages\nsuppressMessages(install.packages(\"tidyverse\"))\nsuppressMessages(library(\"tidyverse\"))\n\nsuppressMessages(install.packages(\"e1071\"))\nsuppressMessages(library(\"e1071\"))",
 16 |       "execution_count": null,
 17 |       "outputs": []
 18 |     },
 19 |     {
 20 |       "metadata": {},
 21 |       "cell_type": "markdown",
 22 |       "source": "Step 1\n---\n\nFirst, let's load the required packages for this session, and load the raw data to see what features we have.\n\n**In the code below, replace `<dataStructure>` with `str` to view the structure of the raw data. Run the code once complete.**"
 23 |     },
 24 |     {
 25 |       "metadata": {
 26 |         "scrolled": false,
 27 |         "trusted": true
 28 |       },
 29 |       "cell_type": "code",
 30 |       "source": "# Load tree data and save as a new variable named `tree_data`\ntree_data <- read.csv(\"Data/trees.csv\")\n\n###\n# IN THE CODE BELOW, CHECK THE STRUCTURE OF tree_data BY REPLACING <dataStructure> WITH str\n###\n<dataStructure>(tree_data)\n###",
 31 |       "execution_count": null,
 32 |       "outputs": []
 33 |     },
 34 |     {
 35 |       "metadata": {},
 36 |       "cell_type": "markdown",
 37 |       "source": "Given the results from `str(tree_data)`, we can see that we have _four features_: \n\n* `leaf_width`\n* `leaf_length`\n* `trunk_girth`\n* `trunk_height`\n\nWe also have _one label_:\n\n* `tree_type`\n\nLet's plot these features using the package `ggplot2`. We will look at the leaf features and trunk features separately using scatter plots, and colour the points based on the label `tree_type`.\n\n### In the cell below replace:\n#### 1. `<xData>` with `leaf_width`\n#### 2. `<yData>` with `leaf_length`\n#### then __run the code__."
 38 |     },
 39 |     {
 40 |       "metadata": {
 41 |         "scrolled": false,
 42 |         "trusted": true
 43 |       },
 44 |       "cell_type": "code",
 45 |       "source": "# Plot the leaf features, where `x = leaf_width` and `y = leaf_length`\ntree_data %>%\n\n###\n# REPLACE <xData> WITH leaf_width and <yData> with leaf_length\n###\nggplot(aes(x = <xData>, y = <yData>, colour = as.factor(tree_type))) +\ngeom_point() +\nggtitle(\"Leaf length vs. leaf width coloured by tree type\") +\nlabs(x = \"Leaf width\", y = \"Leaf length\", colour = \"Tree type\") +\ntheme(plot.title = element_text(hjust = 0.5))",
 46 |       "execution_count": null,
 47 |       "outputs": []
 48 |     },
 49 |     {
 50 |       "metadata": {},
 51 |       "cell_type": "markdown",
 52 |       "source": "Based on the features `leaf_width` and `leaf_length`, we can see three groups that separate according to the label `tree_type`: `0`, `1`, and `2` (coloured red, green, and blue, respectively).\n\nNow let's plot the trunk features in a separate plot.\n\nIn the code below, we will graph each of the trunk features.\n\n### In the cell below replace:\n#### 1. `<xData>` with `trunk_girth`\n#### 2. `<yData>` with `trunk_height`\n#### then __run the code__."
 53 |     },
 54 |     {
 55 |       "metadata": {
 56 |         "scrolled": false,
 57 |         "trusted": true
 58 |       },
 59 |       "cell_type": "code",
 60 |       "source": "# Plot the trunk features, where `x = trunk_girth` and `y = trunk_height`\ntree_data %>%\n\n###\n# REPLACE <xData> WITH trunk_girth and <yData> WITH trunk_height\n###\nggplot(aes(x = <xData>, y = <yData>, colour = as.factor(tree_type))) +\n###\ngeom_point() +\nggtitle(\"Trunk height vs. trunk girth coloured by tree type\") +\nlabs(x = \"Trunk girth\", y = \"Trunk height\", colour = \"Tree type\") +\ntheme(plot.title = element_text(hjust = 0.5))",
 61 |       "execution_count": null,
 62 |       "outputs": []
 63 |     },
 64 |     {
 65 |       "metadata": {},
 66 |       "cell_type": "markdown",
 67 |       "source": "Based on the features `trunk_girth` and `trunk_height`, again we can see three groups that separate according to the label `tree_type`: `0`, `1`, and `2` (coloured red, green, and blue, respectively). There are some outliers, but for the most part, the features trunk girth and trunk height allow you to predict tree type.\n\nNow, say we obtain a new tree specimen and we want to figure out the tree type based on its leaf and trunk measurements. We *could* make a rough guess as to which tree type it belongs to based on where the tree data points lie in the two scatter plots we just created. Alternatively, using these same leaf and trunk measurements, SVMs can predict the tree type for us. SVMs will use the features and labels we provide for known tree types to create hyperplanes for tree type. These hyperplanes allow us to predict which tree type a new tree specimen belongs to, given their leaf and trunk measurements.\n\nIn the next step, we will use SVMs to help solve this problem.\n\nStep 2\n-----\n\nLet's make two SVMs using our data, `tree_data`: one SVM based on the leaf features, and another SVM based on the trunk features.\n\nThe syntax for a simple SVM using the package `e1071` is as follows:\n\n`svm_model <- svm(x = x, y = y, data = dataset)`\n\nwhere `x` represents the features (of class *matrix*), and `y` represents the labels (of class *factor*).\n\n> **R uses a variety of data types and data structures to describe different objects. You may have noticed a few types of objects already, including** `data.frame`, `list`, `matrix`, `factor`, **and** atomic vectors (`integer`, `numeric`, `logical`, `character`). **Knowing the structure of your data object is crucial, particularly when you are running functions that require the data object to be of a certain type.**\n\n> **For the `svm` function, we require two types of data structures: a** `matrix` **and a** `factor`**. A** `matrix` **is a two-dimensional data structure (with rows and columns) containing elements of all the same type, most commonly numbers (numeric or integer) with which you can perform further mathematical operations. Note that a** `matrix` **is different from a** `data.frame`**, as data frames can contain a mix of elements, i.e. both numbers (numeric/integer) and letters (factor/character/logical). A** `factor` **is used to categorize data, where the names of the categories are known as levels. For example, \"fruit\" could be a factor, with levels including \"apples\", \"bananas\", and \"oranges\", or in our example \"tree type\" can be a factor, with levels \"0\", \"1\", and \"2\". Also note that the levels of factors can be ordered, e.g. \"clothing size\" can be a factor, and you can order the levels \"small\", \"medium\", and \"large\", which is important when it comes to plotting the data.**\n\nFor our two SVMs, we will need to create the appropriate `x` and `y` variables based on `tree_data`.\n\nThe first SVM will be based on the leaf features `leaf_width` and `leaf_length`. We will need to create a new variable that contains only these two features, then convert it from a `data.frame` to a `matrix`, so it can be the input to the `x` argument in the `svm` function. We also need to convert the labels `tree_type` into a factor for the input to the `y` variable  of the `svm` function, as it is currently stored as an integer (see the results from `str` above, where `int` stands for `integer`).\n\nThe code below creates the appropriate `x` and `y` variables for the leaf features in `tree_data`. \n\n### In the cell below replace:\n#### 1. `<leafFeature1>` with `leaf_width`\n#### 2. `<leafFeature2>` with `leaf_length`\n#### then __run the code__."
 68 |     },
 69 |     {
 70 |       "metadata": {
 71 |         "scrolled": true,
 72 |         "trusted": true
 73 |       },
 74 |       "cell_type": "code",
 75 |       "source": "# Create new x variable input to `svm`\nx_leaf_data <- tree_data %>% \n\n###\n# REPLACE <leafFeature1> WITH leaf_width AND <leafFeature2> WITH leaf_length\n###\nselect(<leafFeature1>, <leafFeature2>) %>%\n###\nas.matrix()\n\n# Check x variable input to `svm`\nclass(x_leaf_data)\nhead(x_leaf_data)\n\n# Change `tree_data$tree_type` to a factor\ntree_data <- tree_data  %>% \nmutate(tree_type = as.factor(tree_type))\n\n# Check y variable input to `svm`\nclass(tree_data$tree_type)\nhead(tree_data$tree_type)",
 76 |       "execution_count": null,
 77 |       "outputs": []
 78 |     },
 79 |     {
 80 |       "metadata": {},
 81 |       "cell_type": "markdown",
 82 |       "source": "Now we can run the function `svm` based on the leaf features stored in the new variable `x_leaf_data`, and the label saved in the variable `tree_data$tree_type`.\n\n### In the cell below replace:\n#### 1. `<features>` with `x_leaf_data`\n#### 2. `<labels>` with `tree_data$tree_type`\n#### then __run the code__."
 83 |     },
 84 |     {
 85 |       "metadata": {
 86 |         "scrolled": true,
 87 |         "trusted": true
 88 |       },
 89 |       "cell_type": "code",
 90 |       "source": "###\n# REPLACE <features> WITH x_leaf_data AND <labels> WITH tree_data$tree_type\n###\nsvm_leaf_data <- svm(x = <features>, y = <labels>, type = \"C-classification\", kernel = \"radial\")\n###\nprint(\"The SVM model named svm_leaf_data is ready.\")",
 91 |       "execution_count": null,
 92 |       "outputs": []
 93 |     },
 94 |     {
 95 |       "metadata": {},
 96 |       "cell_type": "markdown",
 97 |       "source": "To help us view the hyperplanes of the SVM based on the leaf data, we will create a fine grid of data points within the feature space to represent different combinations of leaf width and leaf length, and colour the new data points based on the predictions of `svm_leaf_data`. You do not need to edit this code block.\n\n**Run the code below**"
 98 |     },
 99 |     {
100 |       "metadata": {
101 |         "scrolled": true,
102 |         "trusted": true
103 |       },
104 |       "cell_type": "code",
105 |       "source": "# Run this box to create the grid of datapoints\n\n# Create a fine grid of the feature space\nleaf_width <- seq(from = min(tree_data$leaf_width), to = max(tree_data$leaf_width), length = 100)\nleaf_length <- seq(from = min(tree_data$leaf_length), to = max(tree_data$leaf_length), length = 100)\n\nfine_grid_leaf <- as.data.frame(expand.grid(leaf_width, leaf_length))\nfine_grid_leaf <- fine_grid_leaf %>%\n                  dplyr::rename(leaf_width = \"Var1\", leaf_length = \"Var2\")\n\n# Check output\nhead(fine_grid_leaf)\n\n# For every new point in `fine_grid_leaf`, predict its tree type based on the SVM `svm_leaf_data`\nfine_grid_leaf$tree_pred <- predict(svm_leaf_data, newdata = fine_grid_leaf, type = \"decision\")\n\n# Check output\nhead(fine_grid_leaf)\ntable(fine_grid_leaf$tree_pred)",
106 |       "execution_count": null,
107 |       "outputs": []
108 |     },
109 |     {
110 |       "metadata": {},
111 |       "cell_type": "markdown",
112 |       "source": "Now we can create a scatter plot that contains the new fine grid of points we created above, and also the original tree data to see which group the different trees fall into based on the SVM `svm_leaf_data`. You do not need to edit this code block.\n\n**Run the code below**"
113 |     },
114 |     {
115 |       "metadata": {
116 |         "scrolled": false,
117 |         "trusted": true
118 |       },
119 |       "cell_type": "code",
120 |       "source": "# Run this box to generate the scatter plot\n\n# Create scatter plot  with original leaf features layered over the fine grid of data points\nggplot() +\ngeom_point(data = fine_grid_leaf, aes(x = leaf_width, y = leaf_length, colour = tree_pred), alpha = 0.25) +\nstat_contour(data = fine_grid_leaf, aes(x = leaf_width, y = leaf_length, z = as.integer(tree_pred)),\n             lineend = \"round\", linejoin = \"round\", linemitre = 1, size = 0.25, colour = \"black\") +\ngeom_point(data = tree_data, aes(x = leaf_width, y = leaf_length, colour = tree_type, shape = tree_type)) +\nggtitle(\"SVM decision boundaries for leaf length vs. leaf width\") +\nlabs(x = \"Leaf width\", y = \"Leaf length\", colour = \"Actual tree type\", shape = \"Actual tree type\") +\ntheme(plot.title = element_text(hjust = 0.5))",
121 |       "execution_count": null,
122 |       "outputs": []
123 |     },
124 |     {
125 |       "metadata": {},
126 |       "cell_type": "markdown",
127 |       "source": "The graph shows three faintly coloured zones based on the SVM's predictions for the fine grid of data points (based on leaf features), and the hyperplanes for the different tree types represented by thick black lines. \n\nWe can use these coloured zones and hyperplanes to observe which tree type the SVM has chosen to place our original data points into. Note that in the graph above, our original data points are represented by both colour and shape. Also remember, that the tree type of the fine grid of data points is based on the SVM model where we used leaf features as input to the SVM.\n\nSo, using the graph above, we observe two different classification scenarios:\n\n1. Our original data points are classified correctly by the SVM, as the data point falls into the zone of the same colour, e.g. a green triangle data point (an actual type 1 tree) falls into the green zone (the SVM predicted the tree as type 1).\n\n2. Our original data points are misclassified by the SVM, as the data point falls into the zone of a different colour, e.g. a red circle data point (an actual type 0 tree) falls into the green zone (the SVM predicted the tree as type 1). \n\nFor the most part, our SVM can calculate tree type based on leaf features reasonably well, but let's determine the mis-classification rate. To do this, we will need to run the `predict` function again, but this time using our original data points as input. Note that this method is somewhat circular, since we used this same data to train the SVM, but we will run this just to give us an idea how well our SVM fits our data. \n\n> **If we truly want to test the performance of our SVM, we need a *training set* with which to train the SVM, and an independent *test/validation set* with which to test the SVM.**\n\n**Run the code below, to run the `predict` function. You do not need to edit this code block.**"
128 |     },
129 |     {
130 |       "metadata": {
131 |         "trusted": true
132 |       },
133 |       "cell_type": "code",
134 |       "source": "# Run this box to run the predict function\n\npred_leaf_data <- tree_data %>% \nselect(leaf_width, leaf_length)\n\n# Predict the tree type of our original data based on the SVM `svm_leaf_data`\npred_leaf_data$tree_pred <- predict(svm_leaf_data, newdata = pred_leaf_data, type = \"decision\")\n\n# Check output\nhead(pred_leaf_data)\n\n# Add tree_data$tree_type to pred_leaf_data\npred_leaf_data <- inner_join(pred_leaf_data, tree_data, by = c(\"leaf_width\", \"leaf_length\")) %>%\nselect(-trunk_girth, -trunk_height)\n\n# Check output\nhead(pred_leaf_data)\n\n# Create a table of predictions to show mis-classification rate\ntable(pred_leaf_data$tree_pred, pred_leaf_data$tree_type)\n\n# Mis-classification rate: proportion of misclassifiedb observations\nmean(pred_leaf_data$tree_pred != pred_leaf_data$tree_type)",
135 |       "execution_count": null,
136 |       "outputs": []
137 |     },
138 |     {
139 |       "metadata": {},
140 |       "cell_type": "markdown",
141 |       "source": "Our mis-classification rate is 6.5% which can actually preferable to a mis-classification rate of 0%, as the latter might indicate that the model has overfit the training data.\n\n# Step 3\n\nNow let's create our second SVM based on the trunk features. Remember, for the `e1071::svm` function, we need to create a new variable for input to the `x` argument, but we can use the same variable as before as input to `y`, `tree_data$tree_type`.\n\n### In the cell below replace:\n#### 1. `<trunkFeature1>` with `trunk_girth`\n#### 2. `<trunkFeature2>` with `trunk_height`\n#### then __run the code__."
142 |     },
143 |     {
144 |       "metadata": {
145 |         "scrolled": true,
146 |         "trusted": true
147 |       },
148 |       "cell_type": "code",
149 |       "source": "# Create new x variable input to `svm` based on trunk features\nx_trunk_data <- tree_data %>% \n\n###\n# REPLACE <trunkFeature1> WITH trunk_girth and <trunkFeature2> WITH trunk_height\n###\nselect(<trunkFeature1>, <trunkFeature2>) %>%\n###\nas.matrix()\n\n# Check output\nhead(x_trunk_data)\n\n# Fit SVM\nsvm_trunk_data <- svm(x = x_trunk_data, y = tree_data$tree_type, type = \"C-classification\", kernel = \"radial\")\n\n# Create a fine grid of the feature space\ntrunk_girth <- seq(from = min(tree_data$trunk_girth), to = max(tree_data$trunk_girth), length = 100)\ntrunk_height <- seq(from = min(tree_data$trunk_height), to = max(tree_data$trunk_height), length = 100)\n\nfine_grid_trunk <- as.data.frame(expand.grid(leaf_width, leaf_length))\nfine_grid_trunk <- fine_grid_trunk %>% \n                   dplyr::rename(trunk_girth = \"Var1\", trunk_height = \"Var2\")\n\n# Check output\nhead(fine_grid_trunk)\n\n# Predict which tree type the new points fall into\nfine_grid_trunk$tree_pred <- predict(svm_trunk_data, newdata = fine_grid_trunk, type = \"decision\")\n\n# Check output\nhead(fine_grid_trunk)\ntable(fine_grid_trunk$tree_pred)",
150 |       "execution_count": null,
151 |       "outputs": []
152 |     },
153 |     {
154 |       "metadata": {},
155 |       "cell_type": "markdown",
156 |       "source": "Now let's create a scatter plot using `ggplot2`. We will plot the fine grid as well as the original tree points.\n\n### In the cell below replace:\n#### 1. `<data1>` with `fine_grid_trunk`\n#### 2. `<data2>` with `fine_grid_trunk`\n#### 3. `<data3>` with `tree_data`\n#### then __run the code__."
157 |     },
158 |     {
159 |       "metadata": {
160 |         "scrolled": true,
161 |         "trusted": true
162 |       },
163 |       "cell_type": "code",
164 |       "source": "# Create scatter plot with original trunk features layered over the fine grid of data points\nggplot() +\n\n# First plot the fine grid of data points;\n###\n# REPLACE <data1> WITH fine_grid_trunk\n###\ngeom_point(data = <data1>, aes(x = trunk_girth, y = trunk_height, colour = tree_pred), alpha = 0.25) +\n\n# Add contour lines based on fine grid of data points; \n###\n# REPLACE <data2> WITH fine_grid_trunk\n###\nstat_contour(data = <data2>, aes(x = trunk_girth, y = trunk_height, z = as.integer(tree_pred)),\n             lineend = \"round\", linejoin = \"round\", linemitre = 1, size = 0.25, colour = \"black\") +\n###\n\n# Now plot the original data points to see where they lie in relation to the fine grid of data points;\n\n###\n# REPLACE <data3> WITH tree_data\n###\ngeom_point(data = <data3>, aes(x = trunk_girth, y = trunk_height, colour = tree_type, shape = tree_type)) +\n###\nggtitle(\"SVM decision boundaries for trunk girth vs. trunk height\") +\nlabs(x = \"Trunk girth\", y = \"Trunk height\", colour = \"Tree type\", shape = \"Tree type\") +\ntheme(plot.title = element_text(hjust = 0.5))",
165 |       "execution_count": null,
166 |       "outputs": []
167 |     },
168 |     {
169 |       "metadata": {},
170 |       "cell_type": "markdown",
171 |       "source": "Excellent! Again we can observe three faintly coloured zones based on the SVM's predictions of tree type for the fine grid of data points (based on trunk features), and the hyperplanes for the different tree types represented by thick black lines. We use these coloured zones and hyperplanes to observe which tree type the SVM has chosen to place our original data points into. Again, we observe two different classification scenarios: either our original data points are classified correctly by the SVM, or 2) our original data points are misclassified by the SVM.\n\n**Now let's run the `predict` function as we did earlier to determine the mis-classification rate of our SVM model based on trunk features.**"
172 |     },
173 |     {
174 |       "metadata": {
175 |         "trusted": true
176 |       },
177 |       "cell_type": "code",
178 |       "source": "# Run this box to determing the mis-classification rate\n\npred_trunk_data <- tree_data %>% \nselect(trunk_girth, trunk_height)\n\n# Predict the tree type of our original data based on the SVM `svm_trunk_data`\npred_trunk_data$tree_pred <- predict(svm_trunk_data, newdata = pred_trunk_data, type = \"decision\")\n\n# Check output\nhead(pred_trunk_data)\n\n# Add tree_data$tree_type to pred_trunk_data\npred_trunk_data <- inner_join(pred_trunk_data, tree_data, by = c(\"trunk_girth\", \"trunk_height\")) %>%\nselect(-leaf_length, -leaf_width)\n\n# Check output\nhead(pred_trunk_data)\n\n# Create a table of predictions to show mis-classification rate\ntable(pred_trunk_data$tree_pred, pred_trunk_data$tree_type)\n\n# Mis-classification rate: proportion of misclassifiedb observations\nmean(pred_trunk_data$tree_pred != pred_trunk_data$tree_type)",
179 |       "execution_count": null,
180 |       "outputs": []
181 |     },
182 |     {
183 |       "metadata": {},
184 |       "cell_type": "markdown",
185 |       "source": "Here our mis-classification rate of the training data using the `svm_trunk_data` model is 4.5%, which is lower than the mis-classification rate of the `svm_leaf_data` model.\n\n\nConclusion\n-------\n\nThat's it! You've made two simple SVMs that can predict the type of tree based on the leaf measurements and trunk measurements!\n\nYou can go back to the course now and click __'Next Step'__ to move onto how we can test AI models."
186 |     }
187 |   ],
188 |   "metadata": {
189 |     "kernelspec": {
190 |       "name": "r",
191 |       "display_name": "R",
192 |       "language": "R"
193 |     },
194 |     "language_info": {
195 |       "mimetype": "text/x-r-source",
196 |       "name": "R",
197 |       "pygments_lexer": "r",
198 |       "version": "3.4.1",
199 |       "file_extension": ".r",
200 |       "codemirror_mode": "r"
201 |     }
202 |   },
203 |   "nbformat": 4,
204 |   "nbformat_minor": 2
205 | }


--------------------------------------------------------------------------------
/07. Advanced SVMs - R.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "metadata": {
  5 |         "collapsed": true
  6 |       },
  7 |       "cell_type": "markdown",
  8 |       "source": "Exercise 7 - Advanced Support Vector Machines\n===\n\nSupport vector machines (SVMs) let us predict categories. In this exercise, we will be using SVM, paying attention to the key steps as we go:  formatting data correctly, splitting the data into training and test sets, training an SVM model using the training set, and then evaluating and visualising the SVM model using the test set.\n\nWe will be looking at __prions__: misfolded proteins that are associated with several fatal neurodegenerative diseases (kind of like Daleks, if you have seen Doctor Who). Looking at examples of protein mass and weight, we will build a predictive model to detect prions in blood samples.\n\n#### Run the code below to load the required libraries for this exercise."
  9 |     },
 10 |     {
 11 |       "metadata": {
 12 |         "trusted": true
 13 |       },
 14 |       "cell_type": "code",
 15 |       "source": "# Load `tidyverse` package\nsuppressMessages(install.packages(\"tidyverse\"))\nsuppressMessages(library(\"tidyverse\"))\nsuppressMessages(install.packages(\"e1071\"))\nsuppressMessages(library(\"e1071\"))\nsuppressMessages(install.packages(\"magrittr\"))\nsuppressMessages(library(\"magrittr\"))",
 16 |       "execution_count": null,
 17 |       "outputs": []
 18 |     },
 19 |     {
 20 |       "metadata": {},
 21 |       "cell_type": "markdown",
 22 |       "source": "Step 1\n---\n\nLet's load the required R packages and the prion data for this exercise.\n\n**In the code below, complete the data loading step, by replacing `<prionDataset>` with `prion_data`, and running the code.**"
 23 |     },
 24 |     {
 25 |       "metadata": {
 26 |         "scrolled": true,
 27 |         "trusted": true
 28 |       },
 29 |       "cell_type": "code",
 30 |       "source": "###\n# REPLACE <prionDataset> WITH prion_data\n###\n<prionDataset> <- read.csv(\"Data/PrionData.csv\")\n###\n\n# Check the structure of `prion_data`\nstr(prion_data)\nhead(prion_data)",
 31 |       "execution_count": null,
 32 |       "outputs": []
 33 |     },
 34 |     {
 35 |       "metadata": {},
 36 |       "cell_type": "markdown",
 37 |       "source": "It appears that we have an extra column `X` in `prion_data` that contains the row number. By default, R has labelled the column `X` because the input didn't have a column name (it was blank). This behaviour happens regularly when exporting data sets from a program like Microsoft Excel and then importing them into R.\n\nLet's get rid of the first column from `prion_data`, and then check that it has been successfully removed. We will use the `select` function from the `dplyr` package together with the `-` symbol to \"minus\" the `X` column from our dataset.\n\n> **N.B. We have used a different assignment symbol `%<>%` from the `magrittr` package in the code below. The `magrittr` assignment symbol `%<>%` is a combination of the `magrittr` pipe symbol `%>%` and the base R assignment symbol `<-`. It takes the variable on the left hand side of the `%<>%` symbol, and updates the value of the variable with the result of the right hand side. So the object on the left hand side acts as both the initial value and the resulting value.**\n\n#### Replace `<removeColumn>` with `-X` to remove the excess column X, then run the code."
 38 |     },
 39 |     {
 40 |       "metadata": {
 41 |         "trusted": true
 42 |       },
 43 |       "cell_type": "code",
 44 |       "source": "###\n# REPLACE <removeColumn> WITH -X\n###\nprion_data %<>% select(<removeColumn>)\n###\nstr(prion_data)\nhead(prion_data)\n\n# Check frequency of `prion_status` in `prion_data`\nprion_data %>%\ngroup_by(prion_status) %>%\nsummarise(n = n()) %>% \nmutate(freq = n/sum(n))",
 45 |       "execution_count": null,
 46 |       "outputs": []
 47 |     },
 48 |     {
 49 |       "metadata": {},
 50 |       "cell_type": "markdown",
 51 |       "source": "Excellent, we have successfully removed column `X` from `prion_data`!\n\nNow, looking at the output of `str` and `head`, we can observe that `prion_data` is a `data.frame` that contains 485 observations and 3 variables stored in the following columns:\n\n* `mass` is the first *feature*;\n* `weight` is the second *feature*;\n* `prion_status` is the *label* (or category).\n\nOf the 485 observations, 375 (77.31%) are non-prions, and 110 (22.68%) are prions.\n\nStep 2\n---\n\nLet's graph `prion_data` to better understand the features and labels.\n\n**In the cell below replace:**\n\n**1. `<xData>` with `mass`**\n\n**2. `<yData>` with `weight`**\n\n**3. `<colorData>` with `prion_status`**\n\n** then __run the code__. **"
 52 |     },
 53 |     {
 54 |       "metadata": {
 55 |         "trusted": true
 56 |       },
 57 |       "cell_type": "code",
 58 |       "source": "prion_data  %>% \n###\n# REPLACE <xData> WITH mass AND <yData> WITH weight AND <colorData> WITH prion_status\n###\nggplot(aes(x = <xData> , y = <yData> , colour = <colorData> )) +\n###\ngeom_point() +\nggtitle(\"Classification plot for prion data\") +\n# Create labels for x-axis, y-axis, and legend\nlabs(x = \"Mass\", y = \"Weight\", colour = \"Prion status\") +\n# Align title to centre\ntheme(plot.title = element_text(hjust = 0.5))",
 59 |       "execution_count": null,
 60 |       "outputs": []
 61 |     },
 62 |     {
 63 |       "metadata": {},
 64 |       "cell_type": "markdown",
 65 |       "source": "Step 3\n---\n\nTo create a SVM model, let's split our data into training and test sets. We'll start by checking the total number of instances in our data set. If we go back to the output from `str(prion_data)` in Step 2, we have 485 observations and 3 variables.\n\nSo, let's use 400 examples for our `training` set, and the remainder for our `test` set.\n\nWe will use the `slice` function to select the first 400 rows from `prion_data`\n\n#### Replace `<selectData>` with `1:400`, and run the code."
 66 |     },
 67 |     {
 68 |       "metadata": {
 69 |         "trusted": true
 70 |       },
 71 |       "cell_type": "code",
 72 |       "source": "###\n# REPLACE <selectData> WITH 1:400\n###\ntrain_prion <- slice(prion_data, <selectData>)\n###\nstr(train_prion)\n\n# Check percentage of samples that are prions\ntrain_prion %>%\ngroup_by(prion_status) %>%\nsummarise(n = n()) %>% \nmutate(freq = n/sum(n))\n\n# Create test set using the remaining examples\ntest_prion <- slice(prion_data, 401:n())\nstr(test_prion)\n\n# Check percentage of samples that are prions\ntest_prion %>%\ngroup_by(prion_status) %>%\nsummarise(n = n()) %>% \nmutate(freq = n/sum(n))",
 73 |       "execution_count": null,
 74 |       "outputs": []
 75 |     },
 76 |     {
 77 |       "metadata": {},
 78 |       "cell_type": "markdown",
 79 |       "source": "Well done! Let's look at a summary of our training data to get a better idea of what we're dealing with.\n\n#### Replace `<trainDataset>` with `train_prion` and run the code."
 80 |     },
 81 |     {
 82 |       "metadata": {
 83 |         "trusted": true
 84 |       },
 85 |       "cell_type": "code",
 86 |       "source": "###\n# REPLACE <trainDataset> WITH train_prion\n###\nsummary(<trainDataset>)\n###",
 87 |       "execution_count": null,
 88 |       "outputs": []
 89 |     },
 90 |     {
 91 |       "metadata": {},
 92 |       "cell_type": "markdown",
 93 |       "source": "Using the `summary` function, we observe our training data contains 314 non-prions and 86 prions out of a total of 400 observations. This looks right, because the scatter plot we created in Step 2 showed us the majority of observations have 'non-prion' status.\n\nLet's take a look at `test_prion` too, using the `summary` function again.\n\n#### Replace `<testDataset>` with `test_prion` and run the code."
 94 |     },
 95 |     {
 96 |       "metadata": {
 97 |         "trusted": true
 98 |       },
 99 |       "cell_type": "code",
100 |       "source": "###\n# REPLACE <testDataset> WITH test_prion\n###\nsummary(<testDataset>)\n###",
101 |       "execution_count": null,
102 |       "outputs": []
103 |     },
104 |     {
105 |       "metadata": {},
106 |       "cell_type": "markdown",
107 |       "source": "Looking good! Alright, now to make a support vector machine.\n\nStep 4\n---\n\nBelow we will make an SVM similar to the previous exercise. Remember the syntax for SVMs using the `e1071::svm` function:\n\n**`svm_model <- svm(x = x, y = y, data = dataset)`**\n\nwhere `x` represents the features (a matrix), and `y` represents the labels (factors).\n\nAlternatively, we can use the following syntax for the `svm` function:\n\n**`model <- svm(formula = y ~ x, data = dataset)`**\n\nwhere `y` represents the labels/categories, and `x` represents the features. Note if you have multiple `x` features in the dataset, you can simply type `.` in the `formula` argument to refer to everything in the data set except the y argument. Let's try out this syntax using the training data as our input.\n\n**In the code below, create an SVM model using the `train_prion` data using the `svm` function with the `formula` argument.**\n\n#### Replace `<dataSelection>` with `prion_status ~ .`, then run the code.\n\nNote: the `prion_status` on the left hand side of the formula selects our labels, and the `.` on the right hand side of the formula selects our features. In this case, the `.` selects all the features in our dataset `train_prion`."
108 |     },
109 |     {
110 |       "metadata": {
111 |         "trusted": true
112 |       },
113 |       "cell_type": "code",
114 |       "source": "###\n# REPLACE <dataSelection> WITH prion_status ~ .\n###\nSVM_Model <- svm(formula = <dataSelection> , data = train_prion)\n###\n\nprint(\"Model ready!\")",
115 |       "execution_count": null,
116 |       "outputs": []
117 |     },
118 |     {
119 |       "metadata": {},
120 |       "cell_type": "markdown",
121 |       "source": "Well done! We've made a SVM model using our training set `train_prion`.\n\nStep 5\n---\n\nLet's create some custom functions to graph and evaluate SVM models. We will use these functions throughout the remainder of this exercise. You do not need to edit the code block below.\n\n**Run the code below**"
122 |     },
123 |     {
124 |       "metadata": {
125 |         "trusted": true
126 |       },
127 |       "cell_type": "code",
128 |       "source": "# Run this box to prepare functions for later use\n\n# Create a custom function named `Graph_SVM` to plot an SVM model\n\nGraph_SVM <- function(model, data_set){\n    grid <- expand.grid(\"mass\" = seq(min(data_set$mass), max(data_set$mass), length.out = 100),\n                        \"weight\" = seq(min(data_set$weight), max(data_set$weight), length.out = 100))\n    preds <- predict(model, grid)\n    df <- data.frame(grid, preds)\n    ggplot() +\n    geom_tile(data = df, aes(x = mass, y = weight, fill = preds)) +\n    geom_point(data = data_set, aes(x = mass, y = weight, shape = prion_status, \n                                    colour = prion_status), \n               alpha = 0.75) +\n    scale_colour_manual(values = c(\"grey10\", \"grey50\")) +\n    labs(title = paste(\"SVM model prediction\"), x = \"Mass\", y = \"Weight\",\n         fill = \"Prediction\", shape = \"Prion status\", colour = \"Prion status\") +\n    theme(plot.title = element_text(hjust = 0.5))\n    }\n\n# Create another custom function named `Evaluate_SVM` to evaluate the SVM model, print results to screen,\n# and run the `Graph_SVM` custom function\nEvaluate_SVM <- function(model, data_set){\n    predictions <- predict(model, data_set)\n    total <- 0\n    for(i in 1:nrow(data_set)){\n    if(toString(predictions[i]) == data_set[i, \"prion_status\"]){\n        total = total + 1\n        }\n        }\n    # Print results to screen\n    print(\"SVM Model Evaluation\")\n    print(paste0(\"Model name: \", deparse(substitute(model))))\n    print(paste0(\"Dataset: \", deparse(substitute(data_set))))\n    print(paste0(\"Accuracy: \", total/nrow(data_set)*100, \"%\"))\n    print(paste0(\"Number of samples: \", nrow(data_set)))\n    \n    # Call our custom function for graphing SVM model\n    Graph_SVM(model, data_set)\n}\n\nprint(\"Custom function ready!\")",
129 |       "execution_count": null,
130 |       "outputs": []
131 |     },
132 |     {
133 |       "metadata": {},
134 |       "cell_type": "markdown",
135 |       "source": "Excellent! Now that we have created the custom function `Evaluate_SVM` (which incorporates the `Graph_SVM` function) let's evaluate our SVM model on the training data. \n\nIn the code below, we will change the inputs to the `Evaluate_SVM` function, where the first argument is the SVM model we will evaluate, and the second argument is the dataset we will evaluate the SVM model with.\n\n** In the cell below replace: **\n\n** 1. `<svmModel>` with `SVM_Model` **\n\n** 2. `<dataset>` with `train_prion` **\n\n** Then __run the code__. **"
136 |     },
137 |     {
138 |       "metadata": {
139 |         "trusted": true
140 |       },
141 |       "cell_type": "code",
142 |       "source": "###\n# REPLACE <svmModel> WITH SVM_Model AND <dataset> WITH train_prion\n###\nEvaluate_SVM(<svmModel>, <dataset>)\n###",
143 |       "execution_count": null,
144 |       "outputs": []
145 |     },
146 |     {
147 |       "metadata": {},
148 |       "cell_type": "markdown",
149 |       "source": "Step 6\n---\n\nThe SVM has performed reasonably well separating our training data set into two. Now let's take a look at our test set.\n\nIn the code below, we will use our custom function `Evaluate_SVM` to evaluate `SVM_Model` on the test set.\n\n** In the cell below replace: **\n\n** 1. `<svmModel>` with `SVM_Model` **\n\n** 2. `<dataset>` with `test_prion` **\n\n** Then __run the code__. **"
150 |     },
151 |     {
152 |       "metadata": {
153 |         "trusted": true
154 |       },
155 |       "cell_type": "code",
156 |       "source": "###\n# REPLACE <svmModel> WITH SVM_Model AND <dataset> WITH test_prion\n###\nEvaluate_SVM(<svmModel>, <dataset>)\n###",
157 |       "execution_count": null,
158 |       "outputs": []
159 |     },
160 |     {
161 |       "metadata": {},
162 |       "cell_type": "markdown",
163 |       "source": "That's a good result. \n\nConclusion\n---\n\nWell done! We've taken a data set, tidied it, prepared it into training and test sets, created an SVM based on the training set, and evaluated the SVM model using the test set.\n\nYou can go back to the course now, or you can try using different kernels with your SVM below.\n\nOPTIONAL: Step 8\n---\n\nWant to have a play around with different kernels for your SVM models? It's really easy!\n\nThe standard kernel is a radial basis kernel. But there's a few more you can choose from: `linear`, `polynomial`, and `sigmoid`. Let's try them out.\n\nIf you want to use a linear kernel, all you need to do is add `kernel = \"linear\"` to your model. Like this:\n\n`SVM_Model <- svm(formula = y ~ x, data = dataset, kernel = \"linear\")`\n\nGive it a go with all the different kernels below. The first kernel, `linear`, has been done for you.\n\n**Run the code below**"
164 |     },
165 |     {
166 |       "metadata": {
167 |         "trusted": true
168 |       },
169 |       "cell_type": "code",
170 |       "source": "# Run this box to make a linear SVM\n\n# Make a linear SVM model\nSVM_Model_Linear <- svm(prion_status ~ . , data = train_prion, kernel = \"linear\")\nprint(\"Model ready\")",
171 |       "execution_count": null,
172 |       "outputs": []
173 |     },
174 |     {
175 |       "metadata": {},
176 |       "cell_type": "markdown",
177 |       "source": "Now we have created the linear SVM model, let's evaluate it on our training and test sets using our custom function we created earlier, `Evaluate_SVM`. Remember the inputs to `Evaluate_SVM` are the SVM model followed by the data you wish to evaluate the model on.\n\nIn the code blocks below, we will change the inputs to our `Evaluate_SVM` function to the appropriate variable names to evaluate the linear SVM model on the training and test sets.\n\n** In the cell below replace: **\n\n** 1. `<svmModel>` with `SVM_Model_Linear` **\n\n** 2. `<dataset>` with `train_prion` **\n\n** Then __run the code__. **"
178 |     },
179 |     {
180 |       "metadata": {
181 |         "trusted": true
182 |       },
183 |       "cell_type": "code",
184 |       "source": "# Evaluate linear SVM model on training set\n\n###\n# REPLACE <svmModel> WITH SVM_Model_Linear AND <dataset> WITH train_prion\n###\nEvaluate_SVM(<svmModel>, <dataset>)\n###",
185 |       "execution_count": null,
186 |       "outputs": []
187 |     },
188 |     {
189 |       "metadata": {},
190 |       "cell_type": "markdown",
191 |       "source": "And now for the test set.\n\n** In the cell below replace: **\n\n** 1. `<svmModel>` with `SVM_Model_Linear` **\n\n** 2. `<dataset>` with `test_prion` **\n\n** Then __run the code__. **"
192 |     },
193 |     {
194 |       "metadata": {
195 |         "trusted": true
196 |       },
197 |       "cell_type": "code",
198 |       "source": "# Evaluate linear SVM model on test set\n\n###\n# REPLACE <svmModel> WITH SVM_Model_Linear AND <dataset> WITH test_prion\n###\nEvaluate_SVM(<svmModel>, <dataset>)\n###",
199 |       "execution_count": null,
200 |       "outputs": []
201 |     },
202 |     {
203 |       "metadata": {},
204 |       "cell_type": "markdown",
205 |       "source": "You can see the hyperplane is a linear line! Compare the linear SVM model results to the radial SVM model results to see the difference for yourself!\n\n## Now let's try a sigmoid kernel.\n\n** In the cell below replace: **\n\n** 1. `<kernelSelection>` with `\"sigmoid\"` **\n\n** 2. `<svmModel>` with `SVM_Model_Sigmoid` **\n\n** 3. `<dataset>` with `train_prion` **\n\n** Then __run the code__. **"
206 |     },
207 |     {
208 |       "metadata": {
209 |         "trusted": true
210 |       },
211 |       "cell_type": "code",
212 |       "source": "###\n# REPLACE <kernelSelection> WITH \"sigmoid\" (INCLUDING THE QUOTATION MARKS)\n###\nSVM_Model_Sigmoid <- svm(prion_status ~ . , data = train_prion, kernel = <kernelSelection>)\n###\n\n# Evaluate sigmoid SVM model on training set\n###\n# REPLACE <svmModel> WITH SVM_Model_Sigmoid AND <dataset> WITH train_prion\n###\nEvaluate_SVM(<svmModel>, <dataset>)\n###",
213 |       "execution_count": null,
214 |       "outputs": []
215 |     },
216 |     {
217 |       "metadata": {},
218 |       "cell_type": "markdown",
219 |       "source": "And now for the test set.\n\n** In the cell below replace: **\n\n** 1. `<svmModel>` with `SVM_Model_Sigmoid` **\n\n** 2. `<dataset>` with `test_prion` **\n\n** Then __run the code__. **"
220 |     },
221 |     {
222 |       "metadata": {
223 |         "trusted": true
224 |       },
225 |       "cell_type": "code",
226 |       "source": "# Evaluate sigmoid SVM model on test set\n###\n# REPLACE <svmModel> WITH SVM_Model_Sigmoid AND <dataset> WITH test_prion\n###\nEvaluate_SVM(<svmModel>, <dataset>)\n###",
227 |       "execution_count": null,
228 |       "outputs": []
229 |     },
230 |     {
231 |       "metadata": {},
232 |       "cell_type": "markdown",
233 |       "source": "Perhaps a sigmoid kernel isn't a good idea for this data set....\n\n## Let's try a sigmoid kernel instead.\n\n** In the cell below replace: **\n\n** 1. `<kernelSelection>` with `\"polynomial\"` **\n\n** 2. `<svmModel>` with `SVM_Model_Sigmoid` **\n\n** 3. `<dataset>` with `train_prion` **\n\n** Then __run the code__. **"
234 |     },
235 |     {
236 |       "metadata": {
237 |         "trusted": true
238 |       },
239 |       "cell_type": "code",
240 |       "source": "###\n# REPLACE <kernelSelection> WITH \"polynomial\" (INCLUDING THE QUOTATION MARKS)\n###\nSVM_Model_Poly <- svm(prion_status ~ . , data = train_prion, kernel = <kernelSelection>)\n\n# Evaluate polynomial SVM model on training set\n###\n# REPLACE <svmModel> WITH SVM_Model_Poly AND <dataset> WITH train_prion\n###\nEvaluate_SVM(<svmModel>, <dataset>)\n###",
241 |       "execution_count": null,
242 |       "outputs": []
243 |     },
244 |     {
245 |       "metadata": {},
246 |       "cell_type": "markdown",
247 |       "source": "And now for the test set.\n\n** In the cell below replace: **\n\n** 1. `<svmModel>` with `SVM_Model_Poly` **\n\n** 2. `<dataset>` with `test_prion` **\n\n** Then __run the code__. **"
248 |     },
249 |     {
250 |       "metadata": {
251 |         "trusted": true
252 |       },
253 |       "cell_type": "code",
254 |       "source": "# Evaluate polynomial SVM model on test set\n###\n# REPLACE <svmModel> WITH SVM_Model_Poly AND <dataset> WITH test_prion\n###\nEvaluate_SVM(<svmModel>, <dataset>)\n###",
255 |       "execution_count": null,
256 |       "outputs": []
257 |     },
258 |     {
259 |       "metadata": {},
260 |       "cell_type": "markdown",
261 |       "source": "If we were to carry on analysing prions like this, a polynomial SVM looks like a good choice (based on the performance of the different models on `test_prion`). If the data set was more complicated, we could try different degrees for the polynomial to see which one was the most accurate. This is part of __`tuning`__ a model.\n\nWell done!"
262 |     }
263 |   ],
264 |   "metadata": {
265 |     "kernelspec": {
266 |       "name": "r",
267 |       "display_name": "R",
268 |       "language": "R"
269 |     },
270 |     "language_info": {
271 |       "mimetype": "text/x-r-source",
272 |       "name": "R",
273 |       "pygments_lexer": "r",
274 |       "version": "3.4.1",
275 |       "file_extension": ".r",
276 |       "codemirror_mode": "r"
277 |     }
278 |   },
279 |   "nbformat": 4,
280 |   "nbformat_minor": 2
281 | }


--------------------------------------------------------------------------------
/08. Neural Networks Introduction - R.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "metadata": {
  5 |         "collapsed": true
  6 |       },
  7 |       "cell_type": "markdown",
  8 |       "source": "Exercise 8 - Introduction to Neural Networks\n===\n\nOriginally hypothesised in the 1940s, neural networks are now one of the main tools used in modern AI. Neural networks can be used for both regression and categorisation applications. Recent advances with storage, processing power, and open-source tools have allowed many successful applications of neural networks in medical diagnosis, filtering explicit content, speech recognition, and machine translation.\n\nIn this exercise we will compare three dog breeds using their age, weight, and height. We will make a neural network model to classify the breeds of dogs based on these features.\n\nNote: It's extremely common for AI practitioners to use a template such as the one below for making neural networks quickly. After you are done, feel free to play around with the template to get a feel of how you can easily adjust a neural network to your problems using the package `keras`.\n\nLet's start by loading the libraries required for this session.\n\n**Run the code below**"
  9 |     },
 10 |     {
 11 |       "metadata": {
 12 |         "trusted": true
 13 |       },
 14 |       "cell_type": "code",
 15 |       "source": "# Run this to load the required libraries, it might take a little while.\n\nsuppressMessages(install.packages(\"tidyverse\"))\nsuppressMessages(library(\"tidyverse\"))\n\nsuppressMessages(install.packages(\"keras\"))\nsuppressMessages(library(keras))\nsuppressMessages(install_keras())",
 16 |       "execution_count": null,
 17 |       "outputs": []
 18 |     },
 19 |     {
 20 |       "metadata": {},
 21 |       "cell_type": "markdown",
 22 |       "source": "Step 1\n---\n\nNow let's load our data and inspect it.\n\n#### Replace `<dataset>` with `dog_data` and run the code."
 23 |     },
 24 |     {
 25 |       "metadata": {
 26 |         "trusted": true
 27 |       },
 28 |       "cell_type": "code",
 29 |       "source": "# Run this box to load our data\n\n# Load the dataset `dog_data.csv`\n\n###\n# REPLACE <dataset> WITH dog_data\n###\n<dataset> <- read.csv(\"Data/dog_data.csv\")\n###\n\n# Check the structure\nstr(dog_data)\nhead(dog_data)\nsummary(dog_data)",
 30 |       "execution_count": null,
 31 |       "outputs": []
 32 |     },
 33 |     {
 34 |       "metadata": {},
 35 |       "cell_type": "markdown",
 36 |       "source": "Based on the output of `str(dog_data)`, we have **200 observations** on dogs stored in **4 variables**:\n\n* `age`: the first feature;\n* `weight`: the second feature;\n* `height`: the third feature;\n* `breed`: the label, represented as numbers `0`, `1`, and `2`. \n\nStep 2\n---\n\nBefore we make our model, let's get our training and test sets ready.\n\nWe've got 200 observations on dogs, so we'll use the first 160 observations for the training set, and the last 40 observations for our test set. For both the training and test sets, we will also separate `X` the features (`age`, `weight` and `height`) from `Y` the label (`breed`).\n\n### In the cell below replace:\n#### 1. `<trainingSetLocation>` with `1:160`\n#### 2. `<trainingSetLocation>` with `1:160`\n#### 3. `<testSetLocation>` with `161:200`\n#### 4. `<testSetLocation>` with `161:200`\n#### then __run the code__."
 37 |     },
 38 |     {
 39 |       "metadata": {
 40 |         "trusted": true
 41 |       },
 42 |       "cell_type": "code",
 43 |       "source": "# Run this box to split data into training and test sets\n\n###\n# REPLACE <trainingSetLocation> WITH 1:160\n###\ntrain_X <- as.matrix(dog_data[<trainingSetLocation>, 1:3]) # Rows 1 - 160, columns 1 - 3 (the features)\nraw_train_Y <- as.matrix(dog_data[<trainingSetLocation>, 4]) # Rows 1 - 160, column 4 (the label)\n###\n\n###\n# REPLACE <testSetLocation> WITH 161:200\n###\ntest_X <- as.matrix(dog_data[<testSetLocation>, 1:3]) # Rows 161 - 200, columns 1 - 3 (the features)\nraw_test_Y <- as.matrix(dog_data[<testSetLocation>, 4]) # Rows 161 - 200, column 4 (the label)\n###\n\n# Check first few lines of new variables to see if the output is what we expect\n# Training data\nhead(train_X)\nhead(raw_train_Y)\n\n# Test data\nhead(test_X)\nhead(raw_test_Y)",
 44 |       "execution_count": null,
 45 |       "outputs": []
 46 |     },
 47 |     {
 48 |       "metadata": {},
 49 |       "cell_type": "markdown",
 50 |       "source": "Step 3\n---\n\nFor a neural network, indicating `breed` using  `0`, `1`, and `2` are misleading, as it might imply that breed `0` is closer to breed `1` than breed `2`. But that is not the case here.\n\nTo allow the neural network to predict categories properly, we represent categories as 'one-hot vectors'. The labels (dog breeds) will go from being represented as `0`, `1`, and `2` to this:\n\n| breed 0 | breed 1 | breed 2 |\n|:------- |:------- |:------- |\n| `1 0 0` | `0 1 0` | `0 0 1` |\n\nSo if the 1 is in the first position, the neural network knows that it's breed 0.\n\nIf the 1 is in the second position, the neural network knows that it's breed 1, and so on.\n\nThe code below will turn our raw labels into one-hot vectors our neural networks will be able to use.\n\n### In the cell below replace:\n#### 1. `<trainingLabels>` with `raw_train_Y`\n#### 2. `<testLabels>` with `raw_test_Y`\n#### then __run the code__."
 51 |     },
 52 |     {
 53 |       "metadata": {
 54 |         "trusted": true
 55 |       },
 56 |       "cell_type": "code",
 57 |       "source": "# This box uses the keras function to_categorical to change breed from integer to categorical\n\n###\n# REPLACE <trainingLabels> WITH raw_train_Y\n###\ntrain_Y <- to_categorical(<trainingLabels>, num_classes = 3)\n###\n\n###\n# REPLACE <testLabels> WITH raw_test_Y\n###\ntest_Y <- to_categorical(<testLabels>, num_classes = 3)\n###\n\n# Print out some of our training and test data\nhead(train_Y)\nhead(test_Y)",
 58 |       "execution_count": null,
 59 |       "outputs": []
 60 |     },
 61 |     {
 62 |       "metadata": {},
 63 |       "cell_type": "markdown",
 64 |       "source": "There we go!\n\n## Step 4\n\nThat's our data ready. Now it's time to make your first neural network model!\n\nThis is the standard syntax for a model using the `keras` package. You can always play around with adding in extra hidden layers and changing their size and activation functions later.\n\nOur **input shape** in the first dense layer is the **number of features**, which is **3** in this case.\n\nOur **final layer** has **3 units** (nodes), one for each of the dog breeds. So if we had 5 different breeds of dog in our dataset, the final layer would have 5 units.\n\n### In the cell below replace:\n#### 1. `<hiddenLayer1>` with `10`\n#### 2. `<inputNumber>` with `3`\n#### 3. `<hiddenLayer2>` with `10`\n#### 4. `<outputNumber>` with `3`\n#### then __run the code__."
 65 |     },
 66 |     {
 67 |       "metadata": {
 68 |         "trusted": true
 69 |       },
 70 |       "cell_type": "code",
 71 |       "source": "# Run this!\n\nuse_session_with_seed(5)\nset.seed(5)\n\nmodel <- keras_model_sequential()\n\nmodel %>%\n\n# Add densely-connected neural network layers using `layer_dense` function\n# Our first layer has an input shape of 3 to represent 3 input features (age, weight, height)\n\n###\n# REPLACE <hiddenLayer1> WITH 10 AND <inputNumber> WITH 3\n###\nlayer_dense(units = <hiddenLayer1>, activation = \"relu\", input_shape = <inputNumber>) %>% \n###\n\n# We now have a hidden layer with 10 nodes, with an input shape of 3 representing our 3 features.\n\n# Next up we'll add another layer, with 10 nodes too.\n###\n# REPLACE <hiddenLayer2> WITH 10\n###\nlayer_dense(units = <hiddenLayer2>, activation = \"relu\") %>% \n###\n\n# Uncomment the next line if you want to add another layer\n# layer_dense(units = 10, activation = \"relu\") %>% \n\n###\n# REPLACE <outputNumber> WITH 3\n###\nlayer_dense(units = <outputNumber>, activation = \"softmax\")\n###\n\n# Output layer has 3 nodes, one for each type of category we have\n\nmodel %>% summary",
 72 |       "execution_count": null,
 73 |       "outputs": []
 74 |     },
 75 |     {
 76 |       "metadata": {},
 77 |       "cell_type": "markdown",
 78 |       "source": "Alright, that's our first model ready.\n\nN.B. `\"tanh\"` is another common activation function that, if you want, you can try instead of `\"relu\"`, but it doesn't perform very well here.\n\nFeel free to experiment with some different parameters later on. If this doesn't work, check that you have the correct size for the input and output layers in Step 4 (must be 3 nodes each). For example, \"tanh\" is another popular activation function if you want to try it instead of \"relu\".\n\nStep 5\n---\n\nNext, we'll compile the model for training and see how it runs.\n\nThere are a few parameters you can choose that change how the model trains, and end up changing how the model performs.\n\nWe will use some standard parameters for now. \n\nFeel free to experiment with some different parameters later on. If this doesn't work, check that you input the correct size for the input and output layers in Step 4 (must have 3 nodes each).\n\n#### Replace `<optimizer>` with `optimizer_adagrad()` and run the code."
 79 |     },
 80 |     {
 81 |       "metadata": {
 82 |         "trusted": true
 83 |       },
 84 |       "cell_type": "code",
 85 |       "source": "###\n# REPLACE <optimizer> WITH optimizer_adagrad()\n###\nmodel %>% compile(\n    loss = \"categorical_crossentropy\",\n    optimizer = <optimizer>,\n    metrics = c(\"accuracy\")\n)\n###",
 86 |       "execution_count": null,
 87 |       "outputs": []
 88 |     },
 89 |     {
 90 |       "metadata": {},
 91 |       "cell_type": "markdown",
 92 |       "source": "N.B. `\"adam\"` is another popular optimizer if you want to try it instead of `\"adagrad\"`.\n\nLet's train the neural network and plot it!\n\n### In the cell below replace:\n#### 1. `<xData>` with `train_X`\n#### 2. `<yData>` with `train_Y`\n#### 3. `<epochNumber>` with `25`\n#### then __run the code__."
 93 |     },
 94 |     {
 95 |       "metadata": {
 96 |         "trusted": true
 97 |       },
 98 |       "cell_type": "code",
 99 |       "source": "# Run this box to plot our fit and print out how it performed on the training set\nhistory <- model %>% fit(\n    ###\n    # REPLACE <xData> WITH train_X and <yData> WITH train_Y\n    ###\n    x = <xData>,\n    y = <yData>,\n    ###\n    shuffle = T,\n    ###\n    # REPLACE <epochNumber> WITH 25\n    ###\n    epochs = <epochNumber>,\n    ###\n    batch_size = 2,\n    validation_split = 0.2\n)\n\nplot(history)\n\n# This tells us how the model performed on the training set\nhistory",
100 |       "execution_count": null,
101 |       "outputs": []
102 |     },
103 |     {
104 |       "metadata": {},
105 |       "cell_type": "markdown",
106 |       "source": "Note that the original training set `train_X` and `train_Y` with 160 observations has been split up again during the training process, where 112 of 160 samples were used for training, and 48 samples were used for validation, as per the output from `history`.\n\nStep 6\n---\n\nNow that our model is trained and ready, let's see how it performs on our test data, `test_X` and `test_Y`!\n\nIt's important to test a model on data that it has never seen before, to make sure it doesn't overfit. Now let's evaluate it against the test set.\n\n**Run the box below**"
107 |     },
108 |     {
109 |       "metadata": {
110 |         "trusted": true
111 |       },
112 |       "cell_type": "code",
113 |       "source": "# Run this box\nperf <- model %>% evaluate(test_X, test_Y)\nprint(perf)",
114 |       "execution_count": null,
115 |       "outputs": []
116 |     },
117 |     {
118 |       "metadata": {},
119 |       "cell_type": "markdown",
120 |       "source": "It seems to be very accurate (acc = 95%) with the random seed that we set!\n\nLet's see how the model predicts something completely new and unclassified.\n\n**Come up with a brand new sample of the format `[age, weight, height]` to test the model with, then run the two code blocks.**"
121 |     },
122 |     {
123 |       "metadata": {
124 |         "trusted": true
125 |       },
126 |       "cell_type": "code",
127 |       "source": "###\n# CHANGE age, weight, AND height TO NEW VALUES \n###\nnew_dog <- data.frame(age = 5, weight = 4, height = 8)\n###\n\nstr(dog_data)\n\n# Age vs weight\nggplot() +\ngeom_point(data = dog_data, aes(x = age, y = weight, colour = as.factor(breed))) +\ngeom_point(data = new_dog, aes (x = age, y = weight), shape = \"+\", size=10) +\nlabs(x = \"Age\", y = \"Weight\", colour = \"Breed\")",
128 |       "execution_count": null,
129 |       "outputs": []
130 |     },
131 |     {
132 |       "metadata": {
133 |         "trusted": true
134 |       },
135 |       "cell_type": "code",
136 |       "source": "# Run this code block to plot the relationship between age, height, and breed\n# Age vs height\nggplot() +\ngeom_point(data = dog_data, aes(x = age, y = height, colour = as.factor(breed))) +\ngeom_point(data = new_dog, aes (x = age, y = height), , shape = \"+\", size=10) +\nlabs(x = \"Age\", y = \"Height\", colour = \"Breed\")",
137 |       "execution_count": null,
138 |       "outputs": []
139 |     },
140 |     {
141 |       "metadata": {},
142 |       "cell_type": "markdown",
143 |       "source": "Now let's see what breed of dog the model says it is!\n\n**Run the code below**"
144 |     },
145 |     {
146 |       "metadata": {
147 |         "trusted": true
148 |       },
149 |       "cell_type": "code",
150 |       "source": "# Run this code to run the model\n\nprint(\"Probabilities of classes:\")\npredict_proba(model, as.matrix(new_dog))\n\nprint(\"Predicted class:\")\npredict_classes(model, as.matrix(new_dog))",
151 |       "execution_count": null,
152 |       "outputs": []
153 |     },
154 |     {
155 |       "metadata": {},
156 |       "cell_type": "markdown",
157 |       "source": "The final number tells us which class it thinks it is.\n\nConclusion\n---\n\nWe've built a simple neural network to help us predict dog breeds. In the next exercise, we'll look into neural networks with a bit more depth, and at the factors that influence how well it learns.\n\nIf you want to play around with this neural network and a new data set, just remember to set your input and output sizes correctly."
158 |     }
159 |   ],
160 |   "metadata": {
161 |     "kernelspec": {
162 |       "name": "r",
163 |       "display_name": "R",
164 |       "language": "R"
165 |     },
166 |     "language_info": {
167 |       "mimetype": "text/x-r-source",
168 |       "name": "R",
169 |       "pygments_lexer": "r",
170 |       "version": "3.4.1",
171 |       "file_extension": ".r",
172 |       "codemirror_mode": "r"
173 |     }
174 |   },
175 |   "nbformat": 4,
176 |   "nbformat_minor": 2
177 | }


--------------------------------------------------------------------------------
/09. Neural Networks Advanced - R.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "metadata": {},
  5 |       "cell_type": "markdown",
  6 |       "source": "Exercise 9 - Advanced Neural Networks\n===\n\nThere are many factors that influence how well a neural network might perform. AI practitioners tend to play around with the structure of the hidden layers, the activation function, the optimisation function, and the number of epochs (training cycles).\n\nIn this exercise, we will look at how changing these parameters impacts the accuracy and performance of our network.\n\nLet's start by loading the libraries required for this session.\n\n**Run the code below**"
  7 |     },
  8 |     {
  9 |       "metadata": {
 10 |         "trusted": true
 11 |       },
 12 |       "cell_type": "code",
 13 |       "source": "# Run this code box to load the packages we need\n# It might take a few minutes...\n\nsuppressMessages(install.packages(\"tidyverse\"))\nsuppressMessages(library(\"tidyverse\"))\n\nsuppressMessages(install.packages(\"keras\"))\nsuppressMessages(library(\"keras\"))\nsuppressMessages(install_keras())\n\noptions(repr.plot.width = 7, repr.plot.height = 5)",
 14 |       "execution_count": null,
 15 |       "outputs": []
 16 |     },
 17 |     {
 18 |       "metadata": {},
 19 |       "cell_type": "markdown",
 20 |       "source": "Step 1\n---\n\nWe will use the same dog data set as in Exercise 8, building on what we learnt before and trying different parameters for a network to try and improve performance.\n\nLet's open up our data set and create training and test sets.\n\n### In the cell below replace:\n#### 1. `<featureColumns>` with `1:3`\n#### 2. `<labelColumn>` with `4`\n#### 3. `<featureColumns>` with `1:3`\n#### 4. `<labelColumn>` with `4`\n#### then __run the code__."
 21 |     },
 22 |     {
 23 |       "metadata": {
 24 |         "trusted": true
 25 |       },
 26 |       "cell_type": "code",
 27 |       "source": "# Run this box to set up our training and test datasets\n\n# Load the dog data\ndog_data <- read.csv(\"Data/dog_data.csv\")\n\n# Check structure\nstr(dog_data)\nhead(dog_data)\n\n# Take the first 160 observations, separate the features from the labels, and assign them to the training set\n###\n# REPLACE <featureColumns> WITH 1:3 AND <labelColumn> WITH 4\n###\ntrain_X <- as.matrix(dog_data[1:160, <featureColumns>])\nraw_train_Y <- as.matrix(dog_data[1:160, <labelColumn>])\n###\n\n# Take the last 40 observations, separate the features from the labels, and assign them to the test set\n###\n# REPLACE <featureColumns> WITH 1:3 AND <labelColumn> WITH 4\n###\ntest_X <- as.matrix(dog_data[161:200, <featureColumns>])\nraw_test_Y <- as.matrix(dog_data[161:200, <labelColumn>])\n###",
 28 |       "execution_count": null,
 29 |       "outputs": []
 30 |     },
 31 |     {
 32 |       "metadata": {},
 33 |       "cell_type": "markdown",
 34 |       "source": "And just like the last exercise, we will transform the raw labels into one-hot vectors\n\n### In the cell below replace:\n#### 1. `<one-hot-function>` with `to_categorical`\n#### 2. `<numberOfClasses>` with `3`\n#### 3. `<one-hot-function>` with `to_categorical`\n#### 4. `<numberOfClasses>` with `3`\n#### then __run the code__."
 35 |     },
 36 |     {
 37 |       "metadata": {
 38 |         "trusted": true
 39 |       },
 40 |       "cell_type": "code",
 41 |       "source": "# Set the testing and training labels as categories using one-hot vectors\n###\n# REPLACE <one-hot-function> WITH to_categorical AND <numberOfClasses> WITH 3\n###\ntrain_Y <- <one-hot-function>(raw_train_Y, num_classes = <numberOfClasses>)\ntest_Y <- <one-hot-function>(raw_test_Y, num_classes = <numberOfClasses>)\n###\n\nhead(train_Y)",
 42 |       "execution_count": null,
 43 |       "outputs": []
 44 |     },
 45 |     {
 46 |       "metadata": {},
 47 |       "cell_type": "markdown",
 48 |       "source": "Done!\n\nStep 2\n---\n\nThe code block below contains a custom function `train_network` to help us quickly change the training factors of our neural network. We will use this function throughout the remainder of this exercise.\n\nThe `train_network` function allows us to change:\n\n* the size and/or number of layers;\n* the activation function the layers use;\n* the optimizer of the model;\n* the number of training cycles for the model (`epochs`).\n\n**Run the code below**"
 49 |     },
 50 |     {
 51 |       "metadata": {
 52 |         "trusted": true
 53 |       },
 54 |       "cell_type": "code",
 55 |       "source": "# Run this box to prepare functions for later\n\n# Define our custom function `train_network` with four arguments\ntrain_network <- function(structure, activation, optimizer, epochs){\n    suppressMessages(use_session_with_seed(1))\n    model = keras_model_sequential()\n    \n    model %>%\n    layer_dense(units = structure[2], activation = activation, input_shape = structure[1]) %>%\n    layer_dense(units = structure[3], activation = activation) %>% \n    layer_dense(units = structure[4], activation = \"softmax\")\n    \n    model %>% \n    compile(loss = \"categorical_crossentropy\", optimizer = optimizer, metrics = c(\"accuracy\"))\n    \n    history = model %>% \n    fit(x = train_X, y = train_Y, shuffle = T, epochs = epochs, batch_size = 5, \n        validation_split = 0.3)\n    \n    history_df <- as.data.frame(history)   \n    acc <<- history_df[nrow(history_df), 2]\n    print(\"Accuracy based on training set...\")\n    print(history_df[nrow(history_df), 2])\n    \n    perf <- model %>% evaluate(test_X, test_Y)\n    print(\"Accuracy based on test set...\")\n    print(perf$acc)\n    testacc <<- perf$acc\n    \n    plot(history)\n}",
 56 |       "execution_count": null,
 57 |       "outputs": []
 58 |     },
 59 |     {
 60 |       "metadata": {},
 61 |       "cell_type": "markdown",
 62 |       "source": "Let's recreate the neural network from Exercise 8 to use as our bench mark, but we will change it to have two hidden layers. You do not need to edit the code block below.\n\n### In the cell below replace:\n#### 1. `<activationFunction>` with `\"relu\"`\n#### 2. `<optimizer>` with `\"adagrad\"`\n#### 3. `<epochNumber>` with `30`\n#### then __run the code__."
 63 |     },
 64 |     {
 65 |       "metadata": {
 66 |         "trusted": true
 67 |       },
 68 |       "cell_type": "code",
 69 |       "source": "# Run this code to train the network\n\n# Create variables for each of the inputs to our custom function\nsample_structure <- c(3, 10, 10, 3)\n###\n# REPLACE <activationFunction> WITH \"relu\" (INCLUDING THE QUOTATION MARKS!)\n###\nsample_activation <- <activationFunction>\n###\n\n###\n# REPLACE <activationFunction> WITH \"adagrad\" (INCLUDING THE QUOTATION MARKS!)\n###\noptimizer <- <activationFunction>\n###\n\n###\n# REPLACE <epochNumber> WITH 30\n###\nsample_epochs <- <epochNumber>\n###\n\n# Run our custom function specifying our arguments in correct order: structure, activation, optimizer, epochs\ntrain_network(sample_structure, sample_activation, optimizer, sample_epochs)",
 70 |       "execution_count": null,
 71 |       "outputs": []
 72 |     },
 73 |     {
 74 |       "metadata": {},
 75 |       "cell_type": "markdown",
 76 |       "source": "Step 3\n---\n\nNow, let's start playing with the structure of our neural network, in particular the size of our hidden layers. We can easily do this by changing the input to the first argument of our `train_network` function, `structure`.\n\nHere we will test the size of our two hidden layers, testing values 1 through to 10. For simplicity, we will make the size of the two hidden layers the same, e.g. when we test a layer size of 5, the structure of our neural network will be `[3, 5, 5, 3]`, and when we test a layer size of 9, our neural network structure will be `[3, 9, 9, 3]`. Note that both the input and output layers of our network must remain as size 3, as our data have 3 input features.\n\n**In the code below:**  \n**1. Run the first box to alter the structure of the network**  \n**2. Run the second box to plot the results**"
 77 |     },
 78 |     {
 79 |       "metadata": {
 80 |         "trusted": true
 81 |       },
 82 |       "cell_type": "code",
 83 |       "source": "# Run this code box to alter the structure of the network\n\n# Initialise empty lists to store results\ntrain_acc <- c()\ntest_acc <- c()\n\n# Change the input to our first argument of our `train_network` function\nfor(i in 1:10){\n    NN_structure <- c(3, i, i, 3)\n    print(\"TEST THE FOLLOWING NUMBER OF HIDDEN LAYERS...\")\n    print(i)\n    train_network(NN_structure, sample_activation, optimizer, sample_epochs)\n    train_acc[i] <- acc\n    test_acc[i] <- testacc\n}",
 84 |       "execution_count": null,
 85 |       "outputs": []
 86 |     },
 87 |     {
 88 |       "metadata": {
 89 |         "trusted": true
 90 |       },
 91 |       "cell_type": "code",
 92 |       "source": "# Run this box to plot the results\n\n# Reshape the results for plotting\ntrain_results <- data.frame(dataType = rep(\"Training\", 10), acc = train_acc, nLayers = seq(1, 10, 1), stringsAsFactors = FALSE)\ntest_results <- data.frame(dataType = rep(\"Test\", 10), acc = test_acc, nLayers = seq(1, 10, 1), stringsAsFactors = FALSE)\n\nhiddenLayerDf <- train_results %>%  mutate(dataType = 'Training') %>%\n       bind_rows(test_results %>%\n           mutate(dataType = 'Test'))\n\nggplot(hiddenLayerDf,aes(y = acc,x = nLayers,color = dataType)) + \n  geom_line() +\n  labs(title = \"\", x = \"Size of hidden layers\", y = \"Accuracy\", colour = \"Data type\") +\nscale_x_discrete(limits = seq(1, 10, 1))",
 93 |       "execution_count": null,
 94 |       "outputs": []
 95 |     },
 96 |     {
 97 |       "metadata": {},
 98 |       "cell_type": "markdown",
 99 |       "source": "So, experimenting with different sizes of hidden layers can dramatically improve your results.\n\nStep 4\n---\n\nNow we'll look at how different **activation functions** impact the performance of neural networks. To do this, we need to change the second argument to our custom function `train_network`, the `activation` argument.\n\nThere are many different activation functions to try, so let's store them all as a vector and try them all!\n\n#### Replace `<addActivation>` with `activation_functions[i]` and run the code."
100 |     },
101 |     {
102 |       "metadata": {
103 |         "scrolled": false,
104 |         "trusted": true
105 |       },
106 |       "cell_type": "code",
107 |       "source": "# Run this box to run the network with different activation functions\n\n# Initialise empty lists to store results\ntrain_acc <- c()\ntest_acc <- c()\n\n# Create a vector listing all the activation functions we wish to test\nactivation_functions <- c(\"elu\", \"hard_sigmoid\", \"linear\", \"relu\", \"selu\", \"sigmoid\", \n                         \"softplus\", \"softsign\", \"tanh\")\n\n# # Uncomment the code below to play with the structure, optimizer, and epochs\n# sample_structure <- c(3, ?, ?, 3) # e.g. c(3, 4, 4, 3)\n# optimizer <- \"?\" # e.g. \"adagrad\"\n# sample_epochs <- ? # e.g. 20\n\n# Test all the different activation functions and save results\nfor(i in 1:length(activation_functions)){\n    print(\"Evaluating model with hidden layer activation function... \")\n    print(activation_functions[i])\n###\n# REPLACE <addActivation> WITH activation_functions[i]\n###    \n    train_network(sample_structure, <addActivation>, optimizer, sample_epochs)\n###    \n    train_acc[i] <- acc\n    test_acc[i] <- testacc\n}\n\nprint(\"Finished!\")",
108 |       "execution_count": null,
109 |       "outputs": []
110 |     },
111 |     {
112 |       "metadata": {},
113 |       "cell_type": "markdown",
114 |       "source": "#### Now run the code below to plow the results."
115 |     },
116 |     {
117 |       "metadata": {
118 |         "trusted": true
119 |       },
120 |       "cell_type": "code",
121 |       "source": "# Run this box to plot the result\n\n# Reshape the results for plotting\ntrain_results <- data.frame(dataType = \"Train\", actFuncName = activation_functions, funcAcc = train_acc,\n                            stringsAsFactors = FALSE)\ntest_results <- data.frame(dataType = \"Test\", actFuncName = activation_functions, funcAcc = test_acc,\n                           stringsAsFactors = FALSE)\n\nresults <- bind_rows(train_results, test_results) %>%\nmutate(dataType = as.factor(dataType))\n\n# Create line plot: activation function vs. accuracy coloured by data type\nresults %>%\nggplot(aes(actFuncName, funcAcc, group = dataType, colour = dataType)) +\ngeom_line() +\nlabs(title = \"\", x = \"Activation function\", y = \"Function accuracy\", colour = \"Data type\") +\ntheme(plot.title = element_text(hjust = 0.5))",
122 |       "execution_count": null,
123 |       "outputs": []
124 |     },
125 |     {
126 |       "metadata": {},
127 |       "cell_type": "markdown",
128 |       "source": "There's quite a lot of variance there. It's always good to quickly test different activation functions first.\n\nStep 5\n---\n\nThe __optimisation function__ is the next major parameter of the network architecture. It changes how the network is trained, so it can have a __very large impact on training time and end performance__.\n\n#### Replace `<optimizerFunction>` with `optimizer_functions[i]` and run the cell."
129 |     },
130 |     {
131 |       "metadata": {
132 |         "trusted": true
133 |       },
134 |       "cell_type": "code",
135 |       "source": "# Run this box to try different optimization functions\n\n# Initialise empty lists to store results\ntrain_acc <- c()\ntest_acc <- c()\n\n# Create a vector listing all the optimization functions we wish to test\noptimizer_functions = c(\"adadelta\", \"adagrad\", \"adam\", \"adamax\",\n                        \"nadam\", \"rmsprop\", \"sgd\")\nNN_structure <- c(3, 9, 9, 3)\nsample_activation <- \"relu\"\n\n# Uncomment the code below to play with the structure, activation, and epochs\n# NN_structure <- c(3, ?, ?, 3) # e.g. c(3, 4, 4, 3)\n# sample_activation <- ? # e.g. \"tanh\"\n# sample_epochs <- ? # e.g. 20\n\n# Test all the different optimization functions and save results\nfor(i in 1:length(optimizer_functions)){\n    print(\"Evaluating model with hidden layer optimization function... \")\n    print(optimizer_functions[i])\n###\n# REPLACE <optimizerFunction> WITH optimizer_functions[i]\n###    \n    train_network(NN_structure, sample_activation, <optimizerFunction>, sample_epochs)\n###    \n    train_acc[i] <- acc\n    test_acc[i] <- testacc\n}\n\ntrain_acc\ntest_acc",
136 |       "execution_count": null,
137 |       "outputs": []
138 |     },
139 |     {
140 |       "metadata": {},
141 |       "cell_type": "markdown",
142 |       "source": "#### Now run the code below to plot the results."
143 |     },
144 |     {
145 |       "metadata": {
146 |         "trusted": true
147 |       },
148 |       "cell_type": "code",
149 |       "source": "# Run this box to plot the results\n\n# Reshape the results to create plot\ntrain_results <- data.frame(dataType = \"Train\", optFuncName = optimizer_functions, funcAcc = train_acc,\n                            stringsAsFactors = FALSE)\ntest_results <- data.frame(dataType = \"Test\", optFuncName = optimizer_functions, funcAcc = test_acc,\n                           stringsAsFactors = FALSE)\n\nresults <- bind_rows(train_results, test_results) %>%\nmutate(dataType = as.factor(dataType))\n\n# Create line plot: optimzation function vs. accuracy coloured by data type\nresults %>%\nggplot(aes(optFuncName, funcAcc, group = dataType, colour = dataType)) +\ngeom_line() +\nlabs(title = \"Performance of training and test sets using different optimizer functions\",\n     x = \"Optimization function\", y = \"Function accuracy\", colour = \"Data type\") +\ntheme(plot.title = element_text(hjust = 0.5))",
150 |       "execution_count": null,
151 |       "outputs": []
152 |     },
153 |     {
154 |       "metadata": {},
155 |       "cell_type": "markdown",
156 |       "source": "Step 6\n---\n\nNow let's test the number of training cycles for the model, i.e. `epochs`, the final argument in our custom function.\n\n**In the code below, change the epochs below to any positive whole number and press Run. Try this with several different numbers.**"
157 |     },
158 |     {
159 |       "metadata": {
160 |         "trusted": true
161 |       },
162 |       "cell_type": "code",
163 |       "source": "###\n# CHANGE 15 TO ANY POSITIVE INTEGER\n###\nepochs <- 15\n\ntrain_network(sample_structure, sample_activation, optimizer, epochs)",
164 |       "execution_count": null,
165 |       "outputs": []
166 |     },
167 |     {
168 |       "metadata": {},
169 |       "cell_type": "markdown",
170 |       "source": "You will notice a trend: the higher the number of epoch/training cycles, the greater the accuracy of the model.\n\nStep 7\n---\n\nLet's try to combine what we've seen above and try to create a neural network that performs better than what we made in Exercise 7, where we used the structure `[3, 4, 2, 3]`, the activation function `relu`, and the optimiser `sgd` (stochastic gradient descent).\n\n**Follow the instructions in the code below**"
171 |     },
172 |     {
173 |       "metadata": {
174 |         "trusted": true
175 |       },
176 |       "cell_type": "code",
177 |       "source": "###\n# Run this box to train once more with a good selection of options\n# Then change the configurations as you like and run again to see how the network performs\n###\n\nsample_structure <- c(3, 9, 9, 3)\nsample_activation <- \"selu\"\noptimizer <- \"adam\"\nsample_epochs <- 10\n\ntrain_network(sample_structure, sample_activation, optimizer, sample_epochs)",
178 |       "execution_count": null,
179 |       "outputs": []
180 |     },
181 |     {
182 |       "metadata": {},
183 |       "cell_type": "markdown",
184 |       "source": "How does it look? Were we able to beat the other network? Try out a number of different configurations to see how they perform!\n\nConclusion\n---\n\nWe've compared how different neural network architecture parameters influence accuracy performance, and we've tried to combine them in such a way that we maximise this performance."
185 |     }
186 |   ],
187 |   "metadata": {
188 |     "kernelspec": {
189 |       "name": "r",
190 |       "display_name": "R",
191 |       "language": "R"
192 |     },
193 |     "language_info": {
194 |       "mimetype": "text/x-r-source",
195 |       "name": "R",
196 |       "pygments_lexer": "r",
197 |       "version": "3.4.1",
198 |       "file_extension": ".r",
199 |       "codemirror_mode": "r"
200 |     }
201 |   },
202 |   "nbformat": 4,
203 |   "nbformat_minor": 2
204 | }


--------------------------------------------------------------------------------
/10. Convolutional Neural Networks - R.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "metadata": {},
  5 |       "cell_type": "markdown",
  6 |       "source": "Convolutional Neural Networks\n======\n\nConvolutional neural networks (CNNs) are a class of deep neural networks, most commonly used in computer vision applications.\n\nConvolutional refers the network pre-processing data for you - traditionally this pre-processing was performed by data scientists. The neural network can learn how to do pre-processing *itself* by applying filters for things such as edge detection.\n\nThis exercise uses keras and another library, which we need to load before we begin.\n\n**Run the code below to start loading the required libraries for this exercise.**"
  7 |     },
  8 |     {
  9 |       "metadata": {
 10 |         "trusted": true
 11 |       },
 12 |       "cell_type": "code",
 13 |       "source": "# Run this box to load libraries\n\n# Load libraries\nsuppressMessages(install.packages(\"keras\"))\nsuppressMessages(install.packages(\"stringr\"))\nsuppressMessages(library(keras))\nsuppressMessages(install_keras())\nsuppressMessages(library(stringr))",
 14 |       "execution_count": null,
 15 |       "outputs": []
 16 |     },
 17 |     {
 18 |       "metadata": {},
 19 |       "cell_type": "markdown",
 20 |       "source": "Step 1\n-----\n\nIn this exercise we will train a CNN to recognise handwritten digits, using the MNIST digit dataset.\n\nThis is a very common exercise and data set to learn from.\n\nLet's start by load our dataset and setting up our training and test sets (keras will automatically assign a validation set from the training set for us).\n\n### In the cell below replace:\n#### 1. `<selectTrainingSetX>` with `1:1000,,`\n#### 2. `<selectTrainingSetY>` with `1:1000`\n#### 3. `<selectTestSetX>` with `1001:1500,,`\n#### 4. `<selectTestSetY>` with `1001:1500`\n#### then __run the code__."
 21 |     },
 22 |     {
 23 |       "metadata": {
 24 |         "trusted": true
 25 |       },
 26 |       "cell_type": "code",
 27 |       "source": "# Run this box to load the dataset and split it into training and test sets\n\n# Here we import the dataset.\nmnist <- dataset_mnist()\n\n\n# This stores our features and labels for both our training and test sets as local variables\n###\n# REPLACE <selectTrainingSetX> WITH 1:6000,, AND <selectTrainingSetY> WITH 1:6000\n###\nraw_x_train <- mnist$train$x[<selectTrainingSet>]\nraw_y_train <- mnist$train$y[<selectTrainingSet>]\n###\n\n###\n# REPLACE <selectTestSetX> WITH 6001:8000,, AND <selectTestSetY> WITH 6001:8000\n###\nraw_x_test <- mnist$test$x[<selectTestSet>]\nraw_y_test <- mnist$test$y[<selectTestSet>]\n###\n\n# This tells us the dimensions of our training set's features\ndim(raw_x_train)",
 28 |       "execution_count": null,
 29 |       "outputs": []
 30 |     },
 31 |     {
 32 |       "metadata": {},
 33 |       "cell_type": "markdown",
 34 |       "source": "Expected output:  \n`1000  28  28`\n\nSo we have 1,000 training samples.\n\n\nThe two 28's after the 1,000 tell us each sample is 28 pixels wide and 28 pixels high.\n\nEach pixel is really just a number from 0 to 255 - 0 being fully black, 255 being fully white - so the images are greyscale. When we graph the 28x28 numbers, we can see the image.\n\nStep 2\n============\n\nSo, let's have a look at one of our samples.\n\n**Run the code below**"
 35 |     },
 36 |     {
 37 |       "metadata": {
 38 |         "trusted": true
 39 |       },
 40 |       "cell_type": "code",
 41 |       "source": "# Run this box to look at one of our images\nim <- raw_x_train[1,,]\nim <- t(apply(im, 2, rev)) \nimage(1:28, 1:28, im, col=gray((0:255)/255), xaxt='n', main=paste(raw_y_train[1]))",
 42 |       "execution_count": null,
 43 |       "outputs": []
 44 |     },
 45 |     {
 46 |       "metadata": {},
 47 |       "cell_type": "markdown",
 48 |       "source": "Our first training image is `5`.\n\nNext, let's check out our test set.\n\n**Run the code below**"
 49 |     },
 50 |     {
 51 |       "metadata": {
 52 |         "trusted": true
 53 |       },
 54 |       "cell_type": "code",
 55 |       "source": "# Run this to see the dimensions of our test set.\ndim(raw_x_test)",
 56 |       "execution_count": null,
 57 |       "outputs": []
 58 |     },
 59 |     {
 60 |       "metadata": {},
 61 |       "cell_type": "markdown",
 62 |       "source": "Expected output:  \n`500  28  28`\n\nAnd we have 500 test images!\n\nLet's take a look at the first image in the test set.\n\n**Run the code below**"
 63 |     },
 64 |     {
 65 |       "metadata": {
 66 |         "trusted": true
 67 |       },
 68 |       "cell_type": "code",
 69 |       "source": "# Run this to look at the first image in the test set\nim <- raw_x_test[1,,]\nim <- t(apply(im, 2, rev)) \nimage(1:28, 1:28, im, col=gray((0:255)/255), xaxt='n', main=paste(raw_y_test[1]))",
 70 |       "execution_count": null,
 71 |       "outputs": []
 72 |     },
 73 |     {
 74 |       "metadata": {},
 75 |       "cell_type": "markdown",
 76 |       "source": "You should see a 9 above. Looking good - next we will prepare our data for another neural network.\n\nStep 3\n---\n\nThe neural network will use the 28x28 values of each image to predict what each image represents.\n\nWe need to reshape our data to get it working well with our neural network. \n\n**Run the code below**"
 77 |     },
 78 |     {
 79 |       "metadata": {
 80 |         "trusted": true
 81 |       },
 82 |       "cell_type": "code",
 83 |       "source": "# Read then run this code\n\n# First off, let's reshape our X sets so that they fit the convolutional layers.\nx_train <- array_reshape(raw_x_train, c(nrow(raw_x_train), 28, 28, 1))\nx_test <- array_reshape(raw_x_test, c(nrow(raw_x_test), 28, 28, 1))\n\n# Next up - feature scaling.\n# We scale the values so they are between 0 and 1, instead of 0 and 255.\nx_train <- x_train / 255\nx_test <- x_test / 255\n\n# Print the label associated with the first element in the training data set\nprint(raw_y_train[1])",
 84 |       "execution_count": null,
 85 |       "outputs": []
 86 |     },
 87 |     {
 88 |       "metadata": {},
 89 |       "cell_type": "markdown",
 90 |       "source": "Expected output:  \n`5`\n\nThe label is a number - the number we see when we view the image.\n\nWe need represent this number as a category by using a one-hot vector, rather than an integer (a number). This is the same as if we were still trying to predict the breed of a dog.\n\nKeras can convert these numeric labels into one-hot vectors easily with the function - `to_categorical`\n\n#### Replace `<addCategorical>` with `to_categorical` and run the code."
 91 |     },
 92 |     {
 93 |       "metadata": {
 94 |         "trusted": true
 95 |       },
 96 |       "cell_type": "code",
 97 |       "source": "# The 10 means that there are 10 different categories - 0 to 9\n###\n# REPLACE THE <addCategorical> BELOW WITH to_categorical\n###\ny_train <- <addCategorical>(raw_y_train, 10)\ny_test <- <addCategorical>(raw_y_test, 10)\n###\n\n# Print the label for the first element\nprint(y_train[1,])",
 98 |       "execution_count": null,
 99 |       "outputs": []
100 |     },
101 |     {
102 |       "metadata": {},
103 |       "cell_type": "markdown",
104 |       "source": "Expected output:  \n`[1] 0 0 0 0 0 1 0 0 0 0`\n\nStep 4\n-----\n\nAll ready! Time to build another neural network.\n\nWe need to add in convolutional layers. We have 2D images, so we want 2D layers. We also will use a few additional techniques which you can read about in the code comments.\n\n### In the cell below replace:\n#### 1. `<shape1>` with `28 `\n#### 2. `<shape2>` with `28`\n#### 3. `<shape3>` with `1`\n#### 4. `<numberOfClasses>` with `10`\n\n#### and then __run the code__."
105 |     },
106 |     {
107 |       "metadata": {
108 |         "trusted": true
109 |       },
110 |       "cell_type": "code",
111 |       "source": "suppressMessages(use_session_with_seed(1))\n###\n# REPLACE THE <shape1> WITH 28 AND <shape2> WITH 28 AND <shape3> WITH 1\n###\ninput_shape <- c(<shape1>, <shape2>, <shape3>)\n###\n\n###\n# REPLACE THE <numberOfClasses> WITH 10\n###\nnum_classes <- <numberOfClasses>\n###",
112 |       "execution_count": null,
113 |       "outputs": []
114 |     },
115 |     {
116 |       "metadata": {},
117 |       "cell_type": "markdown",
118 |       "source": "Time to set up our model.\n\n### In the cell below replace:\n#### 1. `<convolutionalLayer>` with `layer_conv_2d `\n#### 2. `<convolutionalLayer>` with `layer_conv_2d`\n#### 3. `<poolingLayer>` with `layer_max_pooling_2d`\n#### 4. `<dropout>` with `layer_dropout`\n#### 5. `<flatten>` with `layer_flatten()`\n#### 6. `<dropout>` with `layer_dropout`\n\n#### and then __run the code__."
119 |     },
120 |     {
121 |       "metadata": {
122 |         "trusted": true
123 |       },
124 |       "cell_type": "code",
125 |       "source": "# This box sets up a new convolutional neural network and prints a summary         \n\nuse_session_with_seed(1)\nset.seed(1)\n\nmodel <- keras_model_sequential() %>%\n# Here we start with the convolutional layers\n###\n# REPLACE THE TWO <convolutionalLayer>'s BELOW WITH layer_conv_2d\n###\n  <convolutionalLayer>(filters = 28, kernel_size = c(3,3), activation = 'relu',\n                input_shape = input_shape) %>% \n  <convolutionalLayer>(filters = 28, kernel_size = c(3,3), activation = 'relu') %>%\n###\n\n# Pooling layers help speed up training time and make features it detects more robust.\n# They act by downsampling the data - reducing the data size and complexity.\n###\n# REPLACE <poolingLayer> WITH layer_max_pooling_2d\n###\n  <poolingLayer>(pool_size = c(2, 2)) %>%\n###\n\n# Dropout is a technique to help prevent overfitting\n# It makes nodes 'dropout' - turning them off randomly.\n###\n# REPLACE <dropout> WITH layer_dropout\n###\n  <dropout>(rate = 0.125) %>% \n###\n\n# Next the data is flattened to a vector\n###\n# REPLACE <flatten> WITH layer_flatten()\n###\n  <flatten> %>% \n###\n\n# Dense layers perform classification - we have extracted the features with the convolutional pre-processing\n  layer_dense(units = 64, activation = 'relu') %>% \n###\n# REPLACE <dropout> WITH layer_dropout\n###\n  <dropout>(rate = 0.25) %>% \n###\n\n# Next is our output layer\n# Softmax outputs the probability for each category\n  layer_dense(units = num_classes, activation = 'softmax')\n\n\n# Let's print out the structure of our model\nsummary(model)",
126 |       "execution_count": null,
127 |       "outputs": []
128 |     },
129 |     {
130 |       "metadata": {
131 |         "trusted": true
132 |       },
133 |       "cell_type": "code",
134 |       "source": "# Run this cell!\n# Time to compile the model, ready for training\n\nmodel %>% compile(\n  loss = 'categorical_crossentropy',\n  optimizer = 'Adamax',\n  metrics = c('accuracy')\n)",
135 |       "execution_count": null,
136 |       "outputs": []
137 |     },
138 |     {
139 |       "metadata": {},
140 |       "cell_type": "markdown",
141 |       "source": "Step 5\n============\n\nTime to train our model!\n\nIf it's taking a while you can lower the number of epochs. If you want to leave it running in the background and see how accurate you can get, you can increase the number of epochs.\n\n### In the cell below replace:\n#### 1. `<numberOfEpochs>` with `25`\n#### 2. `<validationPercentage>` with `0.2`\n\n#### and then __run the code__."
142 |     },
143 |     {
144 |       "metadata": {
145 |         "trusted": true
146 |       },
147 |       "cell_type": "code",
148 |       "source": "# Run this code to train the convolutional neural network and print out its accuracy\n\nhistory <- model %>% fit(\n  x_train, y_train, \n###\n# REPLACE <numberOfEpochs> WITH 25 AND <validationPercentage> WITH 0.2\n###    \n  epochs = <numberOfEpochs>, batch_size = 32, \n  validation_split = <validationPercentage>\n###    \n)\n\n# Make a graph of loss and accuracy\nplot(history)\n\n# Let's take a look at the loss and accuracy on the test set\nmodel %>% evaluate(x_test, y_test)\n\npredictions <- model %>% predict_classes(x_test)\nscores <- model %>% evaluate(\n  x_test, y_test, verbose = 0\n)\n\n# Output metrics\ncat('Test loss:', scores[[1]], '\\n')\ncat('Test accuracy:', scores[[2]], '\\n')",
149 |       "execution_count": null,
150 |       "outputs": []
151 |     },
152 |     {
153 |       "metadata": {},
154 |       "cell_type": "markdown",
155 |       "source": "Step 6\n============\n\nLet's take a look at an actual prediction, and what the image in the test set looks like.\n\n**Run the code below**"
156 |     },
157 |     {
158 |       "metadata": {
159 |         "trusted": true
160 |       },
161 |       "cell_type": "code",
162 |       "source": "# Run this box to print how the  convolutional neural network predicts the label for an image\nprint(\"prediction:\")\nprint(predictions[1])\nprint(\"Test image:\")\nim <- x_test[1,,,]\nim <- t(apply(im, 2, rev)) \nimage(1:28, 1:28, im, col=gray((0:255)/255), xaxt='n')",
163 |       "execution_count": null,
164 |       "outputs": []
165 |     },
166 |     {
167 |       "metadata": {},
168 |       "cell_type": "markdown",
169 |       "source": "How is the prediction? Does it look right?\n\nConclusion\n------\n\nCongratulations! We've built a convolutional neural network that is able to recognise handwritten digits with very high accuracy.\n\nCNN's are very complex - you're not expected to understand everything (or most things) we covered here. They take a lot of time and practice to properly understand each aspect of them.\n\nHere we used:  \n* __Feature scaling__ - reducing the range of the values. This helps improve training time.\n* __Convolutional layers__ - network layers that pre-process the data for us. These apply filters to extract features for the neural network to analyze.\n* __Pooling layers__ - part of the Convolutional layers. They apply filters to downsample the data - extracting features.\n* __Dropout__ - a regularization technique to help prevent overfitting.\n* __Dense layers__ - neural network layers which perform classification on the features extracted by the convolutional layers and downsampled by the pooling layers.\n* __Softmax__ - an activation function which outputs the probability for each category."
170 |     }
171 |   ],
172 |   "metadata": {
173 |     "kernelspec": {
174 |       "name": "r",
175 |       "display_name": "R",
176 |       "language": "R"
177 |     },
178 |     "language_info": {
179 |       "mimetype": "text/x-r-source",
180 |       "name": "R",
181 |       "pygments_lexer": "r",
182 |       "version": "3.4.1",
183 |       "file_extension": ".r",
184 |       "codemirror_mode": "r"
185 |     }
186 |   },
187 |   "nbformat": 4,
188 |   "nbformat_minor": 2
189 | }


--------------------------------------------------------------------------------
/11. Recurrent Neural Networks - R.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "metadata": {
  5 |         "collapsed": true
  6 |       },
  7 |       "cell_type": "markdown",
  8 |       "source": "Recurrent Neural Networks\n===\n\nA recurrent neural network (RNN) is a class of neural network that excels when your data can be treated as a sequence - such as text, music, speech recognition, connected handwriting, or data over a time period. \n\nRNNs can analyse or predict a word based on the previous words in a sentence - they allow a connection between previous information and current information.\n\nThis exercise looks at implementing a LSTM RNN to generate new characters after learning from a large sample of text. LSTMs are a special type of RNN which dramatically improves the model’s ability to connect previous data to current data where there is a long gap.\n\nWe will train an RNN model using a novel written by H. G. Wells - The Time Machine."
  9 |     },
 10 |     {
 11 |       "metadata": {},
 12 |       "cell_type": "markdown",
 13 |       "source": "Step 1\n------\n\nLet's start by loading our libraries looking at our text file. This might take a few minutes."
 14 |     },
 15 |     {
 16 |       "metadata": {
 17 |         "trusted": true
 18 |       },
 19 |       "cell_type": "code",
 20 |       "source": "# Run this!\n\nsuppressMessages(install.packages(\"keras\"))\nsuppressMessages(install.packages(\"tokenizers\"))\nsuppressMessages(install.packages(\"stringr\"))\nsuppressMessages(library(keras))\nsuppressMessages(library(readr))\nsuppressMessages(library(stringr))\nsuppressMessages(library(purrr))\nsuppressMessages(library(tokenizers))\nsuppressMessages(install_keras())",
 21 |       "execution_count": null,
 22 |       "outputs": []
 23 |     },
 24 |     {
 25 |       "metadata": {
 26 |         "trusted": true
 27 |       },
 28 |       "cell_type": "code",
 29 |       "source": "path <- file.path(\"Data/time-edit.txt\")\n# Let's have a look at the text\nread_lines(path)",
 30 |       "execution_count": null,
 31 |       "outputs": []
 32 |     },
 33 |     {
 34 |       "metadata": {},
 35 |       "cell_type": "markdown",
 36 |       "source": "Expected output:  \n```The Time Traveller (for so it will be convenient to speak of him) was expounding a recondite matter to us. His pale grey eyes shone and twinkled, and his usually pale face was flushed and animated.\ntext length: 174201 characters\nunique characters: 39```\n\nStep 2\n-----\n\nNext we'll divide the text into sequences of 35 characters.\n\nThen for each sequence we'll make a training set - the following character will be the correct output for the test set.\n\n### In the cell below replace:\n#### 1. `<textSequenceLength>` with `35`\n#### 2. `<pathToDataset>` with `path`\n#### then __run the code__."
 37 |     },
 38 |     {
 39 |       "metadata": {
 40 |         "trusted": true
 41 |       },
 42 |       "cell_type": "code",
 43 |       "source": "###\n# REPLACE <textSequenceLength> WITH 35\n###\nmaxlen <- <textSequenceLength>\n###\n\n# This makes all the characters lower case, and separates the individual characters from whole words.\n\n###\n# REPLACE <pathToDataset> WITH path\n###\ntext <- read_lines(<pathToDataset>) %>%\n###\n  str_to_lower() %>%\n  str_c(collapse = \"\\n\") %>%\n  tokenize_characters(strip_non_alphanum = FALSE, simplify = TRUE)\n\nprint(sprintf(\"Total length: %d\", length(text)))\n\nchars <- text %>%\n  unique() %>%\n  sort()\n\nprint(sprintf(\"Total chars: %d\", length(chars)))",
 44 |       "execution_count": null,
 45 |       "outputs": []
 46 |     },
 47 |     {
 48 |       "metadata": {},
 49 |       "cell_type": "markdown",
 50 |       "source": "Expected output:  \n`\"Total length: 174666\"`  \n`\"Total chars: 29\"`\n\n#### Replace the 3 `<maximumLength>`'s with `maxlen`"
 51 |     },
 52 |     {
 53 |       "metadata": {
 54 |         "trusted": true
 55 |       },
 56 |       "cell_type": "code",
 57 |       "source": "###\n# REPLACE ALL THE <maximumLength>'s WITH maxlen\n###\ndataset <- map(\n  seq(1, length(text) - <maximumLength> - 1, by = 6), \n  ~list(sentence = text[.x:(.x + <maximumLength> - 1)], next_char = text[.x + <maximumLength>])\n  )\n###\n\ndataset <- transpose(dataset)\n\nx <- array(0, dim = c(length(dataset$sentence), maxlen, length(chars)))\ny <- array(0, dim = c(length(dataset$sentence), length(chars)))\n\nfor(i in 1:length(dataset$sentence)){\n  \n  x[i,,] <- sapply(chars, function(x){\n    as.integer(x == dataset$sentence[[i]])\n  })\n  \n  y[i,] <- as.integer(chars == dataset$next_char[[i]])\n  \n}",
 58 |       "execution_count": null,
 59 |       "outputs": []
 60 |     },
 61 |     {
 62 |       "metadata": {},
 63 |       "cell_type": "markdown",
 64 |       "source": "Step 3\n------\n\nLet's build our model, using a single LSTM layer of 64 units. We'll keep the model simple for now, so that training does not take too long.\n\n#### Replace the `<layerSize>` with 64, and run the cell."
 65 |     },
 66 |     {
 67 |       "metadata": {
 68 |         "trusted": true
 69 |       },
 70 |       "cell_type": "code",
 71 |       "source": "model <- keras_model_sequential()\n###\n# REPLACE <layerSize> WITH 64\n###\nmodel %>%\n  layer_lstm(<layerSize>, input_shape = c(maxlen, length(chars))) %>%\n###\n  layer_dense(length(chars)) %>%\n  layer_activation(\"softmax\")\n\nmodel %>% compile(\n  loss = \"categorical_crossentropy\", \n  optimizer = \"Adam\"\n)",
 72 |       "execution_count": null,
 73 |       "outputs": []
 74 |     },
 75 |     {
 76 |       "metadata": {},
 77 |       "cell_type": "markdown",
 78 |       "source": "We'll just get a few helper functions ready, run the cell below to prepare them."
 79 |     },
 80 |     {
 81 |       "metadata": {
 82 |         "trusted": true
 83 |       },
 84 |       "cell_type": "code",
 85 |       "source": "# Run this cell!\n\nsample_mod <- function(preds, temperature = 1){\n  preds <- log(preds)/temperature\n  exp_preds <- exp(preds)\n  preds <- exp_preds/sum(exp(preds))\n  \n  rmultinom(1, 1, preds) %>% \n    as.integer() %>%\n    which.max()\n}\n\non_epoch_end <- function(epoch, logs) {\n  \n  cat(sprintf(\"epoch: %02d ---------------\\n\\n\", epoch))\n    \n  diversity <- 0.5\n  generated <- \"\"\n    \n  cat(sprintf(\"diversity: %f ---------------\\n\\n\", diversity))\n    \n  start_index <- sample(1:(length(text) - maxlen), size = 1)\n  sentence <- text[start_index:(start_index + maxlen - 1)]\n    \n    for(i in 1:400){\n      \n      x <- sapply(chars, function(x){\n        as.integer(x == sentence)\n      })\n      x <- array_reshape(x, c(1, dim(x)))\n      \n      preds <- predict(model, x)\n      next_index <- sample_mod(preds, diversity)\n      next_char <- chars[next_index]\n      \n      generated <- str_c(generated, next_char, collapse = \"\")\n      sentence <- c(sentence[-1], next_char)\n      \n    }\n    \n    cat(generated)\n    cat(\"\\n\\n\")\n    \n  \n}",
 86 |       "execution_count": null,
 87 |       "outputs": []
 88 |     },
 89 |     {
 90 |       "metadata": {},
 91 |       "cell_type": "markdown",
 92 |       "source": "Ready to go. The next cell will train the model.\n\nTraining RNN's on low compute takes a long time. We'll only build a small one for now. If you want to leave this model training for longer change the number of epochs to a larger number.\n\n#### Replace the `<epochNumber>` with 3 and run the cell."
 93 |     },
 94 |     {
 95 |       "metadata": {
 96 |         "trusted": true
 97 |       },
 98 |       "cell_type": "code",
 99 |       "source": "# This will take a little while...\nprint_callback <- callback_lambda(on_epoch_end = on_epoch_end)\n\nhistory <- model %>% fit(\n  x, y,\n  batch_size = 1,\n###\n# REPLACE <epochNumber> WITH 3\n###\n  epochs = <epochNumber>,\n###\n  callbacks = print_callback\n)",
100 |       "execution_count": null,
101 |       "outputs": []
102 |     },
103 |     {
104 |       "metadata": {},
105 |       "cell_type": "markdown",
106 |       "source": "The output won't appear to be very good. But then, this dataset is small, and we have trained it only for a short time using a rather small RNN. Feel free to increase the number of epochs and leave it training for a long time if you want to see better results.\n\nWe could improve our model by:\n* Having a larger training set.\n* Increasing the number of LSTM units.\n* Training it for longer\n* Experimenting with difference activation functions, optimization functions etc\n\n\nConclusion\n--------\n\nWe have trained an RNN that learns to predict characters based on a text sequence. We have trained a lightweight model from scratch."
107 |     }
108 |   ],
109 |   "metadata": {
110 |     "kernelspec": {
111 |       "name": "r",
112 |       "display_name": "R",
113 |       "language": "R"
114 |     },
115 |     "language_info": {
116 |       "mimetype": "text/x-r-source",
117 |       "name": "R",
118 |       "pygments_lexer": "r",
119 |       "version": "3.4.1",
120 |       "file_extension": ".r",
121 |       "codemirror_mode": "r"
122 |     }
123 |   },
124 |   "nbformat": 4,
125 |   "nbformat_minor": 2
126 | }


--------------------------------------------------------------------------------
/12. Clustering - R.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "metadata": {},
  5 |       "cell_type": "markdown",
  6 |       "source": "Clustering\n===\n\nWhen a data set doesn’t have labels, we can use unsupervised learning to find structure in the data, which in turn allows us to discover patterns or groups.\n\nCluster analysis is a method of finding groups, known as **clusters**, in datasets. As the datasets are unlabelled, cluster analysis aims to group similar samples based on their input features.\n\n**K-means clustering** separates samples into `k` clusters, and partitions samples by the average (mean) of the clusters. So if we state that `k = 5`, k-means clustering will divide the samples into 5 clusters based on the means of the clusters.\n\nStep 1\n---\n\nIn this exercise, we will use k-means clustering to analyse a few different datasets.\n\nFirst, we need to load the required packages for this session.\n\n**Run the code below**"
  7 |     },
  8 |     {
  9 |       "metadata": {
 10 |         "trusted": true
 11 |       },
 12 |       "cell_type": "code",
 13 |       "source": "# Run this box to load the required packages\n\n# Load the required libraries for this session\nsuppressMessages(install.packages(\"tidyverse\"))\nsuppressMessages(library(\"tidyverse\"))\nsuppressMessages(install.packages(\"clusterGeneration\"))\nsuppressMessages(library(\"clusterGeneration\"))\nsuppressMessages(install.packages(\"kernlab\"))\nsuppressMessages(library(\"kernlab\"))\nsuppressMessages(install.packages(\"mlbench\"))\nsuppressMessages(library(\"mlbench\"))",
 14 |       "execution_count": null,
 15 |       "outputs": []
 16 |     },
 17 |     {
 18 |       "metadata": {},
 19 |       "cell_type": "markdown",
 20 |       "source": "Now let's create a dataset with a known number of clusters to demonstrate how k-means clustering would handle the data.\n\nBelow, we will change the `numClust` argument within the `genRandomClust` function to generate a random data set with 3 clusters.\n\n#### Replace `<clusterNumber>` with `3` and run the code."
 21 |     },
 22 |     {
 23 |       "metadata": {
 24 |         "trusted": true
 25 |       },
 26 |       "cell_type": "code",
 27 |       "source": "# Set the seed to be able to reproduce the same random cluster data\nset.seed(365)\n\n# Generate random data set with 3 clusters\n###\n# REPLACE <clusterNumber> WITH 3\n###\nclust_three <- genRandomClust(numClust = <clusterNumber>, sepVal = 0.1, numReplicate = 1, clustszind = 1, clustSizeEq = 175, \n                              outputDatFlag = FALSE, outputLogFlag = FALSE, outputEmpirical = FALSE, \n                              outputInfo = FALSE)\n###\n\n# Save x and y values to a data frame\nclust_three <- as.data.frame(clust_three$datList$test_1) %>% \nrename(., x = x1, y = x2)\n\n# Create scatter plot\nggplot(clust_three, aes(x, y)) +\ngeom_point(alpha = 0.75) +\nggtitle(\"Data set n = 3 clusters\") +\ntheme(plot.title = element_text(hjust = 0.5))",
 28 |       "execution_count": null,
 29 |       "outputs": []
 30 |     },
 31 |     {
 32 |       "metadata": {},
 33 |       "cell_type": "markdown",
 34 |       "source": "Alright, we just made a dataset with 3 clusters and graphed it.\n\nLet's see how k-means performs on this dataset, already knowing we have 3 clusters.\n\n### In the cell below replace:\n#### 1. `<clusterVariableName>` with `clust_three`\n#### 2. `<numberOfCenters>` with `3`\n#### then __run the code__."
 35 |     },
 36 |     {
 37 |       "metadata": {
 38 |         "trusted": true
 39 |       },
 40 |       "cell_type": "code",
 41 |       "source": "###\n# REPLACE <clusterVariableName> WITH clust_three AND <numberOfCenters> WITH 3\n###\nclust3_kmeans <- kmeans(x = <clusterVariableName>, centers = <numberOfCenters>)\n###\n\nclust3 <- as.data.frame(clust3_kmeans$cluster) %>% \nrename(., Cluster_kmeans = `clust3_kmeans$cluster`) %>% \nmutate(., Cluster_kmeans = as.factor(Cluster_kmeans)) %>%\nbind_cols(., clust_three)\n\n# Check output\n# str(clust3)\nhead(clust3)\n\n# Plot the results\nclust3 %>% \nggplot(aes(x, y, colour = Cluster_kmeans)) +\ngeom_point(alpha = 0.75) +\nlabs(title = \"Data set n = 3 clusters: k-means clustering analysis\", colour = \"Cluster\\ngroup\") +\ntheme(plot.title = element_text(hjust = 0.5))",
 42 |       "execution_count": null,
 43 |       "outputs": []
 44 |     },
 45 |     {
 46 |       "metadata": {},
 47 |       "cell_type": "markdown",
 48 |       "source": "K-means clustering performs rather well, by the looks of it!\n\nBut we knew that our data set had three clusters, sometimes it might not be so clear..."
 49 |     },
 50 |     {
 51 |       "metadata": {},
 52 |       "cell_type": "markdown",
 53 |       "source": "## Step 2\n\nLet's generate another dataset in which it may be a little less obvious how many clusters it contains.\n\nBelow we will generate a random data set with `4` clusters and change the `sepVal` argument to  reduce the separation between the clusters.\n\n### In the cell below replace:\n#### 1. `<numberOfClusters>` with `4`\n#### 2. `<clusterSeperationValue>` with `-0.01`\n#### then __run the code__."
 54 |     },
 55 |     {
 56 |       "metadata": {
 57 |         "trusted": true
 58 |       },
 59 |       "cell_type": "code",
 60 |       "source": "# Set seed to reproduce this code\nset.seed(365)\n\n# Generate random data set with 4 clusters\n###\n# REPLACE <numberOfClusters> WITH 4 AND <clusterSeperationValue> WITH -0.01\n###\nfour_clust <- genRandomClust(numClust = <numberOfClusters>, sepVal = <clusterSeperationValue>, numReplicate = 1, clustszind = 1, clustSizeEq = 175,\n                             outputDatFlag = FALSE, outputLogFlag = FALSE, outputEmpirical = FALSE, \n                             outputInfo = FALSE)\n###\n\n# Obtain cluster data x and y values\nclust4_data <- as.data.frame(four_clust$datList$test_1) %>% \nrename(., x = x1, y = x2)\n\n# Create scatter plot\nggplot(clust4_data, aes(x, y)) +\ngeom_point(alpha = 0.75) +\nggtitle(\"Data set n = 4 clusters\") +\ntheme(plot.title = element_text(hjust = 0.5))",
 61 |       "execution_count": null,
 62 |       "outputs": []
 63 |     },
 64 |     {
 65 |       "metadata": {},
 66 |       "cell_type": "markdown",
 67 |       "source": "In instances where we do not know how many classes to expect, we can run k-means clustering multiple times with different *k* values to see how the data is partitioned. Let's try that now.\n\nThe following code block creates a custom function named `cluster_kvalue`. This function performs k-means clustering, saves the cluster membership, then creates a scatter plot of the data coloured by the cluster membership. The `cluster_kvalue` function contains two arguments:\n\n- `data_input` numeric matrix/data frame\n- `kvalue` number of clusters to partition into (k)\n\nYou do not need to edit the following code block. However, you will need to call this custom function later!\n\n**Run the code below to prepare the function for later use**"
 68 |     },
 69 |     {
 70 |       "metadata": {
 71 |         "trusted": true
 72 |       },
 73 |       "cell_type": "code",
 74 |       "source": "# Run this block to prepare the function for later\n\n# But don't edit it!\n\n# Create own function to run k-means clustering, save cluster membership, then plot results\ncluster_kvalue <- function(data_input, kvalue) {\n    clust_kmeans <- kmeans(x = data_input, centers = kvalue) #, algorithm = \"Hartigan-Wong\"\n    as.data.frame(clust_kmeans$cluster) %>% \n    rename(., Cluster_kmeans = `clust_kmeans$cluster`) %>% \n    mutate(., Cluster_kmeans = as.factor(Cluster_kmeans)) %>%\n    bind_cols(., data_input) %>% \n    ggplot(aes(x, y, colour = Cluster_kmeans)) +\n    geom_point(alpha = 0.75) +\n    ggtitle(paste(\"Cluster analysis using k-means clustering: k = \", kvalue)) +\n    theme(plot.title = element_text(hjust = 0.5)) +\n    labs(colour = \"Cluster\\ngroup\")\n}",
 75 |       "execution_count": null,
 76 |       "outputs": []
 77 |     },
 78 |     {
 79 |       "metadata": {},
 80 |       "cell_type": "markdown",
 81 |       "source": "Now let's run our custom function on `clust4_data`, changing the number of clusters (`kvalue`) each time. This will tell us how kmeans performs with a different set number of clusters.\n\n#### Below, replace the `<numberOfClusters>`'s as directed."
 82 |     },
 83 |     {
 84 |       "metadata": {
 85 |         "trusted": true
 86 |       },
 87 |       "cell_type": "code",
 88 |       "source": "# Run this box to test k = 2\ncluster_kvalue(clust4_data, 2)",
 89 |       "execution_count": null,
 90 |       "outputs": []
 91 |     },
 92 |     {
 93 |       "metadata": {
 94 |         "trusted": true
 95 |       },
 96 |       "cell_type": "code",
 97 |       "source": "###\n# REPLACE <numberOfClusters> WITH 3\n###\ncluster_kvalue(clust4_data, <numberOfClusters>)\n###",
 98 |       "execution_count": null,
 99 |       "outputs": []
100 |     },
101 |     {
102 |       "metadata": {
103 |         "trusted": true
104 |       },
105 |       "cell_type": "code",
106 |       "source": "###\n# REPLACE <numberOfClusters> WITH 4\n###\ncluster_kvalue(clust4_data, <numberOfClusters>)\n###",
107 |       "execution_count": null,
108 |       "outputs": []
109 |     },
110 |     {
111 |       "metadata": {
112 |         "trusted": true
113 |       },
114 |       "cell_type": "code",
115 |       "source": "###\n# REPLACE <numberOfClusters> WITH 5\n###\ncluster_kvalue(clust4_data, <numberOfClusters>)\n###",
116 |       "execution_count": null,
117 |       "outputs": []
118 |     },
119 |     {
120 |       "metadata": {
121 |         "trusted": true
122 |       },
123 |       "cell_type": "code",
124 |       "source": "###\n# REPLACE <numberOfClusters> WITH 6\n###\ncluster_kvalue(clust4_data, <numberOfClusters>)\n###",
125 |       "execution_count": null,
126 |       "outputs": []
127 |     },
128 |     {
129 |       "metadata": {},
130 |       "cell_type": "markdown",
131 |       "source": "Which value of *k* do you think best splits the data?"
132 |     },
133 |     {
134 |       "metadata": {},
135 |       "cell_type": "markdown",
136 |       "source": "## Step 3\n\nK-means clustering performs well enough on clustered data like that, but let's try it out on a data set that is not so linear.\n\nLet's create a data set that contains two rings of data. \n\nWe need to change the arguments in the `ggplot` code to plot the data.\n\n### In the cell below replace:\n#### 1. `<dataset>` with `ring_data`\n#### 2. `<variableNames>` with `x, y`\n\n#### and then __run the code__."
137 |     },
138 |     {
139 |       "metadata": {
140 |         "trusted": true
141 |       },
142 |       "cell_type": "code",
143 |       "source": "x <- matrix(rnorm(500), ncol = 2)\n# Formula for Euclidean norm\nring1_data <- x/sqrt(rowSums(x^2))\nring2_data <- ring1_data/2\n\nring1_data <- as.data.frame(ring1_data)\nring2_data <- as.data.frame(ring2_data)\n\n# Check structure\nstr(ring1_data)\nstr(ring2_data)\n\nring_data <- bind_rows(ring1_data, ring2_data) %>% \nrename(x = V1, y = V2)\nstr(ring_data)\n\n###\n# REPLACE <dataset> WITH ring_data AND <variableNames> WITH x, y\n###\nggplot(data = <dataset>, aes(<variableNames>)) +\n###\ngeom_point(alpha = 0.5) +\nggtitle(\"Two ring data set\") +\ntheme(plot.title = element_text(hjust = 0.5))",
144 |       "execution_count": null,
145 |       "outputs": []
146 |     },
147 |     {
148 |       "metadata": {},
149 |       "cell_type": "markdown",
150 |       "source": "We can clearly distinguish two \"clusters\", that is, the two rings of datapoints.\n\nLet's see how k-means handles a dataset like this. We can use our previous custom function to perform k-means clustering on `ring_data` and plot the results.\n\n#### Replace the `<clusterFunctionParameters>` with `ring_data, 2` then run the code."
151 |     },
152 |     {
153 |       "metadata": {
154 |         "trusted": true
155 |       },
156 |       "cell_type": "code",
157 |       "source": "###\n# REPLACE <clusterFunctionParameters> WITH ring_data, 2\n###\n(ring_kmeans <- cluster_kvalue(<clusterFunctionParameters>))\n###",
158 |       "execution_count": null,
159 |       "outputs": []
160 |     },
161 |     {
162 |       "metadata": {},
163 |       "cell_type": "markdown",
164 |       "source": "K-means clustering clearly has difficulty solving this. As we are currently using it, there is no way for k-means clustering to place two means to label this data set correctly."
165 |     },
166 |     {
167 |       "metadata": {},
168 |       "cell_type": "markdown",
169 |       "source": "Step 4\n---\n\nWe can try to run k-means clustering another way. Let's add another feature to our two ring data set: the distance of each point away from the centre.\n\nLet's see if k-means is able to classify the two data clusters with this new feature.\n\nWe will change the arguments in the `ggplot` call to plot the ring data in 2D. This will be coloured by the new feature, `z`.\n\n### In the cell below replace:\n#### 1. `<variablesToPlot>` with `x, y`\n#### 2. `<variableForColour>` with `z`\n\n#### and then __run the code__."
170 |     },
171 |     {
172 |       "metadata": {
173 |         "trusted": true
174 |       },
175 |       "cell_type": "code",
176 |       "source": "# Calculate distance from centre for each data point\nring_data_z <- ring_data %>% \nmutate(z = 4 * sqrt(x^2 + y^2))\n\nhead(ring_data_z)\ntail(ring_data_z)\n\n# Plot in 2D first\nring_data_z %>%\n\n###\n# REPLACE <variablesToPlot> WITH x, y and <variableForColour> WITH z\n###\nggplot(aes(<variablesToPlot> , colour = <variableForColour> )) +\n###\ngeom_point(alpha = 0.75) +\nlabs(title = \"Two ring data coloured by distance from centre\") +\ntheme(plot.title = element_text(hjust = 0.5))",
177 |       "execution_count": null,
178 |       "outputs": []
179 |     },
180 |     {
181 |       "metadata": {},
182 |       "cell_type": "markdown",
183 |       "source": "Now let's plot all three features `x, y, z` coloured by the feature `z` in 3D using the `plot3D` package.\n\n#### Run the code below."
184 |     },
185 |     {
186 |       "metadata": {
187 |         "trusted": true
188 |       },
189 |       "cell_type": "code",
190 |       "source": "plot_ly(ring_data_z, x = ~x, y = ~y, z = ~z, color = ~z) %>% \nadd_markers(opacity = 0.25)",
191 |       "execution_count": null,
192 |       "outputs": []
193 |     },
194 |     {
195 |       "metadata": {},
196 |       "cell_type": "markdown",
197 |       "source": "How does k-means clustering deal with our ring dataset now that it has 3 features, and 2 clusters?\n\n### In the cell below replace:\n#### 1. `<dataset>` with `ring_data_z`\n#### 2. `<numberOfClusers>` with `2\n\n#### and then __run the code__."
198 |     },
199 |     {
200 |       "metadata": {
201 |         "trusted": true
202 |       },
203 |       "cell_type": "code",
204 |       "source": "###\n# REPLACE <dataset> WITH ring_data_z AND <numberOfClusers> WITH 2\n###\ncluster_kvalue(<dataset>, <numberOfClusers>)\n###",
205 |       "execution_count": null,
206 |       "outputs": []
207 |     },
208 |     {
209 |       "metadata": {},
210 |       "cell_type": "markdown",
211 |       "source": "Looks good! When we add a third feature `z` to our two ring data set, k-means clustering can better discern the cluster membership.\n\nStep 5\n---\n\nSome data cannot be manipulated like that. Let's have a look at a different type of data distribution, spirals.\n\nWe will create a data set in the shape of spirals using the function `mlbench.spirals` from the package `mlbench`.\n\n#### Replace `<spiralFunction>` with `mlbench.spirals` and run the code."
212 |     },
213 |     {
214 |       "metadata": {
215 |         "trusted": true
216 |       },
217 |       "cell_type": "code",
218 |       "source": "# Set the seed to reproduce the random data set\nset.seed(123)\n\n###\n# REPLACE <spiralFunction> WITH mlbench.spirals\n###\nspiral_data <- <spiralFunction>(n = 500, cycles = 1, sd = 0.025)\n###\n\n# Save `spiral_data` to a data frame to allow plotting\nspiral_data <- data.frame(x = spiral_data$x[, 1], y = spiral_data$x[, 2], classes = spiral_data$classes)\n\n# Create scatter plot of the data\nspiral_data %>% \nggplot(aes(x, y)) +\ngeom_point(alpha = 0.75) +\nlabs(title = \"Spiral data set\", colour = \"Spiral\\nnumber\") +\ntheme(plot.title = element_text(hjust = 0.5))",
219 |       "execution_count": null,
220 |       "outputs": []
221 |     },
222 |     {
223 |       "metadata": {},
224 |       "cell_type": "markdown",
225 |       "source": "Let's try running k-means clustering on `spiral_data` using our custom function.\n\n**In the code below, add the appropriate arguments to `cluster_kvalue` and press Run.**"
226 |     },
227 |     {
228 |       "metadata": {
229 |         "trusted": true
230 |       },
231 |       "cell_type": "code",
232 |       "source": "###\n# REPLACE <dataset> WITH spiral_data AND <numberOfClusters> WITH 2\n###\ncluster_kvalue(<dataset>, <numberOfClusters>)\n###",
233 |       "execution_count": null,
234 |       "outputs": []
235 |     },
236 |     {
237 |       "metadata": {},
238 |       "cell_type": "markdown",
239 |       "source": "Again, k-means clustering is facing a similar issue as with the circle data. But k-means clustering is just one method for clustering; other clustering methods may be more suitable to partition spiral data appropriately.\n\nStep 6\n---\n\n**Spectral clustering** is a clustering method that aims to cluster data that is in some way connected, so that samples in the same group are similar, and samples in different groups are dissimilar to each other.\n\nWe will run spectral analysis using the `specc` function. We will set the number of centers to 2, since we expect two groups where the samples in each group belong to a different spiral. Our dataset is the `spiral_data` we have been using previously.\n\n#### Replace `<spectralClusteringFunction>` with `specc` and run the code."
240 |     },
241 |     {
242 |       "metadata": {
243 |         "trusted": true
244 |       },
245 |       "cell_type": "code",
246 |       "source": "###\n# REPLACE <spectralClusteringFunction> WITH specc\n###\nspiral_data_specc <- ?(as.matrix(select(spiral_data, -classes)), centers = 2)\n###\nspiral_data %>% \nmutate(Specc = spiral_data_specc@.Data) %>%\nggplot(aes(x, y, colour = as.factor(Specc))) +\ngeom_point(alpha = 0.75) +\nlabs(title = \"Spectral clustering of spiral data\", colour = \"Spectral\\ncluster\") +\ntheme(plot.title = element_text(hjust = 0.5))",
247 |       "execution_count": null,
248 |       "outputs": []
249 |     },
250 |     {
251 |       "metadata": {},
252 |       "cell_type": "markdown",
253 |       "source": "Excellent! Spectral clustering works for the spiral data.\n\nLet's see how spectral clustering performs on our previous data set, the two ring data, based on just two features, x and y co-ordinates.\n\n### In the cell below replace:\n#### 1. `<dataset>` with `ring_data`\n#### 2. `<numberOfCenters>` with `2`\n#### then __run the code__."
254 |     },
255 |     {
256 |       "metadata": {
257 |         "trusted": true
258 |       },
259 |       "cell_type": "code",
260 |       "source": "# Use spectral clustering algorithm on ring_data\nhead(ring_data)\nclass(ring_data)\n\n###\n# REPLACE <dataset> WITH ring_data and <numberOfCenters> WITH 2\n###\nring_data_specc <- specc(as.matrix(<dataset>), centers = <numberOfCenters>)\n###\n\nring_data %>% \nmutate(Specc = ring_data_specc@.Data) %>%\nggplot(aes(x, y, colour = as.factor(Specc))) +\ngeom_point(alpha = 0.75) +\nlabs(title = \"Spectral clustering of two ring data\", colour = \"Spectral\\ncluster\") +\ntheme(plot.title = element_text(hjust = 0.5))",
261 |       "execution_count": null,
262 |       "outputs": []
263 |     },
264 |     {
265 |       "metadata": {},
266 |       "cell_type": "markdown",
267 |       "source": "Does spectral clustering classify the two ring data into the correct clusters?\n\n## Conclusion\n\nWe have learnt two important clustering methods, *k-means clustering* and *spectral clustering*, and used them on a variety of datasets. Remember, one clustering method might be more appropriate to use on a data set than another, especially straight out of the box; some additions/transformations to the data may allow the clustering method to be used."
268 |     }
269 |   ],
270 |   "metadata": {
271 |     "kernelspec": {
272 |       "name": "r",
273 |       "display_name": "R",
274 |       "language": "R"
275 |     },
276 |     "language_info": {
277 |       "mimetype": "text/x-r-source",
278 |       "name": "R",
279 |       "pygments_lexer": "r",
280 |       "version": "3.4.1",
281 |       "file_extension": ".r",
282 |       "codemirror_mode": "r"
283 |     }
284 |   },
285 |   "nbformat": 4,
286 |   "nbformat_minor": 2
287 | }


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/Data/chocolate data multiple linear regression.txt:
--------------------------------------------------------------------------------
  1 | weight	cocoa_percent	cost	customer_happiness
  2 | 247	0.11	0.25	29
  3 | 192	0.82	10.44	29
  4 | 106	0.01	0	6
  5 | 78	0.04	0.01	4
  6 | 213	0.39	2.56	30
  7 | 188	0.05	0.04	19
  8 | 190	0.38	2.23	28
  9 | 154	0.45	2.48	24
 10 | 79	0.38	0.89	13
 11 | 165	0.36	1.68	24
 12 | 175	0.84	9.77	28
 13 | 206	0.79	10.41	30
 14 | 227	0.26	1.23	30
 15 | 204	0.96	15.12	27
 16 | 148	0.17	0.34	17
 17 | 60	0.8	3.07	22
 18 | 200	0.86	11.9	28
 19 | 148	0.63	4.64	26
 20 | 225	0.19	0.68	28
 21 | 61	0.02	0	0
 22 | 174	0.85	10.01	23
 23 | 93	0.72	3.82	18
 24 | 184	0.85	10.72	24
 25 | 243	0.95	17.49	23
 26 | 234	0.81	12.37	27
 27 | 92	0.4	1.16	11
 28 | 239	0.43	3.58	29
 29 | 101	0.66	3.49	18
 30 | 142	0.32	1.16	16
 31 | 151	0.45	2.42	20
 32 | 157	0.4	2.05	19
 33 | 219	0.21	0.8	23
 34 | 109	0.23	0.47	9
 35 | 80	0.89	5.02	21
 36 | 173	0.68	6.48	24
 37 | 118	0.23	0.5	11
 38 | 162	0.53	3.6	22
 39 | 162	0.08	0.09	12
 40 | 222	0.22	0.87	24
 41 | 194	0.46	3.27	25
 42 | 61	0.18	0.16	1
 43 | 157	0.55	3.74	21
 44 | 101	0.85	5.85	21
 45 | 55	0.84	3.08	18
 46 | 213	0.35	2.13	26
 47 | 244	0.08	0.13	24
 48 | 239	0.05	0.05	22
 49 | 119	0.68	4.44	20
 50 | 162	0.29	1.1	18
 51 | 89	0.24	0.42	7
 52 | 131	0.37	1.41	16
 53 | 156	0.61	4.62	22
 54 | 108	0.73	4.64	20
 55 | 95	1	7.57	22
 56 | 147	0.76	6.72	22
 57 | 127	0.15	0.23	9
 58 | 149	0.46	2.54	20
 59 | 103	0.94	7.33	21
 60 | 121	0.76	5.65	21
 61 | 193	0.88	11.85	24
 62 | 117	0.85	6.79	21
 63 | 203	0.59	5.62	26
 64 | 110	0.73	4.73	20
 65 | 135	0.99	10.56	22
 66 | 90	0.54	2.1	15
 67 | 209	0.52	4.53	27
 68 | 194	0.55	4.78	26
 69 | 73	0.95	5.22	21
 70 | 132	0.59	3.72	20
 71 | 166	0.04	0.02	11
 72 | 214	0.58	5.8	27
 73 | 122	0.64	4.05	19
 74 | 64	0.19	0.18	2
 75 | 205	0.62	6.3	26
 76 | 55	0.17	0.13	0
 77 | 208	0.31	1.61	22
 78 | 216	0.4	2.7	24
 79 | 149	0.41	1.96	16
 80 | 128	0.73	5.53	19
 81 | 73	1	5.81	20
 82 | 61	0.29	0.42	3
 83 | 159	0.19	0.47	13
 84 | 166	0.24	0.76	15
 85 | 178	0.95	12.98	21
 86 | 116	0.02	0	1
 87 | 74	0.32	0.62	5
 88 | 133	0.94	9.42	20
 89 | 208	0.57	5.41	25
 90 | 137	0.34	1.23	13
 91 | 107	0.02	0	0
 92 | 203	0.17	0.46	6
 93 | 182	0.45	2.98	9
 94 | 189	0.23	0.83	6
 95 | 149	0.21	0.53	0
 96 | 243	0.46	4.17	10
 97 | 204	0.73	8.77	7
 98 | 224	0.05	0.04	0
 99 | 170	0.52	3.67	1
100 | 216	0.66	7.46	5
101 | 233	0.07	0.09	0


--------------------------------------------------------------------------------
/Data/chocolate data.txt:
--------------------------------------------------------------------------------
  1 | weight	cocoa_percent	sugar_percent	milk_percent	customer_happiness
  2 | 185	65	11	24	47
  3 | 247	44	34	22	55
  4 | 133	33	21	47	35
  5 | 145	30	38	32	34
  6 | 110	22	70	7	40
  7 | 134	25	38	37	40
  8 | 196	18	34	48	41
  9 | 118	45	38	17	38
 10 | 235	45	12	43	50
 11 | 107	8	2	90	25
 12 | 106	10	72	18	22
 13 | 151	42	20	38	47
 14 | 144	18	36	46	26
 15 | 160	39	40	21	47
 16 | 59	17	47	37	22
 17 | 221	39	28	33	46
 18 | 81	57	32	10	48
 19 | 135	19	36	45	27
 20 | 85	43	20	37	28
 21 | 246	35	43	22	52
 22 | 150	9	49	42	31
 23 | 124	49	32	20	43
 24 | 216	11	12	76	49
 25 | 227	15	26	60	43
 26 | 105	13	61	26	27
 27 | 27	61	9	30	30
 28 | 225	48	2	51	58
 29 | 50	43	29	28	23
 30 | 156	53	44	4	40
 31 | 155	76	24	0	54
 32 | 67	12	44	44	21
 33 | 41	26	53	21	34
 34 | 204	46	31	22	61
 35 | 211	35	41	23	58
 36 | 208	14	57	29	47
 37 | 162	43	41	15	39
 38 | 15	40	13	47	32
 39 | 133	23	28	49	28
 40 | 76	36	28	37	40
 41 | 18	76	13	11	30
 42 | 131	61	23	16	39
 43 | 231	34	14	52	49
 44 | 66	9	48	43	15
 45 | 242	41	42	17	47
 46 | 79	0	54	46	17
 47 | 51	15	44	41	24
 48 | 90	26	72	2	24
 49 | 1	3	33	65	7
 50 | 67	19	44	37	16
 51 | 176	50	21	29	41
 52 | 46	64	24	12	47
 53 | 81	24	53	24	19
 54 | 250	32	40	28	44
 55 | 240	48	8	44	56
 56 | 175	30	38	32	35
 57 | 11	11	72	17	11
 58 | 58	29	64	7	26
 59 | 234	48	11	41	52
 60 | 95	11	57	32	32
 61 | 97	31	54	15	23
 62 | 32	79	3	18	35
 63 | 234	41	25	34	51
 64 | 237	71	20	9	70
 65 | 116	47	38	15	35
 66 | 31	12	75	13	13
 67 | 83	2	33	65	16
 68 | 192	30	68	2	41
 69 | 1	24	29	47	10
 70 | 65	42	16	42	37
 71 | 22	43	32	25	31
 72 | 140	44	5	51	51
 73 | 212	38	55	6	62
 74 | 67	13	72	15	14
 75 | 112	24	59	18	26
 76 | 234	42	41	17	56
 77 | 233	21	28	51	60
 78 | 2	28	10	62	12
 79 | 81	37	41	22	35
 80 | 222	90	5	5	60
 81 | 223	39	21	40	54
 82 | 181	9	40	51	52
 83 | 34	46	28	26	22
 84 | 132	19	45	36	29
 85 | 103	39	23	38	41
 86 | 45	35	36	29	18
 87 | 40	13	52	35	22
 88 | 216	12	73	15	41
 89 | 107	41	23	36	32
 90 | 42	47	33	20	31
 91 | 165	44	30	26	52
 92 | 33	53	3	45	34
 93 | 162	5	56	39	36
 94 | 249	4	14	82	52
 95 | 144	79	3	18	45
 96 | 215	3	20	78	41
 97 | 194	0	12	88	29
 98 | 95	34	20	46	26
 99 | 44	29	40	31	19
100 | 218	46	25	28	49
101 | 169	88	1	12	57
102 | 


--------------------------------------------------------------------------------
/Data/dog_data.csv:
--------------------------------------------------------------------------------
  1 | age,weight,height,breed
  2 | 9.47E+00,6.20E+00,6.80E+00,1
  3 | 7.97E+00,8.63E+00,8.92E+00,0
  4 | 9.51E+00,6.40E+00,5.78E+00,1
  5 | 8.96E+00,8.82E+00,6.28E+00,2
  6 | 8.37E+00,3.89E+00,5.62E+00,1
  7 | 9.46E+00,9.56E+00,5.77E+00,2
  8 | 1.04E+01,1.10E+01,7.78E+00,0
  9 | 9.08E+00,7.10E+00,5.79E+00,1
 10 | 9.53E+00,9.29E+00,5.03E+00,2
 11 | 8.57E+00,5.09E+00,4.05E+00,1
 12 | 8.77E+00,6.17E+00,4.89E+00,1
 13 | 8.63E+00,3.92E+00,4.62E+00,1
 14 | 7.67E+00,7.54E+00,4.90E+00,2
 15 | 8.25E+00,7.94E+00,4.75E+00,2
 16 | 8.54E+00,8.73E+00,9.10E+00,0
 17 | 9.22E+00,8.47E+00,8.51E+00,0
 18 | 8.28E+00,8.86E+00,5.12E+00,2
 19 | 8.52E+00,4.98E+00,4.74E+00,1
 20 | 9.82E+00,8.66E+00,8.92E+00,0
 21 | 9.71E+00,5.82E+00,4.57E+00,1
 22 | 8.34E+00,5.68E+00,5.49E+00,1
 23 | 8.51E+00,8.01E+00,9.13E+00,2
 24 | 7.19E+00,7.98E+00,2.78E+00,2
 25 | 8.53E+00,9.33E+00,5.32E+00,2
 26 | 9.49E+00,9.29E+00,9.14E+00,0
 27 | 1.05E+01,1.06E+01,6.68E+00,2
 28 | 8.90E+00,7.16E+00,1.01E+01,0
 29 | 9.06E+00,4.01E+00,5.37E+00,1
 30 | 9.76E+00,6.54E+00,5.01E+00,1
 31 | 8.42E+00,3.38E+00,4.16E+00,1
 32 | 8.65E+00,2.78E+00,5.80E+00,1
 33 | 9.12E+00,5.26E+00,5.87E+00,1
 34 | 9.97E+00,1.05E+01,6.36E+00,2
 35 | 9.02E+00,8.54E+00,9.15E+00,0
 36 | 8.34E+00,6.47E+00,8.31E+00,0
 37 | 8.98E+00,5.44E+00,4.89E+00,1
 38 | 8.55E+00,8.07E+00,1.02E+01,0
 39 | 9.54E+00,1.03E+01,7.62E+00,0
 40 | 8.38E+00,4.50E+00,3.79E+00,1
 41 | 8.11E+00,8.52E+00,8.57E+00,0
 42 | 8.01E+00,8.40E+00,1.07E+01,0
 43 | 8.14E+00,7.33E+00,9.01E+00,0
 44 | 7.72E+00,8.81E+00,8.34E+00,0
 45 | 9.57E+00,8.51E+00,9.32E+00,0
 46 | 9.52E+00,9.32E+00,5.19E+00,2
 47 | 7.63E+00,7.79E+00,3.97E+00,2
 48 | 8.30E+00,8.63E+00,7.75E+00,0
 49 | 1.05E+01,1.04E+01,6.51E+00,2
 50 | 1.02E+01,9.13E+00,8.06E+00,0
 51 | 8.18E+00,4.05E+00,6.22E+00,1
 52 | 8.27E+00,8.49E+00,5.44E+00,2
 53 | 8.90E+00,9.29E+00,5.22E+00,2
 54 | 9.58E+00,9.08E+00,4.58E+00,2
 55 | 8.97E+00,4.78E+00,4.93E+00,1
 56 | 1.07E+01,1.06E+01,8.65E+00,0
 57 | 9.59E+00,5.10E+00,4.93E+00,1
 58 | 9.55E+00,6.27E+00,5.41E+00,1
 59 | 8.16E+00,7.70E+00,1.11E+01,0
 60 | 1.06E+01,1.01E+01,9.82E+00,0
 61 | 9.31E+00,4.98E+00,5.40E+00,1
 62 | 8.24E+00,7.84E+00,9.13E+00,0
 63 | 9.67E+00,9.99E+00,4.78E+00,2
 64 | 9.54E+00,1.08E+01,8.42E+00,0
 65 | 9.19E+00,7.52E+00,1.04E+01,0
 66 | 8.86E+00,8.69E+00,6.01E+00,2
 67 | 7.96E+00,6.26E+00,5.51E+00,1
 68 | 1.13E+01,1.06E+01,8.73E+00,0
 69 | 8.71E+00,6.21E+00,6.66E+00,1
 70 | 1.19E+01,1.14E+01,6.02E+00,2
 71 | 8.77E+00,5.61E+00,5.22E+00,1
 72 | 8.09E+00,8.37E+00,4.10E+00,2
 73 | 8.10E+00,8.05E+00,8.51E+00,0
 74 | 8.84E+00,9.21E+00,5.38E+00,2
 75 | 8.12E+00,3.23E+00,2.21E+00,1
 76 | 9.14E+00,8.98E+00,4.96E+00,2
 77 | 1.20E+01,1.18E+01,7.09E+00,2
 78 | 8.67E+00,8.22E+00,3.54E+00,2
 79 | 9.04E+00,1.01E+01,9.34E+00,0
 80 | 9.27E+00,8.58E+00,8.30E+00,0
 81 | 1.07E+01,9.59E+00,9.66E+00,0
 82 | 9.10E+00,9.51E+00,9.18E+00,0
 83 | 8.30E+00,8.36E+00,9.00E+00,0
 84 | 1.03E+01,1.17E+01,7.35E+00,0
 85 | 8.27E+00,7.69E+00,3.44E+00,2
 86 | 8.70E+00,8.60E+00,5.31E+00,2
 87 | 8.10E+00,4.51E+00,4.19E+00,1
 88 | 9.17E+00,8.35E+00,9.87E+00,0
 89 | 8.64E+00,8.85E+00,6.20E+00,2
 90 | 8.20E+00,8.37E+00,4.14E+00,2
 91 | 6.90E+00,6.87E+00,2.67E+00,2
 92 | 7.30E+00,7.36E+00,2.74E+00,2
 93 | 1.01E+01,6.19E+00,5.54E+00,1
 94 | 9.67E+00,1.02E+01,6.13E+00,2
 95 | 8.28E+00,8.56E+00,1.02E+01,0
 96 | 9.93E+00,9.29E+00,8.54E+00,0
 97 | 1.04E+01,1.05E+01,6.94E+00,2
 98 | 8.06E+00,7.52E+00,1.04E+01,0
 99 | 1.03E+01,5.82E+00,4.62E+00,1
100 | 7.68E+00,8.26E+00,1.01E+01,0
101 | 9.71E+00,9.50E+00,5.56E+00,2
102 | 9.82E+00,4.99E+00,5.14E+00,1
103 | 8.98E+00,9.47E+00,4.22E+00,2
104 | 7.62E+00,4.42E+00,5.05E+00,1
105 | 7.54E+00,8.87E+00,8.99E+00,0
106 | 8.65E+00,6.18E+00,4.55E+00,1
107 | 7.83E+00,7.86E+00,4.99E+00,2
108 | 8.17E+00,5.29E+00,5.17E+00,1
109 | 9.13E+00,9.19E+00,1.05E+01,0
110 | 8.96E+00,4.09E+00,3.45E+00,1
111 | 1.00E+01,1.03E+01,6.87E+00,2
112 | 8.78E+00,4.21E+00,3.11E+00,1
113 | 7.12E+00,7.93E+00,1.01E+01,0
114 | 8.67E+00,9.00E+00,9.38E+00,0
115 | 9.40E+00,6.34E+00,5.17E+00,1
116 | 8.53E+00,4.10E+00,3.87E+00,1
117 | 1.06E+01,1.08E+01,6.48E+00,2
118 | 8.76E+00,8.48E+00,4.46E+00,2
119 | 9.51E+00,9.40E+00,5.03E+00,2
120 | 9.18E+00,4.23E+00,5.77E+00,1
121 | 1.03E+01,9.66E+00,6.60E+00,2
122 | 8.96E+00,8.75E+00,4.96E+00,2
123 | 8.43E+00,8.11E+00,8.75E+00,0
124 | 8.37E+00,8.74E+00,5.70E+00,2
125 | 9.06E+00,5.17E+00,4.72E+00,1
126 | 1.11E+01,1.11E+01,6.91E+00,2
127 | 7.32E+00,7.54E+00,9.31E+00,0
128 | 8.92E+00,9.39E+00,5.48E+00,2
129 | 8.82E+00,4.71E+00,5.26E+00,1
130 | 7.77E+00,7.99E+00,3.44E+00,2
131 | 8.10E+00,8.37E+00,9.02E+00,0
132 | 8.48E+00,8.78E+00,4.96E+00,2
133 | 8.29E+00,8.05E+00,1.06E+01,0
134 | 9.72E+00,6.69E+00,4.87E+00,1
135 | 1.02E+01,6.34E+00,4.95E+00,1
136 | 9.60E+00,7.27E+00,6.92E+00,1
137 | 8.50E+00,7.83E+00,8.60E+00,0
138 | 8.95E+00,8.89E+00,9.44E+00,0
139 | 8.45E+00,9.00E+00,8.75E+00,0
140 | 1.12E+01,1.18E+01,7.13E+00,0
141 | 8.37E+00,2.08E+00,4.73E+00,1
142 | 9.30E+00,6.46E+00,5.95E+00,1
143 | 8.96E+00,9.07E+00,5.08E+00,2
144 | 9.51E+00,9.77E+00,9.19E+00,0
145 | 8.84E+00,9.42E+00,6.13E+00,2
146 | 1.06E+01,9.96E+00,5.77E+00,2
147 | 9.16E+00,8.96E+00,6.41E+00,2
148 | 8.17E+00,4.25E+00,4.59E+00,1
149 | 7.92E+00,8.90E+00,9.10E+00,0
150 | 8.91E+00,3.64E+00,5.13E+00,1
151 | 8.98E+00,5.03E+00,4.29E+00,1
152 | 9.15E+00,4.39E+00,5.62E+00,1
153 | 9.40E+00,9.52E+00,4.84E+00,2
154 | 7.99E+00,7.93E+00,3.98E+00,2
155 | 9.24E+00,9.99E+00,8.83E+00,0
156 | 9.87E+00,1.05E+01,7.99E+00,0
157 | 9.30E+00,3.72E+00,4.02E+00,1
158 | 9.39E+00,5.08E+00,4.51E+00,1
159 | 9.02E+00,8.60E+00,8.35E+00,0
160 | 9.36E+00,6.89E+00,6.13E+00,1
161 | 9.23E+00,1.03E+01,1.01E+01,0
162 | 8.96E+00,4.15E+00,4.88E+00,1
163 | 8.64E+00,8.22E+00,9.16E+00,0
164 | 1.08E+01,1.06E+01,6.74E+00,2
165 | 1.05E+01,1.02E+01,9.06E+00,0
166 | 8.10E+00,9.20E+00,5.62E+00,2
167 | 8.77E+00,8.67E+00,4.10E+00,2
168 | 8.63E+00,8.09E+00,4.86E+00,2
169 | 9.20E+00,9.53E+00,6.55E+00,2
170 | 7.68E+00,7.59E+00,9.77E+00,0
171 | 1.09E+01,1.06E+01,6.38E+00,2
172 | 8.38E+00,8.69E+00,5.06E+00,2
173 | 8.30E+00,8.34E+00,9.32E+00,0
174 | 8.99E+00,4.19E+00,5.63E+00,1
175 | 8.74E+00,8.71E+00,5.55E+00,2
176 | 8.50E+00,3.55E+00,5.81E+00,1
177 | 9.87E+00,9.56E+00,7.98E+00,0
178 | 6.82E+00,6.46E+00,2.48E+00,2
179 | 8.70E+00,2.93E+00,4.36E+00,1
180 | 1.01E+01,6.75E+00,5.12E+00,1
181 | 1.18E+01,1.21E+01,8.03E+00,2
182 | 9.40E+00,5.21E+00,6.14E+00,1
183 | 7.70E+00,8.80E+00,9.61E+00,0
184 | 8.92E+00,5.24E+00,4.35E+00,1
185 | 9.59E+00,4.62E+00,6.07E+00,1
186 | 9.60E+00,5.42E+00,7.63E+00,1
187 | 9.53E+00,9.59E+00,5.81E+00,2
188 | 9.32E+00,9.32E+00,6.27E+00,2
189 | 1.02E+01,1.01E+01,8.69E+00,1
190 | 9.00E+00,9.68E+00,6.44E+00,2
191 | 1.21E+01,1.11E+01,6.95E+00,0
192 | 1.04E+01,4.64E+00,6.01E+00,1
193 | 7.50E+00,7.72E+00,9.99E+00,0
194 | 9.49E+00,4.28E+00,6.47E+00,1
195 | 8.95E+00,5.77E+00,6.43E+00,1
196 | 8.90E+00,4.67E+00,2.88E+00,1
197 | 1.02E+01,9.90E+00,5.02E+00,2
198 | 7.77E+00,3.96E+00,4.17E+00,1
199 | 9.38E+00,6.98E+00,5.89E+00,1
200 | 9.00E+00,9.06E+00,9.59E+00,0
201 | 8.26E+00,8.85E+00,1.02E+01,0
202 | 


--------------------------------------------------------------------------------
/Data/football data.txt:
--------------------------------------------------------------------------------
  1 | average_goals_per_match	won_competition
  2 | 2.422870462	0
  3 | 2.824477516	0
  4 | 0.571688038	0
  5 | 1.055027667	0
  6 | 0.394192269	0
  7 | 0.754099232	0
  8 | 0.962959667	1
  9 | 1.994727613	0
 10 | 0.456755473	0
 11 | 0.525435057	0
 12 | 1.891407683	0
 13 | 1.018292157	0
 14 | 2.641061388	1
 15 | 1.081919124	0
 16 | 1.584087989	0
 17 | 1.587817681	0
 18 | 2.459575476	1
 19 | 1.170237541	0
 20 | 2.821653731	1
 21 | 2.05399727	0
 22 | 0.451411638	0
 23 | 1.486331674	0
 24 | 2.023428035	0
 25 | 2.843421156	1
 26 | 2.18239352	0
 27 | 2.508448909	1
 28 | 1.514173157	0
 29 | 1.460308	0
 30 | 1.779336362	0
 31 | 2.258397839	0
 32 | 2.607770127	1
 33 | 0.09945028	0
 34 | 2.35292296	1
 35 | 2.732122873	1
 36 | 2.8009988	1
 37 | 1.375195574	0
 38 | 0.971946125	0
 39 | 2.558268873	1
 40 | 2.565131087	1
 41 | 1.32548955	0
 42 | 2.334994306	0
 43 | 0.277073998	0
 44 | 0.261170366	0
 45 | 2.486403854	1
 46 | 2.969708759	1
 47 | 1.156309517	0
 48 | 2.84905351	1
 49 | 2.43635455	1
 50 | 0.754363317	0
 51 | 2.742626634	1
 52 | 1.224594963	0
 53 | 0.430516008	0
 54 | 0.887394082	0
 55 | 0.208938758	0
 56 | 1.520957714	0
 57 | 1.163998189	0
 58 | 2.81519393	1
 59 | 2.866950623	1
 60 | 0.35201577	0
 61 | 1.00306916	0
 62 | 0.089511949	0
 63 | 2.540442771	1
 64 | 0.547311147	0
 65 | 1.79477098	0
 66 | 1.48085737	0
 67 | 2.31644846	1
 68 | 0.763619073	0
 69 | 0.143390622	0
 70 | 0.193967012	0
 71 | 0.381415979	0
 72 | 2.632311728	1
 73 | 1.470286787	0
 74 | 0.498301326	0
 75 | 0.819245999	0
 76 | 1.869586655	0
 77 | 2.132291437	0
 78 | 0.145980681	0
 79 | 1.253342554	0
 80 | 1.647209079	0
 81 | 0.926896356	0
 82 | 2.282348031	1
 83 | 0.012578271	0
 84 | 0.455925289	0
 85 | 2.680673352	1
 86 | 1.985915665	0
 87 | 0.085881894	0
 88 | 2.689432655	1
 89 | 2.508641	1
 90 | 2.45187147	1
 91 | 1.238382828	0
 92 | 2.284923233	1
 93 | 0.101472759	0
 94 | 0.533641322	0
 95 | 1.73402536	0
 96 | 2.180275628	0
 97 | 2.642701936	1
 98 | 0.388674174	0
 99 | 2.226984467	0
100 | 1.045053442	0
101 | 2.216692543	0


--------------------------------------------------------------------------------
/Data/football_data.csv:
--------------------------------------------------------------------------------
  1 | ﻿average_goals_per_match,won_competition
  2 | 2.422870462,1
  3 | 2.824477516,1
  4 | 0.571688038,0
  5 | 1.055027667,0
  6 | 0.394192269,0
  7 | 0.754099232,0
  8 | 0.962959667,0
  9 | 1.994727613,0
 10 | 0.456755473,0
 11 | 0.525435057,0
 12 | 1.891407683,0
 13 | 1.018292157,0
 14 | 2.641061388,1
 15 | 1.081919124,0
 16 | 1.584087989,0
 17 | 1.587817681,0
 18 | 2.459575476,1
 19 | 1.170237541,0
 20 | 2.821653731,1
 21 | 2.05399727,0
 22 | 0.451411638,0
 23 | 1.486331674,0
 24 | 2.023428035,0
 25 | 2.843421156,1
 26 | 2.18239352,0
 27 | 2.508448909,1
 28 | 1.514173157,0
 29 | 1.460308,0
 30 | 1.779336362,0
 31 | 2.258397839,0
 32 | 2.607770127,1
 33 | 0.09945028,0
 34 | 2.35292296,1
 35 | 2.732122873,1
 36 | 2.8009988,1
 37 | 1.375195574,0
 38 | 0.971946125,0
 39 | 2.558268873,1
 40 | 2.565131087,1
 41 | 1.32548955,0
 42 | 2.334994306,0
 43 | 0.277073998,0
 44 | 0.261170366,0
 45 | 2.486403854,1
 46 | 2.969708759,1
 47 | 1.156309517,0
 48 | 2.84905351,1
 49 | 2.43635455,1
 50 | 0.754363317,0
 51 | 2.742626634,1
 52 | 1.224594963,0
 53 | 0.430516008,0
 54 | 0.887394082,0
 55 | 0.208938758,0
 56 | 1.520957714,0
 57 | 1.163998189,0
 58 | 2.81519393,1
 59 | 2.866950623,1
 60 | 0.35201577,0
 61 | 1.00306916,0
 62 | 0.089511949,0
 63 | 2.540442771,1
 64 | 0.547311147,0
 65 | 1.79477098,0
 66 | 1.48085737,0
 67 | 2.31644846,1
 68 | 0.763619073,0
 69 | 0.143390622,0
 70 | 0.193967012,0
 71 | 0.381415979,0
 72 | 2.632311728,1
 73 | 1.470286787,0
 74 | 0.498301326,0
 75 | 0.819245999,0
 76 | 1.869586655,0
 77 | 2.132291437,0
 78 | 0.145980681,0
 79 | 1.253342554,0
 80 | 1.647209079,0
 81 | 0.926896356,0
 82 | 2.282348031,1
 83 | 0.012578271,0
 84 | 0.455925289,0
 85 | 2.680673352,1
 86 | 1.985915665,0
 87 | 0.085881894,0
 88 | 2.689432655,1
 89 | 2.508641,1
 90 | 2.45187147,1
 91 | 1.238382828,0
 92 | 2.284923233,1
 93 | 0.101472759,0
 94 | 0.533641322,0
 95 | 1.73402536,0
 96 | 2.180275628,0
 97 | 2.642701936,1
 98 | 0.388674174,0
 99 | 2.226984467,0
100 | 1.045053442,0
101 | 2.216692543,0
102 | 


--------------------------------------------------------------------------------
/Data/traffic_by_hour.csv:
--------------------------------------------------------------------------------
1 | 00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,19,20,21,22,23
2 | 4.360655421386891106e+01,2.471415243168735998e+01,9.302910970866314244e+00,3.694417015169733531e+00,9.324994928878604483e+00,9.837653313714728398e+00,7.960156509601787000e+00,2.129209806855379128e+01,2.771412586402762201e+01,4.670921084052981342e+01,3.911199949015489352e+01,4.742874469954385575e+01,4.345939414008118007e+01,3.904657935639097843e+01,4.171486034093191364e+01,3.813035688692960434e+01,4.277975145346606212e+01,4.130417935632857507e+01,4.949913745519085495e+01,4.356621060502504861e+01,4.333981356792399708e+01,6.409661727874893700e+01,5.958220793355829414e+01,4.281970155221666374e+01
3 | 4.458483461102591150e+01,1.960434764035231581e+01,9.480831719723372686e+00,1.347690534797838424e+01,1.446522395805932604e+01,6.014082826041636132e+00,2.267967131865366781e+01,1.819289802099414288e+01,2.878376226399169013e+01,4.011397231467096702e+01,4.614933381836575421e+01,4.375361104497339682e+01,4.531261797021947046e+01,3.465456885850920088e+01,5.136445722452378959e+01,3.581937927938652422e+01,5.324305644941320281e+01,4.991026677530656741e+01,4.521989462961121120e+01,5.200261896989397314e+01,5.681758056516863320e+01,6.135913191794563204e+01,5.028792564895855577e+01,4.038354412487803557e+01
4 | 3.320856146056010516e+01,2.958418081618512119e+01,2.720763330588727413e+01,1.124323327934973094e+01,1.222980541796953169e+01,5.072605297463047336e+00,6.111837866899005434e+00,2.617679230349706643e+01,3.524648307585961504e+01,3.822043200812598229e+01,3.090295064127423785e+01,5.046242188322992206e+01,4.186584869132914122e+01,4.362873596980531232e+01,3.773802942037548291e+01,4.210401295001258148e+01,5.464266677003922013e+01,4.965617376126637339e+01,3.477964093080311869e+01,4.530579139639098685e+01,4.181824635262458401e+01,6.114016284549821023e+01,6.144635319762286940e+01,5.881157571704308396e+01
5 | 3.502665485183174354e+01,2.036754951472195785e+01,2.144528489571132468e+01,7.449591532631174573e+00,2.232114991577189400e+00,8.104623220511466997e+00,9.095804670617658516e+00,1.949946259191193576e+01,3.768956690121524389e+01,3.390709349829828056e+01,3.101834889182442012e+01,4.337981435125371377e+01,4.033062481032064994e+01,4.179804092660128845e+01,3.235427385361062136e+01,3.611236563329332228e+01,5.382150804401013033e+01,3.586998953767924547e+01,4.183091027514376492e+01,4.692259473776898204e+01,4.267652627221272610e+01,6.013905435990915294e+01,6.163977217887768489e+01,4.467098795974052194e+01
6 | 4.016319422608915346e+01,1.993632753640963173e+01,1.806648016188032813e+01,1.210993983210695646e+01,1.087853935093216329e+01,9.766026740065232303e+00,1.950476123450128796e+01,1.031387494130530769e+01,2.850912829110281166e+01,3.080974586707072405e+01,3.632650856885130253e+01,4.589394114695814153e+01,3.151274296264309527e+01,3.723943695070546767e+01,3.750943082970208309e+01,5.441648446512536452e+01,3.680134250606158020e+01,4.921699104313118767e+01,4.392759526930144176e+01,4.065717511095378711e+01,4.435037140017104917e+01,5.190988563720762272e+01,6.167439508720916308e+01,4.672716979505646862e+01
7 | 4.916939064389787006e+01,2.445518800786128821e+01,1.239135992063409653e+01,1.070533695787615258e+01,6.511395138985639264e+00,2.178534480043876442e+01,1.925732145248098703e+01,2.327378212446163985e+01,2.966100596772361797e+01,3.460858234351567120e+01,3.867958481988834052e+01,4.825450197089693916e+01,4.458540354744464906e+01,3.356191487925229211e+01,3.939223788729094622e+01,5.470800707845620536e+01,4.804269775089670702e+01,3.668272237596980290e+01,4.784333915301860429e+01,4.587219623494701182e+01,4.163642201315494873e+01,5.404916943574897203e+01,5.370873094384637625e+01,5.547372381993804424e+01


--------------------------------------------------------------------------------
/Data/trees.csv:
--------------------------------------------------------------------------------
  1 | leaf_width,leaf_length,trunk_girth,trunk_height,tree_type
  2 | 5.13E+00,6.18E+00,8.26E+00,8.74E+00,0
  3 | 7.49E+00,4.02E+00,8.07E+00,6.78E+00,0
  4 | 9.22E+00,4.16E+00,5.46E+00,8.45E+00,1
  5 | 6.98E+00,1.11E+01,6.96E+00,4.06E+00,2
  6 | 3.46E+00,5.19E+00,8.72E+00,1.04E+01,0
  7 | 4.55E+00,5.15E+00,9.01E+00,9.64E+00,0
  8 | 4.95E+00,1.04E+01,6.33E+00,4.49E+00,2
  9 | 7.64E+00,2.58E+00,9.73E+00,7.75E+00,0
 10 | 8.69E+00,4.35E+00,4.37E+00,8.82E+00,1
 11 | 7.21E+00,3.62E+00,8.71E+00,7.43E+00,0
 12 | 6.48E+00,1.15E+01,8.20E+00,3.85E+00,2
 13 | 8.52E+00,3.67E+00,5.99E+00,9.70E+00,1
 14 | 6.35E+00,8.18E+00,4.50E+00,6.14E+00,2
 15 | 6.61E+00,5.29E+00,6.80E+00,6.07E+00,0
 16 | 5.70E+00,5.08E+00,8.30E+00,8.10E+00,0
 17 | 4.73E+00,7.88E+00,4.03E+00,5.10E+00,2
 18 | 8.15E+00,5.08E+00,5.43E+00,1.05E+01,1
 19 | 5.42E+00,8.67E+00,4.57E+00,4.57E+00,2
 20 | 3.31E+00,5.46E+00,9.85E+00,1.04E+01,0
 21 | 6.50E+00,4.39E+00,9.15E+00,8.29E+00,0
 22 | 9.31E+00,5.18E+00,5.66E+00,8.93E+00,1
 23 | 2.79E+00,9.93E+00,6.56E+00,3.12E+00,2
 24 | 5.56E+00,9.99E+00,6.06E+00,4.56E+00,2
 25 | 5.87E+00,6.92E+00,2.89E+00,6.25E+00,2
 26 | 3.99E+00,6.54E+00,2.91E+00,6.36E+00,2
 27 | 5.18E+00,9.02E+00,4.42E+00,5.62E+00,2
 28 | 6.07E+00,3.81E+00,9.77E+00,9.44E+00,0
 29 | 8.78E+00,4.35E+00,4.44E+00,8.85E+00,1
 30 | 4.12E+00,7.14E+00,7.83E+00,8.16E+00,0
 31 | 5.10E+00,5.86E+00,8.55E+00,8.37E+00,0
 32 | 3.34E+00,9.27E+00,5.25E+00,3.98E+00,2
 33 | 4.88E+00,9.19E+00,5.89E+00,4.93E+00,2
 34 | 4.70E+00,6.18E+00,9.26E+00,8.09E+00,0
 35 | 5.32E+00,9.59E+00,5.23E+00,4.85E+00,2
 36 | 9.01E+00,7.84E+00,3.10E+00,9.07E+00,1
 37 | 1.01E+01,5.94E+00,6.45E+00,8.39E+00,1
 38 | 5.71E+00,6.16E+00,2.24E+00,8.06E+00,2
 39 | 3.01E+00,5.78E+00,9.61E+00,1.06E+01,0
 40 | 5.56E+00,4.63E+00,9.24E+00,9.44E+00,0
 41 | 4.39E+00,7.97E+00,4.49E+00,5.15E+00,2
 42 | 3.67E+00,5.17E+00,9.34E+00,9.81E+00,0
 43 | 3.68E+00,8.62E+00,5.35E+00,4.48E+00,2
 44 | 3.64E+00,6.14E+00,8.99E+00,9.09E+00,0
 45 | 8.92E+00,5.78E+00,5.08E+00,9.39E+00,1
 46 | 5.84E+00,4.81E+00,8.86E+00,8.05E+00,0
 47 | 8.11E+00,9.31E+00,4.70E+00,5.62E+00,2
 48 | 5.41E+00,5.81E+00,7.48E+00,7.84E+00,0
 49 | 1.00E+01,8.15E+00,4.89E+00,8.69E+00,1
 50 | 5.05E+00,8.51E+00,4.89E+00,5.52E+00,2
 51 | 2.06E+00,6.52E+00,9.68E+00,1.04E+01,0
 52 | 1.10E+01,4.56E+00,7.22E+00,6.97E+00,1
 53 | 5.27E+00,4.25E+00,8.19E+00,9.16E+00,0
 54 | 4.31E+00,5.11E+00,1.09E+01,1.06E+01,0
 55 | 7.48E+00,5.17E+00,7.65E+00,6.13E+00,0
 56 | 5.75E+00,4.78E+00,8.19E+00,8.02E+00,0
 57 | 3.73E+00,9.85E+00,5.46E+00,4.45E+00,2
 58 | 3.03E+00,1.05E+01,6.70E+00,2.78E+00,2
 59 | 4.99E+00,9.67E+00,5.29E+00,4.81E+00,2
 60 | 3.97E+00,6.25E+00,7.10E+00,8.14E+00,0
 61 | 6.12E+00,9.76E+00,5.79E+00,4.45E+00,2
 62 | 5.51E+00,4.12E+00,1.01E+01,9.38E+00,0
 63 | 8.12E+00,5.01E+00,2.81E+00,9.33E+00,1
 64 | 4.37E+00,6.64E+00,7.70E+00,7.72E+00,0
 65 | 4.03E+00,7.25E+00,3.44E+00,5.95E+00,2
 66 | 1.01E+01,5.55E+00,5.15E+00,7.42E+00,1
 67 | 9.20E+00,7.27E+00,4.76E+00,9.44E+00,1
 68 | 8.63E+00,5.95E+00,4.65E+00,9.68E+00,1
 69 | 6.72E+00,5.02E+00,7.74E+00,7.16E+00,0
 70 | 6.23E+00,4.27E+00,8.95E+00,7.94E+00,0
 71 | 4.66E+00,5.47E+00,9.49E+00,9.13E+00,0
 72 | 1.06E+01,7.05E+00,5.37E+00,7.60E+00,1
 73 | 6.90E+00,3.11E+00,8.50E+00,7.86E+00,0
 74 | 9.01E+00,3.80E+00,4.98E+00,8.48E+00,1
 75 | 8.53E+00,4.52E+00,5.02E+00,9.58E+00,1
 76 | 9.40E+00,5.34E+00,5.26E+00,8.64E+00,1
 77 | 1.03E+01,3.09E+00,6.88E+00,7.04E+00,1
 78 | 1.01E+01,5.69E+00,4.99E+00,7.61E+00,1
 79 | 1.01E+01,5.40E+00,5.75E+00,7.94E+00,1
 80 | 7.07E+00,3.58E+00,8.95E+00,7.79E+00,0
 81 | 7.67E+00,5.31E+00,3.30E+00,1.03E+01,1
 82 | 3.92E+00,7.08E+00,3.35E+00,5.54E+00,2
 83 | 5.17E+00,9.49E+00,5.81E+00,4.65E+00,2
 84 | 4.53E+00,5.58E+00,9.38E+00,9.35E+00,0
 85 | 3.79E+00,5.93E+00,8.36E+00,8.99E+00,0
 86 | 5.27E+00,8.53E+00,4.45E+00,5.00E+00,2
 87 | 4.26E+00,5.84E+00,9.39E+00,9.95E+00,0
 88 | 9.05E+00,7.22E+00,3.80E+00,9.33E+00,1
 89 | 5.86E+00,9.61E+00,6.11E+00,5.93E+00,2
 90 | 4.22E+00,5.76E+00,9.20E+00,1.00E+01,0
 91 | 9.61E+00,6.40E+00,4.47E+00,8.33E+00,1
 92 | 6.20E+00,8.41E+00,4.77E+00,5.19E+00,2
 93 | 5.94E+00,4.02E+00,8.40E+00,8.65E+00,0
 94 | 7.70E+00,4.67E+00,3.81E+00,1.02E+01,1
 95 | 5.00E+00,5.62E+00,7.93E+00,8.02E+00,0
 96 | 5.46E+00,4.58E+00,1.08E+01,1.02E+01,0
 97 | 5.55E+00,8.15E+00,3.50E+00,5.31E+00,0
 98 | 5.92E+00,9.84E+00,6.16E+00,5.53E+00,2
 99 | 4.87E+00,8.81E+00,4.87E+00,4.72E+00,2
100 | 6.61E+00,7.36E+00,2.62E+00,6.53E+00,2
101 | 2.83E+00,5.99E+00,8.54E+00,9.73E+00,0
102 | 4.18E+00,5.00E+00,8.57E+00,9.30E+00,0
103 | 2.54E+00,9.60E+00,6.12E+00,3.65E+00,2
104 | 5.28E+00,7.55E+00,2.97E+00,6.02E+00,2
105 | 5.08E+00,8.39E+00,3.93E+00,5.24E+00,2
106 | 5.92E+00,4.94E+00,7.95E+00,8.02E+00,0
107 | 5.84E+00,8.72E+00,4.61E+00,5.86E+00,2
108 | 5.32E+00,8.29E+00,4.14E+00,5.12E+00,2
109 | 5.88E+00,1.04E+01,6.83E+00,4.06E+00,2
110 | 6.59E+00,8.95E+00,4.38E+00,4.97E+00,2
111 | 1.02E+01,5.59E+00,5.44E+00,7.76E+00,1
112 | 5.58E+00,8.32E+00,4.88E+00,5.17E+00,2
113 | 5.49E+00,8.55E+00,4.29E+00,6.14E+00,2
114 | 5.08E+00,5.57E+00,7.69E+00,8.07E+00,0
115 | 5.19E+00,4.69E+00,9.93E+00,9.36E+00,0
116 | 8.47E+00,3.43E+00,4.63E+00,9.08E+00,1
117 | 7.81E+00,3.76E+00,3.88E+00,9.86E+00,1
118 | 8.01E+00,4.20E+00,3.12E+00,9.29E+00,1
119 | 9.27E+00,4.56E+00,5.31E+00,8.65E+00,1
120 | 5.45E+00,9.25E+00,5.36E+00,5.48E+00,2
121 | 4.64E+00,9.64E+00,5.61E+00,4.85E+00,2
122 | 9.69E+00,3.71E+00,6.06E+00,8.01E+00,1
123 | 5.85E+00,4.81E+00,8.57E+00,7.75E+00,0
124 | 5.83E+00,9.24E+00,4.72E+00,4.79E+00,2
125 | 5.46E+00,5.08E+00,8.39E+00,8.13E+00,0
126 | 3.57E+00,5.34E+00,1.04E+01,1.03E+01,0
127 | 6.85E+00,1.11E+00,4.48E+00,1.06E+01,1
128 | 1.01E+01,3.93E+00,7.03E+00,7.87E+00,1
129 | 8.21E+00,4.32E+00,5.20E+00,1.01E+01,1
130 | 9.67E+00,5.02E+00,7.91E+00,9.23E+00,1
131 | 1.08E+01,5.49E+00,7.08E+00,7.28E+00,1
132 | 1.01E+01,5.75E+00,6.46E+00,8.39E+00,1
133 | 8.80E+00,3.99E+00,4.87E+00,8.84E+00,0
134 | 9.31E+00,5.24E+00,6.48E+00,9.25E+00,1
135 | 5.56E+00,9.08E+00,5.01E+00,4.57E+00,2
136 | 5.65E+00,4.48E+00,9.20E+00,8.85E+00,0
137 | 8.75E+00,4.16E+00,4.94E+00,9.11E+00,1
138 | 1.04E+01,4.94E+00,6.81E+00,7.56E+00,1
139 | 8.67E+00,2.93E+00,6.75E+00,9.51E+00,1
140 | 8.44E+00,4.18E+00,4.45E+00,9.18E+00,1
141 | 9.27E+00,5.37E+00,6.06E+00,9.23E+00,1
142 | 1.04E+01,5.92E+00,6.91E+00,8.19E+00,1
143 | 4.66E+00,4.64E+00,1.03E+01,9.59E+00,0
144 | 7.63E+00,9.19E+00,4.31E+00,5.58E+00,2
145 | 1.07E+01,5.35E+00,7.17E+00,7.69E+00,1
146 | 7.84E+00,4.19E+00,4.32E+00,1.00E+01,1
147 | 7.35E+00,8.98E+00,4.12E+00,5.59E+00,2
148 | 9.02E+00,3.42E+00,6.66E+00,9.06E+00,1
149 | 9.15E+00,2.81E+00,6.53E+00,8.70E+00,1
150 | 9.13E+00,3.60E+00,5.20E+00,8.35E+00,1
151 | 4.04E+00,6.60E+00,1.88E+00,6.53E+00,2
152 | 4.67E+00,1.02E+01,5.92E+00,4.06E+00,2
153 | 5.47E+00,4.77E+00,9.05E+00,8.35E+00,0
154 | 6.95E+00,3.95E+00,1.91E+00,1.03E+01,1
155 | 4.93E+00,4.60E+00,9.49E+00,9.21E+00,0
156 | 9.51E+00,4.21E+00,5.65E+00,8.20E+00,1
157 | 6.60E+00,4.28E+00,8.30E+00,7.75E+00,0
158 | 4.93E+00,5.33E+00,9.11E+00,8.55E+00,0
159 | 5.96E+00,9.36E+00,5.49E+00,5.03E+00,2
160 | 7.09E+00,4.09E+00,8.47E+00,6.88E+00,0
161 | 9.85E+00,6.31E+00,5.49E+00,8.42E+00,1
162 | 4.87E+00,1.13E+01,7.62E+00,3.42E+00,2
163 | 8.54E+00,5.68E+00,4.73E+00,9.75E+00,1
164 | 7.36E+00,7.37E+00,1.78E+00,1.10E+01,1
165 | 4.17E+00,6.11E+00,8.44E+00,9.02E+00,0
166 | 7.49E+00,8.27E+00,3.76E+00,6.25E+00,2
167 | 4.23E+00,9.41E+00,5.83E+00,5.04E+00,2
168 | 6.46E+00,9.95E+00,5.59E+00,5.11E+00,2
169 | 8.68E+00,3.93E+00,6.34E+00,9.75E+00,1
170 | 5.00E+00,7.49E+00,3.10E+00,5.63E+00,2
171 | 7.85E+00,4.84E+00,4.77E+00,1.07E+01,1
172 | 5.38E+00,9.24E+00,4.77E+00,4.48E+00,2
173 | 5.26E+00,5.68E+00,8.93E+00,8.84E+00,1
174 | 9.50E+00,7.24E+00,4.32E+00,8.80E+00,1
175 | 5.67E+00,8.60E+00,4.80E+00,5.45E+00,2
176 | 3.69E+00,9.97E+00,6.64E+00,4.05E+00,2
177 | 6.18E+00,3.63E+00,8.39E+00,8.83E+00,0
178 | 9.74E+00,5.23E+00,7.00E+00,9.07E+00,1
179 | 5.00E+00,5.10E+00,8.64E+00,9.12E+00,0
180 | 9.68E+00,6.63E+00,5.12E+00,8.85E+00,1
181 | 4.60E+00,4.18E+00,9.99E+00,9.71E+00,0
182 | 5.21E+00,5.61E+00,8.09E+00,8.01E+00,0
183 | 2.84E+00,5.86E+00,9.53E+00,1.07E+01,0
184 | 9.65E+00,6.07E+00,6.31E+00,9.11E+00,1
185 | 2.65E+00,6.69E+00,9.18E+00,1.07E+01,0
186 | 4.99E+00,1.04E+01,6.61E+00,3.95E+00,2
187 | 3.35E+00,1.02E+01,6.53E+00,3.33E+00,2
188 | 8.94E+00,4.19E+00,4.17E+00,8.34E+00,1
189 | 4.10E+00,4.38E+00,1.05E+01,1.10E+01,0
190 | 3.69E+00,7.75E+00,3.82E+00,5.14E+00,2
191 | 9.15E+00,4.52E+00,4.77E+00,8.58E+00,1
192 | 4.67E+00,4.25E+00,1.02E+01,9.46E+00,0
193 | 3.94E+00,9.39E+00,5.20E+00,4.43E+00,2
194 | 7.51E+00,7.45E+00,2.47E+00,6.13E+00,2
195 | 2.87E+00,7.84E+00,3.56E+00,4.77E+00,2
196 | 6.96E+00,1.05E+01,6.22E+00,4.69E+00,2
197 | 8.65E+00,5.83E+00,4.63E+00,9.64E+00,1
198 | 8.39E+00,2.96E+00,4.14E+00,8.56E+00,1
199 | 4.99E+00,4.47E+00,9.17E+00,9.66E+00,0
200 | 9.58E+00,4.53E+00,5.68E+00,8.29E+00,1
201 | 6.45E+00,3.04E+00,9.37E+00,8.89E+00,0
202 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/Models/arthur-model-epoch-30.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MicrosoftDocs/ms-learn-ml-crash-course-R/385ee8b000e9586e9fbd4042d4dc6436d2329852/Models/arthur-model-epoch-30.hdf5


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ML Crash Course R Programming Exercises
 2 | 
 3 | Welcome! In this library we have the R programming exercises for the ML Crash Course.
 4 | 
 5 | These notebooks contains the programming exercises used in the [ML Crash Course](https://docs.microsoft.com/learn/paths/ml-crash-course). While you could explore the examples without following the tutorial, it's strongly recommended that you follow along with the course on Microsoft Learn.
 6 | 
 7 | ## Getting started
 8 | 
 9 | If you aren't already completing ML Crash Course you can visit the [learning path](https://docs.microsoft.com/learn/paths/ml-crash-course).
10 | 
11 | ### Setting up Azure Notebooks
12 | 
13 | * Go to [Azure Notebooks projects](https://notebooks.azure.com/home/projects#).
14 | * Click on Upload GitHub Repo.
15 | * Click on the "GitHub repository" box and paste in ```MicrosoftDocs/ms-learn-ml-crash-course-R```.
16 | * Click the Import button.
17 | 
18 | ## Troubleshooting
19 | 
20 | Below are some common issues and their solutions when completing the exercises in this Azure Notebooks.
21 | 
22 | ### Links to the exercises don't work
23 | 
24 | The links to the exercises on MS Learn assume you have set up your library, and kept the library ID as 'ms-learn-ml-crash-course-R' - if you haven't then the links won't work.
25 | 
26 | #### Solution
27 | 
28 | * Click [here](https://notebooks.azure.com/home/libraries) to go to your libraries
29 | * Right clicking on your library and select __Settings__
30 | * Change the Library ID back to ```ms-learn-ml-crash-course-R```.
31 | 
32 | You can also just go to your library and right click the programming exercise you wish to do and click '__Run__'
33 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.3 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets Microsoft's [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)) of a security vulnerability, please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------