├── Scratch Neural Network.ipynb
├── __init__.py
├── __pycache__
└── __init__.cpython-35.pyc
├── docs
├── Gemfile
├── License.md
├── Rakefile
├── ReadMe.md
├── _config.yml
├── _includes
│ ├── JB
│ │ ├── analytics
│ │ ├── analytics-providers
│ │ │ ├── getclicky
│ │ │ ├── google
│ │ │ └── mixpanel
│ │ ├── categories_list
│ │ ├── comments
│ │ ├── comments-providers
│ │ │ ├── disqus
│ │ │ ├── facebook
│ │ │ ├── intensedebate
│ │ │ └── livefyre
│ │ ├── liquid_raw
│ │ ├── pages_list
│ │ ├── posts_collate
│ │ ├── setup
│ │ ├── sharing
│ │ └── tags_list
│ ├── head.html
│ └── themes
│ │ └── twitter
│ │ ├── default.html
│ │ ├── page.html
│ │ ├── post.html
│ │ └── settings.yml
├── _layouts
│ ├── default.html
│ ├── page.html
│ └── post.html
├── _plugins
│ └── debug.rb
├── assets
│ ├── all_3neurons_lr_0.003_reg_0.0.gif
│ ├── all_50neurons_lr_0.003_reg_0.000001.gif
│ ├── all_50neurons_lr_0.003_reg_0.0001.gif
│ ├── chain_w1.png
│ ├── chain_w1_numbers.png
│ ├── chain_w1_numbers_final.png
│ ├── chain_w2.png
│ ├── chain_w2_detailed.png
│ ├── chain_w2_numbers.png
│ ├── code.png
│ ├── copy_values.png
│ ├── example.png
│ ├── forward.png
│ ├── h1.png
│ ├── h2.png
│ ├── initialized_network.png
│ ├── loss.png
│ ├── nonlinear_xor.png
│ ├── overview.png
│ ├── overview2.png
│ ├── themes
│ │ └── twitter
│ │ │ ├── bootstrap
│ │ │ ├── css
│ │ │ │ └── bootstrap.2.2.2.min.css
│ │ │ └── img
│ │ │ │ ├── glyphicons-halflings-white.png
│ │ │ │ └── glyphicons-halflings.png
│ │ │ └── css
│ │ │ ├── kbroman.css
│ │ │ └── style.css
│ ├── update_w1.png
│ ├── update_w2.png
│ ├── z1.png
│ └── z2.png
├── index.md
├── index_es.md
└── pages
│ ├── independent_site.md
│ ├── local_test.md
│ ├── nojekyll.md
│ ├── overview.md
│ ├── project_site.md
│ ├── resources.md
│ └── user_site.md
├── scratch_mlp.py
├── slides
└── 2017_Summer_School_LACCI.pdf
└── utils.py
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/__init__.py
--------------------------------------------------------------------------------
/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/docs/Gemfile:
--------------------------------------------------------------------------------
1 | gem 'github-pages'
2 |
--------------------------------------------------------------------------------
/docs/License.md:
--------------------------------------------------------------------------------
1 | To the extent possible under law,
2 | [Karl Broman](https://github.com/kbroman)
3 | has waived all copyright and related or neighboring rights to
4 | “[simple site](https://github.com/kbroman/simple_site)”.
5 | This work is published from the United States.
6 |
7 | [](https://creativecommons.org/publicdomain/zero/1.0/)
8 |
--------------------------------------------------------------------------------
/docs/Rakefile:
--------------------------------------------------------------------------------
1 | require "rubygems"
2 | require 'rake'
3 | require 'yaml'
4 | require 'time'
5 |
6 | SOURCE = "."
7 | CONFIG = {
8 | 'version' => "0.3.0",
9 | 'themes' => File.join(SOURCE, "_includes", "themes"),
10 | 'layouts' => File.join(SOURCE, "_layouts"),
11 | 'posts' => File.join(SOURCE, "_posts"),
12 | 'post_ext' => "md",
13 | 'theme_package_version' => "0.1.0"
14 | }
15 |
16 | # Path configuration helper
17 | module JB
18 | class Path
19 | SOURCE = "."
20 | Paths = {
21 | :layouts => "_layouts",
22 | :themes => "_includes/themes",
23 | :theme_assets => "assets/themes",
24 | :theme_packages => "_theme_packages",
25 | :posts => "_posts"
26 | }
27 |
28 | def self.base
29 | SOURCE
30 | end
31 |
32 | # build a path relative to configured path settings.
33 | def self.build(path, opts = {})
34 | opts[:root] ||= SOURCE
35 | path = "#{opts[:root]}/#{Paths[path.to_sym]}/#{opts[:node]}".split("/")
36 | path.compact!
37 | File.__send__ :join, path
38 | end
39 |
40 | end #Path
41 | end #JB
42 |
43 | # Usage: rake post title="A Title" [date="2012-02-09"] [tags=[tag1, tag2]]
44 | desc "Begin a new post in #{CONFIG['posts']}"
45 | task :post do
46 | abort("rake aborted: '#{CONFIG['posts']}' directory not found.") unless FileTest.directory?(CONFIG['posts'])
47 | title = ENV["title"] || "new-post"
48 | tags = ENV["tags"] || "[]"
49 | slug = title.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')
50 | begin
51 | date = (ENV['date'] ? Time.parse(ENV['date']) : Time.now).strftime('%Y-%m-%d')
52 | rescue Exception => e
53 | puts "Error - date format must be YYYY-MM-DD, please check you typed it correctly!"
54 | exit -1
55 | end
56 | filename = File.join(CONFIG['posts'], "#{date}-#{slug}.#{CONFIG['post_ext']}")
57 | if File.exist?(filename)
58 | abort("rake aborted!") if ask("#{filename} already exists. Do you want to overwrite?", ['y', 'n']) == 'n'
59 | end
60 |
61 | puts "Creating new post: #{filename}"
62 | open(filename, 'w') do |post|
63 | post.puts "---"
64 | post.puts "layout: post"
65 | post.puts "title: \"#{title.gsub(/-/,' ')}\""
66 | post.puts 'description: ""'
67 | post.puts "category: "
68 | post.puts "tags: []"
69 | post.puts "---"
70 | post.puts "{% include JB/setup %}"
71 | end
72 | end # task :post
73 |
74 | # Usage: rake page name="about.html"
75 | # You can also specify a sub-directory path.
76 | # If you don't specify a file extention we create an index.html at the path specified
77 | desc "Create a new page."
78 | task :page do
79 | name = ENV["name"] || "new-page.md"
80 | filename = File.join(SOURCE, "#{name}")
81 | filename = File.join(filename, "index.html") if File.extname(filename) == ""
82 | title = File.basename(filename, File.extname(filename)).gsub(/[\W\_]/, " ").gsub(/\b\w/){$&.upcase}
83 | if File.exist?(filename)
84 | abort("rake aborted!") if ask("#{filename} already exists. Do you want to overwrite?", ['y', 'n']) == 'n'
85 | end
86 |
87 | mkdir_p File.dirname(filename)
88 | puts "Creating new page: #{filename}"
89 | open(filename, 'w') do |post|
90 | post.puts "---"
91 | post.puts "layout: page"
92 | post.puts "title: \"#{title}\""
93 | post.puts 'description: ""'
94 | post.puts "---"
95 | post.puts "{% include JB/setup %}"
96 | end
97 | end # task :page
98 |
99 | desc "Launch preview environment"
100 | task :preview do
101 | system "jekyll --auto --server"
102 | end # task :preview
103 |
104 | # Public: Alias - Maintains backwards compatability for theme switching.
105 | task :switch_theme => "theme:switch"
106 |
107 | namespace :theme do
108 |
109 | # Public: Switch from one theme to another for your blog.
110 | #
111 | # name - String, Required. name of the theme you want to switch to.
112 | # The the theme must be installed into your JB framework.
113 | #
114 | # Examples
115 | #
116 | # rake theme:switch name="the-program"
117 | #
118 | # Returns Success/failure messages.
119 | desc "Switch between Jekyll-bootstrap themes."
120 | task :switch do
121 | theme_name = ENV["name"].to_s
122 | theme_path = File.join(CONFIG['themes'], theme_name)
123 | settings_file = File.join(theme_path, "settings.yml")
124 | non_layout_files = ["settings.yml"]
125 |
126 | abort("rake aborted: name cannot be blank") if theme_name.empty?
127 | abort("rake aborted: '#{theme_path}' directory not found.") unless FileTest.directory?(theme_path)
128 | abort("rake aborted: '#{CONFIG['layouts']}' directory not found.") unless FileTest.directory?(CONFIG['layouts'])
129 |
130 | Dir.glob("#{theme_path}/*") do |filename|
131 | next if non_layout_files.include?(File.basename(filename).downcase)
132 | puts "Generating '#{theme_name}' layout: #{File.basename(filename)}"
133 |
134 | open(File.join(CONFIG['layouts'], File.basename(filename)), 'w') do |page|
135 | if File.basename(filename, ".html").downcase == "default"
136 | page.puts "---"
137 | page.puts File.read(settings_file) if File.exist?(settings_file)
138 | page.puts "---"
139 | else
140 | page.puts "---"
141 | page.puts "layout: default"
142 | page.puts "---"
143 | end
144 | page.puts "{% include JB/setup %}"
145 | page.puts "{% include themes/#{theme_name}/#{File.basename(filename)} %}"
146 | end
147 | end
148 |
149 | puts "=> Theme successfully switched!"
150 | puts "=> Reload your web-page to check it out =)"
151 | end # task :switch
152 |
153 | # Public: Install a theme using the theme packager.
154 | # Version 0.1.0 simple 1:1 file matching.
155 | #
156 | # git - String, Optional path to the git repository of the theme to be installed.
157 | # name - String, Optional name of the theme you want to install.
158 | # Passing name requires that the theme package already exist.
159 | #
160 | # Examples
161 | #
162 | # rake theme:install git="https://github.com/jekyllbootstrap/theme-twitter.git"
163 | # rake theme:install name="cool-theme"
164 | #
165 | # Returns Success/failure messages.
166 | desc "Install theme"
167 | task :install do
168 | if ENV["git"]
169 | manifest = theme_from_git_url(ENV["git"])
170 | name = manifest["name"]
171 | else
172 | name = ENV["name"].to_s.downcase
173 | end
174 |
175 | packaged_theme_path = JB::Path.build(:theme_packages, :node => name)
176 |
177 | abort("rake aborted!
178 | => ERROR: 'name' cannot be blank") if name.empty?
179 | abort("rake aborted!
180 | => ERROR: '#{packaged_theme_path}' directory not found.
181 | => Installable themes can be added via git. You can find some here: http://github.com/jekyllbootstrap
182 | => To download+install run: `rake theme:install git='[PUBLIC-CLONE-URL]'`
183 | => example : rake theme:install git='git@github.com:jekyllbootstrap/theme-the-program.git'
184 | ") unless FileTest.directory?(packaged_theme_path)
185 |
186 | manifest = verify_manifest(packaged_theme_path)
187 |
188 | # Get relative paths to packaged theme files
189 | # Exclude directories as they'll be recursively created. Exclude meta-data files.
190 | packaged_theme_files = []
191 | FileUtils.cd(packaged_theme_path) {
192 | Dir.glob("**/*.*") { |f|
193 | next if ( FileTest.directory?(f) || f =~ /^(manifest|readme|packager)/i )
194 | packaged_theme_files << f
195 | }
196 | }
197 |
198 | # Mirror each file into the framework making sure to prompt if already exists.
199 | packaged_theme_files.each do |filename|
200 | file_install_path = File.join(JB::Path.base, filename)
201 | if File.exist? file_install_path and ask("#{file_install_path} already exists. Do you want to overwrite?", ['y', 'n']) == 'n'
202 | next
203 | else
204 | mkdir_p File.dirname(file_install_path)
205 | cp_r File.join(packaged_theme_path, filename), file_install_path
206 | end
207 | end
208 |
209 | puts "=> #{name} theme has been installed!"
210 | puts "=> ---"
211 | if ask("=> Want to switch themes now?", ['y', 'n']) == 'y'
212 | system("rake switch_theme name='#{name}'")
213 | end
214 | end
215 |
216 | # Public: Package a theme using the theme packager.
217 | # The theme must be structured using valid JB API.
218 | # In other words packaging is essentially the reverse of installing.
219 | #
220 | # name - String, Required name of the theme you want to package.
221 | #
222 | # Examples
223 | #
224 | # rake theme:package name="twitter"
225 | #
226 | # Returns Success/failure messages.
227 | desc "Package theme"
228 | task :package do
229 | name = ENV["name"].to_s.downcase
230 | theme_path = JB::Path.build(:themes, :node => name)
231 | asset_path = JB::Path.build(:theme_assets, :node => name)
232 |
233 | abort("rake aborted: name cannot be blank") if name.empty?
234 | abort("rake aborted: '#{theme_path}' directory not found.") unless FileTest.directory?(theme_path)
235 | abort("rake aborted: '#{asset_path}' directory not found.") unless FileTest.directory?(asset_path)
236 |
237 | ## Mirror theme's template directory (_includes)
238 | packaged_theme_path = JB::Path.build(:themes, :root => JB::Path.build(:theme_packages, :node => name))
239 | mkdir_p packaged_theme_path
240 | cp_r theme_path, packaged_theme_path
241 |
242 | ## Mirror theme's asset directory
243 | packaged_theme_assets_path = JB::Path.build(:theme_assets, :root => JB::Path.build(:theme_packages, :node => name))
244 | mkdir_p packaged_theme_assets_path
245 | cp_r asset_path, packaged_theme_assets_path
246 |
247 | ## Log packager version
248 | packager = {"packager" => {"version" => CONFIG["theme_package_version"].to_s } }
249 | open(JB::Path.build(:theme_packages, :node => "#{name}/packager.yml"), "w") do |page|
250 | page.puts packager.to_yaml
251 | end
252 |
253 | puts "=> '#{name}' theme is packaged and available at: #{JB::Path.build(:theme_packages, :node => name)}"
254 | end
255 |
256 | end # end namespace :theme
257 |
258 | # Internal: Download and process a theme from a git url.
259 | # Notice we don't know the name of the theme until we look it up in the manifest.
260 | # So we'll have to change the folder name once we get the name.
261 | #
262 | # url - String, Required url to git repository.
263 | #
264 | # Returns theme manifest hash
265 | def theme_from_git_url(url)
266 | tmp_path = JB::Path.build(:theme_packages, :node => "_tmp")
267 | abort("rake aborted: system call to git clone failed") if !system("git clone #{url} #{tmp_path}")
268 | manifest = verify_manifest(tmp_path)
269 | new_path = JB::Path.build(:theme_packages, :node => manifest["name"])
270 | if File.exist?(new_path) && ask("=> #{new_path} theme package already exists. Override?", ['y', 'n']) == 'n'
271 | remove_dir(tmp_path)
272 | abort("rake aborted: '#{manifest["name"]}' already exists as theme package.")
273 | end
274 |
275 | remove_dir(new_path) if File.exist?(new_path)
276 | mv(tmp_path, new_path)
277 | manifest
278 | end
279 |
280 | # Internal: Process theme package manifest file.
281 | #
282 | # theme_path - String, Required. File path to theme package.
283 | #
284 | # Returns theme manifest hash
285 | def verify_manifest(theme_path)
286 | manifest_path = File.join(theme_path, "manifest.yml")
287 | manifest_file = File.open( manifest_path )
288 | abort("rake aborted: repo must contain valid manifest.yml") unless File.exist? manifest_file
289 | manifest = YAML.load( manifest_file )
290 | manifest_file.close
291 | manifest
292 | end
293 |
294 | def ask(message, valid_options)
295 | if valid_options
296 | answer = get_stdin("#{message} #{valid_options.to_s.gsub(/"/, '').gsub(/, /,'/')} ") while !valid_options.include?(answer)
297 | else
298 | answer = get_stdin(message)
299 | end
300 | answer
301 | end
302 |
303 | def get_stdin(message)
304 | print message
305 | STDIN.gets.chomp
306 | end
307 |
308 | #Load custom rake scripts
309 | Dir['_rake/*.rake'].each { |r| load r }
310 |
--------------------------------------------------------------------------------
/docs/ReadMe.md:
--------------------------------------------------------------------------------
1 |
2 | Steps to run the code:
3 | - git clone https://github.com/omar-florez/scratch_mlp/
4 | - python scratch_mlp/scratch_mlp.py
5 |
6 | >A **neural network** is a clever arrangement of linear and non-linear modules. When we choose and connect them wisely,
7 | we have a powerful tool to approximate any mathematical function. For example one that **separates classes with a non-linear
8 | decision boundary**.
9 |
10 | A topic that is not always explained in depth, despite of its intuitive and modular nature, is the
11 | **backpropagation technique** responsible for updating trainable parameters. Let’s build a neural network from scratch
12 | to see the internal functioning of a neural network using **LEGO pieces as a modular analogy**, one brick at a time.
13 |
14 | Code implementing this can be found in this repository: [https://github.com/omar-florez/scratch_mlp](https://github.com/omar-florez/scratch_mlp)
15 |
16 | ## Neural Networks as a Composition of Pieces
17 |
18 | 
19 |
20 | The above figure depicts some of the Math used for training a neural network. We will make sense of this during this article.
21 | The reader may find interesting that a neural network is a stack of modules with different purposes:
22 |
23 | - **Input X** feeds a neural network with raw data, which is stored in a matrix in which observations are rows and dimensions are columns
24 | - **Weights W1** maps input X to the first hidden layer h1. Weights W1 works then as a linear kernel
25 | - A **Sigmoid function** prevents numbers in the hidden layer from falling out of range by scaling them to 0-1. The result is an **array of
26 | neural activations** h1 = Sigmoid(WX)
27 |
28 | At this point these operations only compute a **general linear system**, which doesn’t have the capacity to model non-linear interactions.
29 | This changes when we stack one more layer, adding depth to this modular structure. The deeper the network, the more subtle non-linear
30 | interactions we can learn and more complex problems we can solve, which may explain in part the rise of deep neural models.
31 |
32 | ## Why should I read this?
33 |
34 | >If you understand the internal parts of a neural network, you will quickly know **what to change first** when things don't work
35 | and define an strategy to **test invariants** and **expected behaviors** that you know are part the algorithm. This will also
36 | be helpful when you want to **create new capabilities that are not currently implemented in the ML library** you are using.
37 |
38 | **Because debugging machine learning models is a complex task**. By experience, mathematical models don't
39 | work as expected the first try. They may give you low accuracy for new data, spend long training time or too much memory,
40 | return a large number of false negatives or NaN predictions, etc. Let me show some cases when knowing how the algorithm works
41 | can become handy:
42 |
43 | - If it **takes so much time to train**, it is maybe a good idea to increase the size of a minibatch to reduce the variance
44 | in the observations and thus to help the algorithm to converge
45 | - If you observe **NaN predictions**, the algorithm may have received large gradients producing memory overflow. Think of
46 | this as consecutive matrix multiplications that exploit after many iterations. Decreasing the learning rate will have the
47 | effect of scaling down these values. Reducing the number of layers will decrease the number of multiplications. And clipping
48 | gradients will control this problem explicitly
49 |
50 | ## Concrete Example: Learning the XOR Function
51 |
52 | >Let's open the blackbox. We will build now a neural network from scratch that learns the **XOR function**.
53 | The choice of this **non-linear function** is by no means random chance. Without backpropagation it would be hard to learn
54 | to separate classes with a **straight line**.
55 |
56 | To illustrate this important concept, note below how a straight line cannot
57 | separate 0s and 1s, the outputs of the XOR function. **Real life problems are also non-linearly separable**.
58 |
59 | 
60 |
61 | The topology of the network is simple:
62 | - **Input X** is a two dimensional vector
63 | - **Weights W1** is a 2x3 matrix with randomly initialized values
64 | - **Hidden layer h1** consists of three neurons. Each neuron receives as input a weighted sum of observations, this is the inner product
65 | highlighted in green in the below figure: **z1 = [x1, x2][w1, w2]**
66 | - **Weights W2** is a 3x2 matrix with randomly initialized values and
67 | - **Output layer h2** consists of two neurons since the XOR function returns either 0 (y1=[0,1]) or 1 (y2 = [1,0])
68 |
69 | More visually:
70 |
71 | 
72 |
73 | Let's now train the model. In our simple example the trainable parameters are weights, but be aware that current
74 | research is exploring more types of parameters to be optimized. For example shortcuts between layers, regularized distributions, topologies,
75 | residual, learning rates, etc.
76 |
77 | **Backpropagation** is a method to update the weights towards the direction (**gradient**) that minimizes a predefined error metric known as **Loss function**
78 | given a batch of labeled observations. This algorithm has been repeatedly rediscovered and is a special case of a more general technique called
79 | [automatic differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation) in reverse accumulation mode.
80 |
81 | ### Network Initialization
82 |
83 | >Let's **initialize the network weights** with random numbers.
84 |
85 | {:width="1300px"}
86 |
87 | ### Forward Step:
88 |
89 | >This goal of this step is to **forward propagate** the input X to each layer of the network until computing a vector in
90 | the output layer h2.
91 |
92 | This is how it happens:
93 | - Linearly map input data X using weights W1 as a kernel:
94 |
95 |
96 | {:width="500px"}
97 |
98 | - Scale this weighted sum z1 with a Sigmoid function to get values of the first hidden layer h1. **Note that the original
99 | 2D vector is now mapped to a 3D space**.
100 |
101 |
102 | {:width="400px"}
103 |
104 | - A similar process takes place for the second layer h2. Let's compute first the **weighted sum** z2 of the
105 | first hidden layer, which is now input data.
106 |
107 |
108 | {:width="500px"}
109 |
110 | - And then compute their Sigmoid activation function. This vector [0.37166596 0.45414264] represents the **log probability**
111 | or **predicted vector** computed by the network given input X.
112 |
113 | {:width="300px"}
114 |
115 | ### Computing the Total Loss
116 |
117 | >Also known as "actual minus predicted", the goal of the loss function is to **quantify the distance between the predicted
118 | vector h2 and the actual label provided by humans y**.
119 |
120 | Note that the Loss function contains a **regularization component** that penalizes large weight values as in a Ridge
121 | regression. In other words, large squared weights values will increase the Loss function, **an error metric we indeed want to minimize**.
122 |
123 | {:width="500px"}
124 |
125 | ### Backward step:
126 | >The goal of this step is to **update the weights of the neural network** in a direction that minimizes its Loss function.
127 | As we will see, this is a **recursive algorithm**, which can reuse gradients previously computed and heavily relies on
128 | **differentiable functions**. Since these updates reduce the loss function, a network ‘learns’ to approximate the label
129 | of observations with known classes. A property called **generalization**.
130 |
131 | This step goes in **backward order** than the forward step. It computes first the partial derivative of the loss function
132 | with respect to the weights of the output layer (dLoss/dW2) and then the hidden layer (dLoss/dW1). Let's explain
133 | in detail each one.
134 |
135 | #### dLoss/dW2:
136 |
137 | The chain rule says that we can decompose the computation of gradients of a neural network into **differentiable pieces**:
138 |
139 | {:width="500px"}
140 |
141 | As a memory helper, these are the **function definitions** used above and their **first derivatives**:
142 |
143 | | Function | First derivative |
144 | |------------------------------------------------------------ |------------------------------------------------------------|
145 | |Loss = (y-h2)^2 | dLoss/dW2 = -(y-h2) |
146 | |h2 = Sigmoid(z2) | dh2/dz2 = h2(1-h2) |
147 | |z2 = h1W2 | dz2/dW2 = h1 |
148 | |z2 = h1W2 | dz2/dh1 = W2 |
149 |
150 |
151 | More visually, we aim to update the weights W2 (in blue) in the below figure. In order to that, we need to compute
152 | three **partial derivatives along the chain**.
153 |
154 | {:width="500px"}
155 |
156 | Plugging in values into these partial derivatives allow us to compute gradients with respect to weights W2 as follows.
157 |
158 | {:width="600px"}
159 |
160 | The result is a 3x2 matrix dLoss/dW2, which will update the original W2 values in a direction that minimizes the Loss function.
161 |
162 | {:width="700px"}
163 |
164 | #### dLoss/dW1:
165 |
166 | Computing the **chain rule** for updating the weights of the first hidden layer W1 exhibits the possibility of **reusing existing
167 | computations**.
168 |
169 | {:width="500px"}
170 |
171 | More visually, the **path from the output layer to the weights W1** touches partial derivatives already computed in **latter
172 | layers**.
173 |
174 | {:width="500px"}
175 |
176 | For example, partial derivatives dLoss/dh2 and dh2/dz2 have been already computed as a dependency for learning weights
177 | of the output layer dLoss/dW2 in the previous section.
178 |
179 | {:width="700px"}
180 |
181 | Placing all derivatives together, we can execute the **chain rule** again to update the weights of the hidden layer W1:
182 |
183 | {:width="700px"}
184 |
185 | Finally, we assign the new values of the weights and have completed an iteration on the training of network.
186 |
187 | {:width="150px"}
188 |
189 | ### Implementation
190 |
191 | Let's translate the above mathematical equations to code only using [Numpy](http://www.numpy.org/) as our **linear algebra engine**.
192 | Neural networks are trained in a loop in which each iteration present already **calibrated input data** to the network.
193 | In this small example, let's just consider the entire dataset in each iteration. The computations of **Forward step**,
194 | **Loss**, and **Backwards step** lead to good generalization since we update the **trainable parameters** (matrices w1 and
195 | w2 in the code) with their corresponding **gradients** (matrices dL_dw1 and dL_dw2) in every cycle.
196 | Code is stored in this repository: [https://github.com/omar-florez/scratch_mlp](https://github.com/omar-florez/scratch_mlp)
197 |
198 | 
199 |
200 | ### Let's Run This!
201 |
202 | See below **some neural networks** trained to approximate the **XOR function** over many iterations.
203 |
204 | **Left plot:** Accuracy. **Central plot:** Learned decision boundary. **Right plot:** Loss function.
205 |
206 | First let's see how a neural network with **3 neurons** in the hidden layer has small capacity. This model learns to separate 2 classes
207 | with a **simple decision boundary** that starts being a straight line but then shows a non-linear behavior.
208 | The loss function in the right plot nicely gets low as training continues.
209 |
210 | 
211 |
212 | Having **50 neurons** in the hidden layer notably increases model's power to learn more **complex decision boundaries**.
213 | This could not only produce more accurate results, but also **exploiting gradients**, a notable problem when training neural networks.
214 | This happens when very large gradients multiply weights during backpropagation and thus generate large updated weights.
215 | This is reason why the **Loss value suddenly increases** during the last steps of the training (step > 90).
216 | The **regularization component** of the Loss function computes the **squared values** of weights that are already very large (sum(W^2)/2N).
217 |
218 | 
219 |
220 | This problem can be avoided by **reducing the learning rate** as you can see below. Or by implementing a policy that reduces
221 | the learning rate over time. Or by enforcing a stronger regularization, maybe L1 instead of L2.
222 | **Exploiding** and **vanishing gradients** are interesting phenomenons and we will devote an entire analysis later.
223 |
224 | 
225 |
226 |
--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | # This is the default format.
2 | # For more see: https://github.com/mojombo/jekyll/wiki/Permalinks
3 | permalink: /:categories/:year/:month/:day/:title
4 |
5 | exclude: [".rvmrc", ".rbenv-version", "ReadMe.md", "Rakefile", "changelog.md", "License.md"]
6 | highlighter: rouge
7 |
8 | # Themes are encouraged to use these universal variables
9 | # so be sure to set them if your theme uses them.
10 | #
11 | title : Omar U. Florez
12 | author :
13 | name : Omar U. Florez
14 | email : omar.florez@aggiemail.usu.edu
15 | github : omar-florez
16 | twitter : OmarUFlorez
17 | feedburner : nil
18 |
19 | # NOTE: If replacing this next line with your own URL, you likely want "https://" not "http://"
20 | production_url : https://omar-florez.github.io/scratch_mlp
21 |
22 | # Tell Github to use the kramdown markdown interpreter
23 | # (see https://help.github.com/articles/migrating-your-pages-site-from-maruku)
24 | markdown: kramdown
25 |
26 | # All Jekyll-Bootstrap specific configurations are namespaced into this hash
27 | #
28 | JB :
29 | version : 0.3.0
30 |
31 | # All links will be namespaced by BASE_PATH if defined.
32 | # Links in your website should always be prefixed with {{BASE_PATH}}
33 | # however this value will be dynamically changed depending on your deployment situation.
34 | #
35 | # CNAME (http://yourcustomdomain.com)
36 | # DO NOT SET BASE_PATH
37 | # (urls will be prefixed with "/" and work relatively)
38 | #
39 | # GitHub Pages (http://username.github.io)
40 | # DO NOT SET BASE_PATH
41 | # (urls will be prefixed with "/" and work relatively)
42 | #
43 | # GitHub Project Pages (http://username.github.io/project-name)
44 | #
45 | # A GitHub Project site exists in the `gh-pages` branch of one of your repositories.
46 | # REQUIRED! Set BASE_PATH to: http://username.github.io/project-name
47 | #
48 | # CAUTION:
49 | # - When in Localhost, your site will run from root "/" regardless of BASE_PATH
50 | # - Only the following values are falsy: ["", null, false]
51 | # - When setting BASE_PATH it must be a valid url.
52 | # This means always setting the protocol (http|https) or prefixing with "/"
53 | #
54 | # NOTE: If replacing this next line with your own URL, you likely want "https://" not "http://"
55 | BASE_PATH : https://omar-florez.github.io/scratch_mlp
56 |
57 | # By default, the asset_path is automatically defined relative to BASE_PATH plus the enabled theme.
58 | # ex: [BASE_PATH]/assets/themes/[THEME-NAME]
59 | #
60 | # Override this by defining an absolute path to assets here.
61 | # ex:
62 | # http://s3.amazonaws.com/yoursite/themes/watermelon
63 | # /assets
64 | #
65 | # ASSET_PATH : http://kbroman.org/simple_site/assets/themes/twitter
66 |
67 | # These paths are to the main pages Jekyll-Bootstrap ships with.
68 | # Some JB helpers refer to these paths; change them here if needed.
69 | #
70 | archive_path: nil
71 | categories_path : nil
72 | tags_path : nil
73 | atom_path : nil
74 | rss_path : nil
75 |
76 | # Settings for comments helper
77 | # Set 'provider' to the comment provider you want to use.
78 | # Set 'provider' to false to turn commenting off globally.
79 | #
80 | comments :
81 | provider : false
82 |
83 | # Settings for analytics helper
84 | # Set 'provider' to the analytics provider you want to use.
85 | # Set 'provider' to false to turn analytics off globally.
86 | #
87 | analytics :
88 | provider : false
89 |
90 | # Settings for sharing helper.
91 | # Sharing is for things like tweet, plusone, like, reddit buttons etc.
92 | # Set 'provider' to the sharing provider you want to use.
93 | # Set 'provider' to false to turn sharing off globally.
94 | #
95 | sharing :
96 | provider : true
97 |
98 | # Settings for all other include helpers can be defined by creating
99 | # a hash with key named for the given helper. ex:
100 | #
101 | # pages_list :
102 | # provider : "custom"
103 | #
104 | # Setting any helper's provider to 'custom' will bypass the helper code
105 | # and include your custom code. Your custom file must be defined at:
106 | # ./_includes/custom/[HELPER]
107 | # where [HELPER] is the name of the helper you are overriding.
108 |
109 | theme: jekyll-theme-leap-day
--------------------------------------------------------------------------------
/docs/_includes/JB/analytics:
--------------------------------------------------------------------------------
1 | {% if site.safe and site.JB.analytics.provider and page.JB.analytics != false %}
2 |
3 | {% case site.JB.analytics.provider %}
4 | {% when "google" %}
5 | {% include JB/analytics-providers/google %}
6 | {% when "getclicky" %}
7 | {% include JB/analytics-providers/getclicky %}
8 | {% when "mixpanel" %}
9 | {% include JB/analytics-providers/mixpanel %}
10 | {% when "custom" %}
11 | {% include custom/analytics %}
12 | {% endcase %}
13 |
14 | {% endif %}
--------------------------------------------------------------------------------
/docs/_includes/JB/analytics-providers/getclicky:
--------------------------------------------------------------------------------
1 |
12 |
13 |
--------------------------------------------------------------------------------
/docs/_includes/JB/analytics-providers/google:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/_includes/JB/analytics-providers/mixpanel:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/_includes/JB/categories_list:
--------------------------------------------------------------------------------
1 | {% comment %}{% endcomment %}
19 |
20 | {% if site.JB.categories_list.provider == "custom" %}
21 | {% include custom/categories_list %}
22 | {% else %}
23 | {% if categories_list.first[0] == null %}
24 | {% for category in categories_list %}
25 |
{{text | replace:"|.", "{" | replace:".|", "}" | replace:">", ">" | replace:"<", "<" }}
31 | {% endif %}
32 | {% assign text = nil %}
--------------------------------------------------------------------------------
/docs/_includes/JB/pages_list:
--------------------------------------------------------------------------------
1 | {% comment %}{% endcomment %}
22 |
23 | {% if site.JB.pages_list.provider == "custom" %}
24 | {% include custom/pages_list %}
25 | {% else %}
26 | {% for node in pages_list %}
27 | {% if node.title != null %}
28 | {% if group == null or group == node.group %}
29 | {% if page.url == node.url %}
30 | #{obj.class}\n#{obj.pretty_inspect}" 33 | end 34 | 35 | end # DebugFilter 36 | end # Jekyll 37 | 38 | Liquid::Template.register_filter(Jekyll::DebugFilter) -------------------------------------------------------------------------------- /docs/assets/all_3neurons_lr_0.003_reg_0.0.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/all_3neurons_lr_0.003_reg_0.0.gif -------------------------------------------------------------------------------- /docs/assets/all_50neurons_lr_0.003_reg_0.000001.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/all_50neurons_lr_0.003_reg_0.000001.gif -------------------------------------------------------------------------------- /docs/assets/all_50neurons_lr_0.003_reg_0.0001.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/all_50neurons_lr_0.003_reg_0.0001.gif -------------------------------------------------------------------------------- /docs/assets/chain_w1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/chain_w1.png -------------------------------------------------------------------------------- /docs/assets/chain_w1_numbers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/chain_w1_numbers.png -------------------------------------------------------------------------------- /docs/assets/chain_w1_numbers_final.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/chain_w1_numbers_final.png -------------------------------------------------------------------------------- /docs/assets/chain_w2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/chain_w2.png -------------------------------------------------------------------------------- /docs/assets/chain_w2_detailed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/chain_w2_detailed.png -------------------------------------------------------------------------------- /docs/assets/chain_w2_numbers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/chain_w2_numbers.png -------------------------------------------------------------------------------- /docs/assets/code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/code.png -------------------------------------------------------------------------------- /docs/assets/copy_values.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/copy_values.png -------------------------------------------------------------------------------- /docs/assets/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/example.png -------------------------------------------------------------------------------- /docs/assets/forward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/forward.png -------------------------------------------------------------------------------- /docs/assets/h1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/h1.png -------------------------------------------------------------------------------- /docs/assets/h2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/h2.png -------------------------------------------------------------------------------- /docs/assets/initialized_network.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/initialized_network.png -------------------------------------------------------------------------------- /docs/assets/loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/loss.png -------------------------------------------------------------------------------- /docs/assets/nonlinear_xor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/nonlinear_xor.png -------------------------------------------------------------------------------- /docs/assets/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/overview.png -------------------------------------------------------------------------------- /docs/assets/overview2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/overview2.png -------------------------------------------------------------------------------- /docs/assets/themes/twitter/bootstrap/img/glyphicons-halflings-white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/themes/twitter/bootstrap/img/glyphicons-halflings-white.png -------------------------------------------------------------------------------- /docs/assets/themes/twitter/bootstrap/img/glyphicons-halflings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/themes/twitter/bootstrap/img/glyphicons-halflings.png -------------------------------------------------------------------------------- /docs/assets/themes/twitter/css/kbroman.css: -------------------------------------------------------------------------------- 1 | code { 2 | padding: 0; 3 | font-size: 90%; 4 | color: black; 5 | background-color: white; 6 | border: 0px solid white; 7 | } 8 | 9 | a code { 10 | color: #08c; 11 | } -------------------------------------------------------------------------------- /docs/assets/themes/twitter/css/style.css: -------------------------------------------------------------------------------- 1 | /* Custom container */ 2 | .container-narrow { 3 | margin: 0 auto; 4 | max-width: 700px; } 5 | 6 | .container-narrow > hr { 7 | margin: 30px 0; } 8 | 9 | .navbar .nav { 10 | float: right; } 11 | 12 | /* posts index */ 13 | .post > h3.title { 14 | position: relative; 15 | padding-top: 10px; } 16 | 17 | .post > h3.title span.date { 18 | position: absolute; 19 | right: 0; 20 | font-size: 0.9em; } 21 | 22 | .post > .more { 23 | margin: 10px 0; 24 | text-align: left; } 25 | 26 | /* post-full*/ 27 | .post-full .date { 28 | margin-bottom: 20px; 29 | font-weight: bold; } 30 | 31 | /* tag_box */ 32 | .tag_box { 33 | list-style: none; 34 | margin: 0; 35 | overflow: hidden; } 36 | 37 | .tag_box li { 38 | line-height: 28px; } 39 | 40 | .tag_box li i { 41 | opacity: 0.9; } 42 | 43 | .tag_box.inline li { 44 | float: left; } 45 | 46 | .tag_box a { 47 | padding: 3px 6px; 48 | margin: 2px; 49 | background: #eee; 50 | color: #555; 51 | border-radius: 3px; 52 | text-decoration: none; 53 | border: 1px dashed #cccccc; } 54 | 55 | .tag_box a span { 56 | vertical-align: super; 57 | font-size: 0.8em; } 58 | 59 | .tag_box a:hover { 60 | background-color: #e5e5e5; } 61 | 62 | .tag_box a.active { 63 | background: #57A957; 64 | border: 1px solid #4c964d; 65 | color: #FFF; } -------------------------------------------------------------------------------- /docs/assets/update_w1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/update_w1.png -------------------------------------------------------------------------------- /docs/assets/update_w2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/update_w2.png -------------------------------------------------------------------------------- /docs/assets/z1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/z1.png -------------------------------------------------------------------------------- /docs/assets/z2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omar-florez/scratch_mlp/133c565e7e386b9852aa5f89c99273078594e7a7/docs/assets/z2.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: One LEGO at a time: Explaining the Math of How Neural Networks Learn 4 | tagline: 5 | description: Tutorial on back-propagation 6 | --- 7 | 8 | >A **neural network** is a clever arrangement of linear and non-linear modules. When we choose and connect them wisely, 9 | we have a powerful tool to approximate any mathematical function. For example one that **separates classes with a non-linear 10 | decision boundary**. 11 | 12 | A topic that is not always explained in depth, despite of its intuitive and modular nature, is the 13 | **backpropagation technique** responsible for updating trainable parameters. Let’s build a neural network from scratch 14 | to see the internal functioning of a neural network using **LEGO pieces as a modular analogy**, one brick at a time. 15 | 16 | Code implementing this can be found in this repository: [https://github.com/omar-florez/scratch_mlp](https://github.com/omar-florez/scratch_mlp) 17 | 18 | ## Neural Networks as a Composition of Pieces 19 | 20 |  21 | 22 | The above figure depicts some of the Math used for training a neural network. We will make sense of this during this article. 23 | The reader may find interesting that a neural network is a stack of modules with different purposes: 24 | 25 | - **Input X** feeds a neural network with raw data, which is stored in a matrix in which observations are rows and dimensions are columns 26 | - **Weights W1** maps input X to the first hidden layer h1. Weights W1 works then as a linear kernel 27 | - A **Sigmoid function** prevents numbers in the hidden layer from falling out of range by scaling them to 0-1. The result is an **array of 28 | neural activations** h1 = Sigmoid(WX) 29 | 30 | At this point these operations only compute a **general linear system**, which doesn’t have the capacity to model non-linear interactions. 31 | This changes when we stack one more layer, adding depth to this modular structure. The deeper the network, the more subtle non-linear 32 | interactions we can learn and more complex problems we can solve, which may explain in part the rise of deep neural models. 33 | 34 | ## Why should I read this? 35 | 36 | >If you understand the internal parts of a neural network, you will quickly know **what to change first** when things don't work 37 | and define an strategy to **test invariants** and **expected behaviors** that you know are part the algorithm. This will also 38 | be helpful when you want to **create new capabilities that are not currently implemented in the ML library** you are using. 39 | 40 | **Because debugging machine learning models is a complex task**. By experience, mathematical models don't 41 | work as expected the first try. They may give you low accuracy for new data, spend long training time or too much memory, 42 | return a large number of false negatives or NaN predictions, etc. Let me show some cases when knowing how the algorithm works 43 | can become handy: 44 | 45 | - If it **takes so much time to train**, it is maybe a good idea to increase the size of a minibatch to reduce the variance 46 | in the observations and thus to help the algorithm to converge 47 | - If you observe **NaN predictions**, the algorithm may have received large gradients producing memory overflow. Think of 48 | this as consecutive matrix multiplications that exploit after many iterations. Decreasing the learning rate will have the 49 | effect of scaling down these values. Reducing the number of layers will decrease the number of multiplications. And clipping 50 | gradients will control this problem explicitly 51 | 52 | ## Concrete Example: Learning the XOR Function 53 | 54 | >Let's open the blackbox. We will build now a neural network from scratch that learns the **XOR function**. 55 | The choice of this **non-linear function** is by no means random chance. Without backpropagation it would be hard to learn 56 | to separate classes with a **straight line**. 57 | 58 | To illustrate this important concept, note below how a straight line cannot 59 | separate 0s and 1s, the outputs of the XOR function. **Real life problems are also non-linearly separable**. 60 | 61 |  62 | 63 | The topology of the network is simple: 64 | - **Input X** is a two dimensional vector 65 | - **Weights W1** is a 2x3 matrix with randomly initialized values 66 | - **Hidden layer h1** consists of three neurons. Each neuron receives as input a weighted sum of observations, this is the inner product 67 | highlighted in green in the below figure: **z1 = [x1, x2][w1, w2]** 68 | - **Weights W2** is a 3x2 matrix with randomly initialized values and 69 | - **Output layer h2** consists of two neurons since the XOR function returns either 0 (y1=[0,1]) or 1 (y2 = [1,0]) 70 | 71 | More visually: 72 | 73 |  74 | 75 | Let's now train the model. In our simple example the trainable parameters are weights, but be aware that current 76 | research is exploring more types of parameters to be optimized. For example shortcuts between layers, regularized distributions, topologies, 77 | residual, learning rates, etc. 78 | 79 | **Backpropagation** is a method to update the weights towards the direction (**gradient**) that minimizes a predefined error metric known as **Loss function** 80 | given a batch of labeled observations. This algorithm has been repeatedly rediscovered and is a special case of a more general technique called 81 | [automatic differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation) in reverse accumulation mode. 82 | 83 | ### Network Initialization 84 | 85 | >Let's **initialize the network weights** with random numbers. 86 | 87 | {:width="1300px"} 88 | 89 | ### Forward Step: 90 | 91 | >This goal of this step is to **forward propagate** the input X to each layer of the network until computing a vector in 92 | the output layer h2. 93 | 94 | This is how it happens: 95 | - Linearly map input data X using weights W1 as a kernel: 96 | 97 | 98 | {:width="500px"} 99 | 100 | - Scale this weighted sum z1 with a Sigmoid function to get values of the first hidden layer h1. **Note that the original 101 | 2D vector is now mapped to a 3D space**. 102 | 103 | 104 | {:width="400px"} 105 | 106 | - A similar process takes place for the second layer h2. Let's compute first the **weighted sum** z2 of the 107 | first hidden layer, which is now input data. 108 | 109 | 110 | {:width="500px"} 111 | 112 | - And then compute their Sigmoid activation function. This vector [0.37166596 0.45414264] represents the **log probability** 113 | or **predicted vector** computed by the network given input X. 114 | 115 | {:width="300px"} 116 | 117 | ### Computing the Total Loss 118 | 119 | >Also known as "actual minus predicted", the goal of the loss function is to **quantify the distance between the predicted 120 | vector h2 and the actual label provided by humans y**. 121 | 122 | Note that the Loss function contains a **regularization component** that penalizes large weight values as in a Ridge 123 | regression. In other words, large squared weights values will increase the Loss function, **an error metric we indeed want to minimize**. 124 | 125 | {:width="500px"} 126 | 127 | ### Backward step: 128 | >The goal of this step is to **update the weights of the neural network** in a direction that minimizes its Loss function. 129 | As we will see, this is a **recursive algorithm**, which can reuse gradients previously computed and heavily relies on 130 | **differentiable functions**. Since these updates reduce the loss function, a network ‘learns’ to approximate the label 131 | of observations with known classes. A property called **generalization**. 132 | 133 | This step goes in **backward order** than the forward step. It computes first the partial derivative of the loss function 134 | with respect to the weights of the output layer (dLoss/dW2) and then the hidden layer (dLoss/dW1). Let's explain 135 | in detail each one. 136 | 137 | #### dLoss/dW2: 138 | 139 | The chain rule says that we can decompose the computation of gradients of a neural network into **differentiable pieces**: 140 | 141 | {:width="500px"} 142 | 143 | As a memory helper, these are the **function definitions** used above and their **first derivatives**: 144 | 145 | | Function | First derivative | 146 | |------------------------------------------------------------ |------------------------------------------------------------| 147 | |Loss = (y-h2)^2 | dLoss/dW2 = -(y-h2) | 148 | |h2 = Sigmoid(z2) | dh2/dz2 = h2(1-h2) | 149 | |z2 = h1W2 | dz2/dW2 = h1 | 150 | |z2 = h1W2 | dz2/dh1 = W2 | 151 | 152 | 153 | More visually, we aim to update the weights W2 (in blue) in the below figure. In order to that, we need to compute 154 | three **partial derivatives along the chain**. 155 | 156 | {:width="500px"} 157 | 158 | Plugging in values into these partial derivatives allow us to compute gradients with respect to weights W2 as follows. 159 | 160 | {:width="600px"} 161 | 162 | The result is a 3x2 matrix dLoss/dW2, which will update the original W2 values in a direction that minimizes the Loss function. 163 | 164 | {:width="700px"} 165 | 166 | #### dLoss/dW1: 167 | 168 | Computing the **chain rule** for updating the weights of the first hidden layer W1 exhibits the possibility of **reusing existing 169 | computations**. 170 | 171 | {:width="500px"} 172 | 173 | More visually, the **path from the output layer to the weights W1** touches partial derivatives already computed in **latter 174 | layers**. 175 | 176 | {:width="500px"} 177 | 178 | For example, partial derivatives dLoss/dh2 and dh2/dz2 have been already computed as a dependency for learning weights 179 | of the output layer dLoss/dW2 in the previous section. 180 | 181 | {:width="700px"} 182 | 183 | Placing all derivatives together, we can execute the **chain rule** again to update the weights of the hidden layer W1: 184 | 185 | {:width="700px"} 186 | 187 | Finally, we assign the new values of the weights and have completed an iteration on the training of network. 188 | 189 | {:width="150px"} 190 | 191 | ### Implementation 192 | 193 | Let's translate the above mathematical equations to code only using [Numpy](http://www.numpy.org/) as our **linear algebra engine**. 194 | Neural networks are trained in a loop in which each iteration present already **calibrated input data** to the network. 195 | In this small example, let's just consider the entire dataset in each iteration. The computations of **Forward step**, 196 | **Loss**, and **Backwards step** lead to good generalization since we update the **trainable parameters** (matrices w1 and 197 | w2 in the code) with their corresponding **gradients** (matrices dL_dw1 and dL_dw2) in every cycle. 198 | Code is stored in this repository: [https://github.com/omar-florez/scratch_mlp](https://github.com/omar-florez/scratch_mlp) 199 | 200 |  201 | 202 | ### Let's Run This! 203 | 204 | See below **some neural networks** trained to approximate the **XOR function** over many iterations. 205 | 206 | **Left plot:** Accuracy. **Central plot:** Learned decision boundary. **Right plot:** Loss function. 207 | 208 | First let's see how a neural network with **3 neurons** in the hidden layer has small capacity. This model learns to separate 2 classes 209 | with a **simple decision boundary** that starts being a straight line but then shows a non-linear behavior. 210 | The loss function in the right plot nicely gets low as training continues. 211 | 212 |  213 | 214 | Having **50 neurons** in the hidden layer notably increases model's power to learn more **complex decision boundaries**. 215 | This could not only produce more accurate results, but also **exploiting gradients**, a notable problem when training neural networks. 216 | This happens when very large gradients multiply weights during backpropagation and thus generate large updated weights. 217 | This is reason why the **Loss value suddenly increases** during the last steps of the training (step > 90). 218 | The **regularization component** of the Loss function computes the **squared values** of weights that are already very large (sum(W^2)/2N). 219 | 220 |  221 | 222 | This problem can be avoided by **reducing the learning rate** as you can see below. Or by implementing a policy that reduces 223 | the learning rate over time. Or by enforcing a stronger regularization, maybe L1 instead of L2. 224 | **Exploiding** and **vanishing gradients** are interesting phenomenons and we will devote an entire analysis later. 225 | 226 |  227 | 228 | -------------------------------------------------------------------------------- /docs/index_es.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Un LEGO a la vez: Explicando la Matemática de como las Redes Neuronales Aprenden 4 | tagline: 5 | description: Tutorial de retro-alimentación 6 | --- 7 | 8 | >Una **red neuronal** es un composición inteligente de módulos lineales y no lineales. Cuando los escogemos sabiamente, tenemos una herramienta muy poderosa para optimizar cualquier función matemática. Por ejemplo una que **separe clases con un limite de decisión no lineal**. 9 | 10 | Un tópico que no es siempre explicado en detalle, a pesar de su naturaleza intuitiva y modular, es el **algoritmo de retro-alimentación** (backpropagation algorithm) 11 | Responsable de actualizar parámetros entrenables en la red. Construyamos una red neuronal desde cero para ver el funcionamiento interno de una red neuronal usando **piezas de LEGO como una analogía**, un bloque a la vez. 12 | 13 | Código implementando estos conceptos pueden ser encontrados en el siguiente repositorio: [https://github.com/omar-florez/scratch_mlp](https://github.com/omar-florez/scratch_mlp) 14 | 15 | ## Las Redes Neuronales como una Composición de Piezas 16 | 17 |  18 | 19 | La figura de arriba muestra algo de la matemática usada para entrenar una red neuronal. Haremos sentido de esto durante el articulo. 20 | El lector puede encontrar interesante que una red neuronal es una pila de módulos con diferentes propósitos: 21 | 22 | - **Entrada X** alimenta la red neuronal con datos sin procesar, la cual se almacena en una matriz en la cual las observaciones con filas y las dimensiones son columnas 23 | - **Pesos W1** proyectan entrada X a la primera capa escondida h1. Pesos W1 trabajan entonces como un kernel lineal 24 | - Una **función Sigmoid** que previene los números de la capa escondida de salir del rango 0-1. El resultado es un **array activaciones neuronales** h1 = Sigmoid(WX) 25 | 26 | Hasta este punto estas operaciones solo calculan un **sistema general lineal**, el cual no tiene la capacidad de modelar interacciones no lineales. 27 | Esto cambia cuando ponemos otro elemento en el pila, añadiendo profundidad a la estructura modular. Mientras más profunda sea la red, más interacciones no-lineales podremos aprender y problemas mas complejos podremos resolver, lo cual puede explicar en parte la popularidad de redes neuronales. 28 | 29 | ## Porque debería leer esto? 30 | 31 | >Si uno entiende las partes internas de una red neuronal, es mas fácil saber **que cambiar primero** cuando el algoritmo no funcione como es esperado y permite definir una estrategia para **probar invariantes** and **comportamientos esperados** que uno saben son parte del algoritmo. Esto también es útil cuando el lector quiere **crear nuevos algoritmos que actualmente no están implementados en la librería de Machine Learning de preferencia**. 32 | 33 | **Porque hacer debugging de modelos de aprendizaje de maquina es una tarea compleja**. Por experiencia, modelos matemáticos no funcionan como son esperados al primer intento. A veces estos pueden darte una exactitud baja para datos nuevos, tomar mucho tiempo de entrenamiento o mucha memoria RAM, devolver una gran cantidad de falsos negativos o valores NaN (Not a Number), etc. Déjame mostrarte algunos casos donde saber como el algoritmo funciona puede ser útil: 34 | 35 | - Si **toma mucho tiempo para entrenar**, es quizás una buena idea incrementar el tamaño del mini-batch o array de observaciones que alimentan a la red neuronal para reducir la varianza en las observaciones y así ayudar al algoritmo a converger 36 | - Si se observa **valores NaN**, el algoritmo ha recibido gradientes con valores muy altos produciendo desborde de memoria RAM. Piensa esto como una secuencia de multiplicaciones de matrices que explotan después de varias iteraciones. Reducir la velocidad de aprendizaje tendrá el efecto de escalar estos valores. Reduciendo el numero de capas reducirá el numero de multiplicaciones. Y poniendo una cota superior a los gradientes (clipping gradients) controlara este problema explícitamente 37 | 38 | ## Un Ejemplo Concreto: Aprendiendo la Función XOR 39 | 40 | >Abramos la caja negra. Construiremos a continuación una red neuronal desde cero que aprende la **función XOR**. 41 | La elección de esta **función no linear** no es por casualidad. Sin backpropagation seria difícil aprender a separar clases con una **línea recta**. 42 | 43 | Para ilustrar este importante concepto, note a continuación como una línea recta no puede separar 0s and 1s, las salidas de la función XOR. **Los problemas reales también son linealmente no separables**. 44 | 45 |  46 | 47 | La topología de la red es simple: 48 | - **Entrada X** es un vector de dos dimensiones 49 | - **Pesos W1** son una matriz de 2x3 dimensiones con valores inicializados de forma aleatoria 50 | - **Capa escondida h1** consiste de 3 neuronas. Cada neurona recibe como entrada la suma de sus observaciones escaladas por sus pesos, este es el producto punto resaltado en verde en la figura de abajo: **z1 = [x1, x2][w1, w2]** 51 | - **Pesos W2** son una matroz de 3x2 con valores inicializados de forma aleatoria y 52 | - **Capa de salida h2** consiste de 2 neuronas ya que la función XOR retorna 0 (y1=[0,1]) o 1 (y2 = [1,0]) 53 | 54 | Mas visualmente: 55 | 56 |  57 | 58 | Entrenemos ahora el modelo. En nuestro ejemplo los valores entrenables son los pesos, pero tenga en cuenta que la investigación actual esta explorando nuevos tipos de parámetros a ser optimizados. Por ejemplo, atajos entre capas, distribuciones estables en las capas, topologías, velocidades de aprendizaje, etc. 59 | 60 | **Backpropagation** es un método para actualizar los pesos en la dirección (**gradiente**) que minimiza una métrica de error predefinida conocida como **función Loss** 61 | dado un conjunto de observaciones etiquetadas. Este algoritmo ha sido repetidamente redescubierto y es un caso especial de una técnica mas general llamada [diferenciación automática](https://en.wikipedia.org/wiki/Automatic_differentiation) en modo acumulativo reverso. 62 | 63 | ### Inicialización de la Red 64 | 65 | >Inicialicemos **los pesos de la red ** con valores aleatorios. 66 | 67 | {:width="1300px"} 68 | 69 | ### Propagación hacia Adelante: 70 | 71 | >El objetivo de este paso es **propagar hacia delante** la entrada X a cada capa de la red hasta calcular un vector en la capa de salida h2. 72 | 73 | Es así como sucede: 74 | - Se proyecta linealmente la entrada X usando pesos W1 a manera de kernel: 75 | 76 | 77 | {:width="500px"} 78 | 79 | - Se escala esta suma z1 con una función Sigmoid para obtener valores de la primera capa escondida. **Note que el vector original de 2D ha sido proyectado ahora a 3D**. 80 | 81 | {:width="400px"} 82 | 83 | - Un proceso similar toma lugar para la segunda capa h2. Calculemos primero la **suma** z2 de la primera capa escondida, la cual es ahora un vector de entrada. 84 | 85 | {:width="500px"} 86 | 87 | - Y luego calculemos su activación Sigmoid. Este vector [0.37166596 0.45414264] representa el **logaritmo de la probabilidad** 88 | o **vector predecido** calculado por la red dado los datos de entrada X. 89 | 90 | {:width="300px"} 91 | 92 | ### Calculando el Error Total 93 | 94 | >También conocido como "valor real menos predecido", el objetivo de la función Loss es **cuantificar la distancia entre el vector predecido h2 y la etiqueta real proveída por un ser humano, y**. 95 | 96 | Note que la función Loss contiene un **componente de regularización** que penaliza valores de los pesos muy altos a manera de una regresión L2. En otras palabras, grandes valores cuadrados de los pesos incrementaran la función Loss, **una métrica de error que en realidad queremos reducir**. 97 | 98 | {:width="500px"} 99 | 100 | ### Propagación hacia Atrás: 101 | >El objetivo de este paso es **actualizar los pesos de la red neuronal ** en una dirección que minimiza la función Loss. 102 | Como veremos mas adelante, este es un **algoritmo recursivo**, el cual reutiliza gradientes previamente calculadas y se basada plenamente en 103 | **funciones diferenciables**. Ya que estas actualizaciones reducen la función Loss, una red ‘aprende’ a aproximar las etiquetas de nuevas observaciones. Una propiedad llamada **generalización**. 104 | 105 | Este paso va en **orden reverso** que la propagación hacia adelante. Este calcula la primera derivada de la función Loss con respecto a los pesos de la red neuronal de la capa de salida (dLoss/dW2) y luego los de la capa escondida (dLoss/dW1). Expliquemos en detalle cada uno. 106 | 107 | #### dLoss/dW2: 108 | 109 | La regla de la cadena dice que podemos descomponer el calculo de gradientes de una red neuronal en **funciones diferenciables**: 110 | 111 | {:width="500px"} 112 | 113 | Aquí están las **definiciones de funciones** usadas arriba y sus **primeras derivadas**: 114 | 115 | | Función | Primera derivada | 116 | |------------------------------------------------------------ |------------------------------------------------------------| 117 | |Loss = (y-h2)^2 | dLoss/dW2 = -(y-h2) | 118 | |h2 = Sigmoid(z2) | dh2/dz2 = h2(1-h2) | 119 | |z2 = h1W2 | dz2/dW2 = h1 | 120 | |z2 = h1W2 | dz2/dh1 = W2 | 121 | 122 | 123 | Mas visualmente, queremos actualizar los pesos W2 (en azul) en la figura de abajo. Para eso necesitamos calcular tres **derivadas parciales a lo largo de la cadena**. 124 | 125 | {:width="500px"} 126 | 127 | Insertando esos valores esas derivadas parciales nos permite calcular gradientes con respecto a los pesos W2 como sigue. 128 | 129 | {:width="600px"} 130 | 131 | El resultado es una matriz de 3x2 llamada dLoss/dW2, la cual actualizara los valores originales de W2 en una dirección que minimiza la función Loss. 132 | 133 | {:width="700px"} 134 | 135 | #### dLoss/dW1: 136 | 137 | Calculando la **regla de la cadena** para actualizar los pesos de la primera capa escondida W1 exhibe la posibilidad de **reutilizar cálculos existentes**. 138 | 139 | {:width="500px"} 140 | 141 | Mas visualmente, el **camino desde la capa de salida hasta los pesos W1** toca derivadas parciales ya calculadas en capas mas superiores. 142 | 143 | {:width="500px"} 144 | 145 | Por ejemplo, la derivada parcial dLoss/dh2 y dh2/dz2 ha sido ya calculada como una dependencia para aprender los pesos de la capa de salida dLoss/dW2 en la sección anterior. 146 | 147 | {:width="700px"} 148 | 149 | Ubicando todas las derivadas juntas, podemos ejecutar la **regla de la cadena** de nuevo para actualizar los pesos de la capa escondida W1: 150 | 151 | {:width="700px"} 152 | 153 | Finalmente, asignamos los nuevos valores de los pesos y hemos completado una iteración del entrenamiento de la red neuronal! 154 | 155 | {:width="150px"} 156 | 157 | ### Implementación 158 | 159 | Traduzcamos las ecuaciones matemáticas de arriba en código solamente utilizando [Numpy](http://www.numpy.org/) como nuestro **motor de algebra linar**. 160 | Redes neuronales son entrenadas en un loop en el cual cada iteración presenta **datos de entrada ya calibrados** a la red. 161 | En este pequeño ejemplo, consideremos todo el dataset en cada iteración. Los cálculos del paso de **Propagación hacia adelante**, 162 | **Loss**, y **Propagación hacia atrás** conducen a obtener una buena generalización ya que actualizaremos los **parámetros entrenables** (matrices W1 and W2 en el código) con sus correspondientes **gradientes** (matrices dL_dw1 and dL_dw2) en cada ciclo. 163 | El código es almacenado en este repositorio: [https://github.com/omar-florez/scratch_mlp](https://github.com/omar-florez/scratch_mlp) 164 | 165 |  166 | 167 | ### Ejecutemos Esto! 168 | 169 | Mire abajo **algunas redes neuronales** entrenadas para aproximar la **función XOR** en múltiple iteraciones. 170 | 171 | **Izquierda:** Exactitud. **Centro:** Borde de decisión aprendido. **Derecha:** Función Loss. 172 | 173 | Primero veamos como una red neuronal con **3 neuronas** en la capa escondida tiene una pequeña capacidad. Este modelo aprende a separar dos clases con un **simple borde de decisión** que empieza una línea recta, pero luego muestra un comportamiento no lineal. 174 | La función Loss en la derecha suavemente se reduce mientras el proceso de aprendizaje ocurre. 175 | 176 |  177 | 178 | Teniendo **50 neuronas** en la capa escondida notablemente incremental el poder del modelo para aprender **bordes de decisión mas complejos**. 179 | Esto podría no solo producir resultados mas exactos, pero también **explotar las gradientes**, un problema notable cuando se entrena redes neuronales. 180 | Esto sucede cuando gradientes muy grandes multiplican pesos durante la propagación hacia atrás y así generan pesos actualizados muy grandes. 181 | Esta es la razón por la que **valores de la función Loss repentinamente se incrementan** durante los últimos pasos del entrenamiento (step > 90). 182 | El **componente de regularicion** de la función Loss calcula los **valores cuadrados** de los pesos que ya tienen valores muy altos (sum(W^2)/2N). 183 | 184 |  185 | 186 | Este problema puede ser evitado **reduciendo la velocidad de aprendizaje** como puede ver abajo. O implementado una política que reduzca la velocidad de aprendizaje con el tiempo. O imponiendo una regularización mas fuerte, quizás L1 en vez de L2. 187 | Gradientes que **explotan** y se **desvanecen** son interesantes fenómenos y haremos un análisis detallada de eso mas adelante. 188 | 189 |  190 | 191 | 192 | -------------------------------------------------------------------------------- /docs/pages/independent_site.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Making an independent website 4 | description: How to make an independent website with GitHub Pages. 5 | --- 6 | 7 | This is what to do if you just want a website. (This page is a bit 8 | long, but it's really not that much work.) 9 | 10 | ### First things 11 | 12 | Start by cloning 13 | [the repository for the present site](https://github.com/kbroman/simple_site). (Or, 14 | alternatively, fork it and then clone your own version.) 15 | 16 | git clone git://github.com/kbroman/simple_site 17 | 18 | Then change the name of that directory to something meaningful. 19 | 20 | mv simple_site something_meaningful 21 | 22 | (Of course, don't use `something_meaningful` but rather 23 | _something meaningful_.) 24 | 25 | Now change into that directory and remove the `.git` directory 26 | (because you don't want the history of _my_ repository). 27 | 28 | cd something_meaningful 29 | rm -r .git 30 | 31 | Now make it a git repository again. 32 | 33 | git init 34 | 35 | ### Things not to change 36 | 37 | You'll need to keep the following files and directories largely unchanged. 38 | 39 | Rakefile 40 | _includes 41 | _layouts 42 | _plugins 43 | assets/themes 44 | 45 | We _will_ change one file within `_includes/`; see below. 46 | 47 | ### Edit the `_config.yml` file 48 | 49 | The 50 | [`_config.yml`](https://github.com/kbroman/simple_site/blob/gh-pages/_config.yml) 51 | file contains a bunch of configuration information. You'll want to 52 | edit this file to replace my information with your information. 53 | 54 | Perhaps edit the 55 | [line with `exclude:`](https://github.com/kbroman/simple_site/blob/gh-pages/_config.yml#L5) 56 | if you've named `License.md` and/or `ReadMe.md` differently. (I've 57 | edited this line a bit, here.) 58 | 59 | exclude: [..., "ReadMe.md", "Rakefile", "License.md"] 60 | 61 | Edit the 62 | [lines about the site name and author](https://github.com/kbroman/simple_site/blob/gh-pages/_config.yml#L11-L17). 63 | 64 | title : simple site 65 | author : 66 | name : Karl Broman 67 | email : kbroman@gmail.com 68 | github : kbroman 69 | twitter : kwbroman 70 | feedburner : nil 71 | 72 | Edit the 73 | [`production_url` line](https://github.com/kbroman/simple_site/blob/gh-pages/_config.yml#L19) 74 | by replacing `kbroman` with _your_ github user name, and replace 75 | `simple_site` with the name that your repository will have on github 76 | (`something_meaningful`?). 77 | 78 | production_url : https://kbroman.github.io/simple_site 79 | 80 | Note that the `https` (vs `http`) is important here; see 81 | “[Securing your github pages site with https](https://help.github.com/articles/securing-your-github-pages-site-with-https/).” 82 | (I need to use `http` because my site uses the custom domain 83 | `kbroman.org`, but you likely need `https`.) 84 | 85 | Replace the 86 | [`BASE_PATH` line](https://github.com/kbroman/simple_site/blob/gh-pages/_config.yml#L52) 87 | with the same url. 88 | 89 | BASE_PATH : https://kbroman.github.io/simple_site 90 | 91 | There's also an 92 | [`ASSET_PATH` line](https://github.com/kbroman/simple_site/blob/gh-pages/_config.yml#L62), 93 | but you can leave that commented-out (with the `#` symbol at the beginning). 94 | 95 | Note that for the `BASE_PATH`, I actually have 96 | `http://kbroman.org/` in place of `https://kbroman.github.io/`. I set up 97 | a 98 | [custom domain](https://help.github.com/articles/setting-up-a-custom-domain-with-github-pages), 99 | which involved a series of emails with a DNS provider. I 100 | don't totally understand how it works, and I'm not _entirely_ sure 101 | that I've done it right. But if you want to have a custom domain, take 102 | a look at 103 | [that GitHub help page](https://help.github.com/articles/setting-up-a-custom-domain-with-github-pages). 104 | 105 | ### Edit `_includes/themes/twitter/default.html` 106 | 107 | The 108 | [`_includes/themes/twitter/default.html`](https://github.com/kbroman/simple_site/blob/gh-pages/_includes/themes/twitter/default.html) 109 | file defines how a basic page will look on your site. In particular, 110 | it contains a bit of html code for a footer, if you want one. 111 | 112 | Find the 113 | [footer for my site](https://github.com/kbroman/simple_site/blob/gh-pages/_includes/themes/twitter/default.html#L47-L50) 114 | and remove it or edit it to suit. This is the only bit of html you'll 115 | have to deal with. 116 | 117 | 118 | ... 119 | Karl Broman 120 | 121 | 122 | ### Edit or remove the Markdown files 123 | 124 | Edit the 125 | [`index.md`](https://raw.githubusercontent.com/kbroman/simple_site/gh-pages/index.md) 126 | file, which will become the main page for your site. 127 | 128 | First, edit the initial chunk with a different title and tagline. Feel 129 | free to just delete the tagline. 130 | 131 | --- 132 | layout: page 133 | title: simple site 134 | tagline: Easy websites with GitHub Pages 135 | --- 136 | 137 | Now edit the rest (or, for now, just remove) the rest of the file. 138 | 139 | Now go into the [`pages/`](https://github.com/kbroman/simple_site/blob/gh-pages/pages) directory and remove or rename and modify 140 | all of the Markdown files in there 141 | 142 | Note that when you link to any of these Markdown-based pages, you'll 143 | want to use a `.html` extension rather than `.md`. For example, look 144 | at the 145 | [main page](https://raw.githubusercontent.com/kbroman/simple_site/gh-pages/index.md) 146 | for this site; the links in the bullet points for the various pages 147 | look like this: 148 | 149 | - [Overview](pages/overview.html) 150 | - [Making an independent website](pages/independent_site.html) 151 | - [Making a personal site](pages/user_site.html) 152 | - [Making a site for a project](pages/project_site.html) 153 | - [Making a jekyll-free site](pages/nojekyll.html) 154 | - [Testing your site locally](pages/local_test.html) 155 | - [Resources](pages/resources.html) 156 | 157 | ### Commit all of these changes. 158 | 159 | At the start, we'd removed the `.git/` subdirectory (with the history 160 | of _my_ repository) and then used `git init` to make it a new git 161 | repository. 162 | 163 | Now you want to add and commit all of the files, as modified. 164 | 165 | git add . 166 | git commit -m "Initial commit" 167 | 168 | Then change the name of the master branch to `gh-pages`. 169 | 170 | git branch -m master gh-pages 171 | 172 | ### Push everything to GitHub 173 | 174 | Now go back to GitHub and create a new repository, called something 175 | meaningful. (I'll again pretend that it's explicitly 176 | `something_meaningful`.) 177 | 178 | Then go back to the command line and push your repository to 179 | [GitHub](https://github.com). 180 | 181 | git remote add origin git@github.com:username/something_meaningful 182 | 183 | Replace `username` with your GitHub user name and 184 | `something_meaningful` with the name of your repository. And you might 185 | want to use the `https://` construction instead, if you're not using ssh. 186 | 187 | git remote add origin https://github.com/username/something_meaningful 188 | 189 | Finally, push everything to GitHub. 190 | 191 | git push -u origin gh-pages 192 | 193 | Note that we're using `gh-pages` and not `master` here, as we want 194 | this stuff in a `gh-pages` branch. 195 | 196 | ### Check whether it worked 197 | 198 | Go to `https://username.github.io/something_meaningful` and cross your 199 | fingers that it worked. (Really, _I_ should be crossing my fingers.) 200 | 201 | ### Up next 202 | 203 | Now go to [making a personal site](user_site.html). 204 | -------------------------------------------------------------------------------- /docs/pages/local_test.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Testing your site locally 4 | description: How to test your GitHub Pages site locally. 5 | --- 6 | 7 | To test your site locally, you'll need 8 | 9 | - [ruby](https://www.ruby-lang.org/en/) 10 | - the [github-pages](https://github.com/github/pages-gem) gem 11 | 12 | ### Installing ruby 13 | 14 | There are 15 | [lots of different ways to install ruby](https://www.ruby-lang.org/en/installation/). 16 | 17 | 18 | In Mac OS X, older versions of ruby will already be installed. But I 19 | use the [Ruby Version Manager (RVM)](https://rvm.io/) to have a more 20 | recent version. You could also use [Homebrew](https://brew.sh/). 21 | 22 | In Windows, use [RubyInstaller](https://rubyinstaller.org/). (In most 23 | of this tutorial, I've assumed you're using a Mac or some flavor of 24 | Unix. It's possible that none of this was usable for Windows 25 | folks. Sorry!) 26 | 27 | 28 | ### Installing the github-pages gem 29 | 30 | Run the following command: 31 | 32 | gem install github-pages 33 | 34 | This will install the `github-pages` gem and all dependencies 35 | (including [jekyll](https://jekyllrb.com/)). 36 | 37 | Later, to update the gem, type: 38 | 39 | gem update github-pages 40 | 41 | 42 | ### Testing your site locally 43 | 44 | To construct and test your site locally, go into the directory and 45 | type 46 | 47 | jekyll build 48 | 49 | This will create (or modify) a `_site/` directory, containing 50 | everything from `assets/`, and then the `index.md` and all 51 | `pages/*.md` files, converted to html. (So there'll be 52 | `_site/index.html` and the various `_site/pages/*.html`.) 53 | 54 | Type the following in order to “serve” the site. 55 | This will first run `build`, and so it does _not_ need to be 56 | preceded by `jekyll build`. 57 | 58 | jekyll serve 59 | 60 | Now open your browser and go to