├── .gitattributes
├── .gitignore
├── 1- Neural Networks and Deep Learning
    ├── Images
    │   ├── 02.png
    │   ├── 03.png
    │   ├── 04.png
    │   ├── 05.png
    │   ├── 06.png
    │   ├── 07.png
    │   ├── 08.png
    │   ├── 09.png
    │   ├── 10.png
    │   ├── 11.png
    │   └── Others
    │   │   ├── 01.jpg
    │   │   ├── 02.png
    │   │   └── 03.png
    └── Readme.md
├── 2- Improving Deep Neural Networks
    ├── Images
    │   ├── 01-_Bias_-_Variance.png
    │   ├── 02-_Early_stopping.png
    │   ├── 03-_Numerical_approximation_of_gradients.png
    │   ├── 04-_batch_vs_mini_batch_cost.png
    │   ├── 05-_exponentially_weighted_averages_intuitions.png
    │   ├── 06-_RMSprop.png
    │   ├── 07-_softmax.png
    │   ├── Nasdaq1_small.png
    │   └── bn.png
    └── Readme.md
├── 3- Structuring Machine Learning Projects
    ├── Images
    │   └── 01-_Why_human-level_performance.png
    └── Readme.md
├── 4- Convolutional Neural Networks
    ├── Images
    │   ├── 01.png
    │   ├── 02.png
    │   ├── 03.png
    │   ├── 04.png
    │   ├── 05.png
    │   ├── 06.png
    │   ├── 07.png
    │   ├── 08.png
    │   ├── 09.png
    │   ├── 10.png
    │   ├── 11.png
    │   ├── 12.png
    │   ├── 13.png
    │   ├── 14.png
    │   ├── 15.png
    │   ├── 16.png
    │   ├── 17.png
    │   ├── 18.png
    │   ├── 19.png
    │   ├── 20.png
    │   ├── 21.png
    │   ├── 22.png
    │   ├── 23.png
    │   ├── 24.png
    │   ├── 25.png
    │   ├── 26.png
    │   ├── 27.png
    │   ├── 28.png
    │   ├── 29.png
    │   ├── 30.png
    │   ├── 31.png
    │   ├── 32.png
    │   ├── 33.png
    │   ├── 34.png
    │   ├── 35.png
    │   ├── 36.png
    │   ├── 37.png
    │   ├── 38.png
    │   ├── 39.png
    │   ├── 40.png
    │   ├── 41.png
    │   ├── 42.png
    │   ├── 43.png
    │   ├── 44.png
    │   ├── Classification.jpg
    │   ├── ClassificationLoc.jpg
    │   ├── InstanceSegmentation.png
    │   ├── ObjectDetection.png
    │   ├── SemanticSegmentation.png
    │   ├── inception_block1a.png
    │   ├── receptiveField.png
    │   └── resNet.jpg
    └── Readme.md
├── 5- Sequence Models
    ├── Images
    │   ├── 01.png
    │   ├── 02.png
    │   ├── 03.png
    │   ├── 04.png
    │   ├── 05.png
    │   ├── 06.png
    │   ├── 07.png
    │   ├── 08.png
    │   ├── 09.jpg
    │   ├── 10.png
    │   ├── 11.png
    │   ├── 12.png
    │   ├── 12_different_types_of_rnn.jpg
    │   ├── 13.png
    │   ├── 14.png
    │   ├── 15.png
    │   ├── 16.png
    │   ├── 17.png
    │   ├── 18.png
    │   ├── 19.png
    │   ├── 20.png
    │   ├── 21.png
    │   ├── 22.png
    │   ├── 23.png
    │   ├── 24.png
    │   ├── 25.png
    │   ├── 26.png
    │   ├── 27.png
    │   ├── 28.png
    │   ├── 29.png
    │   ├── 30.png
    │   ├── 31.png
    │   ├── 32.png
    │   ├── 33.png
    │   ├── 34.png
    │   ├── 35.png
    │   ├── 36.png
    │   ├── 37.png
    │   ├── 38.png
    │   ├── 39.png
    │   ├── 40.png
    │   ├── 41.png
    │   ├── 42.png
    │   ├── 43.png
    │   ├── 44.png
    │   ├── 45.png
    │   ├── 46.png
    │   ├── 47.png
    │   ├── 48.png
    │   ├── 49.png
    │   ├── 50.png
    │   ├── 51.png
    │   ├── 52.png
    │   ├── 53.png
    │   ├── 54.png
    │   ├── 55.png
    │   ├── 56.png
    │   ├── 57.png
    │   ├── 58.png
    │   ├── 59.png
    │   ├── 60.png
    │   ├── 61.png
    │   ├── 62.png
    │   ├── 63.png
    │   ├── 64.png
    │   ├── 65.png
    │   ├── 66.png
    │   ├── 67.png
    │   ├── 68.png
    │   ├── 69.png
    │   ├── 70.png
    │   ├── 71.png
    │   ├── 72.png
    │   ├── 73.png
    │   ├── 74.png
    │   ├── 75.png
    │   ├── 76.png
    │   ├── 77.png
    │   ├── 78.png
    │   ├── 79.png
    │   ├── 80.png
    │   ├── 81.png
    │   ├── 83.png
    │   ├── 84.png
    │   └── 85.png
    └── Readme.md
├── Certificate.png
├── LICENSE
├── Notebooks headers.md
├── Readme.md
└── download.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | 
28 | # PyInstaller
29 | #  Usually these files are written by a python script from a template
30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | 
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 | 
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | .hypothesis/
48 | 
49 | # Translations
50 | *.mo
51 | *.pot
52 | 
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 | 
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 | 
61 | # Scrapy stuff:
62 | .scrapy
63 | 
64 | # Sphinx documentation
65 | docs/_build/
66 | 
67 | # PyBuilder
68 | target/
69 | 
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 | 
73 | # pyenv
74 | .python-version
75 | 
76 | # celery beat schedule file
77 | celerybeat-schedule
78 | 
79 | # dotenv
80 | .env
81 | 
82 | # virtualenv
83 | .venv/
84 | venv/
85 | ENV/
86 | 
87 | # Spyder project settings
88 | .spyderproject
89 | 
90 | # Rope project settings
91 | .ropeproject
92 | 


--------------------------------------------------------------------------------
/1- Neural Networks and Deep Learning/Images/02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/1- Neural Networks and Deep Learning/Images/02.png


--------------------------------------------------------------------------------
/1- Neural Networks and Deep Learning/Images/03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/1- Neural Networks and Deep Learning/Images/03.png


--------------------------------------------------------------------------------
/1- Neural Networks and Deep Learning/Images/04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/1- Neural Networks and Deep Learning/Images/04.png


--------------------------------------------------------------------------------
/1- Neural Networks and Deep Learning/Images/05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/1- Neural Networks and Deep Learning/Images/05.png


--------------------------------------------------------------------------------
/1- Neural Networks and Deep Learning/Images/06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/1- Neural Networks and Deep Learning/Images/06.png


--------------------------------------------------------------------------------
/1- Neural Networks and Deep Learning/Images/07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/1- Neural Networks and Deep Learning/Images/07.png


--------------------------------------------------------------------------------
/1- Neural Networks and Deep Learning/Images/08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/1- Neural Networks and Deep Learning/Images/08.png


--------------------------------------------------------------------------------
/1- Neural Networks and Deep Learning/Images/09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/1- Neural Networks and Deep Learning/Images/09.png


--------------------------------------------------------------------------------
/1- Neural Networks and Deep Learning/Images/10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/1- Neural Networks and Deep Learning/Images/10.png


--------------------------------------------------------------------------------
/1- Neural Networks and Deep Learning/Images/11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/1- Neural Networks and Deep Learning/Images/11.png


--------------------------------------------------------------------------------
/1- Neural Networks and Deep Learning/Images/Others/01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/1- Neural Networks and Deep Learning/Images/Others/01.jpg


--------------------------------------------------------------------------------
/1- Neural Networks and Deep Learning/Images/Others/02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/1- Neural Networks and Deep Learning/Images/Others/02.png


--------------------------------------------------------------------------------
/1- Neural Networks and Deep Learning/Images/Others/03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/1- Neural Networks and Deep Learning/Images/Others/03.png


--------------------------------------------------------------------------------
/1- Neural Networks and Deep Learning/Readme.md:
--------------------------------------------------------------------------------
  1 | # Neural Networks and Deep Learning
  2 | 
  3 | This is the first course of the deep learning specialization at [Coursera](https://www.coursera.org/specializations/deep-learning) which is moderated by [DeepLearning.ai](http://deeplearning.ai/). The course is taught by Andrew Ng.
  4 | 
  5 | ## Table of contents
  6 | 
  7 | * [Neural Networks and Deep Learning](#neural-networks-and-deep-learning)
  8 |    * [Table of contents](#table-of-contents)
  9 |    * [Course summary](#course-summary)
 10 |    * [Introduction to deep learning](#introduction-to-deep-learning)
 11 |       * [What is a (Neural Network) NN?](#what-is-a-neural-network-nn)
 12 |       * [Supervised learning with neural networks](#supervised-learning-with-neural-networks)
 13 |       * [Why is deep learning taking off?](#why-is-deep-learning-taking-off)
 14 |    * [Neural Networks Basics](#neural-networks-basics)
 15 |       * [Binary classification](#binary-classification)
 16 |       * [Logistic regression](#logistic-regression)
 17 |       * [Logistic regression cost function](#logistic-regression-cost-function)
 18 |       * [Gradient Descent](#gradient-descent)
 19 |       * [Derivatives](#derivatives)
 20 |       * [More Derivatives examples](#more-derivatives-examples)
 21 |       * [Computation graph](#computation-graph)
 22 |       * [Derivatives with a Computation Graph](#derivatives-with-a-computation-graph)
 23 |       * [Logistic Regression Gradient Descent](#logistic-regression-gradient-descent)
 24 |       * [Gradient Descent on m Examples](#gradient-descent-on-m-examples)
 25 |       * [Vectorization](#vectorization)
 26 |       * [Vectorizing Logistic Regression](#vectorizing-logistic-regression)
 27 |       * [Notes on Python and NumPy](#notes-on-python-and-numpy)
 28 |       * [General Notes](#general-notes)
 29 |    * [Shallow neural networks](#shallow-neural-networks)
 30 |       * [Neural Networks Overview](#neural-networks-overview)
 31 |       * [Neural Network Representation](#neural-network-representation)
 32 |       * [Computing a Neural Network's Output](#computing-a-neural-networks-output)
 33 |       * [Vectorizing across multiple examples](#vectorizing-across-multiple-examples)
 34 |       * [Activation functions](#activation-functions)
 35 |       * [Why do you need non-linear activation functions?](#why-do-you-need-non-linear-activation-functions)
 36 |       * [Derivatives of activation functions](#derivatives-of-activation-functions)
 37 |       * [Gradient descent for Neural Networks](#gradient-descent-for-neural-networks)
 38 |       * [Random Initialization](#random-initialization)
 39 |    * [Deep Neural Networks](#deep-neural-networks)
 40 |       * [Deep L-layer neural network](#deep-l-layer-neural-network)
 41 |       * [Forward Propagation in a Deep Network](#forward-propagation-in-a-deep-network)
 42 |       * [Getting your matrix dimensions right](#getting-your-matrix-dimensions-right)
 43 |       * [Why deep representations?](#why-deep-representations)
 44 |       * [Building blocks of deep neural networks](#building-blocks-of-deep-neural-networks)
 45 |       * [Forward and Backward Propagation](#forward-and-backward-propagation)
 46 |       * [Parameters vs Hyperparameters](#parameters-vs-hyperparameters)
 47 |       * [What does this have to do with the brain](#what-does-this-have-to-do-with-the-brain)
 48 |    * [Extra: Ian Goodfellow interview](#extra-ian-goodfellow-interview)
 49 | 
 50 | ## Course summary
 51 | 
 52 | Here are the course summary as its given on the course [link](https://www.coursera.org/learn/neural-networks-deep-learning):
 53 | 
 54 | > If you want to break into cutting-edge AI, this course will help you do so. Deep learning engineers are highly sought after, and mastering deep learning will give you numerous new career opportunities. Deep learning is also a new "superpower" that will let you build AI systems that just weren't possible a few years ago.
 55 | >
 56 | > In this course, you will learn the foundations of deep learning. When you finish this class, you will:
 57 | > - Understand the major technology trends driving Deep Learning
 58 | > - Be able to build, train and apply fully connected deep neural networks
 59 | > - Know how to implement efficient (vectorized) neural networks
 60 | > - Understand the key parameters in a neural network's architecture
 61 | >
 62 | > This course also teaches you how Deep Learning actually works, rather than presenting only a cursory or surface-level description. So after completing it, you will be able to apply deep learning to a your own applications. If you are looking for a job in AI, after this course you will also be able to answer basic interview questions.
 63 | 
 64 | 
 65 | 
 66 | ## Introduction to deep learning
 67 | 
 68 | > Be able to explain the major trends driving the rise of deep learning, and understand where and how it is applied today.
 69 | 
 70 | ### What is a (Neural Network) NN?
 71 | 
 72 | - Single neuron == linear regression without applying activation(perceptron)
 73 | - Basically a single neuron will calculate weighted sum of input(W.T*X) and then we can set a threshold to predict output in a perceptron. If weighted sum of input cross the threshold, perceptron fires and if not then perceptron doesn't predict.
 74 | - Perceptron can take real values input or boolean values.
 75 | - Actually, when w⋅x+b=0 the perceptron outputs 0.
 76 | - Disadvantage of perceptron is that it only output binary values and if we try to give small change in weight and bais then perceptron can flip the output. We need some system which can modify the output slightly according to small change in weight and bias. Here comes sigmoid function in picture.
 77 | - If we change perceptron with a sigmoid function, then we can make slight change in output.
 78 | - e.g. output in perceptron = 0, you slightly changed weight and bias, output becomes = 1 but actual output is 0.7. In case of sigmoid, output1 = 0, slight change in weight and bias, output = 0.7. 
 79 | - If we apply sigmoid activation function then Single neuron will act as Logistic Regression.
 80 | -  we can understand difference between perceptron and sigmoid function by looking at sigmoid function graph.
 81 | 
 82 | - Simple NN graph:
 83 |   - ![](Images/Others/01.jpg)
 84 |   - Image taken from [tutorialspoint.com](http://www.tutorialspoint.com/)
 85 | - RELU stands for rectified linear unit is the most popular activation function right now that makes deep NNs train faster now.
 86 | - Hidden layers predicts connection between inputs automatically, thats what deep learning is good at.
 87 | - Deep NN consists of more hidden layers (Deeper layers)
 88 |   - ![](Images/Others/02.png)
 89 |   - Image taken from [opennn.net](http://www.opennn.net/)
 90 | - Each Input will be connected to the hidden layer and the NN will decide the connections.
 91 | - Supervised learning means we have the (X,Y) and we need to get the function that maps X to Y.
 92 | 
 93 | ### Supervised learning with neural networks
 94 | 
 95 | - Different types of neural networks for supervised learning which includes:
 96 |   - CNN or convolutional neural networks (Useful in computer vision)
 97 |   - RNN or Recurrent neural networks (Useful in Speech recognition or NLP)
 98 |   - Standard NN (Useful for Structured data)
 99 |   - Hybrid/custom NN or a Collection of NNs types
100 | - Structured data is like the databases and tables.
101 | - Unstructured data is like images, video, audio, and text.
102 | - Structured data gives more money because companies relies on prediction on its big data.
103 | 
104 | ### Why is deep learning taking off?
105 | 
106 | - Deep learning is taking off for 3 reasons:
107 |   1. Data:
108 |      - Using this image we can conclude:
109 |        - ![](Images/11.png)
110 |      - For small data NN can perform as Linear regression or SVM (Support vector machine)
111 |      - For big data a small NN is better that SVM
112 |      - For big data a big NN is better that a medium NN is better that small NN.
113 |      - Hopefully we have a lot of data because the world is using the computer a little bit more
114 |        - Mobiles
115 |        - IOT (Internet of things)
116 |   2. Computation:
117 |      - GPUs.
118 |      - Powerful CPUs.
119 |      - Distributed computing.
120 |      - ASICs
121 |   3. Algorithm:
122 |      1. Creative algorithms has appeared that changed the way NN works.
123 |         - For example using RELU function is so much better than using SIGMOID function in training a NN because it helps with the vanishing gradient problem.
124 | 
125 |   ​
126 | 
127 | ## Neural Networks Basics
128 | 
129 | > Learn to set up a machine learning problem with a neural network mindset. Learn to use vectorization to speed up your models.
130 | 
131 | ### Binary classification
132 | 
133 | - Mainly he is talking about how to do a logistic regression to make a binary classifier.
134 |   - ![log](Images/Others/03.png)
135 |   - Image taken from [3.bp.blogspot.com](http://3.bp.blogspot.com)
136 | - He talked about an example of knowing if the current image contains a cat or not.
137 | - Here are some notations:
138 |   - `M is the number of training vectors`
139 |   - `Nx is the size of the input vector`
140 |   - `Ny is the size of the output vector`
141 |   - `X(1) is the first input vector`
142 |   - `Y(1) is the first output vector`
143 |   - `X = [x(1) x(2).. x(M)]`
144 |   - `Y = (y(1) y(2).. y(M))`
145 | - We will use python in this course.
146 | - In NumPy we can make matrices and make operations on them in a fast and reliable time.
147 | 
148 | ### Logistic regression
149 | 
150 | - Algorithm is used for classification algorithm of 2 classes.
151 | - Equations:
152 |   - Simple equation:	`y = wx + b`
153 |   - If x is a vector: `y = w(transpose)x + b`
154 |   - If we need y to be in between 0 and 1 (probability): `y = sigmoid(w(transpose)x + b)`
155 |   - In some notations this might be used: `y = sigmoid(w(transpose)x)`
156 |     - While `b` is `w0` of `w` and we add `x0 = 1`. but we won't use this notation in the course (Andrew said that the first notation is better).
157 | - In binary classification `Y` has to be between `0` and `1`.
158 | - In the last equation `w` is a vector of `Nx` and `b` is a real number
159 | 
160 | ### Logistic regression cost function
161 | 
162 | - First loss function would be the square root error:  `L(y',y) = 1/2 (y' - y)^2`
163 |   - But we won't use this notation because it leads us to optimization problem which is non convex, means it contains local optimum points.
164 | - This is the function that we will use: `L(y',y) = - (y*log(y') + (1-y)*log(1-y'))`
165 | - To explain the last function lets see:
166 |   - if `y = 1` ==> `L(y',1) = -log(y')`  ==> we want `y'` to be the largest   ==> `y`' biggest value is 1
167 |   - if `y = 0` ==> `L(y',0) = -log(1-y')` ==> we want `1-y'` to be the largest ==> `y'` to be smaller as possible because it can only has 1 value.
168 | - Then the Cost function will be: `J(w,b) = (1/m) * Sum(L(y'[i],y[i]))`
169 | - The loss function computes the error for a single training example; the cost function is the average of the loss functions of the entire training set.
170 | 
171 | ### Gradient Descent
172 | 
173 | - We want to predict `w` and `b` that minimize the cost function.
174 | - Our cost function is convex.
175 | - First we initialize `w` and `b` to 0,0 or initialize them to a random value in the convex function and then try to improve the values the reach minimum value.
176 | - In Logistic regression people always use 0,0 instead of random.
177 | - The gradient decent algorithm repeats: `w = w - alpha * dw`
178 |   where alpha is the learning rate and `dw` is the derivative of `w` (Change to `w`)
179 |   The derivative is also the slope of `w`
180 | - Looks like greedy algorithms. the derivative give us the direction to improve our parameters.
181 | 
182 | 
183 | - The actual equations we will implement:
184 |   - `w = w - alpha * d(J(w,b) / dw)`        (how much the function slopes in the w direction)
185 |   - `b = b - alpha * d(J(w,b) / db)`        (how much the function slopes in the d direction)
186 | 
187 | ### Derivatives
188 | 
189 | - We will talk about some of required calculus.
190 | - You don't need to be a calculus geek to master deep learning but you'll need some skills from it.
191 | - Derivative of a linear line is its slope.
192 |   - ex. `f(a) = 3a`                    `d(f(a))/d(a) = 3`
193 |   - if `a = 2` then `f(a) = 6`
194 |   - if we move a a little bit `a = 2.001` then `f(a) = 6.003` means that we multiplied the derivative (Slope) to the moved area and added it to the last result.
195 | 
196 | ### More Derivatives examples
197 | 
198 | - `f(a) = a^2`  ==> `d(f(a))/d(a) = 2a`
199 |   - `a = 2`  ==> `f(a) = 4`
200 |   - `a = 2.0001` ==> `f(a) = 4.0004` approx.
201 | - `f(a) = a^3`  ==> `d(f(a))/d(a) = 3a^2`
202 | - `f(a) = log(a)`  ==> `d(f(a))/d(a) = 1/a`
203 | - To conclude, Derivative is the slope and slope is different in different points in the function thats why the derivative is a function.
204 | 
205 | ### Computation graph
206 | 
207 | - Its a graph that organizes the computation from left to right.
208 |   - ![](Images/02.png)
209 | 
210 | ### Derivatives with a Computation Graph
211 | 
212 | - Calculus chain rule says:
213 |   If `x -> y -> z`          (x effect y and y effects z)
214 |   Then `d(z)/d(x) = d(z)/d(y) * d(y)/d(x)`
215 | - The video illustrates a big example.
216 |   - ![](Images/03.png)
217 | - We compute the derivatives on a graph from right to left and it will be a lot more easier.
218 | - `dvar` means the derivatives of a final output variable with respect to various intermediate quantities.
219 | 
220 | ### Logistic Regression Gradient Descent
221 | 
222 | - In the video he discussed the derivatives of gradient decent example for one sample with two features `x1` and `x2`.
223 |   - ![](Images/04.png)
224 | 
225 | ### Gradient Descent on m Examples
226 | 
227 | - Lets say we have these variables:
228 | 
229 |   ```
230 |   	X1                  Feature
231 |   	X2                  Feature
232 |   	W1                  Weight of the first feature.
233 |   	W2                  Weight of the second feature.
234 |   	B                   Logistic Regression parameter.
235 |   	M                   Number of training examples
236 |   	Y(i)                Expected output of i
237 |   ```
238 | 
239 | - So we have:
240 |   ![](Images/09.png)
241 | 
242 | - Then from right to left we will calculate derivations compared to the result:
243 | 
244 |   ```
245 |   	d(a)  = d(l)/d(a) = -(y/a) + ((1-y)/(1-a))
246 |   	d(z)  = d(l)/d(z) = a - y
247 |   	d(W1) = X1 * d(z)
248 |   	d(W2) = X2 * d(z)
249 |   	d(B)  = d(z)
250 |   ```
251 | 
252 | - From the above we can conclude the logistic regression pseudo code:
253 | 
254 |   ```
255 |   	J = 0; dw1 = 0; dw2 =0; db = 0;                 # Devs.
256 |   	w1 = 0; w2 = 0; b=0;							# Weights
257 |   	for i = 1 to m
258 |   		# Forward pass
259 |   		z(i) = W1*x1(i) + W2*x2(i) + b
260 |   		a(i) = Sigmoid(z(i))
261 |   		J += (Y(i)*log(a(i)) + (1-Y(i))*log(1-a(i)))
262 | 
263 |   		# Backward pass
264 |   		dz(i) = a(i) - Y(i)
265 |   		dw1 += dz(i) * x1(i)
266 |   		dw2 += dz(i) * x2(i)
267 |   		db  += dz(i)
268 |   	J /= m
269 |   	dw1/= m
270 |   	dw2/= m
271 |   	db/= m
272 | 
273 |   	# Gradient descent
274 |   	w1 = w1 - alpha * dw1
275 |   	w2 = w2 - alpha * dw2
276 |   	b = b - alpha * db
277 |   ```
278 | 
279 | - The above code should run for some iterations to minimize error.
280 | 
281 | - So there will be two inner loops to implement the logistic regression.
282 | 
283 | - Vectorization is so important on deep learning to reduce loops. In the last code we can make the whole loop in one step using vectorization!
284 | 
285 | ### Vectorization
286 | 
287 | - Deep learning shines when the dataset are big. However for loops will make you wait a lot for a result. Thats why we need vectorization to get rid of some of our for loops.
288 | - NumPy library (dot) function is using vectorization by default.
289 | - The vectorization can be done on CPU or GPU thought the SIMD operation. But its faster on GPU.
290 | - Whenever possible avoid for loops.
291 | - Most of the NumPy library methods are vectorized version.
292 | 
293 | ### Vectorizing Logistic Regression
294 | 
295 | - We will implement Logistic Regression using one for loop then without any for loop.
296 | - As an input we have a matrix `X` and its `[Nx, m]` and a matrix `Y` and its `[Ny, m]`.
297 | - We will then compute at instance `[z1,z2...zm] = W' * X + [b,b,...b]`. This can be written in python as:
298 | 
299 |     		Z = np.dot(W.T,X) + b    # Vectorization, then broadcasting, Z shape is (1, m)
300 |     		A = 1 / 1 + np.exp(-Z)   # Vectorization, A shape is (1, m)
301 | 
302 | - Vectorizing Logistic Regression's Gradient Output:
303 | 
304 |    			dz = A - Y                  # Vectorization, dz shape is (1, m)
305 |    			dw = np.dot(X, dz.T) / m    # Vectorization, dw shape is (Nx, 1)
306 |    			db = dz.sum() / m           # Vectorization, dz shape is (1, 1)
307 | 
308 | ### Notes on Python and NumPy
309 | 
310 | - In NumPy, `obj.sum(axis = 0)` sums the columns while `obj.sum(axis = 1)` sums the rows.
311 | - In NumPy, `obj.reshape(1,4)` changes the shape of the matrix by broadcasting the values.
312 | - Reshape is cheap in calculations so put it everywhere you're not sure about the calculations.
313 | - Broadcasting works when you do a matrix operation with matrices that doesn't match for the operation, in this case NumPy automatically makes the shapes ready for the operation by broadcasting the values.
314 | - In general principle of broadcasting. If you have an (m,n) matrix and you add(+) or subtract(-) or multiply(*) or divide(/) with a (1,n) matrix, then this will copy it m times into an (m,n) matrix. The same with if you use those operations with a (m , 1) matrix, then this will copy it n times into (m, n) matrix. And then apply the addition, subtraction, and multiplication of division element wise.
315 | - Some tricks to eliminate all the strange bugs in the code:
316 |   - If you didn't specify the shape of a vector, it will take a shape of `(m,)` and the transpose operation won't work. You have to reshape it to `(m, 1)`
317 |   - Try to not use the rank one matrix in ANN
318 |   - Don't hesitate to use `assert(a.shape == (5,1))` to check if your matrix shape is the required one.
319 |   - If you've found a rank one matrix try to run reshape on it.
320 | - Jupyter / IPython notebooks are so useful library in python that makes it easy to integrate code and document at the same time. It runs in the browser and doesn't need an IDE to run.
321 |   - To open Jupyter Notebook, open the command line and call: `jupyter-notebook` It should be installed to work.
322 | - To Compute the derivative of Sigmoid:
323 | 
324 |   ```
325 |   	s = sigmoid(x)
326 |   	ds = s * (1 - s)       # derivative  using calculus
327 |   ```
328 | 
329 | - To make an image of `(width,height,depth)` be a vector, use this:
330 | 
331 |   ```
332 |   v = image.reshape(image.shape[0]*image.shape[1]*image.shape[2],1)  #reshapes the image.
333 |   ```
334 | 
335 | - Gradient descent converges faster after normalization of the input matrices.
336 | 
337 | ### General Notes
338 | 
339 | - The main steps for building a Neural Network are:
340 |   - Define the model structure (such as number of input features and outputs)
341 |   - Initialize the model's parameters.
342 |   - Loop.
343 |     - Calculate current loss (forward propagation)
344 |     - Calculate current gradient (backward propagation)
345 |     - Update parameters (gradient descent)
346 | - Preprocessing the dataset is important.
347 | - Tuning the learning rate (which is an example of a "hyperparameter") can make a big difference to the algorithm.
348 | - [kaggle.com](kaggle.com) is a good place for datasets and competitions.
349 | - [Pieter Abbeel](https://www2.eecs.berkeley.edu/Faculty/Homepages/abbeel.html) is one of the best in deep reinforcement learning.
350 | 
351 | 
352 | ## Shallow neural networks
353 | 
354 | > Learn to build a neural network with one hidden layer, using forward propagation and backpropagation.
355 | 
356 | ### Neural Networks Overview
357 | 
358 | - In logistic regression we had:
359 | 
360 |   ```
361 |   X1  \  
362 |   X2   ==>  z = XW + B ==> a = Sigmoid(z) ==> l(a,Y)
363 |   X3  /
364 |   ```
365 | 
366 | - In neural networks with one layer we will have:
367 | 
368 |   ```
369 |   X1  \  
370 |   X2   =>  z1 = XW1 + B1 => a1 = Sigmoid(z1) => z2 = a1W2 + B2 => a2 = Sigmoid(z2) => l(a2,Y)
371 |   X3  /
372 |   ```
373 | 
374 | 
375 | - `X` is the input vector `(X1, X2, X3)`, and `Y` is the output variable `(1x1)`
376 | - NN is stack of logistic regression objects.
377 | 
378 | ### Neural Network Representation
379 | 
380 | - We will define the neural networks that has one hidden layer.
381 | - NN contains of input layers, hidden layers, output layers.
382 | - Hidden layer means we cant see that layers in the training set.
383 | - `a0 = x` (the input layer)
384 | - `a1` will represent the activation of the hidden neurons.
385 | - `a2` will represent the output layer.
386 | - We are talking about 2 layers NN. The input layer isn't counted.
387 | 
388 | ### Computing a Neural Network's Output
389 | 
390 | - Equations of Hidden layers:
391 |   - ![](Images/05.png)
392 | - Here are some informations about the last image:
393 |   - `noOfHiddenNeurons = 4`
394 |   - `Nx = 3`
395 |   - Shapes of the variables:
396 |     - `W1` is the matrix of the first hidden layer, it has a shape of `(noOfHiddenNeurons,nx)`
397 |     - `b1` is the matrix of the first hidden layer, it has a shape of `(noOfHiddenNeurons,1)`
398 |     - `z1` is the result of the equation `z1 = W1*X + b`, it has a shape of `(noOfHiddenNeurons,1)`
399 |     - `a1` is the result of the equation `a1 = sigmoid(z1)`, it has a shape of `(noOfHiddenNeurons,1)`
400 |     - `W2` is the matrix of the second hidden layer, it has a shape of `(1,noOfHiddenNeurons)`
401 |     - `b2` is the matrix of the second hidden layer, it has a shape of `(1,1)`
402 |     - `z2` is the result of the equation `z2 = W2*a1 + b`, it has a shape of `(1,1)`
403 |     - `a2` is the result of the equation `a2 = sigmoid(z2)`, it has a shape of `(1,1)`
404 | 
405 | ### Vectorizing across multiple examples
406 | 
407 | - Pseudo code for forward propagation for the 2 layers NN:
408 | 
409 |   ```
410 |   for i = 1 to m
411 |     z[1, i] = W1*x[i] + b1      # shape of z[1, i] is (noOfHiddenNeurons,1)
412 |     a[1, i] = sigmoid(z[1, i])  # shape of a[1, i] is (noOfHiddenNeurons,1)
413 |     z[2, i] = W2*a[1, i] + b2   # shape of z[2, i] is (1,1)
414 |     a[2, i] = sigmoid(z[2, i])  # shape of a[2, i] is (1,1)
415 |   ```
416 | 
417 | - Lets say we have `X` on shape `(Nx,m)`. So the new pseudo code:
418 | 
419 |   ```
420 |   Z1 = W1X + b1     # shape of Z1 (noOfHiddenNeurons,m)
421 |   A1 = sigmoid(Z1)  # shape of A1 (noOfHiddenNeurons,m)
422 |   Z2 = W2A1 + b2    # shape of Z2 is (1,m)
423 |   A2 = sigmoid(Z2)  # shape of A2 is (1,m)
424 |   ```
425 | 
426 | - If you notice always m is the number of columns.
427 | - In the last example we can call `X` = `A0`. So the previous step can be rewritten as:
428 | 
429 |   ```
430 |   Z1 = W1A0 + b1    # shape of Z1 (noOfHiddenNeurons,m)
431 |   A1 = sigmoid(Z1)  # shape of A1 (noOfHiddenNeurons,m)
432 |   Z2 = W2A1 + b2    # shape of Z2 is (1,m)
433 |   A2 = sigmoid(Z2)  # shape of A2 is (1,m)
434 |   ```
435 | 
436 | ### Activation functions
437 | 
438 | - So far we are using sigmoid, but in some cases other functions can be a lot better.
439 | - Sigmoid can lead us to gradient decent problem where the updates are so low.
440 | - Sigmoid activation function range is [0,1]
441 |   `A = 1 / (1 + np.exp(-z)) # Where z is the input matrix`
442 | - Tanh activation function range is [-1,1]   (Shifted version of sigmoid function)
443 |   - In NumPy we can implement Tanh using one of these methods:
444 |     `A = (np.exp(z) - np.exp(-z)) / (np.exp(z) + np.exp(-z)) # Where z is the input matrix`
445 | 
446 |     Or
447 |     `A = np.tanh(z)   # Where z is the input matrix`
448 | - It turns out that the tanh activation usually works better than sigmoid activation function for hidden units because the mean of its output is closer to zero, and so it centers the data better for the next layer.
449 | - Sigmoid or Tanh function disadvantage is that if the input is too small or too high, the slope will be near zero which will cause us the gradient decent problem.
450 | - One of the popular activation functions that solved the slow gradient decent is the RELU function.
451 |   `RELU = max(0,z) # so if z is negative the slope is 0 and if z is positive the slope remains linear.`
452 | - So here is some basic rule for choosing activation functions, if your classification is between 0 and 1, use the output activation as sigmoid and the others as RELU.
453 | - Leaky RELU activation function different of RELU is that if the input is negative the slope will be so small. It works as RELU but most people uses RELU.
454 |   `Leaky_RELU = max(0.01z,z)  #the 0.01 can be a parameter for your algorithm.`
455 | - In NN you will decide a lot of choices like:
456 |   - No of hidden layers.
457 |   - No of neurons in each hidden layer.
458 |   - Learning rate.       (The most important parameter)
459 |   - Activation functions.
460 |   - And others..
461 | - It turns out there are no guide lines for that. You should try all activation functions for example.
462 | 
463 | ### Why do you need non-linear activation functions?
464 | 
465 | - If we removed the activation function from our algorithm that can be called linear activation function.
466 | - Linear activation function will output linear activations
467 |   - Whatever hidden layers you add, the activation will be always linear like logistic regression (So its useless in a lot of complex problems)
468 | - You might use linear activation function in one place - in the output layer if the output is real numbers (regression problem). But even in this case if the output value is non-negative you could use RELU instead.
469 | 
470 | ### Derivatives of activation functions
471 | 
472 | - Derivation of Sigmoid activation function:
473 | 
474 |   ```
475 |   g(z)  = 1 / (1 + np.exp(-z))
476 |   g'(z) = (1 / (1 + np.exp(-z))) * (1 - (1 / (1 + np.exp(-z))))
477 |   g'(z) = g(z) * (1 - g(z))
478 |   ```
479 | 
480 | - Derivation of Tanh activation function:
481 | 
482 |   ```
483 |   g(z)  = (e^z - e^-z) / (e^z + e^-z)
484 |   g'(z) = 1 - np.tanh(z)^2 = 1 - g(z)^2
485 |   ```
486 | 
487 | - Derivation of RELU activation function:
488 | 
489 |   ```
490 |   g(z)  = np.maximum(0,z)
491 |   g'(z) = { 0  if z < 0
492 |             1  if z >= 0  }
493 |   ```
494 | 
495 | - Derivation of leaky RELU activation function:
496 | 
497 |   ```
498 |   g(z)  = np.maximum(0.01 * z, z)
499 |   g'(z) = { 0.01  if z < 0
500 |             1     if z >= 0   }
501 |   ```
502 | 
503 | ### Gradient descent for Neural Networks
504 | - In this section we will have the full back propagation of the neural network (Just the equations with no explanations).
505 | - Gradient descent algorithm:
506 |   - NN parameters:
507 |     - `n[0] = Nx`
508 |     - `n[1] = NoOfHiddenNeurons`
509 |     - `n[2] = NoOfOutputNeurons = 1`
510 |     - `W1` shape is `(n[1],n[0])`
511 |     - `b1` shape is `(n[1],1)`
512 |     - `W2` shape is `(n[2],n[1])`
513 |     - `b2` shape is `(n[2],1)`
514 |   - Cost function `I =  I(W1, b1, W2, b2) = (1/m) * Sum(L(Y,A2))`
515 |   - Then Gradient descent:
516 | 
517 |     ```
518 |     Repeat:
519 |     		Compute predictions (y'[i], i = 0,...m)
520 |     		Get derivatives: dW1, db1, dW2, db2
521 |     		Update: W1 = W1 - LearningRate * dW1
522 |     				b1 = b1 - LearningRate * db1
523 |     				W2 = W2 - LearningRate * dW2
524 |     				b2 = b2 - LearningRate * db2
525 |     ```
526 | 
527 | - Forward propagation:
528 | 
529 |   ```
530 |   Z1 = W1A0 + b1    # A0 is X
531 |   A1 = g1(Z1)
532 |   Z2 = W2A1 + b2
533 |   A2 = Sigmoid(Z2)      # Sigmoid because the output is between 0 and 1
534 |   ```
535 | 
536 | - Backpropagation (derivations):   
537 |   ```
538 |   dZ2 = A2 - Y      # derivative of cost function we used * derivative of the sigmoid function
539 |   dW2 = (dZ2 * A1.T) / m
540 |   db2 = Sum(dZ2) / m
541 |   dZ1 = (W2.T * dZ2) * g'1(Z1)  # element wise product (*)
542 |   dW1 = (dZ1 * A0.T) / m   # A0 = X
543 |   db1 = Sum(dZ1) / m
544 |   # Hint there are transposes with multiplication because to keep dimensions correct
545 |   ```
546 | - How we derived the 6 equations of the backpropagation:   
547 |   ![](Images/06.png)
548 | 
549 | ### Random Initialization
550 | 
551 | - In logistic regression it wasn't important to initialize the weights randomly, while in NN we have to initialize them randomly.
552 | 
553 | - If we initialize all the weights with zeros in NN it won't work (initializing bias with zero is OK):
554 |   - all hidden units will be completely identical (symmetric) - compute exactly the same function
555 |   - on each gradient descent iteration all the hidden units will always update the same
556 | 
557 | - To solve this we initialize the W's with a small random numbers:
558 | 
559 |   ```
560 |   W1 = np.random.randn((2,2)) * 0.01    # 0.01 to make it small enough
561 |   b1 = np.zeros((2,1))                  # its ok to have b as zero, it won't get us to the symmetry breaking problem
562 |   ```
563 | 
564 | - We need small values because in sigmoid (or tanh), for example, if the weight is too large you are more likely to end up even at the very start of training with very large values of Z. Which causes your tanh or your sigmoid activation function to be saturated, thus slowing down learning. If you don't have any sigmoid or tanh activation functions throughout your neural network, this is less of an issue.
565 | 
566 | - Constant 0.01 is alright for 1 hidden layer networks, but if the NN is deep this number can be changed but it will always be a small number.
567 | 
568 | ## Deep Neural Networks
569 | 
570 | > Understand the key computations underlying deep learning, use them to build and train deep neural networks, and apply it to computer vision.
571 | 
572 | ### Deep L-layer neural network
573 | 
574 | - Shallow NN is a NN with one or two layers.
575 | - Deep NN is a NN with three or more layers.
576 | - We will use the notation `L` to denote the number of layers in a NN.
577 | - `n[l]` is the number of neurons in a specific layer `l`.
578 | - `n[0]` denotes the number of neurons input layer. `n[L]` denotes the number of neurons in output layer.
579 | - `g[l]` is the activation function.
580 | - `a[l] = g[l](z[l])`
581 | - `w[l]` weights is used for `z[l]`
582 | - `x = a[0]`, `a[l] = y'`
583 | - These were the notation we will use for deep neural network.
584 | - So we have:
585 |   - A vector `n` of shape `(1, NoOfLayers+1)`
586 |   - A vector `g` of shape `(1, NoOfLayers)`
587 |   - A list of different shapes `w` based on the number of neurons on the previous and the current layer.
588 |   - A list of different shapes `b` based on the number of neurons on the current layer.
589 | 
590 | ### Forward Propagation in a Deep Network
591 | 
592 | - Forward propagation general rule for one input:
593 | 
594 |   ```
595 |   z[l] = W[l]a[l-1] + b[l]
596 |   a[l] = g[l](a[l])
597 |   ```
598 | 
599 | - Forward propagation general rule for `m` inputs:
600 | 
601 |   ```
602 |   Z[l] = W[l]A[l-1] + B[l]
603 |   A[l] = g[l](A[l])
604 |   ```
605 | 
606 | - We can't compute the whole layers forward propagation without a for loop so its OK to have a for loop here.
607 | - The dimensions of the matrices are so important you need to figure it out.
608 | 
609 | ### Getting your matrix dimensions right
610 | 
611 | - The best way to debug your matrices dimensions is by a pencil and paper.
612 | - Dimension of `W` is `(n[l],n[l-1])` . Can be thought by right to left.
613 | - Dimension of `b` is `(n[l],1)`
614 | - `dw` has the same shape as `W`, while `db` is the same shape as `b`
615 | - Dimension of `Z[l],` `A[l]`, `dZ[l]`, and `dA[l]`  is `(n[l],m)`
616 | 
617 | ### Why deep representations?
618 | 
619 | - Why deep NN works well, we will discuss this question in this section.
620 | - Deep NN makes relations with data from simpler to complex. In each layer it tries to make a relation with the previous layer. E.g.:
621 |   - 1) Face recognition application:
622 |       - Image ==> Edges ==> Face parts ==> Faces ==> desired face
623 |   - 2) Audio recognition application:
624 |       - Audio ==> Low level sound features like (sss,bb) ==> Phonemes ==> Words ==> Sentences
625 | - Neural Researchers think that deep neural networks "think" like brains (simple ==> complex)
626 | - Circuit theory and deep learning:
627 |   - ![](Images/07.png)
628 | - When starting on an application don't start directly by dozens of hidden layers. Try the simplest solutions (e.g. Logistic Regression), then try the shallow neural network and so on.
629 | 
630 | ### Building blocks of deep neural networks
631 | 
632 | - Forward and back propagation for a layer l:
633 |   - ![Untitled](Images/10.png)
634 | - Deep NN blocks:
635 |   - ![](Images/08.png)
636 | 
637 | ### Forward and Backward Propagation
638 | 
639 | - Pseudo code for forward propagation for layer l:
640 | 
641 |   ```
642 |   Input  A[l-1]
643 |   Z[l] = W[l]A[l-1] + b[l]
644 |   A[l] = g[l](Z[l])
645 |   Output A[l], cache(Z[l])
646 |   ```
647 | 
648 | - Pseudo  code for back propagation for layer l:
649 | 
650 |   ```
651 |   Input da[l], Caches
652 |   dZ[l] = dA[l] * g'[l](Z[l])
653 |   dW[l] = (dZ[l]A[l-1].T) / m
654 |   db[l] = sum(dZ[l])/m                # Dont forget axis=1, keepdims=True
655 |   dA[l-1] = w[l].T * dZ[l]            # The multiplication here are a dot product.
656 |   Output dA[l-1], dW[l], db[l]
657 |   ```
658 | 
659 | - If we have used our loss function then:
660 | 
661 |   ```
662 |   dA[L] = (-(y/a) + ((1-y)/(1-a)))
663 |   ```
664 | 
665 | ### Parameters vs Hyperparameters
666 | 
667 | - Main parameters of the NN is `W` and `b`
668 | - Hyper parameters (parameters that control the algorithm) are like:
669 |   - Learning rate.
670 |   - Number of iteration.
671 |   - Number of hidden layers `L`.
672 |   - Number of hidden units `n`.
673 |   - Choice of activation functions.
674 | - You have to try values yourself of hyper parameters.
675 | - In the earlier days of DL and ML learning rate was often called a parameter, but it really is (and now everybody call it) a hyperparameter.
676 | - On the next course we will see how to optimize hyperparameters.
677 | 
678 | ### What does this have to do with the brain
679 | 
680 | - The analogy that "It is like the brain" has become really an oversimplified explanation.
681 | - There is a very simplistic analogy between a single logistic unit and a single neuron in the brain.
682 | - No human today understand how a human brain neuron works.
683 | - No human today know exactly how many neurons on the brain.
684 | - Deep learning in Andrew's opinion is very good at learning very flexible, complex functions to learn X to Y mappings, to learn input-output mappings (supervised learning).
685 | - The field of computer vision has taken a bit more inspiration from the human brains then other disciplines that also apply deep learning.
686 | - NN is a small representation of how brain work. The most near model of human brain is in the computer vision (CNN)
687 | 
688 | ## Extra: Ian Goodfellow interview
689 | 
690 | - Ian is one of the world's most visible deep learning researchers.
691 | - Ian is mainly working with generative models. He is the creator of GANs.
692 | - We need to stabilize GANs. Stabilized GANs can become the best generative models.
693 | - Ian wrote the first textbook on the modern version of deep learning with Yoshua Bengio and Aaron Courville.
694 | - Ian worked with [OpenAI.com](https://openai.com/) and Google on ML and NN applications.
695 | - Ian tells all who wants to get into AI to get a Ph.D. or post your code on Github and the companies will find you.
696 | - Ian thinks that we need to start anticipating security problems with ML now and make sure that these algorithms are secure from the start instead of trying to patch it in retroactively years later.
697 | 
698 | 
699 | 
700 | 
701 | 
702 | <br><br>
703 | <br><br>
704 | These Notes were made by [Mahmoud Badry](mailto:mma18@fayoum.edu.eg) @2017
705 | 


--------------------------------------------------------------------------------
/2- Improving Deep Neural Networks/Images/01-_Bias_-_Variance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/2- Improving Deep Neural Networks/Images/01-_Bias_-_Variance.png


--------------------------------------------------------------------------------
/2- Improving Deep Neural Networks/Images/02-_Early_stopping.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/2- Improving Deep Neural Networks/Images/02-_Early_stopping.png


--------------------------------------------------------------------------------
/2- Improving Deep Neural Networks/Images/03-_Numerical_approximation_of_gradients.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/2- Improving Deep Neural Networks/Images/03-_Numerical_approximation_of_gradients.png


--------------------------------------------------------------------------------
/2- Improving Deep Neural Networks/Images/04-_batch_vs_mini_batch_cost.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/2- Improving Deep Neural Networks/Images/04-_batch_vs_mini_batch_cost.png


--------------------------------------------------------------------------------
/2- Improving Deep Neural Networks/Images/05-_exponentially_weighted_averages_intuitions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/2- Improving Deep Neural Networks/Images/05-_exponentially_weighted_averages_intuitions.png


--------------------------------------------------------------------------------
/2- Improving Deep Neural Networks/Images/06-_RMSprop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/2- Improving Deep Neural Networks/Images/06-_RMSprop.png


--------------------------------------------------------------------------------
/2- Improving Deep Neural Networks/Images/07-_softmax.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/2- Improving Deep Neural Networks/Images/07-_softmax.png


--------------------------------------------------------------------------------
/2- Improving Deep Neural Networks/Images/Nasdaq1_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/2- Improving Deep Neural Networks/Images/Nasdaq1_small.png


--------------------------------------------------------------------------------
/2- Improving Deep Neural Networks/Images/bn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/2- Improving Deep Neural Networks/Images/bn.png


--------------------------------------------------------------------------------
/2- Improving Deep Neural Networks/Readme.md:
--------------------------------------------------------------------------------
  1 | # Improving Deep Neural Networks: Hyperparameter tuning, Regularization and Optimization
  2 | 
  3 | This is the second course of the deep learning specialization at [Coursera](https://www.coursera.org/specializations/deep-learning) which is moderated by [DeepLearning.ai](http://deeplearning.ai/). The course is taught by Andrew Ng.
  4 | 
  5 | ## Table of contents
  6 | 
  7 | * [Improving Deep Neural Networks: Hyperparameter tuning, Regularization and Optimization](#improving-deep-neural-networks-hyperparameter-tuning-regularization-and-optimization)
  8 |    * [Table of contents](#table-of-contents)
  9 |    * [Course summary](#course-summary)
 10 |    * [Practical aspects of Deep Learning](#practical-aspects-of-deep-learning)
 11 |       * [Train / Dev / Test sets](#train--dev--test-sets)
 12 |       * [Bias / Variance](#bias--variance)
 13 |       * [Basic Recipe for Machine Learning](#basic-recipe-for-machine-learning)
 14 |       * [Regularization](#regularization)
 15 |       * [Why regularization reduces overfitting?](#why-regularization-reduces-overfitting)
 16 |       * [Dropout Regularization](#dropout-regularization)
 17 |       * [Understanding Dropout](#understanding-dropout)
 18 |       * [Other regularization methods](#other-regularization-methods)
 19 |       * [Normalizing inputs](#normalizing-inputs)
 20 |       * [Vanishing / Exploding gradients](#vanishing--exploding-gradients)
 21 |       * [Weight Initialization for Deep Networks](#weight-initialization-for-deep-networks)
 22 |       * [Numerical approximation of gradients](#numerical-approximation-of-gradients)
 23 |       * [Gradient checking implementation notes](#gradient-checking-implementation-notes)
 24 |       * [Initialization summary](#initialization-summary)
 25 |       * [Regularization summary](#regularization-summary)
 26 |    * [Optimization algorithms](#optimization-algorithms)
 27 |       * [Mini-batch gradient descent](#mini-batch-gradient-descent)
 28 |       * [Understanding mini-batch gradient descent](#understanding-mini-batch-gradient-descent)
 29 |       * [Exponentially weighted averages](#exponentially-weighted-averages)
 30 |       * [Understanding exponentially weighted averages](#understanding-exponentially-weighted-averages)
 31 |       * [Bias correction in exponentially weighted averages](#bias-correction-in-exponentially-weighted-averages)
 32 |       * [Gradient descent with momentum](#gradient-descent-with-momentum)
 33 |       * [RMSprop](#rmsprop)
 34 |       * [Adam optimization algorithm](#adam-optimization-algorithm)
 35 |       * [Learning rate decay](#learning-rate-decay)
 36 |       * [The problem of local optima](#the-problem-of-local-optima)
 37 |    * [Hyperparameter tuning, Batch Normalization and Programming Frameworks](#hyperparameter-tuning-batch-normalization-and-programming-frameworks)
 38 |       * [Tuning process](#tuning-process)
 39 |       * [Using an appropriate scale to pick hyperparameters](#using-an-appropriate-scale-to-pick-hyperparameters)
 40 |       * [Hyperparameters tuning in practice: Pandas vs. Caviar](#hyperparameters-tuning-in-practice-pandas-vs-caviar)
 41 |       * [Normalizing activations in a network](#normalizing-activations-in-a-network)
 42 |       * [Fitting Batch Normalization into a neural network](#fitting-batch-normalization-into-a-neural-network)
 43 |       * [Why does Batch normalization work?](#why-does-batch-normalization-work)
 44 |       * [Batch normalization at test time](#batch-normalization-at-test-time)
 45 |       * [Softmax Regression](#softmax-regression)
 46 |       * [Training a Softmax classifier](#training-a-softmax-classifier)
 47 |       * [Deep learning frameworks](#deep-learning-frameworks)
 48 |       * [TensorFlow](#tensorflow)
 49 |    * [Extra Notes](#extra-notes)
 50 | 
 51 | ## Course summary
 52 | 
 53 | Here are the course summary as its given on the course [link](https://www.coursera.org/learn/deep-neural-network):
 54 | 
 55 | > This course will teach you the "magic" of getting deep learning to work well. Rather than the deep learning process being a black box, you will understand what drives performance, and be able to more systematically get good results. You will also learn TensorFlow. 
 56 | >
 57 | > After 3 weeks, you will: 
 58 | > - Understand industry best-practices for building deep learning applications. 
 59 | > - Be able to effectively use the common neural network "tricks", including initialization, L2 and dropout regularization, Batch normalization, gradient checking, 
 60 | > - Be able to implement and apply a variety of optimization algorithms, such as mini-batch gradient descent, Momentum, RMSprop and Adam, and check for their convergence. 
 61 | > - Understand new best-practices for the deep learning era of how to set up train/dev/test sets and analyze bias/variance
 62 | > - Be able to implement a neural network in TensorFlow. 
 63 | >
 64 | > This is the second course of the Deep Learning Specialization.
 65 | 
 66 | 
 67 | 
 68 | ## Practical aspects of Deep Learning
 69 | 
 70 | ### Train / Dev / Test sets
 71 | 
 72 | - Its impossible to get all your hyperparameters right on a new application from the first time.
 73 | - So the idea is you go through the loop: `Idea ==> Code ==> Experiment`.
 74 | - You have to go through the loop many times to figure out your hyperparameters.
 75 | - Your data will be split into three parts:
 76 |   - Training set.       (Has to be the largest set)
 77 |   - Hold-out cross validation set / Development or "dev" set.
 78 |   - Testing set.
 79 | - You will try to build a model upon training set then try to optimize hyperparameters on dev set as much as possible. Then after your model is ready you try and evaluate the testing set.
 80 | - so the trend on the ratio of splitting the models:
 81 |   - If size of the  dataset is 100 to 1000000  ==> 60/20/20
 82 |   - If size of the  dataset is 1000000  to INF  ==> 98/1/1 or  99.5/0.25/0.25
 83 | - The trend now gives the training data the biggest sets.
 84 | - Make sure the dev and test set are coming from the same distribution.
 85 |   - For example if cat training pictures is from the web and the dev/test pictures are from users cell phone they will mismatch. It is better to make sure that dev and test set are from the same distribution.
 86 | - The dev set rule is to try them on some of the good models you've created.
 87 | - Its OK to only have a dev set without a testing set. But a lot of people in this case call the dev set as the test set. A better terminology is to call it a dev set as its used in the development.
 88 | 
 89 | ### Bias / Variance
 90 | 
 91 | - Bias / Variance techniques are Easy to learn, but difficult to master.
 92 | - So here the explanation of Bias / Variance:
 93 |   - If your model is underfitting (logistic regression of non linear data) it has a "high bias"
 94 |   - If your model is overfitting then it has a "high variance"
 95 |   - Your model will be alright if you balance the Bias / Variance
 96 |   - For more:
 97 |     - ![](Images/01-_Bias_-_Variance.png)
 98 | - Another idea to get the bias /  variance if you don't have a 2D plotting mechanism:
 99 |   - High variance (overfitting) for example:
100 |     - Training error: 1%
101 |     - Dev error: 11%
102 |   - high Bias (underfitting) for example:
103 |     - Training error: 15%
104 |     - Dev error: 14%
105 |   - high Bias (underfitting) && High variance (overfitting) for example:
106 |     - Training error: 15%
107 |     - Test error: 30%
108 |   - Best:
109 |     - Training error: 0.5%
110 |     - Test error: 1%
111 |   - These Assumptions came from that human has 0% error. If the problem isn't like that you'll need to use human error as baseline.
112 | 
113 | ### Basic Recipe for Machine Learning
114 | 
115 | - If your algorithm has a high bias:
116 |   - Try to make your NN bigger (size of hidden units, number of layers)
117 |   - Try a different model that is suitable for your data.
118 |   - Try to run it longer.
119 |   - Different (advanced) optimization algorithms.
120 | - If your algorithm has a high variance:
121 |   - More data.
122 |   - Try regularization.
123 |   - Try a different model that is suitable for your data.
124 | - You should try the previous two points until you have a low bias and low variance.
125 | - In the older days before deep learning, there was a "Bias/variance tradeoff". But because now you have more options/tools for solving the bias and variance problem its really helpful to use deep learning.
126 | - Training a bigger neural network never hurts.
127 | 
128 | ### Regularization
129 | 
130 | - Adding regularization to NN will help it reduce variance (overfitting)
131 | - L1 matrix norm:
132 |   - `||W|| = Sum(|w[i,j]|)  # sum of absolute values of all w`
133 | - L2 matrix norm because of arcane technical math reasons is called Frobenius norm:
134 |   - `||W||^2 = Sum(|w[i,j]|^2)	# sum of all w squared`
135 |   - Also can be calculated as `||W||^2 = W.T * W if W is a vector`
136 | - Regularization for logistic regression:
137 |   - The normal cost function that we want to minimize is: `J(w,b) = (1/m) * Sum(L(y(i),y'(i)))`
138 |   - The L2 regularization version: `J(w,b) = (1/m) * Sum(L(y(i),y'(i))) + (lambda/2m) * Sum(|w[i]|^2)`
139 |   - The L1 regularization version: `J(w,b) = (1/m) * Sum(L(y(i),y'(i))) + (lambda/2m) * Sum(|w[i]|)`
140 |   - The L1 regularization version makes a lot of w values become zeros, which makes the model size smaller.
141 |   - L2 regularization is being used much more often.
142 |   - `lambda` here is the regularization parameter (hyperparameter)
143 | - Regularization for NN:
144 |   - The normal cost function that we want to minimize is:   
145 |     `J(W1,b1...,WL,bL) = (1/m) * Sum(L(y(i),y'(i)))`
146 | 
147 |   - The L2 regularization version:   
148 |     `J(w,b) = (1/m) * Sum(L(y(i),y'(i))) + (lambda/2m) * Sum((||W[l]||^2)`
149 | 
150 |   - We stack the matrix as one vector `(mn,1)` and then we apply `sqrt(w1^2 + w2^2.....)`
151 | 
152 |   - To do back propagation (old way):   
153 |     `dw[l] = (from back propagation)`
154 | 
155 |   - The new way:   
156 |     `dw[l] = (from back propagation) + lambda/m * w[l]`
157 | 
158 |   - So plugging it in weight update step:
159 | 
160 |     - ```
161 |       w[l] = w[l] - learning_rate * dw[l]
162 |            = w[l] - learning_rate * ((from back propagation) + lambda/m * w[l])
163 |            = w[l] - (learning_rate*lambda/m) * w[l] - learning_rate * (from back propagation) 
164 |            = (1 - (learning_rate*lambda)/m) * w[l] - learning_rate * (from back propagation)
165 |       ```
166 | 
167 |   - In practice this penalizes large weights and effectively limits the freedom in your model.
168 | 
169 |   - The new term `(1 - (learning_rate*lambda)/m) * w[l]`  causes the **weight to decay** in proportion to its size.
170 | 
171 | 
172 | ### Why regularization reduces overfitting?
173 | 
174 | Here are some intuitions:
175 |   - Intuition 1:
176 |      - If `lambda` is too large - a lot of w's will be close to zeros which will make the NN simpler (you can think of it as it would behave closer to logistic regression).
177 |      - If `lambda` is good enough it will just reduce some weights that makes the neural network overfit.
178 |   - Intuition 2 (with _tanh_ activation function):
179 |      - If `lambda` is too large, w's will be small (close to zero) - will use the linear part of the _tanh_ activation function, so we will go from non linear activation to _roughly_ linear which would make the NN a _roughly_ linear classifier.
180 |      - If `lambda` good enough it will just make some of _tanh_ activations _roughly_ linear which will prevent overfitting.
181 |      
182 | _**Implementation tip**_: if you implement gradient descent, one of the steps to debug gradient descent is to plot the cost function J as a function of the number of iterations of gradient descent and you want to see that the cost function J decreases **monotonically** after every elevation of gradient descent with regularization. If you plot the old definition of J (no regularization) then you might not see it decrease monotonically.
183 | 
184 | 
185 | ### Dropout Regularization
186 | 
187 | - In most cases Andrew Ng tells that he uses the L2 regularization.
188 | - The dropout regularization eliminates some neurons/weights on each iteration based on a probability.
189 | - A most common technique to implement dropout is called "Inverted dropout".
190 | - Code for Inverted dropout:
191 | 
192 |   ```python
193 |   keep_prob = 0.8   # 0 <= keep_prob <= 1
194 |   l = 3  # this code is only for layer 3
195 |   # the generated number that are less than 0.8 will be dropped. 80% stay, 20% dropped
196 |   d3 = np.random.rand(a[l].shape[0], a[l].shape[1]) < keep_prob
197 | 
198 |   a3 = np.multiply(a3,d3)   # keep only the values in d3
199 | 
200 |   # increase a3 to not reduce the expected value of output
201 |   # (ensures that the expected value of a3 remains the same) - to solve the scaling problem
202 |   a3 = a3 / keep_prob       
203 |   ```
204 | - Vector d[l] is used for forward and back propagation and is the same for them, but it is different for each iteration (pass) or training example.
205 | - At test time we don't use dropout. If you implement dropout at test time - it would add noise to predictions.
206 | 
207 | ### Understanding Dropout
208 | 
209 | - In the previous video, the intuition was that dropout randomly knocks out units in your network. So it's as if on every iteration you're working with a smaller NN, and so using a smaller NN seems like it should have a regularizing effect.
210 | - Another intuition: can't rely on any one feature, so have to spread out weights.
211 | - It's possible to show that dropout has a similar effect to L2 regularization.
212 | - Dropout can have different `keep_prob` per layer.
213 | - The input layer dropout has to be near 1 (or 1 - no dropout) because you don't want to eliminate a lot of features.
214 | - If you're more worried about some layers overfitting than others, you can set a lower `keep_prob` for some layers than others. The downside is, this gives you even more hyperparameters to search for using cross-validation. One other alternative might be to have some layers where you apply dropout and some layers where you don't apply dropout and then just have one hyperparameter, which is a `keep_prob` for the layers for which you do apply dropouts.
215 | - A lot of researchers are using dropout with Computer Vision (CV) because they have a very big input size and almost never have enough data, so overfitting is the usual problem. And dropout is a regularization technique to prevent overfitting.
216 | - A downside of dropout is that the cost function J is not well defined and it will be hard to debug (plot J by iteration).
217 |   - To solve that you'll need to turn off dropout, set all the `keep_prob`s to 1, and then run the code and check that it monotonically decreases J and then turn on the dropouts again.
218 | 
219 | ### Other regularization methods
220 | 
221 | - **Data augmentation**:
222 |   - For example in a computer vision data:
223 |     - You can flip all your pictures horizontally this will give you m more data instances.
224 |     - You could also apply a random position and rotation to an image to get more data.
225 |   - For example in OCR, you can impose random rotations and distortions to digits/letters.
226 |   - New data obtained using this technique isn't as good as the real independent data, but still can be used as a regularization technique.
227 | - **Early stopping**:
228 |   - In this technique we plot the training set and the dev set cost together for each iteration. At some iteration the dev set cost will stop decreasing and will start increasing.
229 |   - We will pick the point at which the training set error and dev set error are best (lowest training cost with lowest dev cost).
230 |   - We will take these parameters as the best parameters.
231 |     - ![](Images/02-_Early_stopping.png)
232 |   - Andrew prefers to use L2 regularization instead of early stopping because this technique simultaneously tries to minimize the cost function and not to overfit which contradicts the orthogonalization approach (will be discussed further).
233 |   - But its advantage is that you don't need to search a hyperparameter like in other regularization approaches (like `lambda` in L2 regularization).
234 | - **Model Ensembles**:
235 |   - Algorithm:
236 |     - Train multiple independent models.
237 |     - At test time average their results.
238 |   - It can get you extra 2% performance.
239 |   - It reduces the generalization error.
240 |   - You can use some snapshots of your NN at the training ensembles them and take the results.
241 | 
242 | ### Normalizing inputs
243 | 
244 | - If you normalize your inputs this will speed up the training process a lot.
245 | - Normalization are going on these steps:
246 |   1. Get the mean of the training set: `mean = (1/m) * sum(x(i))`
247 |   2. Subtract the mean from each input: `X = X - mean`
248 |      - This makes your inputs centered around 0.
249 |   3. Get the variance of the training set: `variance = (1/m) * sum(x(i)^2)`
250 |   4. Normalize the variance. `X /= variance`
251 | - These steps should be applied to training, dev, and testing sets (but using mean and variance of the train set).
252 | - Why normalize?
253 |   - If we don't normalize the inputs our cost function will be deep and its shape will be inconsistent (elongated) then optimizing it will take a long time.
254 |   - But if we normalize it the opposite will occur. The shape of the cost function will be consistent (look more symmetric like circle in 2D example) and we can use a larger learning rate alpha - the optimization will be faster.
255 | 
256 | ### Vanishing / Exploding gradients
257 | 
258 | - The Vanishing / Exploding gradients occurs when your derivatives become very small or very big.
259 | - To understand the problem, suppose that we have a deep neural network with number of layers L, and all the activation functions are **linear** and each `b = 0`
260 |   - Then:   
261 |     ```
262 |     Y' = W[L]W[L-1].....W[2]W[1]X
263 |     ```
264 |   - Then, if we have 2 hidden units per layer and x1 = x2 = 1, we result in:
265 | 
266 |     ```
267 |     if W[l] = [1.5   0] 
268 |               [0   1.5] (l != L because of different dimensions in the output layer)
269 |     Y' = W[L] [1.5  0]^(L-1) X = 1.5^L 	# which will be very large
270 |               [0  1.5]
271 |     ```
272 |     ```
273 |     if W[l] = [0.5  0]
274 |               [0  0.5]
275 |     Y' = W[L] [0.5  0]^(L-1) X = 0.5^L 	# which will be very small
276 |               [0  0.5]
277 |     ```
278 | - The last example explains that the activations (and similarly derivatives) will be decreased/increased exponentially as a function of number of layers.
279 | - So If W > I (Identity matrix) the activation and gradients will explode.
280 | - And If W < I (Identity matrix) the activation and gradients will vanish.
281 | - Recently Microsoft trained 152 layers (ResNet)! which is a really big number. With such a deep neural network, if your activations or gradients increase or decrease exponentially as a function of L, then these values could get really big or really small. And this makes training difficult, especially if your gradients are exponentially smaller than L, then gradient descent will take tiny little steps. It will take a long time for gradient descent to learn anything.
282 | - There is a partial solution that doesn't completely solve this problem but it helps a lot - careful choice of how you initialize the weights (next video).
283 | 
284 | ### Weight Initialization for Deep Networks
285 | 
286 | - A partial solution to the Vanishing / Exploding gradients in NN is better or more careful choice of the random initialization of weights
287 | - In a single neuron (Perceptron model): `Z = w1x1 + w2x2 + ... + wnxn`
288 |   - So if `n_x` is large we want `W`'s to be smaller to not explode the cost.
289 | - So it turns out that we need the variance which equals `1/n_x` to be the range of `W`'s
290 | - So lets say when we initialize `W`'s like this (better to use with `tanh` activation):   
291 |   ```
292 |   np.random.rand(shape) * np.sqrt(1/n[l-1])
293 |   ```
294 |   or variation of this (Bengio et al.):   
295 |   ```
296 |   np.random.rand(shape) * np.sqrt(2/(n[l-1] + n[l]))
297 |   ```
298 | - Setting initialization part inside sqrt to `2/n[l-1]` for `ReLU` is better:   
299 |   ```
300 |   np.random.rand(shape) * np.sqrt(2/n[l-1])
301 |   ```
302 | - Number 1 or 2 in the neumerator can also be a hyperparameter to tune (but not the first to start with)
303 | - This is one of the best way of partially solution to Vanishing / Exploding gradients (ReLU + Weight Initialization with variance) which will help gradients not to vanish/explode too quickly
304 | - The initialization in this video is called "He Initialization / Xavier Initialization" and has been published in 2015 paper.
305 | 
306 | ### Numerical approximation of gradients
307 | 
308 | - There is an technique called gradient checking which tells you if your implementation of backpropagation is correct.
309 | - There's a numerical way to calculate the derivative:   
310 |   ![](Images/03-_Numerical_approximation_of_gradients.png)
311 | - Gradient checking approximates the gradients and is very helpful for finding the errors in your backpropagation implementation but it's slower than gradient descent (so use only for debugging).
312 | - Implementation of this is very simple.
313 | - Gradient checking:
314 |   - First take `W[1],b[1],...,W[L],b[L]` and reshape into one big vector (`theta`)
315 |   - The cost function will be `J(theta)`
316 |   - Then take `dW[1],db[1],...,dW[L],db[L]` into one big vector (`d_theta`)
317 |   - **Algorithm**:   
318 |     ```
319 |     eps = 10^-7   # small number
320 |     for i in len(theta):
321 |       d_theta_approx[i] = (J(theta1,...,theta[i] + eps) -  J(theta1,...,theta[i] - eps)) / 2*eps
322 |     ```
323 |   - Finally we evaluate this formula `(||d_theta_approx - d_theta||) / (||d_theta_approx||+||d_theta||)` (`||` - Euclidean vector norm) and check (with eps = 10^-7):
324 |     - if it is < 10^-7  - great, very likely the backpropagation implementation is correct
325 |     - if around 10^-5   - can be OK, but need to inspect if there are no particularly big values in `d_theta_approx - d_theta` vector
326 |     - if it is >= 10^-3 - bad, probably there is a bug in backpropagation implementation
327 | 
328 | ### Gradient checking implementation notes
329 | 
330 | - Don't use the gradient checking algorithm at training time because it's very slow.
331 | - Use gradient checking only for debugging.
332 | - If algorithm fails grad check, look at components to try to identify the bug.
333 | - Don't forget to add `lamda/(2m) * sum(W[l])` to `J` if you are using L1 or L2 regularization.
334 | - Gradient checking doesn't work with dropout because J is not consistent. 
335 |   - You can first turn off dropout (set `keep_prob = 1.0`), run gradient checking and then turn on dropout again.
336 | - Run gradient checking at random initialization and train the network for a while maybe there's a bug which can be seen when w's and b's become larger (further from 0) and can't be seen on the first iteration (when w's and b's are very small).
337 | 
338 | ### Initialization summary
339 | 
340 | - The weights W<sup>[l]</sup> should be initialized randomly to break symmetry
341 | 
342 | - It is however okay to initialize the biases b<sup>[l]</sup> to zeros. Symmetry is still broken so long as W<sup>[l]</sup> is initialized randomly
343 | 
344 | - Different initializations lead to different results
345 | 
346 | - Random initialization is used to break symmetry and make sure different hidden units can learn different things
347 | 
348 | - Don't intialize to values that are too large
349 | 
350 | - He initialization works well for networks with ReLU activations. 
351 | 
352 | ### Regularization summary
353 | 
354 | #### 1. L2 Regularization   
355 | **Observations**:   
356 |   - The value of λ is a hyperparameter that you can tune using a dev set.
357 |   - L2 regularization makes your decision boundary smoother. If λ is too large, it is also possible to "oversmooth", resulting in a model with high bias.
358 | 
359 | **What is L2-regularization actually doing?**:   
360 |   - L2-regularization relies on the assumption that a model with small weights is simpler than a model with large weights. Thus, by penalizing the square values of the weights in the cost function you drive all the weights to smaller values. It becomes too costly for the cost to have large weights! This leads to a smoother model in which the output changes more slowly as the input changes.
361 | 
362 | **What you should remember:**   
363 | Implications of L2-regularization on:
364 |   - cost computation:
365 |     - A regularization term is added to the cost
366 |   - backpropagation function:
367 |     - There are extra terms in the gradients with respect to weight matrices
368 |   - weights:
369 |     - weights end up smaller ("weight decay") - are pushed to smaller values.
370 |     
371 | #### 2. Dropout   
372 | **What you should remember about dropout:**   
373 | - Dropout is a regularization technique.
374 | - You only use dropout during training. Don't use dropout (randomly eliminate nodes) during test time.
375 | - Apply dropout both during forward and backward propagation.
376 | - During training time, divide each dropout layer by keep_prob to keep the same expected value for the activations. For example, if `keep_prob` is 0.5, then we will on average shut down half the nodes, so the output will be scaled by 0.5 since only the remaining half are contributing to the solution. Dividing by 0.5 is equivalent to multiplying by 2. Hence, the output now has the same expected value. You can check that this works even when keep_prob is other values than 0.5.
377 | 
378 | 
379 | ## Optimization algorithms
380 | 
381 | ### Mini-batch gradient descent
382 | 
383 | - Training NN with a large data is slow. So to find an optimization algorithm that runs faster is a good idea.
384 | - Suppose we have `m = 50 million`. To train this data it will take a huge processing time for one step.
385 |   - because 50 million won't fit in the memory at once we need other processing to make such a thing.
386 | - It turns out you can make a faster algorithm to make gradient descent process some of your items even before you finish the 50 million items.
387 | - Suppose we have split m to **mini batches** of size 1000.
388 |   - `X{1} = 0    ...  1000`
389 |   - `X{2} = 1001 ...  2000`
390 |   - `...`
391 |   - `X{bs} = ...`
392 | - We similarly split `X` & `Y`.
393 | - So the definition of mini batches ==> `t: X{t}, Y{t}`
394 | - In **Batch gradient descent** we run the gradient descent on the whole dataset.
395 | - While in **Mini-Batch gradient descent** we run the gradient descent on the mini datasets.
396 | - Mini-Batch algorithm pseudo code:
397 |   ```
398 |   for t = 1:No_of_batches                         # this is called an epoch
399 |   	AL, caches = forward_prop(X{t}, Y{t})
400 |   	cost = compute_cost(AL, Y{t})
401 |   	grads = backward_prop(AL, caches)
402 |   	update_parameters(grads)
403 |   ```
404 | - The code inside an epoch should be vectorized.
405 | - Mini-batch gradient descent works much faster in the large datasets.
406 | 
407 | ### Understanding mini-batch gradient descent
408 | 
409 | - In mini-batch algorithm, the cost won't go down with each step as it does in batch algorithm. It could contain some ups and downs but generally it has to go down (unlike the batch gradient descent where cost function descreases on each iteration).
410 |   ![](Images/04-_batch_vs_mini_batch_cost.png)
411 | - Mini-batch size:
412 |   - (`mini batch size = m`)  ==>    Batch gradient descent
413 |   - (`mini batch size = 1`)  ==>    Stochastic gradient descent (SGD)
414 |   - (`mini batch size = between 1 and m`) ==>    Mini-batch gradient descent
415 | - Batch gradient descent:
416 |   - too long per iteration (epoch)
417 | - Stochastic gradient descent:
418 |   - too noisy regarding cost minimization (can be reduced by using smaller learning rate)
419 |   - won't ever converge (reach the minimum cost)
420 |   - lose speedup from vectorization
421 | - Mini-batch gradient descent:
422 |   1. faster learning:
423 |       - you have the vectorization advantage
424 |       - make progress without waiting to process the entire training set
425 |   2. doesn't always exactly converge (oscelates in a very small region, but you can reduce learning rate)
426 | - Guidelines for choosing mini-batch size:
427 |   1. If small training set (< 2000 examples) - use batch gradient descent.
428 |   2. It has to be a power of 2 (because of the way computer memory is layed out and accessed, sometimes your code runs faster if your mini-batch size is a power of 2):
429 |     `64, 128, 256, 512, 1024, ...`
430 |   3. Make sure that mini-batch fits in CPU/GPU memory.
431 | - Mini-batch size is a `hyperparameter`.
432 | 
433 | ### Exponentially weighted averages
434 | 
435 | - There are optimization algorithms that are better than **gradient descent**, but you should first learn about Exponentially weighted averages.
436 | - If we have data like the temperature of day through the year it could be like this:
437 |   ```
438 |   t(1) = 40
439 |   t(2) = 49
440 |   t(3) = 45
441 |   ...
442 |   t(180) = 60
443 |   ...
444 |   ```
445 | - This data is small in winter and big in summer. If we plot this data we will find it some noisy.
446 | - Now lets compute the Exponentially weighted averages:
447 |   ```
448 |   V0 = 0
449 |   V1 = 0.9 * V0 + 0.1 * t(1) = 4		# 0.9 and 0.1 are hyperparameters
450 |   V2 = 0.9 * V1 + 0.1 * t(2) = 8.5
451 |   V3 = 0.9 * V2 + 0.1 * t(3) = 12.15
452 |   ...
453 |   ```
454 | - General equation
455 |   ```
456 |   V(t) = beta * v(t-1) + (1-beta) * theta(t)
457 |   ```
458 | - If we plot this it will represent averages over `~ (1 / (1 - beta))` entries:
459 |     - `beta = 0.9` will average last 10 entries
460 |     - `beta = 0.98` will average last 50 entries
461 |     - `beta = 0.5` will average last 2 entries
462 | - Best beta average for our case is between 0.9 and 0.98
463 | - **Intuition**: The reason why exponentially weighted averages are useful for further optimizing gradient descent algorithm is that it can give different weights to recent data points (`theta`) based on value of `beta`. If `beta` is high (around 0.9), it smoothens out the averages of skewed data points (oscillations w.r.t. Gradient descent terminology). So this reduces oscillations in gradient descent and hence makes faster and smoother path towerds minima.
464 | - Another imagery example:   
465 |     ![](Images/Nasdaq1_small.png)   
466 |     _(taken from [investopedia.com](https://www.investopedia.com/))_
467 | 
468 | ### Understanding exponentially weighted averages
469 | 
470 | - Intuitions:   
471 |     ![](Images/05-_exponentially_weighted_averages_intuitions.png)
472 | - We can implement this algorithm with more accurate results using a moving window. But the code is more efficient and faster using the exponentially weighted averages algorithm.
473 | - Algorithm is very simple:
474 |   ```
475 |   v = 0
476 |   Repeat
477 |   {
478 |   	Get theta(t)
479 |   	v = beta * v + (1-beta) * theta(t)
480 |   }
481 |   ```
482 | 
483 | ### Bias correction in exponentially weighted averages
484 | 
485 | - The bias correction helps make the exponentially weighted averages more accurate.
486 | - Because `v(0) = 0`, the bias of the weighted averages is shifted and the accuracy suffers at the start.
487 | - To solve the bias issue we have to use this equation:
488 |   ```
489 |   v(t) = (beta * v(t-1) + (1-beta) * theta(t)) / (1 - beta^t)
490 |   ```
491 | - As t becomes larger the `(1 - beta^t)` becomes close to `1`
492 | 
493 | ### Gradient descent with momentum
494 | 
495 | - The momentum algorithm almost always works faster than standard gradient descent.
496 | - The simple idea is to calculate the exponentially weighted averages for your gradients and then update your weights with the new values.
497 | - Pseudo code:
498 |   ```
499 |   vdW = 0, vdb = 0
500 |   on iteration t:
501 |   	# can be mini-batch or batch gradient descent
502 |   	compute dw, db on current mini-batch                
503 |   			
504 |   	vdW = beta * vdW + (1 - beta) * dW
505 |   	vdb = beta * vdb + (1 - beta) * db
506 |   	W = W - learning_rate * vdW
507 |   	b = b - learning_rate * vdb
508 |   ```
509 | - Momentum helps the cost function to go to the minimum point in a more fast and consistent way.
510 | - `beta` is another `hyperparameter`. `beta = 0.9` is very common and works very well in most cases.
511 | - In practice people don't bother implementing **bias correction**.
512 | 
513 | ### RMSprop
514 | 
515 | - Stands for **Root mean square prop**.
516 | - This algorithm speeds up the gradient descent.
517 | - Pseudo code:
518 |   ```
519 |   sdW = 0, sdb = 0
520 |   on iteration t:
521 |   	# can be mini-batch or batch gradient descent
522 |   	compute dw, db on current mini-batch
523 |   	
524 |   	sdW = (beta * sdW) + (1 - beta) * dW^2  # squaring is element-wise
525 |   	sdb = (beta * sdb) + (1 - beta) * db^2  # squaring is element-wise
526 |   	W = W - learning_rate * dW / sqrt(sdW)
527 |   	b = B - learning_rate * db / sqrt(sdb)
528 |   ```
529 | - RMSprop will make the cost function move slower on the vertical direction and faster on the horizontal direction in the following example:
530 |     ![](Images/06-_RMSprop.png)
531 | - Ensure that `sdW` is not zero by adding a small value `epsilon` (e.g. `epsilon = 10^-8`) to it:   
532 |    `W = W - learning_rate * dW / (sqrt(sdW) + epsilon)`
533 | - With RMSprop you can increase your learning rate.
534 | - Developed by Geoffrey Hinton and firstly introduced on [Coursera.org](https://www.coursera.org/) course.
535 | 
536 | ### Adam optimization algorithm
537 | 
538 | - Stands for **Adaptive Moment Estimation**.
539 | - Adam optimization and RMSprop are among the optimization algorithms that worked very well with a lot of NN architectures.
540 | - Adam optimization simply puts RMSprop and momentum together!
541 | - Pseudo code:
542 |   ```
543 |   vdW = 0, vdW = 0
544 |   sdW = 0, sdb = 0
545 |   on iteration t:
546 |   	# can be mini-batch or batch gradient descent
547 |   	compute dw, db on current mini-batch                
548 |   			
549 |   	vdW = (beta1 * vdW) + (1 - beta1) * dW     # momentum
550 |   	vdb = (beta1 * vdb) + (1 - beta1) * db     # momentum
551 |   			
552 |   	sdW = (beta2 * sdW) + (1 - beta2) * dW^2   # RMSprop
553 |   	sdb = (beta2 * sdb) + (1 - beta2) * db^2   # RMSprop
554 |   			
555 |   	vdW = vdW / (1 - beta1^t)      # fixing bias
556 |   	vdb = vdb / (1 - beta1^t)      # fixing bias
557 |   			
558 |   	sdW = sdW / (1 - beta2^t)      # fixing bias
559 |   	sdb = sdb / (1 - beta2^t)      # fixing bias
560 |   					
561 |   	W = W - learning_rate * vdW / (sqrt(sdW) + epsilon)
562 |   	b = B - learning_rate * vdb / (sqrt(sdb) + epsilon)
563 |   ```
564 | - Hyperparameters for Adam:
565 |   - Learning rate: needed to be tuned.
566 |   - `beta1`: parameter of the momentum - `0.9` is recommended by default.
567 |   - `beta2`: parameter of the RMSprop - `0.999` is recommended by default.
568 |   - `epsilon`: `10^-8` is recommended by default.
569 | 
570 | ### Learning rate decay
571 | 
572 | - Slowly reduce learning rate.
573 | - As mentioned before mini-batch gradient descent won't reach the optimum point (converge). But by making the learning rate decay with iterations it will be much closer to it because the steps (and possible oscillations) near the optimum are smaller.
574 | - One technique equations is`learning_rate = (1 / (1 + decay_rate * epoch_num)) * learning_rate_0`  
575 |   - `epoch_num` is over all data (not a single mini-batch).
576 | - Other learning rate decay methods (continuous):
577 |   - `learning_rate = (0.95 ^ epoch_num) * learning_rate_0`
578 |   - `learning_rate = (k / sqrt(epoch_num)) * learning_rate_0`
579 | - Some people perform learning rate decay discretely - repeatedly decrease after some number of epochs.
580 | - Some people are making changes to the learning rate manually.
581 | - `decay_rate` is another `hyperparameter`.
582 | - For Andrew Ng, learning rate decay has less priority.
583 | 
584 | ### The problem of local optima
585 | 
586 | - The normal local optima is not likely to appear in a deep neural network because data is usually high dimensional. For point to be a local optima it has to be a local optima for each of the dimensions which is highly unlikely.
587 | - It's unlikely to get stuck in a bad local optima in high dimensions, it is much more likely to get to the saddle point rather to the local optima, which is not a problem.
588 | - Plateaus can make learning slow:
589 |   - Plateau is a region where the derivative is close to zero for a long time.
590 |   - This is where algorithms like momentum, RMSprop or Adam can help.
591 | 
592 | 
593 | 
594 | ## Hyperparameter tuning, Batch Normalization and Programming Frameworks
595 | 
596 | ### Tuning process
597 | 
598 | - We need to tune our hyperparameters to get the best out of them.
599 | - Hyperparameters importance are (as for Andrew Ng):
600 |   1. Learning rate.
601 |   2. Momentum beta.
602 |   3. Mini-batch size.
603 |   4. No. of hidden units.
604 |   5. No. of layers.
605 |   6. Learning rate decay.
606 |   7. Regularization lambda.
607 |   8. Activation functions.
608 |   9. Adam `beta1`, `beta2` & `epsilon`.
609 | - Its hard to decide which hyperparameter is the most important in a problem. It depends a lot on your problem.
610 | - One of the ways to tune is to sample a grid with `N` hyperparameter settings and then try all settings combinations on your problem.
611 | - Try random values: don't use a grid.
612 | - You can use `Coarse to fine sampling scheme`:
613 |   - When you find some hyperparameters values that give you a better performance - zoom into a smaller region around these values and sample more densely within this space.
614 | - These methods can be automated.
615 | 
616 | ### Using an appropriate scale to pick hyperparameters
617 | 
618 | - Let's say you have a specific range for a hyperparameter from "a" to "b". It's better to search for the right ones using the logarithmic scale rather then in linear scale:
619 |   - Calculate: `a_log = log(a)  # e.g. a = 0.0001 then a_log = -4`
620 |   - Calculate: `b_log = log(b)  # e.g. b = 1  then b_log = 0`
621 |   - Then:
622 |     ```
623 |     r = (a_log - b_log) * np.random.rand() + b_log
624 |     # In the example the range would be from [-4, 0] because rand range [0,1)
625 |     result = 10^r
626 |     ```
627 |     It uniformly samples values in log scale from [a,b].
628 | - If we want to use the last method on exploring on the "momentum beta":
629 |   - Beta best range is from 0.9 to 0.999.
630 |   - You should search for `1 - beta in range 0.001 to 0.1 (1 - 0.9 and 1 - 0.999)` and the use `a = 0.001` and `b = 0.1`. Then:
631 |     ```
632 |     a_log = -3
633 |     b_log = -1
634 |     r = (a_log - b_log) * np.random.rand() + b_log
635 |     beta = 1 - 10^r   # because 1 - beta = 10^r
636 |     ```
637 | 
638 | ### Hyperparameters tuning in practice: Pandas vs. Caviar 
639 | 
640 | - Intuitions about hyperparameter settings from one application area may or may not transfer to a different one.
641 | - If you don't have much computational resources you can use the "babysitting model":
642 |   - Day 0 you might initialize your parameter as random and then start training.
643 |   - Then you watch your learning curve gradually decrease over the day.
644 |   - And each day you nudge your parameters a little during training.
645 |   - Called panda approach.
646 | - If you have enough computational resources, you can run some models in parallel and at the end of the day(s) you check the results.
647 |   - Called Caviar approach.
648 | 
649 | ### Normalizing activations in a network
650 | 
651 | - In the rise of deep learning, one of the most important ideas has been an algorithm called **batch normalization**, created by two researchers, Sergey Ioffe and Christian Szegedy.
652 | - Batch Normalization speeds up learning.
653 | - Before we normalized input by subtracting the mean and dividing by variance. This helped a lot for the shape of the cost function and for reaching the minimum point faster.
654 | - The question is: *for any hidden layer can we normalize `A[l]` to train `W[l+1]`, `b[l+1]` faster?* This is what batch normalization is about.
655 | - There are some debates in the deep learning literature about whether you should normalize values before the activation function `Z[l]` or after applying the activation function `A[l]`. In practice, normalizing `Z[l]` is done much more often and that is what Andrew Ng presents.
656 | - Algorithm:
657 |   - Given `Z[l] = [z(1), ..., z(m)]`, i = 1 to m (for each input)
658 |   - Compute `mean = 1/m * sum(z[i])`
659 |   - Compute `variance = 1/m * sum((z[i] - mean)^2)`
660 |   - Then `Z_norm[i] = (z[i] - mean) / np.sqrt(variance + epsilon)` (add `epsilon` for numerical stability if variance = 0)
661 |     - Forcing the inputs to a distribution with zero mean and variance of 1.
662 |   - Then `Z_tilde[i] = gamma * Z_norm[i] + beta`
663 |     - To make inputs belong to other distribution (with other mean and variance).
664 |     - gamma and beta are learnable parameters of the model.
665 |     - Making the NN learn the distribution of the outputs.
666 |     - _Note:_ if `gamma = sqrt(variance + epsilon)` and `beta = mean` then `Z_tilde[i] = z[i]`
667 | 
668 | ### Fitting Batch Normalization into a neural network
669 | 
670 | - Using batch norm in 3 hidden layers NN:
671 |     ![](Images/bn.png)
672 | - Our NN parameters will be:
673 |   - `W[1]`, `b[1]`, ..., `W[L]`, `b[L]`, `beta[1]`, `gamma[1]`, ..., `beta[L]`, `gamma[L]`
674 |   - `beta[1]`, `gamma[1]`, ..., `beta[L]`, `gamma[L]` are updated using any optimization algorithms (like GD, RMSprop, Adam)
675 | - If you are using a deep learning framework, you won't have to implement batch norm yourself:
676 |   - Ex. in Tensorflow you can add this line: `tf.nn.batch-normalization()`
677 | - Batch normalization is usually applied with mini-batches.
678 | - If we are using batch normalization parameters `b[1]`, ..., `b[L]` doesn't count because they will be eliminated after mean subtraction step, so:
679 |   ```
680 |   Z[l] = W[l]A[l-1] + b[l] => Z[l] = W[l]A[l-1]
681 |   Z_norm[l] = ...
682 |   Z_tilde[l] = gamma[l] * Z_norm[l] + beta[l]
683 |   ```
684 |   - Taking the mean of a constant `b[l]` will eliminate the `b[l]`
685 | - So if you are using batch normalization, you can remove b[l] or make it always zero.
686 | - So the parameters will be `W[l]`, `beta[l]`, and `alpha[l]`.
687 | - Shapes:
688 |   - `Z[l]       - (n[l], m)`
689 |   - `beta[l]    - (n[l], m)`
690 |   - `gamma[l]   - (n[l], m)`
691 | 
692 | ### Why does Batch normalization work?
693 | 
694 | - The first reason is the same reason as why we normalize X.
695 | - The second reason is that batch normalization reduces the problem of input values changing (shifting).
696 | - Batch normalization does some regularization:
697 |   - Each mini batch is scaled by the mean/variance computed of that mini-batch.
698 |   - This adds some noise to the values `Z[l]` within that mini batch. So similar to dropout it adds some noise to each hidden layer's activations.
699 |   - This has a slight regularization effect.
700 |   - Using bigger size of the mini-batch you are reducing noise and therefore regularization effect.
701 |   - Don't rely on batch normalization as a regularization. It's intended for normalization of hidden units, activations and therefore speeding up learning. For regularization use other regularization techniques (L2 or dropout).
702 | 
703 | ### Batch normalization at test time
704 | 
705 | - When we train a NN with Batch normalization, we compute the mean and the variance of the mini-batch.
706 | - In testing we might need to process examples one at a time. The mean and the variance of one example won't make sense.
707 | - We have to compute an estimated value of mean and variance to use it in testing time.
708 | - We can use the weighted average across the mini-batches.
709 | - We will use the estimated values of the mean and variance to test.
710 | - This method is also sometimes called "Running average".
711 | - In practice most often you will use a deep learning framework and it will contain some default implementation of doing such a thing.
712 | 
713 | ### Softmax Regression
714 | 
715 | - In every example we have used so far we were talking about binary classification.
716 | - There are a generalization of logistic regression called Softmax regression that is used for multiclass classification/regression.
717 | - For example if we are classifying by classes `dog`, `cat`, `baby chick` and `none of that`
718 |   - Dog `class = 1`
719 |   - Cat `class = 2`
720 |   - Baby chick `class = 3`
721 |   - None `class = 0`
722 |   - To represent a dog vector `y = [0 1 0 0]`
723 |   - To represent a cat vector `y = [0 0 1 0]`
724 |   - To represent a baby chick vector `y = [0 0 0 1]`
725 |   - To represent a none vector `y = [1 0 0 0]`
726 | - Notations:
727 |   - `C = no. of classes`
728 |   - Range of classes is `(0, ..., C-1)`
729 |   - In output layer `Ny = C`
730 | - Each of C values in the output layer will contain a probability of the example to belong to each of the classes.
731 | - In the last layer we will have to activate the Softmax activation function instead of the sigmoid activation.
732 | - Softmax activation equations:
733 |   ```
734 |   t = e^(Z[L])                      # shape(C, m)
735 |   A[L] = e^(Z[L]) / sum(t)          # shape(C, m), sum(t) - sum of t's for each example (shape (1, m))
736 |   ```
737 | 
738 | ### Training a Softmax classifier
739 | 
740 | - There's an activation which is called hard max, which gets 1 for the maximum value and zeros for the others.
741 |   - If you are using NumPy, its `np.max` over the vertical axis.
742 | - The Softmax name came from softening the values and not harding them like hard max.
743 | - Softmax is a generalization of logistic activation function to `C` classes. If `C = 2` softmax reduces to logistic regression.
744 | - The loss function used with softmax:
745 |   ```
746 |   L(y, y_hat) = - sum(y[j] * log(y_hat[j])) # j = 0 to C-1
747 |   ```
748 | - The cost function used with softmax:
749 |   ```
750 |   J(w[1], b[1], ...) = - 1 / m * (sum(L(y[i], y_hat[i]))) # i = 0 to m
751 |   ```
752 | - Back propagation with softmax:
753 |   ```
754 |   dZ[L] = Y_hat - Y
755 |   ```
756 | - The derivative of softmax is:
757 |   ```
758 |   Y_hat * (1 - Y_hat)
759 |   ```
760 | - Example:
761 |     ![](Images/07-_softmax.png)
762 | 
763 | ### Deep learning frameworks
764 | 
765 | - It's not practical to implement everything from scratch. Our numpy implementations were to know how NN works.
766 | - There are many good deep learning frameworks.
767 | - Deep learning is now in the phase of doing something with the frameworks and not from scratch to keep on going.
768 | - Here are some of the leading deep learning frameworks:
769 |   - Caffe/ Caffe2
770 |   - CNTK
771 |   - DL4j
772 |   - Keras
773 |   - Lasagne
774 |   - mxnet
775 |   - PaddlePaddle
776 |   - TensorFlow
777 |   - Theano
778 |   - Torch/Pytorch
779 | - These frameworks are getting better month by month. Comparison between them can be found [here](https://en.wikipedia.org/wiki/Comparison_of_deep_learning_software).
780 | - How to choose deep learning framework:
781 |   - Ease of programming (development and deployment)
782 |   - Running speed
783 |   - Truly open (open source with good governance)
784 | - Programming frameworks can not only shorten your coding time but sometimes also perform optimizations that speed up your code.
785 | 
786 | ### TensorFlow
787 | 
788 | - In this section we will learn the basic structure of TensorFlow programs.
789 | - Lets see how to implement a minimization function:
790 |   - Example function: `J(w) = w^2 - 10w + 25`
791 |   - The result should be `w = 5` as the function is `(w-5)^2 = 0`
792 |   - Code v.1:
793 |     ```python
794 |     import numpy as np
795 |     import tensorflow as tf
796 |     
797 |     
798 |     w = tf.Variable(0, dtype=tf.float32)                 # creating a variable w
799 |     cost = tf.add(tf.add(w**2, tf.multiply(-10.0, w)), 25.0)        # can be written as this - cost = w**2 - 10*w + 25
800 |     train = tf.train.GradientDescentOptimizer(0.01).minimize(cost)
801 | 
802 |     init = tf.global_variables_initializer()
803 |     session = tf.Session()
804 |     session.run(init)
805 |     session.run(w)    # Runs the definition of w, if you print this it will print zero
806 |     session.run(train)
807 | 
808 |     print("W after one iteration:", session.run(w))
809 | 
810 |     for i in range(1000):
811 |     	session.run(train)
812 | 
813 |     print("W after 1000 iterations:", session.run(w))
814 |     ```
815 |   - Code v.2 (we feed the inputs to the algorithm through coefficients):
816 | 
817 |     ```python
818 |     import numpy as np
819 |     import tensorflow as tf
820 |     
821 |     
822 |     coefficients = np.array([[1.], [-10.], [25.]])
823 | 
824 |     x = tf.placeholder(tf.float32, [3, 1])
825 |     w = tf.Variable(0, dtype=tf.float32)                 # Creating a variable w
826 |     cost = x[0][0]*w**2 + x[1][0]*w + x[2][0]
827 | 
828 |     train = tf.train.GradientDescentOptimizer(0.01).minimize(cost)
829 | 
830 |     init = tf.global_variables_initializer()
831 |     session = tf.Session()
832 |     session.run(init)
833 |     session.run(w)    # Runs the definition of w, if you print this it will print zero
834 |     session.run(train, feed_dict={x: coefficients})
835 | 
836 |     print("W after one iteration:", session.run(w))
837 | 
838 |     for i in range(1000):
839 |     	session.run(train, feed_dict={x: coefficients})
840 | 
841 |     print("W after 1000 iterations:", session.run(w))
842 |     ```
843 | - In TensorFlow you implement only the forward propagation and TensorFlow will do the backpropagation by itself.
844 | - In TensorFlow a placeholder is a variable you can assign a value to later.
845 | - If you are using a mini-batch training you should change the `feed_dict={x: coefficients}` to the current mini-batch data.
846 | - Almost all TensorFlow programs use this:
847 |   ```python
848 |   with tf.Session() as session:       # better for cleaning up in case of error/exception
849 |   	session.run(init)
850 |   	session.run(w)
851 |   ```
852 | - In deep learning frameworks there are a lot of things that you can do with one line of code like changing the optimizer.
853 | _**Side notes:**_
854 | - Writing and running programs in TensorFlow has the following steps:
855 |   1. Create Tensors (variables) that are not yet executed/evaluated.
856 |   2. Write operations between those Tensors.
857 |   3. Initialize your Tensors.
858 |   4. Create a Session.
859 |   5. Run the Session. This will run the operations you'd written above.
860 | - Instead of needing to write code to compute the cost function we know, we can use this line in TensorFlow :
861 |   `tf.nn.sigmoid_cross_entropy_with_logits(logits = ...,  labels = ...)`
862 | - To initialize weights in NN using TensorFlow use:
863 |   ```
864 |   W1 = tf.get_variable("W1", [25,12288], initializer = tf.contrib.layers.xavier_initializer(seed = 1))
865 | 
866 |   b1 = tf.get_variable("b1", [25,1], initializer = tf.zeros_initializer())
867 |   ```
868 | - For 3-layer NN, it is important to note that the forward propagation stops at `Z3`. The reason is that in TensorFlow the last linear layer output is given as input to the function computing the loss. Therefore, you don't need `A3`!
869 | - To reset the graph use `tf.reset_default_graph()`
870 | 
871 | ## Extra Notes
872 | 
873 | - If you want a good papers in deep learning look at the ICLR proceedings (Or NIPS proceedings) and that will give you a really good view of the field.
874 | - Who is Yuanqing Lin?
875 |   - Head of Baidu research.
876 |   - First one to win ImageNet
877 |   - Works in PaddlePaddle deep learning platform.
878 | 
879 | 
880 | 
881 | 
882 | 
883 | 
884 | 
885 | 
886 | 
887 | 
888 | <br><br>
889 | <br><br>
890 | These Notes were made by [Mahmoud Badry](mailto:mma18@fayoum.edu.eg) @2017
891 | 


--------------------------------------------------------------------------------
/3- Structuring Machine Learning Projects/Images/01-_Why_human-level_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/3- Structuring Machine Learning Projects/Images/01-_Why_human-level_performance.png


--------------------------------------------------------------------------------
/3- Structuring Machine Learning Projects/Readme.md:
--------------------------------------------------------------------------------
  1 | # Structuring Machine Learning Projects
  2 | 
  3 | This is the third course of the deep learning specialization at [Coursera](https://www.coursera.org/specializations/deep-learning) which is moderated by [DeepLearning.ai](http://deeplearning.ai/). The course is taught by Andrew Ng.
  4 | 
  5 | ## Table of contents
  6 | 
  7 | * [Structuring Machine Learning Projects](#structuring-machine-learning-projects)
  8 |    * [Table of contents](#table-of-contents)
  9 |    * [Course summary](#course-summary)
 10 |    * [ML Strategy 1](#ml-strategy-1)
 11 |       * [Why ML Strategy](#why-ml-strategy)
 12 |       * [Orthogonalization](#orthogonalization)
 13 |       * [Single number evaluation metric](#single-number-evaluation-metric)
 14 |       * [Satisfying and Optimizing metric](#satisfying-and-optimizing-metric)
 15 |       * [Train/dev/test distributions](#traindevtest-distributions)
 16 |       * [Size of the dev and test sets](#size-of-the-dev-and-test-sets)
 17 |       * [When to change dev/test sets and metrics](#when-to-change-devtest-sets-and-metrics)
 18 |       * [Why human-level performance?](#why-human-level-performance)
 19 |       * [Avoidable bias](#avoidable-bias)
 20 |       * [Understanding human-level performance](#understanding-human-level-performance)
 21 |       * [Surpassing human-level performance](#surpassing-human-level-performance)
 22 |       * [Improving your model performance](#improving-your-model-performance)
 23 |    * [ML Strategy 2](#ml-strategy-2)
 24 |       * [Carrying out error analysis](#carrying-out-error-analysis)
 25 |       * [Cleaning up incorrectly labeled data](#cleaning-up-incorrectly-labeled-data)
 26 |       * [Build your first system quickly, then iterate](#build-your-first-system-quickly-then-iterate)
 27 |       * [Training and testing on different distributions](#training-and-testing-on-different-distributions)
 28 |       * [Bias and Variance with mismatched data distributions](#bias-and-variance-with-mismatched-data-distributions)
 29 |       * [Addressing data mismatch](#addressing-data-mismatch)
 30 |       * [Transfer learning](#transfer-learning)
 31 |       * [Multi-task learning](#multi-task-learning)
 32 |       * [What is end-to-end deep learning?](#what-is-end-to-end-deep-learning)
 33 |       * [Whether to use end-to-end deep learning](#whether-to-use-end-to-end-deep-learning)
 34 | 
 35 | ## Course summary
 36 | 
 37 | Here are the course summary as its given on the course [link](https://www.coursera.org/learn/machine-learning-projects):
 38 | 
 39 | > You will learn how to build a successful machine learning project. If you aspire to be a technical leader in AI, and know how to set direction for your team's work, this course will show you how.
 40 | >
 41 | > Much of this content has never been taught elsewhere, and is drawn from my experience building and shipping many deep learning products. This course also has two "flight simulators" that let you practice decision-making as a machine learning project leader. This provides "industry experience" that you might otherwise get only after years of ML work experience.
 42 | >
 43 | > After 2 weeks, you will: 
 44 | > - Understand how to diagnose errors in a machine learning system, and 
 45 | > - Be able to prioritize the most promising directions for reducing error
 46 | > - Understand complex ML settings, such as mismatched training/test sets, and comparing to and/or surpassing human-level performance
 47 | > - Know how to apply end-to-end learning, transfer learning, and multi-task learning
 48 | >
 49 | > I've seen teams waste months or years through not understanding the principles taught in this course. I hope this two week course will save you months of time.
 50 | >
 51 | > This is a standalone course, and you can take this so long as you have basic machine learning knowledge. This is the third course in the Deep Learning Specialization.
 52 | 
 53 | 
 54 | 
 55 | ## ML Strategy 1
 56 | 
 57 | ### Why ML Strategy
 58 | 
 59 | - You have a lot of ideas for how to improve the accuracy of your deep learning system:
 60 |   - Collect more data.
 61 |   - Collect more diverse training set.
 62 |   - Train algorithm longer with gradient descent.
 63 |   - Try different optimization algorithm (e.g. Adam).
 64 |   - Try bigger network.
 65 |   - Try smaller network.
 66 |   - Try dropout.
 67 |   - Add L2 regularization.
 68 |   - Change network architecture (activation functions, # of hidden units, etc.)
 69 | - This course will give you some strategies to help analyze your problem to go in a direction that will help you get better results.
 70 | 
 71 | ### Orthogonalization
 72 | 
 73 | - Some deep learning developers know exactly what hyperparameter to tune in order to try to achieve one effect. This is a process we call orthogonalization.
 74 | - In orthogonalization, you have some controls, but each control does a specific task and doesn't affect other controls.
 75 | - For a supervised learning system to do well, you usually need to tune the knobs of your system to make sure that four things hold true - chain of assumptions in machine learning:
 76 |   1. You'll have to fit training set well on cost function (near human level performance if possible).
 77 |      - If it's not achieved you could try bigger network, another optimization algorithm (like Adam)...
 78 |   2. Fit dev set well on cost function.
 79 |      - If its not achieved you could try regularization, bigger training set...
 80 |   3. Fit test set well on cost function.
 81 |      - If its not achieved you could try bigger dev. set...
 82 |   4. Performs well in real world.
 83 |      - If its not achieved you could try change dev. set, change cost function...
 84 | 
 85 | ### Single number evaluation metric
 86 | 
 87 | - Its better and faster to set a single number evaluation metric for your project before you start it.
 88 | - Difference between precision and recall (in cat classification example):
 89 |   - Suppose we run the classifier on 10 images which are 5 cats and 5 non-cats. The classifier identifies that there are 4 cats, but it identified 1 wrong cat.
 90 |   - Confusion matrix:
 91 | 
 92 |       |                | Predicted cat  | Predicted non-cat |
 93 |       | -------------- | -------------- | ----------------- |
 94 |       | Actual cat     | 3              | 2                 |
 95 |       | Actual non-cat | 1              | 4                 |
 96 |   - **Precision**: percentage of true cats in the recognized result: P = 3/(3 + 1) 
 97 |   - **Recall**: percentage of true recognition cat of the all cat predictions: R = 3/(3 + 2)
 98 |   - **Accuracy**: (3+4)/10
 99 | - Using a precision/recall for evaluation is good in a lot of cases, but separately they don't tell you which algothims is better. Ex:
100 | 
101 |   | Classifier | Precision | Recall |
102 |   | ---------- | --------- | ------ |
103 |   | A          | 95%       | 90%    |
104 |   | B          | 98%       | 85%    |
105 | - A better thing is to combine precision and recall in one single (real) number evaluation metric. There a metric called `F1` score, which combines them
106 |   - You can think of `F1` score as an average of precision and recall
107 |     `F1 = 2 / ((1/P) + (1/R))`
108 | 
109 | ### Satisfying and Optimizing metric
110 | 
111 | - Its hard sometimes to get a single number evaluation metric. Ex:
112 | 
113 |   | Classifier | F1   | Running time |
114 |   | ---------- | ---- | ------------ |
115 |   | A          | 90%  | 80 ms        |
116 |   | B          | 92%  | 95 ms        |
117 |   | C          | 92%  | 1,500 ms     |
118 | - So we can solve that by choosing a single optimizing metric and decide that other metrics are satisfying. Ex:
119 |   ```
120 |   Maximize F1                     # optimizing metric
121 |   subject to running time < 100ms # satisficing metric
122 |   ```
123 | - So as a general rule:
124 |   ```
125 |   Maximize 1     # optimizing metric (one optimizing metric)
126 |   subject to N-1 # satisficing metric (N-1 satisficing metrics)
127 |   ```
128 | 
129 | ### Train/dev/test distributions
130 | 
131 | - Dev and test sets have to come from the same distribution.
132 | - Choose dev set and test set to reflect data you expect to get in the future and consider important to do well on.
133 | - Setting up the dev set, as well as the validation metric is really defining what target you want to aim at.
134 | 
135 | ### Size of the dev and test sets
136 | 
137 | - An old way of splitting the data was 70% training, 30% test or 60% training, 20% dev, 20% test. 
138 | - The old way was valid for a number of examples ~ <100000 
139 | - In the modern deep learning if you have a million or more examples a reasonable split would be 98% training, 1% dev, 1% test. 
140 | 
141 | ### When to change dev/test sets and metrics
142 | 
143 | - Let's take an example. In a cat classification example we have these metric results:
144 | 
145 |   | Metric      | Classification error                                         |
146 |   | ----------- | ------------------------------------------------------------ |
147 |   | Algorithm A | 3% error (But a lot of porn images are treated as cat images here) |
148 |   | Algorithm B | 5% error                                                     |
149 |   - In the last example if we choose the best algorithm by metric it would be "A", but if the users decide it will be "B"
150 |   - Thus in this case, we want and need to change our metric. 
151 |   - `OldMetric = (1/m) * sum(y_pred[i] != y[i] ,m)`
152 |     - Where m is the number of Dev set items.
153 |   - `NewMetric = (1/sum(w[i])) * sum(w[i] * (y_pred[i] != y[i]) ,m)`
154 |     - where:
155 |        - `w[i] = 1                   if x[i] is not porn`
156 |        - `w[i] = 10                 if x[i] is porn`
157 | 
158 | - This is actually an example of an orthogonalization where you should take a machine learning problem and break it into distinct steps: 
159 | 
160 |   1. Figure out how to define a metric that captures what you want to do - place the target. 
161 |   2. Worry about how to actually do well on this metric - how to aim/shoot accurately at the target.
162 | 
163 | - Conclusion: if doing well on your metric + dev/test set doesn't correspond to doing well in your application, change your metric and/or dev/test set.
164 | 
165 | ### Why human-level performance?
166 | 
167 | - We compare to human-level performance because of two main reasons:
168 |   1. Because of advances in deep learning, machine learning algorithms are suddenly working much better and so it has become much more feasible in a lot of application areas for machine learning algorithms to actually become competitive with human-level performance. 
169 |   2. It turns out that the workflow of designing and building a machine learning system is much more efficient when you're trying to do something that humans can also do.
170 | - After an algorithm reaches the human level performance the progress and accuracy slow down.
171 |     ![01- Why human-level performance](Images/01-_Why_human-level_performance.png)
172 | - You won't surpass an error that's called "Bayes optimal error".
173 | - There isn't much error range between human-level error and Bayes optimal error.
174 | - Humans are quite good at a lot of tasks. So as long as Machine learning is worse than humans, you can:
175 |   - Get labeled data from humans.
176 |   - Gain insight from manual error analysis: why did a person get it right?
177 |   - Better analysis of bias/variance.
178 | 
179 | ### Avoidable bias
180 | 
181 | - Suppose that the cat classification algorithm gives these results:
182 | 
183 |   | Humans             | 1%   | 7.5% |
184 |   | ------------------ | ---- | ---- |
185 |   | **Training error** | 8%   | 8%   |
186 |   | **Dev Error**      | 10%  | 10%  |
187 |   - In the left example, because the human level error is 1% then we have to focus on the **bias**.
188 |   - In the right example, because the human level error is 7.5% then we have to focus on the **variance**.
189 |   - The human-level error as a proxy (estimate) for Bayes optimal error. Bayes optimal error is always less (better), but human-level in most cases is not far from it.
190 |   - You can't do better than Bayes error unless you are overfitting.
191 |   - `Avoidable bias = Training error - Human (Bayes) error`
192 |   - `Variance = Dev error - Training error`
193 | 
194 | ### Understanding human-level performance
195 | 
196 | - When choosing human-level performance, it has to be chosen in the terms of what you want to achieve with the system.
197 | - You might have multiple human-level performances based on the human experience. Then you choose the human-level performance (proxy for Bayes error) that is more suitable for the system you're trying to build.
198 | - Improving deep learning algorithms is harder once you reach a human-level performance.
199 | - Summary of bias/variance with human-level performance:
200 |   1. human-level error (proxy for Bayes error)
201 |      - Calculate `avoidable bias = training error - human-level error`
202 |      - If **avoidable bias** difference is the bigger, then it's *bias* problem and you should use a strategy for **bias** resolving.
203 |   2. training error
204 |      - Calculate `variance = dev error - training error`
205 |      - If **variance** difference is bigger, then you should use a strategy for **variance** resolving.
206 |   3. Dev error
207 | - So having an estimate of human-level performance gives you an estimate of Bayes error. And this allows you to more quickly make decisions as to whether you should focus on trying to reduce a bias or trying to reduce the variance of your algorithm.
208 | - These techniques will tend to work well until you surpass human-level performance, whereupon you might no longer have a good estimate of Bayes error that still helps you make this decision really clearly. 
209 | 
210 | ### Surpassing human-level performance
211 | 
212 | - In some problems, deep learning has surpassed human-level performance. Like:
213 |   - Online advertising.
214 |   - Product recommendation.
215 |   - Loan approval.
216 | - The last examples are not natural perception task, rather learning on structural data. Humans are far better in natural perception tasks like computer vision and speech recognition.
217 | - It's harder for machines to surpass human-level performance in natural perception task. But there are already some systems that achieved it.
218 | 
219 | ### Improving your model performance
220 | 
221 | - The two fundamental asssumptions of supervised learning:
222 |   1. You can fit the training set pretty well. This is roughly saying that you can achieve low **avoidable bias**. 
223 |   2. The training set performance generalizes pretty well to the dev/test set. This is roughly saying that **variance** is not too bad.
224 | - To improve your deep learning supervised system follow these guidelines:
225 |   1. Look at the difference between human level error and the training error - **avoidable bias**.
226 |   2. Look at the difference between the dev/test set and training set error - **Variance**.
227 |   3. If **avoidable bias** is large you have these options:
228 |      - Train bigger model.
229 |      - Train longer/better optimization algorithm (like Momentum, RMSprop, Adam).
230 |      - Find better NN architecture/hyperparameters search.
231 |   4. If **variance** is large you have these options:
232 |      - Get more training data.
233 |      - Regularization (L2, Dropout, data augmentation).
234 |      - Find better NN architecture/hyperparameters search.
235 | 
236 | 
237 | 
238 | ## ML Strategy 2
239 | 
240 | ### Carrying out error analysis
241 | 
242 | - Error analysis - process of manually examining mistakes that your algorithm is making. It can give you insights into what to do next. E.g.:
243 |   - In the cat classification example, if you have 10% error on your dev set and you want to decrease the error.
244 |   - You discovered that some of the mislabeled data are dog pictures that look like cats. Should you try to make your cat classifier do better on dogs (this could take some weeks)?
245 |   - Error analysis approach:
246 |     - Get 100 mislabeled dev set examples at random.
247 |     - Count up how many are dogs.
248 |     - if 5 of 100 are dogs then training your classifier to do better on dogs will decrease your error up to 9.5% (called ceiling), which can be too little.
249 |     - if 50 of 100 are dogs then you could decrease your error up to 5%, which is reasonable and you should work on that.
250 | - Based on the last example, error analysis helps you to analyze the error before taking an action that could take lot of time with no need.
251 | - Sometimes, you can evaluate multiple error analysis ideas in parallel and choose the best idea. Create a spreadsheet to do that and decide, e.g.:
252 | 
253 |   | Image        | Dog    | Great Cats | blurry  | Instagram filters |    Comments    |
254 |   | ------------ | ------ | ---------- | ------- | ----------------- |--------------- |
255 |   | 1            | ✓      |            |         | ✓                 |  Pitbull       |
256 |   | 2            | ✓      |            | ✓       | ✓                 |                |
257 |   | 3            |        |            |         |                   |Rainy day at zoo|
258 |   | 4            |        | ✓          |         |                   |                |
259 |   | ....         |        |            |         |                   |                |
260 |   | **% totals** | **8%** | **43%**    | **61%** |      **12%**      |                |
261 | - In the last example you will decide to work on great cats or blurry images to improve your performance.
262 | - This quick counting procedure, which you can often do in, at most, small numbers of hours can really help you make much better prioritization decisions, and understand how promising different approaches are to work on. 
263 | 
264 | ### Cleaning up incorrectly labeled data
265 | 
266 | - DL algorithms are quite robust to random errors in the training set but less robust to systematic errors. But it's OK to go and fix these labels if you can.
267 | - If you want to check for mislabeled data in dev/test set, you should also try error analysis with the mislabeled column. Ex:
268 | 
269 |   | Image        | Dog    | Great Cats | blurry  | Mislabeled | Comments |
270 |   | ------------ | ------ | ---------- | ------- | ---------- | -------- |
271 |   | 1            | ✓      |            |         |            |          |
272 |   | 2            | ✓      |            | ✓       |            |          |
273 |   | 3            |        |            |         |            |          |
274 |   | 4            |        | ✓          |         |            |          |
275 |   | ....         |        |            |         |            |          |
276 |   | **% totals** | **8%** | **43%**    | **61%** | **6%**     |          |
277 |   - Then:
278 |     - If overall dev set error: 10%
279 |       - Then errors due to incorrect data: 0.6%
280 |       - Then errors due to other causes: 9.4%
281 |     - Then you should focus on the 9.4% error rather than the incorrect data.
282 | - Consider these guidelines while correcting the dev/test mislabeled examples:
283 |   - Apply the same process to your dev and test sets to make sure they continue to come from the same distribution.
284 |   - Consider examining examples your algorithm got right as well as ones it got wrong. (Not always done if you reached a good accuracy)
285 |   - Train and (dev/test) data may now come from a slightly different distributions.
286 |   - It's very important to have dev and test sets to come from the same distribution. But it could be OK for a train set to come from slightly other distribution.
287 | 
288 | ### Build your first system quickly, then iterate
289 | 
290 | - The steps you take to make your deep learning project:
291 |   - Setup dev/test set and metric
292 |   - Build initial system quickly
293 |   - Use Bias/Variance analysis & Error analysis to prioritize next steps.
294 | 
295 | ### Training and testing on different distributions
296 | 
297 | - A lot of teams are working with deep learning applications that have training sets that are different from the dev/test sets due to the hunger of deep learning to data.
298 | - There are some strategies to follow up when training set distribution differs from dev/test sets distribution.
299 |   - Option one (not recommended): shuffle all the data together and extract randomly training and dev/test sets.
300 |     - Advantages: all the sets now come from the same distribution.
301 |     - Disadvantages: the other (real world) distribution that was in the dev/test sets will occur less in the new dev/test sets and that might be not what you want to achieve.
302 |   - Option two: take some of the dev/test set examples and add them to the training set.
303 |     - Advantages: the distribution you care about is your target now.
304 |     - Disadvantage: the distributions in training and dev/test sets are now different. But you will get a better performance over a long time.
305 | 
306 | ### Bias and Variance with mismatched data distributions
307 | 
308 | - Bias and Variance analysis changes when training and Dev/test set is from the different distribution.
309 | - Example: the cat classification example. Suppose you've worked in the example and reached this
310 |   - Human error: 0%
311 |   - Train error: 1%
312 |   - Dev error: 10%
313 |   - In this example, you'll think that this is a variance problem, but because the distributions aren't the same you can't tell for sure. Because it could be that train set was easy to train on, but the dev set was more difficult.
314 | - To solve this issue we create a new set called train-dev set as a random subset of the training set (so it has the same distribution) and we get:
315 |   - Human error: 0%
316 |   - Train error: 1%
317 |   - Train-dev error: 9%
318 |   - Dev error: 10%
319 |   - Now we are sure that this is a high variance problem.
320 | - Suppose we have a different situation:
321 |   - Human error: 0%
322 |   - Train error: 1%
323 |   - Train-dev error: 1.5%
324 |   - Dev error: 10%
325 |   - In this case we have something called *Data mismatch* problem.
326 | - Conclusions:
327 |   1. Human-level error (proxy for Bayes error)
328 |   2. Train error
329 |      - Calculate `avoidable bias = training error - human level error`
330 |      - If the difference is big then its **Avoidable bias** problem then you should use a strategy for high **bias**.
331 |   3. Train-dev error
332 |      - Calculate `variance = training-dev error - training error`
333 |      - If the difference is big then its high **variance** problem then you should use a strategy for solving it.
334 |   4. Dev error
335 |      - Calculate `data mismatch = dev error - train-dev error`
336 |      - If difference is much bigger then train-dev error its **Data mismatch** problem.
337 |   5. Test error
338 |      - Calculate `degree of overfitting to dev set = test error - dev error`
339 |      - Is the difference is big (positive) then maybe you need to find a bigger dev set (dev set and test set come from the same distribution, so the only way for there to be a huge gap here, for it to do much better on the dev set than the test set, is if you somehow managed to overfit the dev set).
340 | - Unfortunately, there aren't many systematic ways to deal with data mismatch. There are some things to try about this in the next section.
341 | 
342 | ### Addressing data mismatch
343 | 
344 | - There aren't completely systematic solutions to this, but there some things you could try.
345 | 1. Carry out manual error analysis to try to understand the difference between training and dev/test sets.
346 | 2. Make training data more similar, or collect more data similar to dev/test sets.
347 | - If your goal is to make the training data more similar to your dev set one of the techniques you can use **Artificial data synthesis** that can help you make more training data.
348 |     - Combine some of your training data with something that can convert it to the dev/test set distribution.
349 |       - Examples:
350 |         1. Combine normal audio with car noise to get audio with car noise example.
351 |         2. Generate cars using 3D graphics in a car classification example.
352 |     - Be cautious and bear in mind whether or not you might be accidentally simulating data only from a tiny subset of the space of all possible examples because your NN might overfit these generated data (like particular car noise or a particular design of 3D graphics cars).
353 | 
354 | ### Transfer learning
355 | 
356 | - Apply the knowledge you took in a task A and apply it in another task B.
357 | - For example, you have trained a cat classifier with a lot of data, you can use the part of the trained NN it to solve x-ray classification problem.
358 | - To do transfer learning, delete the last layer of NN and it's weights and:
359 |   1. Option 1: if you have a small data set - keep all the other weights as a fixed weights. Add a new last layer(-s) and initialize the new layer weights and feed the new data to the NN and learn the new weights.
360 |   2. Option 2: if you have enough data you can retrain all the weights.
361 | - Option 1 and 2 are called **fine-tuning** and training on task A called **pretraining**.
362 | - When transfer learning make sense:
363 |   - Task A and B have the same input X (e.g. image, audio).
364 |   - You have a lot of data for the task A you are transferring from and relatively less data for the task B your transferring to.
365 |   - Low level features from task A could be helpful for learning task B.
366 | 
367 | ### Multi-task learning
368 | 
369 | - Whereas in transfer learning, you have a sequential process where you learn from task A and then transfer that to task B. In multi-task learning, you start off simultaneously, trying to have one neural network do several things at the same time. And then each of these tasks helps hopefully all of the other tasks. 
370 | - Example:
371 |   - You want to build an object recognition system that detects pedestrians, cars, stop signs, and traffic lights (image has multiple labels).
372 |   - Then Y shape will be `(4,m)` because we have 4 classes and each one is a binary one.
373 |   - Then   
374 |   `Cost = (1/m) * sum(sum(L(y_hat(i)_j, y(i)_j))), i = 1..m, j = 1..4`, where   
375 |   `L = - y(i)_j * log(y_hat(i)_j) - (1 - y(i)_j) * log(1 - y_hat(i)_j)`
376 | - In the last example you could have trained 4 neural networks separately but if some of the earlier features in neural network can be shared between these different types of objects, then you find that training one neural network to do four things results in better performance than training 4 completely separate neural networks to do the four tasks separately. 
377 | - Multi-task learning will also work if y isn't complete for some labels. For example:
378 |   ```
379 |   Y = [1 ? 1 ...]
380 |       [0 0 1 ...]
381 |       [? 1 ? ...]
382 |   ```
383 |   - And in this case it will do good with the missing data, just the loss function will be different:   
384 |     `Loss = (1/m) * sum(sum(L(y_hat(i)_j, y(i)_j) for all j which y(i)_j != ?))`
385 | - Multi-task learning makes sense:
386 |   1. Training on a set of tasks that could benefit from having shared lower-level features.
387 |   2. Usually, amount of data you have for each task is quite similar.
388 |   3. Can train a big enough network to do well on all the tasks.
389 | - If you can train a big enough NN, the performance of the multi-task learning compared to splitting the tasks is better.
390 | - Today transfer learning is used more often than multi-task learning.
391 | 
392 | ### What is end-to-end deep learning?
393 | 
394 | - Some systems have multiple stages to implement. An end-to-end deep learning system implements all these stages with a single NN.
395 | - Example 1:
396 |   - Speech recognition system:
397 |     ```
398 |     Audio ---> Features --> Phonemes --> Words --> Transcript    # non-end-to-end system
399 |     Audio ---------------------------------------> Transcript    # end-to-end deep learning system
400 |     ```
401 |   - End-to-end deep learning gives data more freedom, it might not use phonemes when training!
402 | - To build the end-to-end deep learning system that works well, we need a big dataset (more data then in non end-to-end system). If we have a small dataset the ordinary implementation could work just fine.
403 | - Example 2:
404 |   - Face recognition system:
405 |     ```
406 |     Image ---------------------> Face recognition    # end-to-end deep learning system
407 |     Image --> Face detection --> Face recognition    # deep learning system - best approach for now
408 |     ```
409 |   - In practice, the best approach is the second one for now.
410 |   - In the second implementation, it's a two steps approach where both parts are implemented using deep learning.
411 |   - Its working well because it's harder to get a lot of pictures with people in front of the camera than getting faces of people and compare them.
412 |   - In the second implementation at the last step, the NN takes two faces as an input and outputs if the two faces are the same person or not.
413 | - Example 3:
414 |   - Machine translation system:
415 |     ```
416 |     English --> Text analysis --> ... --> French    # non-end-to-end system
417 |     English ----------------------------> French    # end-to-end deep learning system - best approach
418 |     ```
419 |   - Here end-to-end deep leaning system works better because we have enough data to build it.
420 | - Example 4:
421 |   - Estimating child's age from the x-ray picture of a hand:
422 |   ```
423 |   Image --> Bones --> Age    # non-end-to-end system - best approach for now
424 |   Image ------------> Age    # end-to-end system
425 |   ```
426 |   - In this example non-end-to-end system works better because we don't have enough data to train end-to-end system.
427 | 
428 | ### Whether to use end-to-end deep learning
429 | 
430 | - Pros of end-to-end deep learning:
431 |   - Let the data speak. By having a pure machine learning approach, your NN learning input from X to Y may be more able to capture whatever statistics are in the data, rather than being forced to reflect human preconceptions.
432 |   - Less hand-designing of components needed.
433 | - Cons of end-to-end deep learning:
434 |   - May need a large amount of data.
435 |   - Excludes potentially useful hand-design components (it helps more on the smaller dataset).
436 | - Applying end-to-end deep learning:
437 |   - Key question: Do you have sufficient data to learn a function of the **complexity** needed to map x to y?
438 |   - Use ML/DL to learn some individual components.
439 |   - When applying supervised learning you should carefully choose what types of X to Y mappings you want to learn depending on what task you can get data for.
440 | 
441 | 
442 | 
443 | 
444 | 
445 | 
446 | 
447 | <br><br>
448 | <br><br>
449 | These Notes were made by [Mahmoud Badry](mailto:mma18@fayoum.edu.eg) @2017
450 | 


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/01.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/02.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/03.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/04.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/05.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/06.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/07.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/08.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/09.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/10.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/11.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/12.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/13.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/14.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/15.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/16.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/17.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/18.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/19.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/20.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/21.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/22.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/23.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/24.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/25.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/26.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/26.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/27.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/27.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/28.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/29.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/29.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/30.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/31.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/31.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/32.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/33.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/33.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/34.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/34.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/35.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/35.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/36.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/36.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/37.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/37.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/38.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/38.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/39.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/39.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/40.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/41.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/41.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/42.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/42.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/43.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/43.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/44.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/44.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/Classification.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/Classification.jpg


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/ClassificationLoc.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/ClassificationLoc.jpg


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/InstanceSegmentation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/InstanceSegmentation.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/ObjectDetection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/ObjectDetection.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/SemanticSegmentation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/SemanticSegmentation.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/inception_block1a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/inception_block1a.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/receptiveField.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/receptiveField.png


--------------------------------------------------------------------------------
/4- Convolutional Neural Networks/Images/resNet.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/4- Convolutional Neural Networks/Images/resNet.jpg


--------------------------------------------------------------------------------
/5- Sequence Models/Images/01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/01.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/02.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/03.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/04.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/05.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/06.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/07.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/08.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/09.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/09.jpg


--------------------------------------------------------------------------------
/5- Sequence Models/Images/10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/10.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/11.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/12.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/12_different_types_of_rnn.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/12_different_types_of_rnn.jpg


--------------------------------------------------------------------------------
/5- Sequence Models/Images/13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/13.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/14.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/15.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/16.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/17.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/18.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/19.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/20.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/21.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/22.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/23.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/24.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/25.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/26.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/26.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/27.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/27.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/28.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/29.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/29.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/30.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/31.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/31.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/32.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/33.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/33.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/34.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/34.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/35.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/35.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/36.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/36.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/37.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/37.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/38.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/38.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/39.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/39.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/40.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/41.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/41.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/42.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/42.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/43.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/43.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/44.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/44.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/45.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/45.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/46.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/46.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/47.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/47.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/48.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/49.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/49.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/50.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/50.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/51.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/51.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/52.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/52.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/53.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/53.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/54.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/54.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/55.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/55.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/56.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/56.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/57.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/57.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/58.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/58.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/59.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/59.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/60.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/60.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/61.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/61.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/62.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/62.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/63.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/63.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/64.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/65.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/65.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/66.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/66.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/67.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/67.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/68.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/68.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/69.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/69.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/70.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/70.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/71.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/71.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/72.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/72.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/73.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/73.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/74.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/74.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/75.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/75.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/76.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/76.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/77.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/77.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/78.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/78.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/79.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/79.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/80.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/80.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/81.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/81.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/83.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/83.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/84.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/84.png


--------------------------------------------------------------------------------
/5- Sequence Models/Images/85.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/5- Sequence Models/Images/85.png


--------------------------------------------------------------------------------
/Certificate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/a0d167901c56559e2effa6e8137adff7384c0a95/Certificate.png


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 MBadry
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/Notebooks headers.md:
--------------------------------------------------------------------------------
  1 | # Notebooks headers
  2 | 
  3 | In this document, i present the whole notebook assignments headers of [deeplearning.ai](https://deeplearning.ai). It may help someone know the code contents of the course or to fast check the applications the courses has dealt with.
  4 | 
  5 | ## Table of contents
  6 | 
  7 | * [Notebooks headers](#notebooks-headers)
  8 |    * [Table of contents](#table-of-contents)
  9 |    * [Neural Networks and Deep Learning](#neural-networks-and-deep-learning)
 10 |       * [Python Basics with Numpy (optional assignment)](#python-basics-with-numpy-optional-assignment)
 11 |       * [Logistic Regression with a Neural Network mindset](#logistic-regression-with-a-neural-network-mindset)
 12 |       * [Planar data classification with one hidden layer](#planar-data-classification-with-one-hidden-layer)
 13 |       * [Building your Deep Neural Network: Step by Step](#building-your-deep-neural-network-step-by-step)
 14 |       * [Deep Neural Network for Image Classification: Application](#deep-neural-network-for-image-classification-application)
 15 |    * [Improving Deep Neural Networks](#improving-deep-neural-networks)
 16 |       * [Initialization](#initialization)
 17 |       * [Regularization](#regularization)
 18 |       * [TensorFlow Tutorial](#tensorflow-tutorial)
 19 |       * [Optimization Methods](#optimization-methods)
 20 |       * [Gradient Checking](#gradient-checking)
 21 |    * [Structuring Machine Learning Projects](#structuring-machine-learning-projects)
 22 |    * [Convolutional Neural Networks](#convolutional-neural-networks)
 23 |       * [Convolutional Neural Networks: Step by Step](#convolutional-neural-networks-step-by-step)
 24 |       * [Convolutional Neural Networks: Application](#convolutional-neural-networks-application)
 25 |       * [Keras tutorial - the Happy House](#keras-tutorial---the-happy-house)
 26 |       * [Residual Networks](#residual-networks)
 27 |       * [Deep Learning &amp; Art: Neural Style Transfer](#deep-learning--art-neural-style-transfer)
 28 |       * [Autonomous driving - Car detection](#autonomous-driving---car-detection)
 29 |    * [Sequence Models](#sequence-models)
 30 |       * [Building a recurrent neural network - step by step](#building-a-recurrent-neural-network---step-by-step)
 31 |       * [Dinosaur Island -- Character-level language model](#dinosaur-island----character-level-language-model)
 32 |       * [Jazz improvisation with LSTM](#jazz-improvisation-with-lstm)
 33 |       * [Emojify](#emojify)
 34 |       * [Word Vector Representation](#word-vector-representation)
 35 |       * [Machine Translation (Neural Machine Translation)](#machine-translation-neural-machine-translation)
 36 |       * [Trigger word detection](#trigger-word-detection)
 37 | 
 38 | 
 39 | ## Neural Networks and Deep Learning
 40 | 
 41 | ### Python Basics with Numpy (optional assignment)
 42 | 
 43 | Welcome to your first assignment. This exercise gives you a brief introduction to Python. Even if you've used Python before, this will help familiarize you with functions we'll need.  
 44 | 
 45 | **Instructions:**
 46 | - You will be using Python 3.
 47 | - Avoid using for-loops and while-loops, unless you are explicitly told to do so.
 48 | - Do not modify the (# GRADED FUNCTION [function name]) comment in some cells. Your work would not be graded if you change this. Each cell containing that comment should only contain one function.
 49 | - After coding your function, run the cell right below it to check if your result is correct.
 50 | 
 51 | **After this assignment you will:**
 52 | - Be able to use iPython Notebooks
 53 | - Be able to use numpy functions and numpy matrix/vector operations
 54 | - Understand the concept of "broadcasting"
 55 | - Be able to vectorize code
 56 | 
 57 | Let's get started!
 58 | 
 59 | ### Logistic Regression with a Neural Network mindset
 60 | 
 61 | Welcome to your first (required) programming assignment! You will build a logistic regression classifier to recognize  cats. This assignment will step you through how to do this with a Neural Network mindset, and so will also hone your intuitions about deep learning.
 62 | 
 63 | **Instructions:**
 64 | - Do not use loops (for/while) in your code, unless the instructions explicitly ask you to do so.
 65 | 
 66 | **You will learn to:**
 67 | - Build the general architecture of a learning algorithm, including:
 68 |     - Initializing parameters
 69 |     - Calculating the cost function and its gradient
 70 |     - Using an optimization algorithm (gradient descent) 
 71 | - Gather all three functions above into a main model function, in the right order.
 72 | 
 73 | ### Planar data classification with one hidden layer
 74 | 
 75 | Welcome to your week 3 programming assignment. It's time to build your first neural network, which will have a hidden layer. You will see a big difference between this model and the one you implemented using logistic regression. 
 76 | 
 77 | **You will learn how to:**
 78 | - Implement a 2-class classification neural network with a single hidden layer
 79 | - Use units with a non-linear activation function, such as tanh 
 80 | - Compute the cross entropy loss 
 81 | - Implement forward and backward propagation
 82 | 
 83 | ### Building your Deep Neural Network: Step by Step
 84 | 
 85 | Welcome to your week 4 assignment (part 1 of 2)! You have previously trained a 2-layer Neural Network (with a single hidden layer). This week, you will build a deep neural network, with as many layers as you want!
 86 | 
 87 | - In this notebook, you will implement all the functions required to build a deep neural network.
 88 | - In the next assignment, you will use these functions to build a deep neural network for image classification.
 89 | 
 90 | **After this assignment you will be able to:**
 91 | - Use non-linear units like ReLU to improve your model
 92 | - Build a deeper neural network (with more than 1 hidden layer)
 93 | - Implement an easy-to-use neural network class
 94 | 
 95 | **Notation**:
 96 | - Superscript <sup>[l]</sup> denotes a quantity associated with the `l.th` layer.
 97 |     - Example: a<sup>[L]</sup> is the `L.th` layer activation. W<sup>[L]</sup> and b<sup>[L]</sup> are the `L.th` (last) layer parameters.
 98 | - Superscript <sup>(i)</sup> denotes a quantity associated with the `i.th` example.
 99 |     - Example: x<sup>(i)</sup> is the `i.th` training example.
100 | - Lowerscript <sub>i</sub> denotes the `i.th` entry of a vector.
101 |     - Example: a<sup>[l]</sup><sub>i</sub> denotes the `i.th` entry of the `l.th` layer's activations).
102 | 
103 | Let's get started!
104 | 
105 | ### Deep Neural Network for Image Classification: Application
106 | 
107 | When you finish this, you will have finished the last programming assignment of Week 4, and also the last programming assignment of this course! 
108 | 
109 | You will use use the functions you'd implemented in the previous assignment to build a deep network, and apply it to cat vs non-cat classification. Hopefully, you will see an improvement in accuracy relative to your previous logistic regression implementation.  
110 | 
111 | **After this assignment you will be able to:**
112 | - Build and apply a deep neural network to supervised learning. 
113 | 
114 | Let's get started!
115 | 
116 | ## Improving Deep Neural Networks
117 | 
118 | ### Initialization
119 | 
120 | Welcome to the first assignment of "Improving Deep Neural Networks". 
121 | 
122 | Training your neural network requires specifying an initial value of the weights. A well chosen initialization method will help learning.  
123 | 
124 | If you completed the previous course of this specialization, you probably followed our instructions for weight initialization, and it has worked out so far. But how do you choose the initialization for a new neural network? In this notebook, you will see how different initializations lead to different results. 
125 | 
126 | A well chosen initialization can:
127 | - Speed up the convergence of gradient descent
128 | - Increase the odds of gradient descent converging to a lower training (and generalization) error 
129 | 
130 | To get started, run the following cell to load the packages and the planar dataset you will try to classify.
131 | 
132 | ### Regularization
133 | 
134 | Welcome to the second assignment of this week. Deep Learning models have so much flexibility and capacity that **overfitting can be a serious problem**, if the training dataset is not big enough. Sure it does well on the training set, but the learned network **doesn't generalize to new examples** that it has never seen!
135 | 
136 | **You will learn to:** Use regularization in your deep learning models.
137 | 
138 | Let's first import the packages you are going to use.
139 | 
140 | ### TensorFlow Tutorial
141 | 
142 | Welcome to this week's programming assignment. Until now, you've always used numpy to build neural networks. Now we will step you through a deep learning framework that will allow you to build neural networks more easily. Machine learning frameworks like TensorFlow, PaddlePaddle, Torch, Caffe, Keras, and many others can speed up your machine learning development significantly. All of these frameworks also have a lot of documentation, which you should feel free to read. In this assignment, you will learn to do the following in TensorFlow: 
143 | 
144 | - Initialize variables
145 | - Start your own session
146 | - Train algorithms 
147 | - Implement a Neural Network
148 | 
149 | Programing frameworks can not only shorten your coding time, but sometimes also perform optimizations that speed up your code. 
150 | 
151 | ### Optimization Methods
152 | 
153 | Until now, you've always used Gradient Descent to update the parameters and minimize the cost. In this notebook, you will learn more advanced optimization methods that can speed up learning and perhaps even get you to a better final value for the cost function. Having a good optimization algorithm can be the difference between waiting days vs. just a few hours to get a good result. 
154 | 
155 | Gradient descent goes "downhill" on a cost function `J`. Think of it as trying to do this: 
156 | At each step of the training, you update your parameters following a certain direction to try to get to the lowest possible point. </center></caption>
157 | 
158 | **Notations**: As usual, &part;J / &part;a =  `da` for any variable `a`.
159 | 
160 | To get started, run the following code to import the libraries you will need.
161 | 
162 | ### Gradient Checking
163 | 
164 | Welcome to the final assignment for this week! In this assignment you will learn to implement and use gradient checking. 
165 | 
166 | You are part of a team working to make mobile payments available globally, and are asked to build a deep learning model to detect fraud--whenever someone makes a payment, you want to see if the payment might be fraudulent, such as if the user's account has been taken over by a hacker. 
167 | 
168 | But backpropagation is quite challenging to implement, and sometimes has bugs. Because this is a mission-critical application, your company's CEO wants to be really certain that your implementation of backpropagation is correct. Your CEO says, "Give me a proof that your backpropagation is actually working!" To give this reassurance, you are going to use "gradient checking".
169 | 
170 | Let's do it!
171 | 
172 | ## Structuring Machine Learning Projects
173 | 
174 | There were no code assignments in this course.
175 | 
176 | ## Convolutional Neural Networks
177 | 
178 | ### Convolutional Neural Networks: Step by Step
179 | 
180 | Welcome to Course 4's first assignment! In this assignment, you will implement convolutional (CONV) and pooling (POOL) layers in numpy, including both forward propagation and (optionally) backward propagation. 
181 | 
182 | **Notation**:
183 | - Superscript <sup>[l]</sup> denotes an object of the `l.th` layer.
184 |     - Example: a<sup>[4]</sup> is the `4.th` layer activation. W<sup>[5]</sup> and b<sup>[5]</sup> are the `5.<sup>th</sup>` layer parameters.
185 | 
186 | 
187 | - Superscript <sup>(i)</sup> denotes an object from the `i.th` example.
188 |     - Example: x<sup>(i)</sup> is the `i.th` training example input.
189 | 
190 | - Lowerscript <sub>i</sub> denotes the `i.th` entry of a vector.
191 |     - Example: a<sup>[l]</sup><sub>i</sub> denotes the `i.th` entry of the activations in layer `l`, assuming this is a fully connected (FC) layer.
192 | 
193 | - `n_H`, `n_W` and `n_C` denote respectively the height, width and number of channels of a given layer. If you want to reference a specific layer `l`, you can also write n_H<sup>[l]</sup>, n_W<sup>[l]</sup>, n_C<sup>[l]</sup>.
194 | - `n_H_prev`, `n_W_prev` and `n_C_prev` denote respectively the height, width and number of channels of the previous layer. If referencing a specific layer `l`, this could also be denoted n_H<sup>[l-1]</sup>, n_W<sup>[l-1]</sup>, n_C<sup>[l-1]</sup>.
195 | 
196 | We assume that you are already familiar with `numpy` and/or have completed the previous courses of the specialization. Let's get started!
197 | 
198 | ### Convolutional Neural Networks: Application
199 | 
200 | Welcome to Course 4's second assignment! In this notebook, you will:
201 | 
202 | - Implement helper functions that you will use when implementing a TensorFlow model
203 | - Implement a fully functioning ConvNet using TensorFlow 
204 | 
205 | **After this assignment you will be able to:**
206 | 
207 | - Build and train a ConvNet in TensorFlow for a classification problem 
208 | 
209 | We assume here that you are already familiar with TensorFlow. If you are not, please refer the *TensorFlow Tutorial* of the third week of Course 2 ("*Improving deep neural networks*").
210 | 
211 | ### Keras tutorial - the Happy House
212 | 
213 | Welcome to the first assignment of week 2. In this assignment, you will:
214 | 1. Learn to use Keras, a high-level neural networks API (programming framework), written in Python and capable of running on top of several lower-level frameworks including TensorFlow and CNTK. 
215 | 2. See how you can in a couple of hours build a deep learning algorithm.
216 | 
217 | Why are we using Keras? Keras was developed to enable deep learning engineers to build and experiment with different models very quickly. Just as TensorFlow is a higher-level framework than Python, Keras is an even higher-level framework and provides additional abstractions. Being able to go from idea to result with the least possible delay is key to finding good models. However, Keras is more restrictive than the lower-level frameworks, so there are some very complex models that you can implement in TensorFlow but not (without more difficulty) in Keras. That being said, Keras will work fine for many common models. 
218 | 
219 | In this exercise, you'll work on the "Happy House" problem, which we'll explain below. Let's load the required packages and solve the problem of the Happy House!
220 | 
221 | ### Residual Networks
222 | 
223 | Welcome to the second assignment of this week! You will learn how to build very deep convolutional networks, using Residual Networks (ResNets). In theory, very deep networks can represent very complex functions; but in practice, they are hard to train. Residual Networks, introduced by [He et al.](https://arxiv.org/pdf/1512.03385.pdf), allow you to train much deeper networks than were previously practically feasible.
224 | 
225 | **In this assignment, you will:**
226 | - Implement the basic building blocks of ResNets. 
227 | - Put together these building blocks to implement and train a state-of-the-art neural network for image classification. 
228 | 
229 | This assignment will be done in Keras. 
230 | 
231 | Before jumping into the problem, let's run the cell below to load the required packages.
232 | 
233 | ### Deep Learning & Art: Neural Style Transfer
234 | 
235 | Welcome to the second assignment of this week. In this assignment, you will learn about Neural Style Transfer. This algorithm was created by Gatys et al. (2015) (https://arxiv.org/abs/1508.06576). 
236 | 
237 | **In this assignment, you will:**
238 | - Implement the neural style transfer algorithm 
239 | - Generate novel artistic images using your algorithm 
240 | 
241 | Most of the algorithms you've studied optimize a cost function to get a set of parameter values. In Neural Style Transfer, you'll optimize a cost function to get pixel values!
242 | 
243 | ### Autonomous driving - Car detection
244 | 
245 | Welcome to your week 3 programming assignment. You will learn about object detection using the very powerful YOLO model. Many of the ideas in this notebook are described in the two YOLO papers: Redmon et al., 2016 (https://arxiv.org/abs/1506.02640) and Redmon and Farhadi, 2016 (https://arxiv.org/abs/1612.08242). 
246 | 
247 | **You will learn to**:
248 | - Use object detection on a car detection dataset
249 | - Deal with bounding boxes
250 | 
251 | Run the following cell to load the packages and dependencies that are going to be useful for your journey!
252 | 
253 | ## Sequence Models
254 | 
255 | ### Building a recurrent neural network - step by step
256 | Welcome to Course 5's first assignment! In this assignment, you will implement your first Recurrent Neural Network in numpy.
257 | 
258 | Recurrent Neural Networks (RNN) are very effective for Natural Language Processing and other sequence tasks because they have "memory". They can read inputs x<sup>`<t>`</sup> (such as words) one at a time, and remember some information/context through the hidden layer activations that get passed from one time-step to the next. This allows a uni-directional RNN to take information from the past to process later inputs. A bidirection RNN can take context from both the past and the future. 
259 | 
260 | **Notation**:
261 | - Superscript <sup>[l]</sup> denotes an object associated with the `l.th` layer.
262 |     - Example: a<sup>[4]</sup> is the `4.th` layer activation. W<sup>[5]</sup> and b<sup>[5]</sup> are the `5.th` layer parameters.
263 | 
264 | - Superscript <sup>(i)</sup> denotes an object associated with the `i.th` example.
265 |     - Example: x<sup>(i)</sup> is the `i.th` training example input.
266 | 
267 | - Superscript `<t>` denotes an object at the `t.th` time-step.
268 |     - Example: x<sup>`<t>`</sup> is the input x at the `t.th` time-step. x<sup>`(i)<t>`</sup> is the input at the `t.th` timestep of example `i`.
269 | 
270 | - Lowerscript <sub>i</sub> denotes the `i.th` entry of a vector.
271 |     - Example: a<sup>[l]</sup><sub>i</sub> denotes the `i.th` entry of the activations in layer `l`.
272 | 
273 | We assume that you are already familiar with `numpy` and/or have completed the previous courses of the specialization. Let's get started!
274 | 
275 | ### Dinosaur Island -- Character-level language model
276 | Welcome to Dinosaurus Island! 65 million years ago, dinosaurs existed, and in this assignment they are back. You are in charge of a special task. Leading biology researchers are creating new breeds of dinosaurs and bringing them to life on earth, and your job is to give names to these dinosaurs. If a dinosaur does not like its name, it might go beserk, so choose wisely! 
277 | <br />
278 | Luckily you have learned some deep learning and you will use it to save the day. Your assistant has collected a list of all the dinosaur names they could find, and compiled them into this [dataset](dinos.txt). (Feel free to take a look by clicking the previous link.) To create new dinosaur names, you will build a character level language model to generate new names. Your algorithm will learn the different name patterns, and randomly generate new names. Hopefully this algorithm will keep you and your team safe from the dinosaurs' wrath! 
279 | 
280 | By completing this assignment you will learn:
281 | 
282 | - How to store text data for processing using an RNN 
283 | - How to synthesize data, by sampling predictions at each time step and passing it to the next RNN-cell unit
284 | - How to build a character-level text generation recurrent neural network
285 | - Why clipping the gradients is important
286 | 
287 | We will begin by loading in some functions that we have provided for you in `rnn_utils`. Specifically, you have access to functions such as `rnn_forward` and `rnn_backward` which are equivalent to those you've implemented in the previous assignment. 
288 | 
289 | ### Jazz improvisation with LSTM
290 | Welcome to your final programming assignment of this week! In this notebook, you will implement a model that uses an LSTM to generate music. You will even be able to listen to your own music at the end of the assignment. 
291 | 
292 | **You will learn to:**
293 | - Apply an LSTM to music generation.
294 | - Generate your own jazz music with deep learning.
295 | 
296 | Please run the following cell to load all the packages required in this assignment. This may take a few minutes. 
297 | 
298 | ### Emojify
299 | Welcome to the second assignment of Week 2. You are going to use word vector representations to build an Emojifier. 
300 | 
301 | Have you ever wanted to make your text messages more expressive? Your emojifier app will help you do that. So rather than writing "Congratulations on the promotion! Lets get coffee and talk. Love you!" the emojifier can automatically turn this into "Congratulations on the promotion! 👍 Lets get coffee and talk. ☕️ Love you! ❤️"
302 | 
303 | You will implement a model which inputs a sentence (such as "Let's go see the baseball game tonight!") and finds the most appropriate emoji to be used with this sentence (⚾️). In many emoji interfaces, you need to remember that ❤️ is the "heart" symbol rather than the "love" symbol. But using word vectors, you'll see that even if your training set explicitly relates only a few words to a particular emoji, your algorithm will be able to generalize and associate words in the test set to the same emoji even if those words don't even appear in the training set. This allows you to build an accurate classifier mapping from sentences to emojis, even using a small training set. 
304 | 
305 | In this exercise, you'll start with a baseline model (Emojifier-V1) using word embeddings, then build a more sophisticated model (Emojifier-V2) that further incorporates an LSTM. 
306 | 
307 | Lets get started! Run the following cell to load the package you are going to use. 
308 | 
309 | ### Word Vector Representation
310 | Welcome to your first assignment of this week! 
311 | 
312 | Because word embeddings are very computionally expensive to train, most ML practitioners will load a pre-trained set of embeddings. 
313 | 
314 | **After this assignment you will be able to:**
315 | 
316 | - Load pre-trained word vectors, and measure similarity using cosine similarity
317 | - Use word embeddings to solve word analogy problems such as Man is to Woman as King is to ______. 
318 | - Modify word embeddings to reduce their gender bias 
319 | 
320 | Let's get started! Run the following cell to load the packages you will need.
321 | 
322 | ### Machine Translation (Neural Machine Translation)
323 | Welcome to your first programming assignment for this week! 
324 | 
325 | You will build a Neural Machine Translation (NMT) model to translate human readable dates ("25th of June, 2009") into machine readable dates ("2009-06-25"). You will do this using an attention model, one of the most sophisticated sequence to sequence models. 
326 | 
327 | This notebook was produced together with NVIDIA's Deep Learning Institute. 
328 | 
329 | Let's load all the packages you will need for this assignment.
330 | 
331 | ### Trigger word detection
332 | Welcome to the final programming assignment of this specialization! 
333 | 
334 | In this week's videos, you learned about applying deep learning to speech recognition. In this assignment, you will construct a speech dataset and implement an algorithm for trigger word detection (sometimes also called keyword detection, or wakeword detection). Trigger word detection is the technology that allows devices like Amazon Alexa, Google Home, Apple Siri, and Baidu DuerOS to wake up upon hearing a certain word.  
335 | 
336 | For this exercise, our trigger word will be "Activate." Every time it hears you say "activate," it will make a "chiming" sound. By the end of this assignment, you will be able to record a clip of yourself talking, and have the algorithm trigger a chime when it detects you saying "activate." 
337 | 
338 | After completing this assignment, perhaps you can also extend it to run on your laptop so that every time you say "activate" it starts up your favorite app, or turns on a network connected lamp in your house, or triggers some other event? 
339 | 
340 | In this assignment you will learn to: 
341 | - Structure a speech recognition project
342 | - Synthesize and process audio recordings to create train/dev datasets
343 | - Train a trigger word detection model and make predictions
344 | 
345 | Lets get started! Run the following cell to load the package you are going to use.  
346 | 
347 | 
348 | <br><br>
349 | <br><br>
350 | 


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
 1 | # DeepLearning.ai Courses Notes
 2 | 
 3 | This repository contains my personal notes and summaries on [DeepLearning.ai](https://deeplearning.ai) specialization courses. I've enjoyed every little bit of the course hope you enjoy my notes too.
 4 | 
 5 | [DeepLearning.ai](https://deeplearning.ai)  contains five courses which can be taken on [Coursera](https://www.coursera.org/specializations/deep-learning). The five courses titles are:
 6 | 
 7 | 1. Neural Networks and Deep Learning.
 8 | 2. Improving Deep Neural Networks: Hyperparameter tuning, Regularization and Optimization.
 9 | 3. Structuring Machine Learning Projects.
10 | 4. Convolutional Neural Networks.
11 | 5. Sequence Models.
12 | 
13 | This is by far the best course series on deep learning that I've taken. Enjoy!
14 | 
15 | 
16 | 
17 | ## About This Specialization (From the official Deep Learning Specialization page)
18 | 
19 | > If you want to break into AI, this Specialization will help you do so. Deep Learning is one of the most highly sought after skills in tech. We will help you become good at Deep Learning.
20 | >
21 | > In five courses, you will learn the foundations of Deep Learning, understand how to build neural networks, and learn how to lead successful machine learning projects. You will learn about Convolutional networks, RNNs, LSTM, Adam, Dropout, BatchNorm, Xavier/He initialization, and more. You will work on case studies from healthcare, autonomous driving, sign language reading, music generation, and natural language processing. You will master not only the theory, but also see how it is applied in industry. You will practice all these ideas in Python and in TensorFlow, which we will teach.
22 | >
23 | > You will also hear from many top leaders in Deep Learning, who will share with you their personal stories and give you career advice.
24 | >
25 | > AI is transforming multiple industries. After finishing this specialization, you will likely find creative ways to apply it to your work.
26 | >
27 | > We will help you master Deep Learning, understand how to apply it, and build a career in AI.
28 | 
29 | 
30 | 
31 | ## Specialization Certificate
32 | 
33 | At last I've successfully completed the specialization and earned my [certificate](https://coursera.org/verify/specialization/DTTJC9Y5B8U6)!
34 | 
35 | ![](Certificate.png)
36 | 
37 | 
38 | 
39 | ## Similar Notes
40 | 
41 | - Beautifully drawn notes by Tess Ferrandez:
42 |   - https://www.slideshare.net/TessFerrandez/notes-from-coursera-deep-learning-courses-by-andrew-ng
43 | 
44 | ## Reviews
45 | 
46 | As [DeepLearning.ai](https://deeplearning.ai) is one of the most popular courses in the field of AI/ML/DL, there are some good reviews regarding some or whole of the specialization courses.
47 | 
48 | The list of reviews includes:
49 | 
50 | - [Ryan Shrott](https://towardsdatascience.com/@ryanshrott?source=post_header_lockup) Reviews:
51 |   - [Deep Learning Specialization by Andrew Ng — 21 Lessons Learned](https://towardsdatascience.com/deep-learning-specialization-by-andrew-ng-21-lessons-learned-15ffaaef627c)
52 |   - [Computer Vision by Andrew Ng — 11 Lessons Learned](https://towardsdatascience.com/computer-vision-by-andrew-ng-11-lessons-learned-7d05c18a6999)
53 | - [Arthur Chan](https://www.linkedin.com/in/arthchan2003/) Reviews:
54 |   - [Review of Ng's deeplearning.ai Course 1: Neural Networks and Deep Learning](https://www.linkedin.com/pulse/review-ngs-deeplearningai-course-1-neural-networks-deep-arthur-chan/?lipi=urn%3Ali%3Apage%3Ad_flagship3_profile_view_base_post_details%3BVLk6TK8sThiFt5gZF%2B25Ug%3D%3D)
55 |   - [Review of Ng's deeplearning.ai Course 2: Improving Deep Neural Networks](https://www.linkedin.com/pulse/review-ngs-deeplearningai-course-2-improving-deep-neural-arthur-chan/?lipi=urn%3Ali%3Apage%3Ad_flagship3_profile_view_base_post_details%3BVLk6TK8sThiFt5gZF%2B25Ug%3D%3D)
56 |   - [Review of Ng's deeplearning.ai Course 3: Structuring Machine Learning Projects](https://www.linkedin.com/pulse/review-ngs-deeplearningai-course-3-structuring-machine-arthur-chan/?lipi=urn%3Ali%3Apage%3Ad_flagship3_profile_view_base_post_details%3BVLk6TK8sThiFt5gZF%2B25Ug%3D%3D)
57 |   - [Review of Ng's deeplearning.ai Course 4: Convolutional Neural Networks](https://www.linkedin.com/pulse/review-ngs-deeplearningai-course-4-convolutional-neural-arthur-chan/?lipi=urn%3Ali%3Apage%3Ad_flagship3_profile_view_base_post_details%3BVLk6TK8sThiFt5gZF%2B25Ug%3D%3D)
58 | - [Thoughts after taking the Deeplearning.ai courses](https://towardsdatascience.com/thoughts-after-taking-the-deeplearning-ai-courses-8568f132153)
59 | - [Learning Deep Learning — fast.ai vs. deeplearning.ai](https://medium.com/@markryan_69718/learning-deep-learning-fast-ai-vs-deeplearning-ai-34f9c42cf701)
60 | 
61 | 
62 | 
63 | A good Facebook group that discusses the courses are here: https://www.facebook.com/groups/DeepLearningAISpecialization/.
64 | 
65 | Group description:
66 | 
67 | > This group is for current, past or future students of Prof Andrew Ng's deeplearning.ai class in Coursera. The purpose is for students to get to know each other, ask questions, and share insights. However, remember the Coursera Honor Code - please do not post any solution in the forum! 
68 | 
69 | 
70 | 
71 | ## Next steps
72 | 
73 | Taking [fast.ai](http://www.fast.ai/) courses series as it focuses more on the practical works.
74 | 
75 | ## Acknowledgements
76 | 
77 | Thanks to [VladKha](https://github.com/VladKha), [wangzhenhui1992](https://github.com/wangzhenhui1992), [jarpit96](https://github.com/jarpit96), and other contributors for helping me revising and fixing mistakes in the notes.
78 | 
79 | <br/>
80 | 
81 | <br/>
82 | 
83 | <br/>
84 | 
85 | <br/>
86 | 
87 | Mahmoud Badry @ 2018
88 | 


--------------------------------------------------------------------------------
/download.py:
--------------------------------------------------------------------------------
 1 | ################################################
 2 | # File name: download.py                       #
 3 | # Author: Mahmoud Badry                        #
 4 | # Date created: 2/11/2018                      #
 5 | # Date last modified: 2/11/2018                #
 6 | # Python Version: 3                            #
 7 | # Purpose: Download all notes in PDF format    #
 8 | # Requirements: pypandoc >= 1.4                #
 9 | ################################################
10 | import pypandoc
11 | 
12 | 
13 | def main():
14 |     home_link = "https://raw.githubusercontent.com/mbadry1/DeepLearning.ai-Summary/master/"
15 |     marks_down_links = {
16 |         "Deeplearning.ai summary Homepage":
17 |             home_link + "Readme.md",
18 |         "01- Neural Networks and Deep Learning":
19 |             home_link + "1-%20Neural%20Networks%20and%20Deep%20Learning/Readme.md",
20 |         "02- Improving Deep Neural Networks Hyperparameter tuning, Regularization and Optimization":
21 |             home_link + "2-%20Improving%20Deep%20Neural%20Networks/Readme.md",
22 |         "03- Structuring Machine Learning Projects":
23 |             home_link + "3-%20Structuring%20Machine%20Learning%20Projects/Readme.md",
24 |         "04- Convolutional Neural Networks":
25 |             home_link + "4-%20Convolutional%20Neural%20Networks/Readme.md",
26 |         "05- Sequence Models":
27 |             home_link + "5-%20Sequence%20Models/Readme.md",
28 |     }
29 | 
30 |     # Extracting pandoc version
31 |     print("pandoc_version:", pypandoc.get_pandoc_version())
32 |     print("pandoc_path:", pypandoc.get_pandoc_path())
33 |     print("\n")
34 | 
35 |     # Starting downloading and converting
36 |     for key, value in marks_down_links.items():
37 |         print("Converting", key)
38 |         pypandoc.convert_file(
39 |             value,
40 |             'pdf',
41 |             extra_args=['--pdf-engine=xelatex', '-V', 'geometry:margin=1.5cm'],
42 |             outputfile=(key + ".pdf")
43 |         )
44 |         print("Converting", key, "completed")
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     main()
49 | 


--------------------------------------------------------------------------------