├── .gitignore
├── .vscode
    └── settings.json
├── LICENSE
├── README.md
├── code
    ├── activation_functions.py
    ├── autoencoder.py
    ├── cnn.py
    ├── decision_tree.py
    ├── gan.py
    ├── id3_decision_tree_simple.py
    ├── knn.py
    ├── layers.py
    ├── logistic_regression.py
    ├── logistic_regression_scipy.py
    ├── loss_functions.py
    ├── mlp.py
    ├── nn_matrix.py
    ├── nn_simple.py
    ├── optimizers.py
    ├── random_forest_classifier.py
    ├── rnn.py
    └── vae.py
├── docs
    ├── .vscode
    │   └── settings.json
    ├── Makefile
    ├── _static
    │   └── theme_overrides.css
    ├── activation_functions.rst
    ├── applications.rst
    ├── architectures.rst
    ├── backpropagation.rst
    ├── build.bat
    ├── calculus.rst
    ├── classification_algos.rst
    ├── clustering_algos.rst
    ├── conf.py
    ├── contribute.rst
    ├── datasets.rst
    ├── figures
    │   ├── SimpleDiagram3_neural_networks.sdxml
    │   ├── activation_function_table.tgn
    │   ├── calculus_symbol_table.tgn
    │   ├── forward_prop_matrix_dimensions_table.tgn
    │   ├── linear_regression_companies_sales.tgn
    │   ├── linearalgebra.tgn
    │   └── statistics_symbols_table.tgn
    ├── forwardpropagation.rst
    ├── generative_algos.rst
    ├── glossary.rst
    ├── gradient_descent.rst
    ├── images
    │   ├── autoencoder.png
    │   ├── autoencoder_2.png
    │   ├── autoencoder_architecture.png
    │   ├── backprop_3_equations.png
    │   ├── backprop_ff_equations.png
    │   ├── backprop_final_3_deriv_equations.png
    │   ├── backprop_visually.png
    │   ├── boosting-sequence-models.PNG
    │   ├── boosting_error_iteration.png
    │   ├── calculus_slope_intro.png
    │   ├── cnn.jpg
    │   ├── cnn_filter_output.png
    │   ├── cross_entropy.png
    │   ├── decision_tree.png
    │   ├── dropout.png
    │   ├── dropout_net.png
    │   ├── dynamic_resizing_neural_network_1_obs.png
    │   ├── dynamic_resizing_neural_network_4_obs.png
    │   ├── earlystopping.png
    │   ├── elu.png
    │   ├── elu_prime.png
    │   ├── fc_layer.png
    │   ├── gan.png
    │   ├── gradient_accumulation.png
    │   ├── gradient_descent.png
    │   ├── gradient_descent_demystified.png
    │   ├── grid_search_cross_validation.png
    │   ├── gru_structure.png
    │   ├── integral_as_change_in_antriderivative.png
    │   ├── integral_as_rectangular_strips.png
    │   ├── integral_definition.png
    │   ├── khan_academy_matrix_product.png
    │   ├── leakyrelu.png
    │   ├── leakyrelu_prime.png
    │   ├── learned_regression_line.png
    │   ├── linear.png
    │   ├── linear_prime.png
    │   ├── linear_regression_3d_plane_mlr.png
    │   ├── linear_regression_line_1.png
    │   ├── linear_regression_line_2.png
    │   ├── linear_regression_line_3.png
    │   ├── linear_regression_line_4.png
    │   ├── linear_regression_line_intro.png
    │   ├── linear_regression_training_cost.png
    │   ├── log_vs_neglog.gif
    │   ├── logistic_cost_function_joined.png
    │   ├── logistic_cost_function_vectorized.png
    │   ├── logistic_regression_binary_decision_boundary.png
    │   ├── logistic_regression_exam_scores_scatter.png
    │   ├── logistic_regression_final_decision_boundary.png
    │   ├── logistic_regression_loss_history.png
    │   ├── logistic_regression_scatter_w_decision_boundary.png
    │   ├── logistic_regression_sigmoid_w_threshold.png
    │   ├── lstm_structure.png
    │   ├── maxpool.png
    │   ├── memoization.png
    │   ├── mlp.jpg
    │   ├── multiple_regression_error_history.png
    │   ├── neural_network_matrix_weighted_input.png
    │   ├── neural_network_simple.png
    │   ├── neural_network_w_matrices.png
    │   ├── neuron.png
    │   ├── ng_cost_function_logistic.png
    │   ├── nn_with_matrices_displayed.png
    │   ├── optimizers.gif
    │   ├── regularization-dropout.PNG
    │   ├── relu.png
    │   ├── relu_prime.png
    │   ├── rnn.png
    │   ├── rnn_layer.png
    │   ├── sigmoid.png
    │   ├── sigmoid_prime.png
    │   ├── simple_nn_diagram_zo_zh_defined.png
    │   ├── slope_formula.png
    │   ├── svm.png
    │   ├── svm_linear.png
    │   ├── svm_nonlinear_1.png
    │   ├── svm_nonlinear_2.png
    │   ├── svm_nonlinear_3.png
    │   ├── tanh.png
    │   ├── tanh_prime.png
    │   ├── vae.png
    │   ├── vector_field.png
    │   ├── vectors_geometry.png
    │   └── y1andy2_logistic_function.png
    ├── index.rst
    ├── layers.rst
    ├── libraries.rst
    ├── linear_algebra.rst
    ├── linear_regression.rst
    ├── logistic_regression.rst
    ├── loss_functions.rst
    ├── math_notation.rst
    ├── nn_concepts.rst
    ├── optimizers.rst
    ├── other_content.rst
    ├── papers.rst
    ├── probability.rst
    ├── regression_algos.rst
    ├── regularization.rst
    ├── reinforcement_learning.rst
    ├── statistics.rst
    └── training.rst
└── notebooks
    └── rnn.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *~
3 | _build/
4 | docs/_build/
5 | venv/
6 | .idea
7 | sphinxenv/


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "restructuredtext.confPath": "${workspaceFolder}/docs"
3 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Brendan Fortuner
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Machine Learning Glossary
  2 | 
  3 | ## Looking for fellow maintainers!
  4 | Apologies for my non-responsiveness. :( I've been heads down at Cruise, buiding ML infra for self-driving cars, and haven't reviewed this repo in forever. Looks like we're getting `54k monthly active users` now and I think the repo deserves more attention. Let me know if you would be interested in joining as a maintainer with priviledges to merge PRs. 
  5 | 
  6 | [View The Glossary](http://ml-cheatsheet.readthedocs.io/en/latest/)
  7 | 
  8 | ## How To Contribute
  9 | 
 10 | 1. Clone Repo
 11 | ```
 12 | git clone https://github.com/bfortuner/ml-glossary.git
 13 | ```
 14 | 
 15 | 2. Install Dependencies
 16 | ```
 17 | # Assumes you have the usual suspects installed: numpy, scipy, etc..
 18 | pip install sphinx sphinx-autobuild
 19 | pip install sphinx_rtd_theme
 20 | pip install recommonmark
 21 | ```
 22 | For python-3.x installed, use:
 23 | ```
 24 | pip3 install sphinx sphinx-autobuild
 25 | pip3 install sphinx_rtd_theme
 26 | pip3 install recommonmark
 27 | ```
 28 | 3. Preview Changes
 29 | 
 30 | If you are using make build.
 31 | 
 32 | ```
 33 | cd ml-glossary
 34 | cd docs
 35 | make html
 36 | ```
 37 | 
 38 | For Windows. 
 39 | 
 40 | ```
 41 | cd ml-glossary
 42 | cd docs
 43 | build.bat html
 44 | ```
 45 | 
 46 | 
 47 | 4. Verify your changes by opening the `index.html` file in `_build/`
 48 | 
 49 | 5. [Submit Pull Request](https://help.github.com/articles/creating-a-pull-request/)
 50 | 
 51 | 
 52 | ### Short for time?
 53 | 
 54 | Feel free to raise an [issue](https://github.com/bfortuner/ml-glossary/issues) to correct errors or contribute content without a pull request.
 55 | 
 56 | 
 57 | ## Style Guide
 58 | 
 59 | Each entry in the glossary MUST include the following at a minimum:
 60 | 
 61 | 1. **Concise explanation** - as short as possible, but no shorter
 62 | 2. **Citations** - Papers, Tutorials, etc.
 63 | 
 64 | Excellent entries will also include:
 65 | 
 66 | 1. **Visuals** - diagrams, charts, animations, images
 67 | 2. **Code** - python/numpy snippets, classes, or functions
 68 | 3. **Equations** - Formatted with Latex
 69 | 
 70 | The goal of the glossary is to present content in the most accessible way possible, with a heavy emphasis on visuals and interactive diagrams. That said, in the spirit of rapid prototyping, it's okay to to submit a "rough draft" without visuals or code. We expect other readers will enhance your submission over time.
 71 | 
 72 | 
 73 | ## Why RST and not Markdown?
 74 | 
 75 | RST has more features. For large and complex documentation projects, it's the logical choice.
 76 | 
 77 | * https://eli.thegreenplace.net/2017/restructuredtext-vs-markdown-for-technical-documentation/
 78 | 
 79 | 
 80 | ## Top Contributors
 81 | 
 82 | We're big fans of [Distill](http://distill.pub/prize) and we like their idea of offering prizes for high-quality submissions. We don't have as much money as they do, but we'd still like to reward contributors in some way for contributing to the glossary. For instance a cheatsheet cryptocurreny where tokens equal commits ;). Let us know if you have better ideas. In the end, this is an open-source project and we hope contributing to a repository of concise, accessible, machine learning knowledge is enough incentive on its own!
 83 | 
 84 | 
 85 | ## Tips and Tricks
 86 | 
 87 | * [Adding equations](http://www.sphinx-doc.org/en/stable/ext/math.html)
 88 | * [Working with Jupyter Notebook](http://louistiao.me/posts/demos/ipython-notebook-demo/)
 89 | * Quickstart with Jupyter notebook template
 90 | * Graphs and charts
 91 | * Importing images
 92 | * Linking to code
 93 | 
 94 | 
 95 | ## Resources
 96 | 
 97 | * [Desmos Graphing Tool](https://www.desmos.com/calculator)
 98 | * [3D Graphing Tool](https://www.geogebra.org/3d)
 99 | * [How To Submit Pull Requests](https://help.github.com/articles/creating-a-pull-request/)
100 | * [RST Cheatsheet](http://docutils.sourceforge.net/docs/user/rst/quickref.html)
101 | * [Markdown Cheatsheet](https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet)
102 | * [Citation Generator](http://www.citationmachine.net)
103 | * [MathJax Cheatsheet](https://math.meta.stackexchange.com/questions/5020/mathjax-basic-tutorial-and-quick-reference)
104 | * [Embedding Math Equations](http://www.sphinx-doc.org/en/stable/ext/math.html)
105 | * [Sphinx Tutorial](https://pythonhosted.org/an_example_pypi_project/sphinx.html)
106 | * [Sphinx Docs](http://www.sphinx-doc.org/en/stable/markup/code.html)
107 | * [Sphinx Cheatsheet](http://openalea.gforge.inria.fr/doc/openalea/doc/_build/html/source/sphinx/rest_syntax.html)
108 | 


--------------------------------------------------------------------------------
/code/activation_functions.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import numpy as np
 3 | 
 4 | ### Note ###
 5 | 
 6 | # z is weighted input
 7 | 
 8 | 
 9 | ### Functions ###
10 | 
11 | def linear(z,m):
12 | 	return m*z
13 | 
14 | def elu(z,alpha):
15 | 	return z if z >= 0 else alpha*(e^z -1)
16 | 
17 | def leakyrelu(z, alpha):
18 | 	return max(alpha * z, z)
19 | 
20 | def relu(z):
21 |   return max(0, z)
22 | 
23 | def sigmoid(z):
24 |   return 1.0 / (1 + np.exp(-z))
25 | 
26 | def tanh(z):
27 | 	return (np.exp(z) - np.exp(-z)) / (np.exp(z) + np.exp(-z))
28 | 
29 | 
30 | 
31 | 
32 | ### Derivatives ###
33 | 
34 | def linear_prime(z,m):
35 | 	return m
36 | 
37 | def elu_prime(z,alpha):
38 | 	return 1 if z > 0 else alpha*np.exp(z)
39 | 
40 | def leakyrelu_prime(z, alpha):
41 | 	return 1 if z > 0 else alpha
42 | 
43 | def sigmoid_prime(z):
44 |   return sigmoid(z) * (1-sigmoid(z))
45 | 
46 | def relu_prime(z):
47 |   return 1 if z > 0 else 0
48 | 
49 | def tanh_prime(z):
50 | 	return 1 - np.power(tanh(z), 2)
51 | 
52 | 


--------------------------------------------------------------------------------
/code/autoencoder.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import torch.nn as nn
 3 | from torch.autograd import Variable
 4 | 
 5 | 
 6 | class Autoencoder(nn.Module):
 7 |     def __init__(self, in_shape):
 8 |         super().__init__()
 9 |         c,h,w = in_shape
10 |         self.encoder = nn.Sequential(
11 |             nn.Linear(c*h*w, 128),
12 |             nn.ReLU(),
13 |             nn.Linear(128, 64),
14 |             nn.ReLU(),
15 |             nn.Linear(64, 12),
16 |             nn.ReLU()
17 |         )
18 |         self.decoder = nn.Sequential(
19 |             nn.Linear(12, 64),
20 |             nn.ReLU(),
21 |             nn.Linear(64, 128),
22 |             nn.ReLU(),
23 |             nn.Linear(128, c*h*w),
24 |             nn.Sigmoid()
25 |         )
26 | 
27 |     def forward(self, x):
28 |         bs,c,h,w = x.size()
29 |         x = x.view(bs, -1)
30 |         x = self.encoder(x)
31 |         x = self.decoder(x)
32 |         x = x.view(bs, c, h, w)
33 |         return x
34 | 
35 | 
36 | class ConvAutoencoder(nn.Module):
37 |     def __init__(self, in_shape):
38 |         super().__init__()
39 |         c,h,w = in_shape
40 |         self.encoder = nn.Sequential(
41 |             nn.Conv2d(c, 16, kernel_size=3, stride=1, padding=1),  # b, 16, 32, 32
42 |             nn.ReLU(),
43 |             nn.MaxPool2d(kernel_size=2, stride=2),  # b, 16, 16, 16
44 |             nn.Conv2d(16, 8, kernel_size=3, stride=1, padding=1),  # b, 8, 16, 16
45 |             nn.ReLU(),
46 |             nn.MaxPool2d(kernel_size=2, stride=2)  # b, 8, 8, 8
47 |         )
48 |         self.decoder = nn.Sequential(
49 |             nn.ConvTranspose2d(8, 16, kernel_size=3, stride=2, padding=0),  # 16, 17, 17
50 |             nn.ReLU(),
51 |             nn.ConvTranspose2d(16, c, kernel_size=3, stride=2, padding=1),  # 3, 33, 33
52 |             CenterCrop(h, w), # 3, 32, 32
53 |             nn.Sigmoid()
54 |         )
55 | 
56 |     def forward(self, x):
57 |         x = self.encoder(x)
58 |         x = self.decoder(x)
59 |         return x
60 | 
61 | 
62 | def train(net, loader, loss_func, optimizer):
63 |     net.train()
64 |     for inputs, _ in loader:
65 |         inputs = Variable(inputs)
66 | 
67 |         output = net(inputs)
68 |         loss = loss_func(output, inputs)
69 | 
70 |         optimizer.zero_grad()
71 |         loss.backward()
72 |         optimizer.step()
73 | 


--------------------------------------------------------------------------------
/code/cnn.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from torch.autograd import Variable
 3 | 
 4 | 
 5 | class CNN(nn.Module):
 6 |     def __init__(self, in_shape, n_classes):
 7 |         super().__init__()
 8 |         c, w, h = in_shape
 9 |         pool_layers = 2
10 |         fc_h = int(h / 2**pool_layers)
11 |         fc_w = int(w / 2**pool_layers)
12 |         self.features = nn.Sequential(
13 |             *conv_bn_relu(c, 16, kernel_size=1, stride=1, padding=0),
14 |             *conv_bn_relu(16, 32, kernel_size=3, stride=1, padding=1),
15 |             nn.MaxPool2d(kernel_size=2, stride=2), #size/2
16 |             *conv_bn_relu(32, 64, kernel_size=3, stride=1, padding=1),
17 |             nn.MaxPool2d(kernel_size=2, stride=2), #size/2
18 |         )
19 |         self.classifier = nn.Sequential(
20 |             *linear_bn_relu_drop(64 * fc_h * fc_w, 128, dropout=0.5),
21 |             nn.Linear(128, n_classes),
22 |             nn.Softmax(dim=1)
23 |         )
24 | 
25 |     def forward(self, x):
26 |         x = self.features(x)
27 |         x = x.view(x.size(0), -1)
28 |         x = self.classifier(x)
29 |         return x
30 | 
31 | def conv_bn_relu(in_chans, out_chans, kernel_size=3, stride=1,
32 |                  padding=1, bias=False):
33 |     return [
34 |         nn.Conv2d(in_chans, out_chans, kernel_size=kernel_size,
35 |                   stride=stride, padding=padding, bias=bias),
36 |         nn.BatchNorm2d(out_chans),
37 |         nn.ReLU(inplace=True),
38 |     ]
39 | 
40 | def linear_bn_relu_drop(in_chans, out_chans, dropout=0.5, bias=False):
41 |     layers = [
42 |         nn.Linear(in_chans, out_chans, bias=bias),
43 |         nn.BatchNorm1d(out_chans),
44 |         nn.ReLU(inplace=True)
45 |     ]
46 |     if dropout > 0:
47 |         layers.append(nn.Dropout(dropout))
48 |     return layers
49 | 
50 | def train(net, loader, loss_func, optimizer):
51 |     net.train()
52 |     n_batches = len(loader)
53 |     for inputs, targets in loader:
54 |         inputs = Variable(inputs)
55 |         targets = Variable(targets)
56 | 
57 |         output = net(inputs)
58 |         loss = loss_func(output, targets)
59 | 
60 |         optimizer.zero_grad()
61 |         loss.backward()
62 |         optimizer.step()
63 | 


--------------------------------------------------------------------------------
/code/decision_tree.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy import stats
  3 | from abc import ABCMeta
  4 | from typing import List
  5 | 
  6 | 
  7 | class TreeNode:
  8 |     def __init__(self, data_idx, depth, child_lst=[]):
  9 |         self.data_idx = data_idx
 10 |         self.depth = depth
 11 |         self.child = child_lst
 12 |         self.label = None
 13 |         self.split_col = None
 14 |         self.child_cate_order = None
 15 | 
 16 |     def set_attribute(self, split_col, child_cate_order=None):
 17 |         self.split_col = split_col
 18 |         self.child_cate_order = child_cate_order
 19 | 
 20 |     def set_label(self, label):
 21 |         self.label = label
 22 | 
 23 | 
 24 | class DecisionTree(metaclass=ABCMeta):
 25 |     def __init__(self, max_depth, min_sample_leaf, min_split_criterion=1e-4, verbose=False):
 26 |         self.max_depth = max_depth
 27 |         self.min_sample_leaf = min_sample_leaf
 28 |         self.verbose = verbose
 29 |         self.min_split_criterion = min_split_criterion
 30 |         self.root = None
 31 |         self.data = None
 32 |         self.labels = None
 33 |         self.feature_num = None
 34 | 
 35 |     def fit(self, X, y):
 36 |         """
 37 |         X: train data, dimensition [num_sample, num_feature]
 38 |         y: label, dimension [num_sample, ]
 39 |         """
 40 |         self.data = X
 41 |         self.labels = y
 42 |         num_sample, num_feature = X.shape
 43 |         self.feature_num = num_feature
 44 |         data_idx = list(range(num_sample))
 45 |         self.root = TreeNode(data_idx=data_idx, depth=0, child_lst=[])
 46 |         queue = [self.root]
 47 |         while queue:
 48 |             node = queue.pop(0)
 49 |             if node.depth>self.max_depth or len(node.data_idx)==1:
 50 |                 self.set_label(node)
 51 |             else:
 52 |                 child_nodes = self.split_node(node)
 53 |                 if not child_nodes:
 54 |                     self.set_label(node)
 55 |                 else:
 56 |                     queue.extend(child_nodes)
 57 | 
 58 |     def predict(self, X):
 59 |         num_sample, num_feature = X.shape
 60 |         labels = []
 61 |         for idx in range(num_sample):
 62 |             x = X[idx]
 63 |             node = self.root
 64 |             while node.child:
 65 |                 node = self.get_nex_node(node, x)
 66 |             labels.append(node.label)
 67 |         return labels
 68 | 
 69 |     @classmethod
 70 |     def get_split_criterion(self, node, child_node_lst):
 71 |         pass
 72 | 
 73 |     def set_label(self, node):
 74 |         target_Y = self.labels[node.data_idx]
 75 |         target_label = stats.mode(target_Y).mode[0]
 76 |         node.set_label(label=target_label)
 77 | 
 78 |     @classmethod
 79 |     def split_node(self, node):
 80 |         pass
 81 | 
 82 |     @classmethod
 83 |     def get_nex_node(self, node, x):
 84 |         pass
 85 | 
 86 | 
 87 | class ID3DecisionTree(DecisionTree):
 88 | 
 89 |     def split_node(self, node):
 90 |         child_node_lst = []
 91 |         child_cate_order = []
 92 |         informatin_gain = 0
 93 |         split_col = None
 94 |         for col_idx in range(self.feature_num):
 95 |             current_child_cate_order = list(np.unique(self.data[node.data_idx][:, col_idx]))
 96 |             current_child_node_lst = []
 97 |             for col_value in current_child_cate_order:
 98 |                 data_idx = np.intersect1d(node.data_idx, np.where(self.data[:, col_idx] == col_value))
 99 |                 current_child_node_lst.append(
100 |                     TreeNode(
101 |                         data_idx=data_idx,
102 |                         depth=node.depth+1
103 |                     )
104 |                 )
105 |             current_gain = self.get_split_criterion(node, current_child_node_lst)
106 |             if current_gain > informatin_gain:
107 |                 informatin_gain = current_gain
108 |                 child_node_lst = current_child_node_lst
109 |                 child_cate_order = current_child_cate_order
110 |                 split_col = col_idx
111 |         if informatin_gain<self.min_split_criterion:
112 |             return
113 |         else:
114 |             node.child = child_node_lst
115 |             node.set_attribute(split_col=split_col, child_cate_order=child_cate_order)
116 |             return child_node_lst
117 | 
118 |     def get_split_criterion(self, node, child_node_lst):
119 |         total = len(node.data_idx)
120 |         split_criterion = 0
121 |         for child_node in child_node_lst:
122 |             impurity = self.get_impurity(child_node.data_idx)
123 |             split_criterion += len(child_node.data_idx) / float(total) * impurity
124 |         return split_criterion
125 | 
126 |     def get_impurity(self, data_ids):
127 |         target_Y = self.labels[data_ids]
128 |         total = len(target_Y)
129 |         unique, count = np.unique(target_Y, return_counts=True)
130 |         res = 0
131 |         for c in count:
132 |             p = float(c)/total
133 |             res -= p*np.log(p)
134 |         return res
135 | 
136 |     def get_nex_node(self, node, x):
137 |         try:
138 |             next_node = node.child[node.child_cate_order.index(x[node.split_col])]
139 |         except:
140 |             next_node = node.child[0]
141 |         return next_node
142 | 
143 | 
144 | class C45DecisionTree(ID3DecisionTree):
145 | 
146 |     def get_split_criterion(self, node, child_node_lst):
147 |         total = len(node.data_idx)
148 |         split_criterion = 0
149 |         for child_node in child_node_lst:
150 |             impurity = self.get_impurity(child_node.data_idx)
151 |             split_criterion += len(child_node.data_idx) / float(total) * impurity
152 |         intrinsic_value = self._get_intrinsic_value(node, child_node_lst)
153 |         split_criterion= split_criterion/intrinsic_value
154 |         return split_criterion
155 | 
156 |     def _get_intrinsic_value(self, node, child_node_lst):
157 |         total = len(node.data_idx)
158 |         res = 0
159 |         for n in child_node_lst:
160 |             frac = len(n.data_idx) / float(total)
161 |             res -=  frac * np.log(frac)
162 |         return res
163 | 
164 | 
165 | class CART(DecisionTree):
166 | 
167 |     def __init__(self, max_depth, min_sample_leaf, split_criterion="gini", tree_type="classification", min_split_criterion=1e-4, verbose=False):
168 |         super(CART, self).__init__(max_depth=max_depth, min_sample_leaf=min_sample_leaf, min_split_criterion=min_split_criterion
169 |                                    , verbose=verbose)
170 |         self.tree_type = tree_type
171 |         self.split_criterion = split_criterion
172 |         assert self.split_criterion in ["gini", "entropy"]
173 |         assert self.tree_type in ["classification", "regression"]
174 | 
175 |     def split_node(self, node: TreeNode) -> List[TreeNode]:
176 |         child_node_lst = []
177 |         child_cate_order = None
178 |         gini_index = float("inf")
179 |         split_col = None
180 |         for col_idx in range(self.feature_num):
181 |             current_child_cate_order = list(np.unique(self.data[node.data_idx][:, col_idx]))
182 |             current_child_cate_order.sort()
183 |             for col_value in current_child_cate_order:
184 |                 left_data_idx = np.intersect1d(node.data_idx, np.where(self.data[:, col_idx] <= col_value))
185 |                 right_data_idx = np.intersect1d(node.data_idx, np.where(self.data[:, col_idx] > col_value))
186 |                 current_child_node_lst = []
187 |                 if len(left_data_idx) != 0:
188 |                     left_tree = TreeNode(
189 |                             data_idx=left_data_idx,
190 |                             depth=node.depth+1,
191 |                         )
192 |                     current_child_node_lst.append(left_tree)
193 |                 if len(right_data_idx) != 0:
194 |                     right_tree = TreeNode(
195 |                             data_idx=right_data_idx,
196 |                             depth=node.depth+1,
197 |                         )
198 |                     current_child_node_lst.append(right_tree)
199 |                 current_gini_index = self.get_split_criterion(node, current_child_node_lst)
200 |                 if current_gini_index < gini_index:
201 |                     gini_index = current_gini_index
202 |                     child_node_lst = current_child_node_lst
203 |                     child_cate_order = col_value
204 |                     split_col = col_idx
205 |         node.child = child_node_lst
206 |         node.set_attribute(split_col=split_col, child_cate_order=child_cate_order)
207 |         return child_node_lst
208 | 
209 |     def get_split_criterion(self, node, child_node_lst):
210 |         total = len(node.data_idx)
211 |         split_criterion = 0
212 |         for child_node in child_node_lst:
213 |             impurity = self.get_impurity(child_node.data_idx)
214 |             split_criterion += len(child_node.data_idx) / float(total) * impurity
215 |         return split_criterion
216 | 
217 |     def get_impurity(self, data_ids):
218 |         target_y = self.labels[data_ids]
219 |         total = len(target_y)
220 |         if self.tree_type == "regression":
221 |             res = 0
222 |             mean_y = np.mean(target_y)
223 |             for y in target_y:
224 |                 res += (y - mean_y) ** 2 / total
225 |         elif self.tree_type == "classification":
226 |             if self.split_criterion == "gini":
227 |                 res = 1
228 |                 unique_y = np.unique(target_y)
229 |                 for y in unique_y:
230 |                     num = len(np.where(target_y==y)[0])
231 |                     res -= (num/float(total))**2
232 |             elif self.split_criterion == "entropy":
233 |                 unique, count = np.unique(target_y, return_counts=True)
234 |                 res = 0
235 |                 for c in count:
236 |                     p = float(c) / total
237 |                     res -= p * np.log(p)
238 |         return res
239 | 
240 |     def get_nex_node(self, node: TreeNode, x: np.array):
241 |         col_value = x[node.split_col]
242 |         if col_value> node.child_cate_order:
243 |             index = 1
244 |         else:
245 |             index = 0
246 |         return node.child[index]
247 | 
248 | 
249 | if __name__ == "__main__":
250 |     # ID3: only categorical features
251 |     from sklearn.model_selection import train_test_split
252 |     from sklearn.metrics import classification_report, mean_squared_error
253 |     from sklearn import datasets
254 |     from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
255 |     dataset = datasets.load_iris()
256 | 
257 | 
258 |     # #############################
259 |     # ========== Config ==========
260 |     # #############################
261 |     all_categorical_feature = True
262 |     max_depth = 3
263 |     min_sample_leaf = 4
264 |     split_criterion = "entropy"
265 |     # tree_type = "classification"
266 |     tree_type = "regression"
267 |     # ###########################
268 | 
269 |     # convert continuous feature to categorical features
270 |     if all_categorical_feature:
271 |         f = lambda x: int(x)
272 |         func = np.vectorize(f)
273 |         X = func(dataset.data)
274 |     else:
275 |         X = dataset.data
276 | 
277 |     Y = dataset.target
278 |     X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.8)
279 | 
280 |     if tree_type == "classification":
281 |         model = DecisionTreeClassifier(criterion=split_criterion, max_depth=max_depth, min_samples_leaf=min_sample_leaf)
282 |     else:
283 |         model = DecisionTreeRegressor(max_depth=max_depth, min_samples_leaf=min_sample_leaf)
284 |     model.fit(X_train, y_train)
285 |     y_pred = model.predict(X_test)
286 |     if tree_type == "classification":
287 |         print(classification_report(y_true=y_test, y_pred=y_pred))
288 |     else:
289 |         print(mean_squared_error(y_test, y_pred))
290 |     #
291 |     # model = ID3DecisionTree(max_depth=max_depth, min_sample_leaf=min_sample_leaf, verbose=True)
292 |     # model = C45DecisionTree(max_depth=max_depth, min_sample_leaf=min_sample_leaf, verbose=True)
293 |     model = CART(max_depth=max_depth, min_sample_leaf=min_sample_leaf, verbose=True, tree_type=tree_type, split_criterion=split_criterion)
294 |     model.fit(X_train, y_train)
295 |     y_pred = model.predict(X_test)
296 |     if tree_type == "classification":
297 |         print(classification_report(y_true=y_test, y_pred=y_pred))
298 |     else:
299 |         print(mean_squared_error(y_test, y_pred))


--------------------------------------------------------------------------------
/code/gan.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class Generator(nn.Module):
 5 |     def __init__(self):
 6 |         super()
 7 |         self.net = nn.Sequential(
 8 |             nn.ConvTranspose2d( 200, 32 * 8, 4, 1, 0, bias=False),
 9 |             nn.BatchNorm2d(32 * 8),
10 |             nn.ReLU(),
11 |             nn.ConvTranspose2d(32 * 8, 32 * 4, 4, 2, 1, bias=False),
12 |             nn.BatchNorm2d(32 * 4),
13 |             nn.ReLU(),
14 |             nn.ConvTranspose2d( 32 * 4, 32 * 2, 4, 2, 1, bias=False),
15 |             nn.BatchNorm2d(32 * 2),
16 |             nn.ReLU(),
17 |             nn.ConvTranspose2d( 32 * 2, 32, 4, 2, 1, bias=False),
18 |             nn.BatchNorm2d(32),
19 |             nn.ReLU(),
20 |             nn.ConvTranspose2d( 32, 1, 4, 2, 1, bias=False),
21 |             nn.Tanh()
22 |         )
23 |     def forward(self, tens):
24 |         return self.net(tens)
25 | 
26 | class Discriminator(nn.Module):
27 |     def __init__(self):
28 |         super()
29 |         self.net = nn.Sequential(
30 |             nn.Conv2d(1, 32, 4, 2, 1, bias=False),
31 |             nn.LeakyReLU(0.2),
32 |             nn.Conv2d(32, 32 * 2, 4, 2, 1, bias=False),
33 |             nn.BatchNorm2d(32 * 2),
34 |             nn.LeakyReLU(0.2),
35 |             nn.Conv2d(32 * 2, 32 * 4, 4, 2, 1, bias=False),
36 |             nn.BatchNorm2d(32 * 4),
37 |             nn.LeakyReLU(0.2),
38 |             # state size. (32*4) x 8 x 8
39 |             nn.Conv2d(32 * 4, 32 * 8, 4, 2, 1, bias=False),
40 |             nn.BatchNorm2d(32 * 8),
41 |             nn.LeakyReLU(0.2),
42 |             # state size. (32*8) x 4 x 4
43 |             nn.Conv2d(32 * 8, 1, 4, 1, 0, bias=False),
44 |             nn.Sigmoid()
45 |         )
46 | 
47 |     def forward(self, tens):
48 |         return self.net(tens)
49 | 
50 | def train(netD, netG, loader, loss_func, optimizerD, optimizerG, num_epochs):
51 |     netD.train()
52 |     netG.train()
53 |     device = "cuda:0" if torch.cuda.is_available() else "cpu"
54 |     for epoch in range(num_epochs):
55 |         for i, data in enumerate(loader, 0):
56 |             netD.zero_grad()
57 |             realtens = data[0].to(device)
58 |             b_size = realtens.size(0)
59 |             label = torch.full((b_size,), 1, dtype=torch.float, device=device) # gen labels
60 |             output = netD(realtens)
61 |             errD_real = loss_func(output, label)
62 |             errD_real.backward() # backprop discriminator fake and real based on label
63 |             noise = torch.randn(b_size, 200, 1, 1, device=device)
64 |             fake = netG(noise)
65 |             label.fill_(0)
66 |             output = netD(fake.detach()).view(-1)
67 |             errD_fake = loss_func(output, label)
68 |             errD_fake.backward() # backprop discriminator fake and real based on label
69 |             errD = errD_real + errD_fake # discriminator error
70 |             optimizerD.step()
71 |             netG.zero_grad()
72 |             label.fill_(1)  
73 |             output = netD(fake).view(-1)
74 |             errG = loss_func(output, label) # generator error
75 |             errG.backward()
76 |             optimizerG.step()
77 | 


--------------------------------------------------------------------------------
/code/id3_decision_tree_simple.py:
--------------------------------------------------------------------------------
  1 | """Numpy Implementation of ID3 Decision Tree Classifier."""
  2 | import numpy as np
  3 | from collections import Counter
  4 | 
  5 | 
  6 | class id3_Classifier():
  7 |     """
  8 |     The ID3 classifier is based on information gain to split.
  9 | 
 10 |     Usage:
 11 |     model = id3_tree_classifier(least_children_num = 4, verbose=True)
 12 |     model.fit(X_train,y)
 13 |     model.predict(X_test)
 14 |     """
 15 | 
 16 |     def __init__(self, least_children_num, verbose=True):
 17 |         """Constructor."""
 18 |         self.least_children_num = least_children_num
 19 |         self.verbose = verbose
 20 | 
 21 |     def fit(self, tmp_x, tmp_y):
 22 |         """Fit function."""
 23 |         def fit_tree(tmp_x, tmp_y):
 24 |             # Exit condition:
 25 |             if len(tmp_y) < self.least_children_num or len(np.unique(tmp_y)) == 1:
 26 | 
 27 |                 if self.verbose:
 28 |                     print('exit condition:')
 29 |                     print('tmp_y:')
 30 |                     print(tmp_y)
 31 | 
 32 |                 mode_val = self._mode(tmp_y.flatten().tolist())
 33 |                 return([np.nan, mode_val, np.nan, np.nan])
 34 | 
 35 |             # Otherwise Split:
 36 |             if self.verbose:
 37 |                 print("start....subset Y len {}".format(len(tmp_y)))
 38 |             split_row, split_col = self._decide_split(tmp_x, tmp_y)
 39 |             if not split_row and not split_col:
 40 |                 print('no better split...return mode')
 41 |                 mode_val = self._mode(tmp_y.flatten().tolist())
 42 |                 return([np.nan, mode_val, np.nan, np.nan])
 43 | 
 44 |             if self.verbose:
 45 |                 print("split on:")
 46 |                 print(split_row, split_col)
 47 |             split_vec = tmp_x[:, split_col]
 48 |             split_val = tmp_x[split_row, split_col]
 49 |             left_ind = np.where(split_vec < split_val)[0].tolist()
 50 |             right_ind = np.where(split_vec >= split_val)[0].tolist()
 51 |             left_dat, left_y = tmp_x[left_ind, :], tmp_y[left_ind, ]
 52 |             right_dat, right_y = tmp_x[right_ind, :], tmp_y[right_ind, ]
 53 | 
 54 |             left_tree = fit_tree(left_dat, left_y)
 55 |             right_tree = fit_tree(right_dat, right_y)
 56 | 
 57 |             if isinstance(left_tree, list):
 58 |                 len_l_tree = 1
 59 |             else:
 60 |                 len_l_tree = left_tree.shape[0]
 61 | 
 62 |             root = [split_col, split_val, 1, len_l_tree + 1]
 63 |             return(np.vstack([root, left_tree, right_tree]))
 64 |         tree = fit_tree(tmp_x, tmp_y)
 65 |         self.tree = tree
 66 | 
 67 | 
 68 |     def _decide_split(self, x, y):
 69 |         """
 70 |         Given subset of X,Y,
 71 |         search for the best splitting node based on: information gain.
 72 |         """
 73 |         def _entropy(tmp_y):
 74 |             """Key Metrics of building a decision tree use Shannon Entropy."""
 75 |             tmp_ent = 0
 76 |             for uni_y in np.unique(tmp_y):
 77 |                 p = len(tmp_y[tmp_y == uni_y]) / len(tmp_y)
 78 |                 tmp_ent -= (p * np.log2(p))
 79 |             return tmp_ent
 80 | 
 81 |         m, n = x.shape
 82 |         best_gain = 0
 83 |         split_row, split_col = None, None
 84 | 
 85 |         previous_entropy = _entropy(y)
 86 |         for col in range(n):
 87 |             tmp_vec = x[:, col].ravel()
 88 |             for row in range(m):
 89 |                 val = tmp_vec[row]
 90 |                 # >= & < is the convention here:
 91 |                 if val != np.max(tmp_vec) and val != np.min(tmp_vec):
 92 |                     left_b = np.where(tmp_vec < val)[0].tolist()
 93 |                     right_b = np.where(tmp_vec >= val)[0].tolist()
 94 | 
 95 |                     new_ent = (len(y[left_b]) / len(y)) * _entropy(y[left_b]) + \
 96 |                         (len(y[right_b]) / len(y)) * _entropy(y[right_b])
 97 |                     info_gain = previous_entropy - new_ent
 98 | 
 99 |                     if info_gain > best_gain:
100 |                         split_row, split_col = row, col
101 |                         best_gain = info_gain
102 |                         if self.verbose:
103 |                             print('better gain:{}'.format(best_gain))
104 |                             print()
105 |         return split_row, split_col
106 | 
107 |     def _mode(self, x_list):
108 |         """Calculate the mode for splitting."""
109 |         return Counter(x_list).most_common(1)[0][0]
110 | 
111 |     def predict(self, tmp_test_array):
112 |         """Wrap-up fun for prediction."""
113 |         def _query(tree, tmp_test_array):
114 |             """Prediction for single example."""
115 |             assert len(tmp_test_array.shape) == 2, \
116 |                 "Make sure your test data is 2d array"
117 | 
118 |             if isinstance(tree,list):
119 |                 start_node = tree # only the 1 row in data
120 |             else:
121 |                 start_node = tree[0,:] # Iteratively hit first row
122 |                 
123 |             test_feat, test_val, left_tree_jump, right_tree_jump = \
124 |                 start_node[0], start_node[1], start_node[2], start_node[3]
125 | 
126 |             if np.isnan(test_feat) and np.isnan(left_tree_jump) and \
127 |                     np.isnan(right_tree_jump):
128 | 
129 |                 pred = test_val
130 |                 return pred
131 | 
132 |             if tmp_test_array[0, int(test_feat)] < test_val:
133 |                 # If <, go left branch:
134 |                 jump_loc = left_tree_jump
135 |                 pred = _query(tree[int(jump_loc):, ], tmp_test_array)
136 | 
137 |             else:
138 |                 # If >=, go right branch:
139 |                 jump_loc = right_tree_jump
140 |                 pred = _query(tree[int(jump_loc):, ], tmp_test_array)
141 | 
142 |             return pred
143 | 
144 |         assert len(tmp_test_array.shape) == 2, \
145 |             "Make sure test data is 2d-array"
146 |         result = []
147 | 
148 |         for i in range(tmp_test_array.shape[0]):
149 |             inp = tmp_test_array[i, :].reshape(1, -1)
150 |             result.append(_query(self.tree, inp))
151 |         return result
152 | 


--------------------------------------------------------------------------------
/code/knn.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | from math import sqrt
 3 | 
 4 | 
 5 | def euclidean_distance(point1, point2):
 6 |     distance = 0
 7 |     for i in range(len(point1)):
 8 |         distance +=(point1[i] - point2[i]) ** 2
 9 |     return sqrt(distance)
10 | 
11 | 
12 | def mean(labels):
13 |     return sum(labels) / len(labels)
14 | 
15 | 
16 | def mode(labels):
17 |     return Counter(labels).most_common(1)[0][0]
18 | 
19 | 
20 | def KNN(training_data, target, k, func):
21 |     """
22 |     training_data: all training data point
23 |     target: new point
24 |     k: user-defined constant, number of closest training data
25 |     func: functions used to get the the target label
26 |     """
27 |     # Step one: calculate the Euclidean distance between the new point and all training data
28 |     neighbors= []
29 |     for index, data in enumerate(training_data):
30 |         # distance between the target data and the current example from the data.
31 |         distance = euclidean_distance(data[:-1], target)
32 |         neighbors.append((distance, index))
33 | 
34 |     # Step two: pick the top-K closest training data
35 |     sorted_neighbors = sorted(neighbors)
36 |     k_nearest = sorted_neighbors[:k]
37 | 
38 |     # Get the labels of the selected K entries
39 |     k_nearest_labels = [training_data[i][1] for distance, i in k_nearest]
40 | 
41 |     # Step three: For regression problem, take the average of the labels as the result;
42 |     #             for classification problem, take the most common label of these labels as the result.
43 |     return k_nearest, func(k_nearest_labels)
44 | 
45 | 
46 | def main():
47 |     """
48 |     # Regression Data(Column 0 : Height(inch), Column 1: Weight(lb))
49 |     """
50 |     reg_data = [
51 |        [73.84, 241.89],
52 |        [68.78, 162.31],
53 |        [74.11, 212.74],
54 |        [71.73, 220.04],
55 |        [69.88, 206.34],
56 |        [67.25, 152.21],
57 |        [63.45, 156.39]
58 |     ]
59 | 
60 |     target_data = [70]
61 |     reg_k_nearest_neighbors, reg_prediction = KNN(
62 |         reg_data, target_data, k=3, func=mean
63 |     )
64 |     print(reg_prediction)
65 |     '''
66 |     # Classification Data( Column 0: age, Column 1:like paragliding  or not )
67 |     '''
68 |     clf_data = [
69 |        [26, 1],
70 |        [20, 1],
71 |        [22, 1],
72 |        [19, 1],
73 |        [28, 0],
74 |        [33, 0],
75 |        [30, 0],
76 |        [50, 0],
77 |     ]
78 |     target_data2 = [32]
79 |     clf_k_nearest_neighbors, clf_prediction = KNN(
80 |         clf_data, target_data2, k=3, func=mode
81 |     )
82 |     print(clf_prediction)
83 | 
84 | 
85 | if __name__ == '__main__':
86 |     main()
87 | 


--------------------------------------------------------------------------------
/code/layers.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | from scipy.special import softmax
  4 | from scipy.special import expit
  5 | from typing import List
  6 | 
  7 | 
  8 | def BatchNorm():
  9 |     # From https://wiseodd.github.io/techblog/2016/07/04/batchnorm/
 10 |     # TODO: Add doctring for variable names. Add momentum to init.
 11 |     def __init__(self):
 12 |         pass
 13 | 
 14 |     def forward(self, X, gamma, beta):
 15 |         mu = np.mean(X, axis=0)
 16 |         var = np.var(X, axis=0)
 17 | 
 18 |         X_norm = (X - mu) / np.sqrt(var + 1e-8)
 19 |         out = gamma * X_norm + beta
 20 | 
 21 |         cache = (X, X_norm, mu, var, gamma, beta)
 22 | 
 23 |         return out, cache, mu, var
 24 | 
 25 |     def backward(self, dout, cache):
 26 |         X, X_norm, mu, var, gamma, beta = cache
 27 | 
 28 |         N, D = X.shape
 29 | 
 30 |         X_mu = X - mu
 31 |         std_inv = 1. / np.sqrt(var + 1e-8)
 32 | 
 33 |         dX_norm = dout * gamma
 34 |         dvar = np.sum(dX_norm * X_mu, axis=0) * -.5 * std_inv**3
 35 |         dmu = np.sum(dX_norm * -std_inv, axis=0) + dvar * np.mean(-2. * X_mu, axis=0)
 36 | 
 37 |         dX = (dX_norm * std_inv) + (dvar * 2 * X_mu / N) + (dmu / N)
 38 |         dgamma = np.sum(dout * X_norm, axis=0)
 39 |         dbeta = np.sum(dout, axis=0)
 40 | 
 41 |         return dX, dgamma, dbeta
 42 | 
 43 | 
 44 | class RNN:
 45 |     def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, batch_size=1) -> None:
 46 |         self.input_dim = input_dim
 47 |         self.hidden_dim = hidden_dim
 48 |         self.out_dim = output_dim
 49 |         self.batch_size = batch_size
 50 |         # initialization
 51 |         self.params = self._init_params()
 52 |         self.hidden_state = self._init_hidden_state()
 53 | 
 54 |     def _init_params(self) -> List[np.array]:
 55 |         scale = 0.01
 56 |         Waa = np.random.normal(scale=scale, size=[self.hidden_dim, self.hidden_dim])
 57 |         Wax = np.random.normal(scale=scale, size=[self.hidden_dim, self.input_dim])
 58 |         Wy = np.random.normal(scale=scale, size=[self.out_dim, self.hidden_dim])
 59 |         ba = np.zeros(shape=[self.hidden_dim, 1])
 60 |         by = np.zeros(shape=[self.out_dim, 1])
 61 |         return [Waa, Wax, Wy, ba, by]
 62 | 
 63 |     def _init_hidden_state(self) -> np.array:
 64 |         return np.zeros(shape=[self.hidden_dim, self.batch_size])
 65 | 
 66 |     def forward(self, input_vector: np.array) -> np.array:
 67 |         """
 68 |         input_vector:
 69 |             dimension: [num_steps, self.input_dim, self.batch_size]
 70 |         out_vector:
 71 |             dimension: [num_steps, self.output_dim, self.batch_size]
 72 |         """
 73 |         Waa, Wax, Wy, ba, by = self.params
 74 |         output_vector = []
 75 |         for vector in input_vector:
 76 |             self.hidden_state = np.tanh(
 77 |                 np.dot(Waa, self.hidden_state) + np.dot(Wax, vector) + ba
 78 |             )
 79 |             y = softmax(
 80 |                 np.dot(Wy, self.hidden_state) + by
 81 |             )
 82 |             output_vector.append(y)
 83 |         return np.array(output_vector)
 84 | 
 85 | 
 86 | class GRU:
 87 |     def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, batch_size=1) -> None:
 88 |         self.input_dim = input_dim
 89 |         self.hidden_dim = hidden_dim
 90 |         self.out_dim = output_dim
 91 |         self.batch_size = batch_size
 92 |         # initialization
 93 |         self.params = self._init_params()
 94 |         self.hidden_state = self._init_hidden_state()
 95 | 
 96 |     def _init_params(self) -> List[np.array]:
 97 |         scale = 0.01
 98 |         def param_single_layer():
 99 |             w = np.random.normal(scale=scale, size=(self.hidden_dim, self.hidden_dim+input_dim))
100 |             b = np.zeros(shape=[self.hidden_dim, 1])
101 |             return w, b
102 | 
103 |         # reset, update gate
104 |         Wr, br = param_single_layer()
105 |         Wu, bu = param_single_layer()
106 |         # output layer
107 |         Wy = np.random.normal(scale=scale, size=[self.out_dim, self.hidden_dim])
108 |         by = np.zeros(shape=[self.out_dim, 1])
109 |         return [Wr, br, Wu, bu, Wy, by]
110 | 
111 |     def _init_hidden_state(self) -> np.array:
112 |         return np.zeros(shape=[self.hidden_dim, self.batch_size])
113 | 
114 |     def forward(self, input_vector: np.array) -> np.array:
115 |         """
116 |         input_vector:
117 |             dimension: [num_steps, self.input_dim, self.batch_size]
118 |         out_vector:
119 |             dimension: [num_steps, self.output_dim, self.batch_size]
120 |         """
121 |         Wr, br, Wu, bu, Wy, by = self.params
122 |         output_vector = []
123 |         for vector in input_vector:
124 |             # expit in scipy is sigmoid function
125 |             reset_gate = expit(
126 |                 np.dot(Wr, np.concatenate([self.hidden_state, vector], axis=0)) + br
127 |             )
128 |             update_gate = expit(
129 |                 np.dot(Wu, np.concatenate([self.hidden_state, vector], axis=0)) + bu
130 |             )
131 |             candidate_hidden = np.tanh(
132 |                 reset_gate * self.hidden_state
133 |             )
134 |             self.hidden_state = update_gate * self.hidden_state + (1-update_gate) * candidate_hidden
135 |             y = softmax(
136 |                 np.dot(Wy, self.hidden_state) + by
137 |             )
138 |             output_vector.append(y)
139 |         return np.array(output_vector)
140 | 
141 | 
142 | class LSTM:
143 |     def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, batch_size=1) -> None:
144 |         self.input_dim = input_dim
145 |         self.hidden_dim = hidden_dim
146 |         self.out_dim = output_dim
147 |         self.batch_size = batch_size
148 |         # initialization
149 |         self.params = self._init_params()
150 |         self.hidden_state = self._init_hidden_state()
151 |         self.memory_state = self._init_hidden_state()
152 | 
153 |     def _init_params(self) -> List[np.array]:
154 |         scale = 0.01
155 |         def param_single_layer():
156 |             w = np.random.normal(scale=scale, size=(self.hidden_dim, self.hidden_dim+input_dim))
157 |             b = np.zeros(shape=[self.hidden_dim, 1])
158 |             return w, b
159 | 
160 |         # forget, input, output gate + candidate memory state
161 |         Wf, bf = param_single_layer()
162 |         Wi, bi = param_single_layer()
163 |         Wo, bo = param_single_layer()
164 |         Wc, bc = param_single_layer()
165 |         # output layer
166 |         Wy = np.random.normal(scale=scale, size=[self.out_dim, self.hidden_dim])
167 |         by = np.zeros(shape=[self.out_dim, 1])
168 |         return [Wf, bf, Wi, bi, Wo, bo, Wc, bc, Wy, by]
169 | 
170 |     def _init_hidden_state(self) -> np.array:
171 |         return np.zeros(shape=[self.hidden_dim, self.batch_size])
172 | 
173 |     def forward(self, input_vector: np.array) -> np.array:
174 |         """
175 |         input_vector:
176 |             dimension: [num_steps, self.input_dim, self.batch_size]
177 |         out_vector:
178 |             dimension: [num_steps, self.output_dim, self.batch_size]
179 |         """
180 |         Wf, bf, Wi, bi, Wo, bo, Wc, bc, Wy, by = self.params
181 |         output_vector = []
182 |         for vector in input_vector:
183 |             # expit in scipy is sigmoid function
184 |             foget_gate = expit(
185 |                 np.dot(Wf, np.concatenate([self.hidden_state, vector], axis=0)) + bf
186 |             )
187 |             input_gate = expit(
188 |                 np.dot(Wi, np.concatenate([self.hidden_state, vector], axis=0)) + bi
189 |             )
190 |             output_gate = expit(
191 |                 np.dot(Wo, np.concatenate([self.hidden_state, vector], axis=0)) + bo
192 |             )
193 |             candidate_memory = np.tanh(
194 |                 np.dot(Wc, np.concatenate([self.hidden_state, vector], axis=0)) + bc
195 |             )
196 |             self.memory_state = foget_gate * self.memory_state + input_gate * candidate_memory
197 |             self.hidden_state = output_gate * np.tanh(self.memory_state)
198 |             y = softmax(
199 |                 np.dot(Wy, self.hidden_state) + by
200 |             )
201 |             output_vector.append(y)
202 |         return np.array(output_vector)
203 | 
204 | 
205 | def Adagrad(data):
206 |     pass
207 | 
208 | 
209 | def Adam(data):
210 |     pass
211 | 
212 | 
213 | def LBFGS(data):
214 |     pass
215 | 
216 | 
217 | def RMSProp(data):
218 |     pass
219 | 
220 | 
221 | # def SGD(data, batch_size, lr):
222 | #     N = len(data)
223 | #     np.random.shuffle(data)
224 | #     mini_batches = np.array([data[i:i+batch_size]
225 | #      for i in range(0, N, batch_size)])
226 | #     for X,y in mini_batches:
227 | #         backprop(X, y, lr)
228 | 
229 | 
230 | def SGD_Momentum():
231 |     pass
232 | 
233 | 
234 | if __name__ == "__main__":
235 |     input_data = np.array([
236 |         [
237 |             [1, 3]
238 |             , [2, 4]
239 |             , [3, 6]
240 |         ]
241 |         , [
242 |             [4, 3]
243 |             , [3, 4]
244 |             , [1, 5]
245 |         ]
246 |     ])
247 |     batch_size = 2
248 |     input_dim = 3
249 |     output_dim = 4
250 |     hidden_dim = 5
251 |     time_step = 2
252 |     # rnn = RNN(input_dim=input_dim, batch_size=batch_size, output_dim=output_dim, hidden_dim=hidden_dim)
253 |     # output_vector = rnn.forward(input_vector=input_data)
254 |     # print("RNN:")
255 |     # print(f"Input data dimensions: {input_data.shape}")
256 |     # print(f"Output data dimensions {output_vector.shape}")
257 |     rnn = GRU(input_dim=input_dim, batch_size=batch_size, output_dim=output_dim, hidden_dim=hidden_dim)
258 |     output_vector = rnn.forward(input_vector=input_data)
259 |     print("LSTM:")
260 |     print(f"Input data dimensions: {input_data.shape}")
261 |     print(f"Output data dimensions {output_vector.shape}")


--------------------------------------------------------------------------------
/code/logistic_regression.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy
  3 | from activation_functions import sigmoid, sigmoid_prime
  4 | 
  5 | 
  6 | def predict(features, weights):
  7 |   '''
  8 |   Returns 1D array of probabilities
  9 |   that the class label == 1
 10 |   '''
 11 |   z = np.dot(features, weights)
 12 |   return sigmoid(z)
 13 | 
 14 | 
 15 | def cost_function(features, labels, weights):
 16 |     '''
 17 |     Using Mean Absolute Error
 18 | 
 19 |     Features:(100,3)
 20 |     Labels: (100,1)
 21 |     Weights:(3,1)
 22 |     Returns 1D matrix of predictions
 23 |     Cost = (labels*log(predictions) + (1-labels)*log(1-predictions) ) / len(labels)
 24 |     '''
 25 |     observations = len(labels)
 26 | 
 27 |     predictions = predict(features, weights)
 28 | 
 29 |     #Take the error when label=1
 30 |     class1_cost = -labels*np.log(predictions)
 31 | 
 32 |     #Take the error when label=0
 33 |     class2_cost = (1-labels)*np.log(1-predictions)
 34 | 
 35 |     #Take the sum of both costs
 36 |     cost = class1_cost - class2_cost
 37 | 
 38 |     #Take the average cost
 39 |     cost = cost.sum() / observations
 40 | 
 41 |     return cost
 42 | 
 43 | 
 44 | def update_weights(features, labels, weights, lr):
 45 |     '''
 46 |     Vectorized Gradient Descent
 47 | 
 48 |     Features:(200, 3)
 49 |     Labels: (200, 1)
 50 |     Weights:(3, 1)
 51 |     '''
 52 |     N = len(features)
 53 | 
 54 |     #1 - Get Predictions
 55 |     predictions = predict(features, weights)
 56 | 
 57 |     #2 Transpose features from (200, 3) to (3, 200)
 58 |     # So we can multiply w the (200,1)  cost matrix.
 59 |     # Returns a (3,1) matrix holding 3 partial derivatives --
 60 |     # one for each feature -- representing the aggregate
 61 |     # slope of the cost function across all observations
 62 |     gradient = np.dot(features.T,  predictions - labels)
 63 | 
 64 |     #3 Take the average cost derivative for each feature
 65 |     gradient /= N
 66 | 
 67 |     #4 - Multiply the gradient by our learning rate
 68 |     gradient *= lr
 69 | 
 70 |     #5 - Subtract from our weights to minimize cost
 71 |     weights -= gradient
 72 | 
 73 |     return weights
 74 | 
 75 | 
 76 | def decision_boundary(prob):
 77 |   return 1 if prob >= .5 else 0
 78 | 
 79 | 
 80 | def classify(predictions):
 81 |   '''
 82 |   input  - N element array of predictions between 0 and 1
 83 |   output - N element array of 0s (False) and 1s (True)
 84 |   '''
 85 |   decision_boundary = np.vectorize(decision_boundary)
 86 |   return decision_boundary(predictions).flatten()
 87 | 
 88 | 
 89 | def train(features, labels, weights, lr, iters):
 90 |     cost_history = []
 91 | 
 92 |     for i in range(iters):
 93 |         weights = update_weights(features, labels, weights, lr)
 94 | 
 95 |         #Calculate error for auditing purposes
 96 |         cost = cost_function(features, labels, weights)
 97 |         cost_history.append(cost)
 98 | 
 99 |         # Log Progress
100 |         if i % 1000 == 0:
101 |             print "iter: "+str(i) + " cost: "+str(cost)
102 | 
103 |     return weights, cost_history
104 | 
105 | 
106 | def accuracy(predicted_labels, actual_labels):
107 |     diff = predicted_labels - actual_labels
108 |     return 1.0 - (float(np.count_nonzero(diff)) / len(diff))
109 | 
110 | 
111 | def plot_decision_boundary(trues, falses):
112 |     fig = plt.figure()
113 |     ax = fig.add_subplot(111)
114 | 
115 |     no_of_preds = len(trues) + len(falses)
116 | 
117 |     ax.scatter([i for i in range(len(trues))], trues, s=25, c='b', marker="o", label='Trues')
118 |     ax.scatter([i for i in range(len(falses))], falses, s=25, c='r', marker="s", label='Falses')
119 | 
120 |     plt.legend(loc='upper right');
121 |     ax.set_title("Decision Boundary")
122 |     ax.set_xlabel('N/2')
123 |     ax.set_ylabel('Predicted Probability')
124 |     plt.axhline(.5, color='black')
125 |     plt.show()
126 | 


--------------------------------------------------------------------------------
/code/logistic_regression_scipy.py:
--------------------------------------------------------------------------------
 1 | import sklearn
 2 | from sklearn.linear_model import LogisticRegression
 3 | from sklearn.cross_validation import train_test_split
 4 | 
 5 | # Normalize grades to values between 0 and 1 for more efficient computation
 6 | normalized_range = sklearn.preprocessing.MinMaxScaler(feature_range=(-1,1))
 7 | 
 8 | # Extract Features + Labels
 9 | labels.shape =  (100,) #scikit expects this
10 | features = normalized_range.fit_transform(features)
11 | 
12 | # Create Test/Train
13 | features_train,features_test,labels_train,labels_test = train_test_split(features,labels,test_size=0.4)
14 | 
15 | # Scikit Logistic Regression
16 | scikit_log_reg = LogisticRegression()
17 | scikit_log_reg.fit(features_train,labels_train)
18 | 
19 | #Score is Mean Accuracy
20 | scikit_score = clf.score(features_test,labels_test)
21 | print 'Scikit score: ', scikit_score
22 | 
23 | #Our Mean Accuracy
24 | observations, features, labels, weights = run()
25 | probabilities = predict(features, weights).flatten()
26 | classifications = classifier(probabilities)
27 | our_acc = accuracy(classifications,labels.flatten())
28 | print 'Our score: ',our_acc
29 | 


--------------------------------------------------------------------------------
/code/loss_functions.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import numpy as np
 3 | 
 4 | ### Note ###
 5 | 
 6 | # yHat is prediction
 7 | # y is the target (true label)
 8 | 
 9 | 
10 | ### Functions ###
11 | 
12 | def CrossEntropy(yHat, y):
13 |     if y == 1:
14 |       return -log(yHat)
15 |     else:
16 |       return -log(1 - yHat)
17 | 
18 | 
19 | def Dice(yHat, y):
20 |     total = np.sum(y, dim=1) + np.sum(yHat, dim=1)
21 |     intersection = np.sum(y * yHat, dim=1)
22 |     dice = (2.0 * intersection) / (total + 1e-7)
23 |     return np.mean(dice)
24 | 
25 | 
26 | def Hinge(yHat, y):
27 |     return np.max(0, y - (1-2*y)*yHat)
28 | 
29 | 
30 | def Huber(yHat, y, delta=1.):
31 |     return np.where(np.abs(y-yHat) < delta,.5*(y-yHat)**2 , delta*(np.abs(y-yHat)-0.5*delta))
32 | 
33 | 
34 | def KLDivergence(yHat, y):
35 |     """
36 |     :param yHat:
37 |     :param y:
38 |     :return: KLDiv(yHat || y)
39 |     """
40 |     return np.sum(yHat * np.log((yHat / y)))
41 | 
42 | 
43 | def L1(yHat, y):
44 |     return np.sum(np.absolute(yHat - y)) / y.size
45 | 
46 | def root_mean_square_error(y_hat: np.ndarray, y: np.ndarray) -> float:
47 |     return np.sqrt(np.sum((y_hat - y)**2) / y.size)
48 | 
49 | def L2(yHat, y):
50 |     return np.sum((yHat - y)**2)
51 | 
52 | 
53 | def MLE(yHat, y):
54 |     pass
55 | 
56 | 
57 | def MSE(yHat, y):
58 |     return np.sum((yHat - y)**2) / y.size
59 | 
60 | 
61 | ### Derivatives ###
62 | 
63 | def MSE_prime(yHat, y):
64 |     return yHat - y
65 | 


--------------------------------------------------------------------------------
/code/mlp.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch
 3 | from torch.autograd import Variable
 4 | import torch.nn.functional as F
 5 | import torch.optim as optim
 6 | 
 7 | # import torchvision module to handle image manipulation
 8 | import torchvision
 9 | import torchvision.transforms as transforms
10 | 
11 | # calculate train time, writing train data to files etc.
12 | import time
13 | import pandas as pd
14 | import json
15 | 
16 | 
17 | class MLP(nn.Module):
18 |     def __init__(self):
19 |         super(MLP,self).__init__()
20 |         # define layers
21 |         self.fc1 = nn.Linear(in_features=28*28, out_features=500)
22 |         self.fc2 = nn.Linear(in_features=500, out_features=200)
23 |         self.fc3 = nn.Linear(in_features=200, out_features=100)
24 |         self.out = nn.Linear(in_features=100, out_features=10)
25 | 
26 | 
27 |     def forward(self, t):
28 |         # fc1  make input 1 dimentional
29 |         t = t.view(-1,28*28)
30 |         t = self.fc1(t)
31 |         t = F.relu(t)
32 |         # fc2
33 |         t = self.fc2(t)
34 |         t = F.relu(t)
35 |         # fc3
36 |         t = self.fc3(t)
37 |         t = F.relu(t)
38 |         # output
39 |         t = self.out(t)
40 |         return t
41 | 
42 | def train(net, loader, loss_func, optimizer):
43 |     net.train()
44 |     n_batches = len(loader)
45 |     for inputs, targets in loader:
46 |         inputs = Variable(inputs)
47 |         targets = Variable(targets)
48 | 
49 |         output = net(inputs)
50 |         loss = loss_func(output, targets)
51 | 
52 |         optimizer.zero_grad()
53 |         loss.backward()
54 |         optimizer.step()
55 |          # print statistics
56 |     running_loss = loss.item()
57 |     print('Training loss: %.3f' %( running_loss))
58 | 
59 | def main():
60 |     train_set = torchvision.datasets.FashionMNIST(
61 |         root = './FMNIST',
62 |         train = True,
63 |         download = False,
64 |         transform = transforms.Compose([
65 |             transforms.ToTensor()
66 |         ])
67 |     )
68 |     mlp = MLP()
69 |     loader = torch.utils.data.DataLoader(train_set, batch_size = 1000)
70 |     optimizer = optim.Adam(mlp.parameters(), lr=0.01)
71 |     loss_func=nn.CrossEntropyLoss()
72 |     for i in range(0,15):
73 |         train(mlp,loader,loss_func,optimizer)
74 |     print("Finished Training")
75 |     torch.save(mlp.state_dict(), "./mlpmodel.pt")
76 |     test_set = torchvision.datasets.FashionMNIST(
77 |         root = './FMNIST',
78 |         train = False,
79 |         download = False,
80 |         transform = transforms.Compose([
81 |             transforms.ToTensor()
82 |         ])
83 |     )
84 |     testloader = torch.utils.data.DataLoader(test_set, batch_size=4)
85 |     correct = 0
86 |     total = 0
87 |     with torch.no_grad():
88 |         for data in testloader:
89 |             images, labels = data
90 |             outputs = mlp(images)
91 |             _, predicted = torch.max(outputs.data, 1)
92 |             total += labels.size(0)
93 |             correct += (predicted == labels).sum().item()
94 |     print('Accuracy of the network on the 10000 test images: %d %%' % (
95 |         100 * correct / total))
96 | 
97 | 
98 | main()
99 | 


--------------------------------------------------------------------------------
/code/nn_matrix.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import numpy as np
 3 | 
 4 | # Neural Network w Matrices
 5 | 
 6 | INPUT_LAYER_SIZE = 1
 7 | HIDDEN_LAYER_SIZE = 2
 8 | OUTPUT_LAYER_SIZE = 2
 9 | 
10 | def init_weights():
11 |     Wh = np.random.randn(INPUT_LAYER_SIZE, HIDDEN_LAYER_SIZE) * \
12 |                 np.sqrt(2.0/INPUT_LAYER_SIZE)
13 |     Wo = np.random.randn(HIDDEN_LAYER_SIZE, OUTPUT_LAYER_SIZE) * \
14 |                 np.sqrt(2.0/HIDDEN_LAYER_SIZE)
15 | 
16 | 
17 | def init_bias():
18 |     Bh = np.full((1, HIDDEN_LAYER_SIZE), 0.1)
19 |     Bo = np.full((1, OUTPUT_LAYER_SIZE), 0.1)
20 |     return Bh, Bo
21 | 
22 | def relu(Z):
23 |     return np.maximum(0, Z)
24 | 
25 | def relu_prime(Z):
26 |     '''
27 |     Z - weighted input matrix
28 | 
29 |     Returns gradient of Z where all
30 |     negative values are set to 0 and
31 |     all positive values set to 1
32 |     '''
33 |     Z[Z < 0] = 0
34 |     Z[Z > 0] = 1
35 |     return Z
36 | 
37 | def cost(yHat, y):
38 |     cost = np.sum((yHat - y)**2) / 2.0
39 |     return cost
40 | 
41 | def cost_prime(yHat, y):
42 |     return yHat - y
43 | 
44 | def feed_forward(X):
45 |     '''
46 |     X    - input matrix
47 |     Zh   - hidden layer weighted input
48 |     Zo   - output layer weighted input
49 |     H    - hidden layer activation
50 |     y    - output layer
51 |     yHat - output layer predictions
52 |     '''
53 | 
54 |     # Hidden layer
55 |     Zh = np.dot(X, Wh) + Bh
56 |     H = relu(Zh)
57 | 
58 |     # Output layer
59 |     Zo = np.dot(H, Wo) + Bo
60 |     yHat = relu(Zo)
61 |     return yHat
62 | 
63 | def backprop(X, y, lr):
64 | 
65 |     yHat = feed_forward(X)
66 | 
67 |     # Layer Error
68 |     Eo = (yHat - y) * relu_prime(Zo)
69 |     Eh = np.dot(Eo, Wo.T) * relu_prime(Zh)
70 | 
71 |     # Cost derivative for weights
72 |     dWo = np.dot(H.T, Eo)
73 |     dWh = np.dot(X.T, Eh)
74 | 
75 |     # Cost derivative for bias
76 |     dBo = np.sum(Eo, axis=0, keepdims=True)
77 |     dBh = np.sum(Eh, axis=0, keepdims=True)
78 | 
79 |     # Update weights
80 |     Wo -= lr * dWo
81 |     Wh -= lr * dWh
82 | 
83 |     # Update biases
84 |     Bo -= lr * dBo
85 |     Bh -= lr * dBh
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/code/nn_simple.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import numpy as np
 3 | 
 4 | def relu(z):
 5 |     return max(0,z)
 6 | 
 7 | def feed_forward(x, Wh, Wo):
 8 |     # Hidden layer
 9 |     Zh = x * Wh
10 |     H = relu(Zh)
11 | 
12 |     # Output layer
13 |     Zo = H * Wo
14 |     output = relu(Zo)
15 |     return output
16 | 
17 | def relu_prime(z):
18 |     if z > 0:
19 |         return 1
20 |     return 0
21 | 
22 | def cost(yHat, y):
23 |     return 0.5 * (yHat - y)**2
24 | 
25 | def cost_prime(yHat, y):
26 |     return yHat - y
27 | 
28 | def backprop(x, y, Wh, Wo, lr):
29 |     yHat = feed_forward(x, Wh, Wo)
30 | 
31 |     # Layer Error
32 |     Eo = (yHat - y) * relu_prime(Zo)
33 |     Eh = Eo * Wo * relu_prime(Zh)
34 | 
35 |     # Cost derivative for weights
36 |     dWo = Eo * H
37 |     dWh = Eh * x
38 | 
39 |     # Update weights
40 |     Wh -= lr * dWh
41 |     Wo -= lr * dWo
42 | 


--------------------------------------------------------------------------------
/code/optimizers.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import numpy as np
 3 | 
 4 | 
 5 | def Adadelta(weights, sqrs, deltas, rho, batch_size):
 6 |     eps_stable = 1e-5
 7 |     for weight, sqr, delta in zip(weights, sqrs, deltas):
 8 |         g = weight.grad / batch_size
 9 |         sqr[:] = rho * sqr + (1. - rho) * nd.square(g)
10 |         cur_delta = nd.sqrt(delta + eps_stable) / nd.sqrt(sqr + eps_stable) * g
11 |         delta[:] = rho * delta + (1. - rho) * cur_delta * cur_delta
12 |         # update weight in place.
13 |         weight[:] -= cur_delta
14 | 
15 | 
16 | def Adagrad(data):
17 |     gradient_sums = np.zeros(theta.shape[0])
18 |     for t in range(num_iterations):
19 |         gradients = compute_gradients(data, weights)
20 |         gradient_sums += gradients ** 2
21 |         gradient_update = gradients / (np.sqrt(gradient_sums + epsilon))
22 |         weights = weights - lr * gradient_update
23 |     return weights
24 | 
25 | 
26 | def Adam(data):
27 |     pass
28 | 
29 | 
30 | def LBFGS(data):
31 |     pass
32 | 
33 | 
34 | def RMSProp(data):
35 |     pass
36 | 
37 | 
38 | def SGD(data, batch_size, lr):
39 |     N = len(data)
40 |     np.random.shuffle(data)
41 |     for i in range(0, N, batch_size)]):
42 |         mini_batches = np.array([data[i:i+batch_size]
43 |         for X,y in mini_batches:
44 |             backprop(X, y, lr)
45 | 
46 | 
47 | def SGD_Momentum():
48 |     pass
49 | 
50 | 


--------------------------------------------------------------------------------
/code/random_forest_classifier.py:
--------------------------------------------------------------------------------
  1 | from sklearn.datasets import load_breast_cancer
  2 | import numpy as np
  3 | from collections import Counter
  4 | import multiprocessing as mp
  5 | import scipy
  6 | import time
  7 | 
  8 | # Basic ID3 Tree
  9 | class id3_tree():
 10 |     'Implementation of ID3 Decision Tree in Python, majorly in NumPy'
 11 |     def __init__(self,least_children_num,verbose=True):
 12 |         self.least_children_num = least_children_num
 13 |         self.verbose = verbose
 14 |         
 15 |     def fit(self,tmp_x,tmp_y):
 16 |         def fit_tree(tmp_x,tmp_y):
 17 |         #     Exit Condition 0:
 18 |             # Exit Condition 1:
 19 |             if \
 20 |             len(tmp_y) < self.least_children_num or len(np.unique(tmp_y))==1:
 21 | 
 22 |                 if self.verbose:
 23 |                     print('exit condition:')
 24 |                     print('tmp_y:')
 25 |                     print(tmp_y)
 26 | 
 27 |                 mode_val = self.mode(tmp_y.flatten().tolist())
 28 |                 return([np.nan, mode_val, np.nan, np.nan]) # Leaf Node: format [feat,splitval,]
 29 | 
 30 |             # Otherwise Split:
 31 |             if self.verbose:
 32 |                 print("start....subset Y len {}".format(len(tmp_y)))
 33 | 
 34 | 
 35 |             split_row,split_col = self.decide_split_data(tmp_x,tmp_y)
 36 | 
 37 |             if not split_row and not split_col:
 38 |                 mode_val = self.mode(tmp_y.flatten().tolist())
 39 |                 return([np.nan, mode_val, np.nan, np.nan])
 40 | 
 41 |             if self.verbose:
 42 |                 print("split on:")
 43 |                 print(split_row,split_col)
 44 | 
 45 |             split_vec = tmp_x[:,split_col]
 46 |             split_val = tmp_x[split_row,split_col]
 47 |             # Recursively Split to left and right branches:
 48 |             left_ind = np.where(split_vec<split_val)[0].tolist()
 49 |             right_ind = np.where(split_vec>=split_val)[0].tolist()
 50 |             left_dat,left_y = tmp_x[left_ind,:],tmp_y[left_ind,]
 51 |             right_dat,right_y = tmp_x[right_ind,:],tmp_y[right_ind,]
 52 | 
 53 |             left_tree = fit_tree(left_dat,left_y)
 54 |             right_tree = fit_tree(right_dat,right_y)
 55 | 
 56 |             if isinstance(left_tree, list): # If list, tree len 1
 57 |                 len_l_tree = 1
 58 |             else:
 59 |                 len_l_tree = left_tree.shape[0] # If array, tree len >1
 60 | 
 61 |             root = [split_col,split_val,1,len_l_tree+1] # Format [split_col, split_val, left_tree_relative_idx, right_tree_relative_idx]
 62 |             return(np.vstack([root,left_tree,right_tree]))
 63 |         
 64 |         tree = fit_tree(tmp_x,tmp_y)
 65 |         self.tree = tree
 66 | 
 67 |     def decide_split_data(self,x,y):
 68 |         'Given subset of X,Y, search for the best splitting node based on: information gain'
 69 |         def entropy(tmp_y):
 70 |             'Key Metrics of building a decision tree. Specifically Shannon Entropy'
 71 |             tmp_ent = 0
 72 |             for uni_y in np.unique(tmp_y):
 73 |                 p = len(tmp_y[tmp_y==uni_y])/len(tmp_y)
 74 |                 tmp_ent -= (p*np.log2(p))
 75 |             return tmp_ent
 76 | 
 77 |         m,n = x.shape
 78 |         best_gain = 0
 79 |         split_row, split_col = None,None
 80 | 
 81 |         previous_entropy = entropy(y)
 82 |         for col in range(n):
 83 |             tmp_vec = x[:,col].ravel()
 84 | 
 85 |             for row in range(m):
 86 |                 val = tmp_vec[row]
 87 |                 # >= & < is my convention here:
 88 |                 if val!=np.max(tmp_vec) and val!= np.min(tmp_vec):
 89 |                     left_b = np.where(tmp_vec<val)[0].tolist()
 90 |                     right_b = np.where(tmp_vec>=val)[0].tolist()
 91 | 
 92 |                     # new entropy is the weighted  average entropy from each of the subset
 93 |                     new_ent = \
 94 |                     (len(y[left_b])/len(y))*entropy(y[left_b]) + \
 95 |                     (len(y[right_b])/len(y))*entropy(y[right_b])
 96 | 
 97 |                     info_gain = previous_entropy - new_ent
 98 | 
 99 |                     if info_gain > best_gain:
100 |                         split_row, split_col = row,col
101 |                         best_gain = info_gain
102 |                         if self.verbose:
103 |                             print('better gain:{}'.format(best_gain))
104 |                             print()
105 | 
106 |         return split_row, split_col
107 |                 
108 |     def mode(self, x_list):
109 |         'calculate the mode'
110 |         return Counter(x_list).most_common(1)[0][0]
111 | 
112 |     def predict(self, tmp_test_array):
113 |         'Wrap-up fun for prediction'
114 |         def query(tree,tmp_test_array):
115 |             'Test for single example'
116 |             assert len(tmp_test_array.shape) == 2, "Make sure your test data is 2d array"
117 | 
118 |             if isinstance(tree,list):
119 |                 start_node = tree # only the 1 row in data
120 |             else:
121 |                 start_node = tree[0,:] # Iteratively hit first row
122 |             test_feat,test_val,left_tree_jump,right_tree_jump = start_node[0],start_node[1],start_node[2],start_node[3]
123 |             # Exit Condition:
124 |             if np.isnan(test_feat) and np.isnan(left_tree_jump) and np.isnan(right_tree_jump):
125 |                 pred = test_val
126 |                 return pred 
127 |             #Test:
128 |             if tmp_test_array[0,int(test_feat)] < test_val:
129 |                 # If <, go left branch:
130 |                 jump_loc = left_tree_jump
131 |                 pred = query(tree[int(jump_loc):,],tmp_test_array)
132 |             else:
133 |                 # If >=, go right branch:
134 |                 jump_loc = right_tree_jump
135 |                 pred = query(tree[int(jump_loc):,],tmp_test_array)
136 |             return pred
137 |         assert len(tmp_test_array.shape) == 2, "Make sure your test data is 2d array"
138 |         result = []
139 |         for i in range(tmp_test_array.shape[0]):
140 |             inp = tmp_test_array[i,:].reshape(1,-1)
141 |             result.append(query(self.tree,inp))
142 |         return result   
143 | 
144 | 
145 | 
146 | # RF using ID-3 tree:
147 | class RandomForestClassification():
148 |     """
149 |     Python inplementation of random forest classifier 
150 |     using id3 as the base tree
151 |     with parallel processing
152 |     """
153 |     def __init__ (
154 |         self,
155 |         n_tree,
156 |         min_leaf_num,  # to control overfit
157 |         criteria = 'entropy', # currently only support entropy
158 |         max_features = 'auto',# if max_feature = sqrt(number of features), otherwise will be proportion of features sampled
159 |         n_workers = 1,
160 |         verbose = True
161 |         
162 |     ):
163 |         self.n_tree = n_tree
164 |         self.min_leaf_num = min_leaf_num
165 |         self.criteria = criteria
166 |         self.max_features = max_features
167 |         self.n_workers = n_workers
168 |         self.verbose = verbose
169 | 
170 | 
171 |     def fit_single(self,data):
172 |         """
173 |         Single ID3 Tree Fitting
174 |         """
175 |         X = data[0]
176 |         y = data[1]
177 |         tmp_X,tmp_y,feat_choose = self.random_find_feature(X,y)
178 |         model = id3_tree(least_children_num = self.min_leaf_num,verbose=False)
179 |         model.fit(tmp_X,tmp_y)
180 |         return model,feat_choose
181 | 
182 |     def fit_rf(self,X,y):
183 |         """
184 |         Forest 
185 |         """
186 |         data = [X,y]
187 |         with mp.Pool(self.n_workers) as p:
188 |             model_list = p.map(self.fit_single,[data]*self.n_tree)
189 |             
190 |         self.model_list = model_list
191 |         
192 | 
193 |     def predict_rf(self,X):
194 |         """
195 |         Forest Prediction
196 |         taking the vote of each tree
197 |         """
198 |         result_list = []
199 |         for model_stuff in self.model_list:
200 |             print('.')
201 |             single_model,single_feat_choose = model_stuff
202 |             
203 |             res = single_model.predict(X[:,single_feat_choose])
204 |             result_list.append(res)
205 |             
206 |         return scipy.stats.mode(np.array(result_list),axis=0).mode.tolist()[0] # Take the vote
207 |         
208 |     
209 |     def random_find_feature(self,X,y):
210 |         """
211 |         Randomly select subset of features for each tree
212 |         """
213 |     
214 |         if self.max_features == 'auto':
215 |             n_feat_dat = X.shape[1]
216 |             n_feat_choose = int(round(np.sqrt(n_feat_dat)))
217 |         else:
218 |             n_feat_dat = X.shape[1]
219 |             n_feat_choose = int(n_feat_dat*self.max_features)
220 |             
221 |         feat_choose = np.random.choice(range(n_feat_dat),size=n_feat_choose,replace=False).tolist()
222 |         feat_choose = sorted(feat_choose) # Important to sort this in order otherwise will confuse the model
223 |         print("feat_chosen:{}".format(feat_choose))
224 |         return  X[:,feat_choose],y,feat_choose
225 | 
226 | 
227 | if __name__ == "__main__":
228 |     # ID3: only categorical features
229 |     from sklearn.model_selection import train_test_split
230 |     from sklearn.metrics import classification_report
231 |     from sklearn import datasets
232 |     from sklearn.tree import DecisionTreeClassifier
233 |     dataset = datasets.load_iris()
234 |     all_categorical_feature = True
235 | 
236 |     # convert continuous feature to categorical features
237 |     if all_categorical_feature:
238 |         f = lambda x: int(x)
239 |         func = np.vectorize(f)
240 |         X = func(dataset.data)
241 |     else:
242 |         X = dataset.data
243 | 
244 |     Y = dataset.target
245 |     X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.8)
246 |     # config
247 |     max_depth = 3
248 |     min_sample_leaf = 4
249 | 
250 |     model = DecisionTreeClassifier(criterion="entropy", max_depth=max_depth, min_samples_leaf=min_sample_leaf)
251 |     model.fit(X_train, y_train)
252 |     y_pred = model.predict(X_test)
253 |     print(classification_report(y_true=y_test, y_pred=y_pred))
254 |     #
255 |     # model = ID3DecisionTree(max_depth=max_depth, min_sample_leaf=min_sample_leaf, verbose=True)
256 |     # model = C45DecisionTree(max_depth=max_depth, min_sample_leaf=min_sample_leaf, verbose=True)
257 |     model = RandomForestClassification(
258 |         n_tree=5,
259 |         min_leaf_num=min_sample_leaf,
260 |         n_workers=5
261 |     )
262 |     model.fit_rf(X_train, y_train)
263 |     y_pred = model.predict_rf(X_test)
264 |     print(classification_report(y_true=y_test, y_pred=y_pred))
265 | 
266 | 
267 | 


--------------------------------------------------------------------------------
/code/rnn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | #from torch.autograd import Variable
 4 | 
 5 | class RNN(nn.Module):
 6 |     def __init__(self, n_classes):
 7 |         super().__init__()
 8 |         self.hid_fc = nn.Linear(185, 128)
 9 |         self.out_fc = nn.Linear(185, n_classes)
10 |         self.softmax = nn.LogSoftmax()
11 |     
12 |     def forward(self, inputs, hidden):
13 |         inputs = inputs.view(1,-1)
14 |         combined = torch.cat([inputs, hidden], dim=1)
15 |         hid_out = self.hid_fc(combined)
16 |         out = self.out_fc(combined)
17 |         out = self.softmax(out)
18 |         return out, hid_out
19 |         
20 | def train(model, inputs, targets):
21 |     for i in range(len(inputs)):
22 |         target = Variable(targets[i])
23 |         name = inputs[i]
24 |         hidden = Variable(torch.zeros(1,128))
25 |         model.zero_grad()
26 |         
27 |         for char in name:
28 |             input_ = Variable(torch.FloatTensor(char))
29 |             pred, hidden = model(input_, hidden)
30 |         
31 |         loss = criterion(pred, target)
32 |         loss.backward()
33 |         
34 |         for p in model.parameters():
35 |             p.data.add_(-.001, p.grad.data)
36 | 


--------------------------------------------------------------------------------
/code/vae.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.autograd import Variable
 4 | 
 5 | 
 6 | class VAE(nn.Module):
 7 |     def __init__(self, in_shape, n_latent):
 8 |         super().__init__()
 9 |         self.in_shape = in_shape
10 |         self.n_latent = n_latent
11 |         c,h,w = in_shape
12 |         self.z_dim = h//2**2 # receptive field downsampled 2 times
13 |         self.encoder = nn.Sequential(
14 |             nn.BatchNorm2d(c),
15 |             nn.Conv2d(c, 32, kernel_size=4, stride=2, padding=1),  # 32, 16, 16
16 |             nn.BatchNorm2d(32),
17 |             nn.LeakyReLU(),
18 |             nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=1),  # 32, 8, 8
19 |             nn.BatchNorm2d(64),
20 |             nn.LeakyReLU(),
21 |         )
22 |         self.z_mean = nn.Linear(64 * self.z_dim**2, n_latent)
23 |         self.z_var = nn.Linear(64 * self.z_dim**2, n_latent)
24 |         self.z_develop = nn.Linear(n_latent, 64 * self.z_dim**2)
25 |         self.decoder = nn.Sequential(
26 |             nn.ConvTranspose2d(64, 32, kernel_size=3, stride=2, padding=0),
27 |             nn.BatchNorm2d(32),
28 |             nn.ReLU(),
29 |             nn.ConvTranspose2d(32, 1, kernel_size=3, stride=2, padding=1),
30 |             CenterCrop(h,w),
31 |             nn.Sigmoid()
32 |         )
33 | 
34 |     def sample_z(self, mean, logvar):
35 |         stddev = torch.exp(0.5 * logvar)
36 |         noise = Variable(torch.randn(stddev.size()))
37 |         return (noise * stddev) + mean
38 | 
39 |     def encode(self, x):
40 |         x = self.encoder(x)
41 |         x = x.view(x.size(0), -1)
42 |         mean = self.z_mean(x)
43 |         var = self.z_var(x)
44 |         return mean, var
45 | 
46 |     def decode(self, z):
47 |         out = self.z_develop(z)
48 |         out = out.view(z.size(0), 64, self.z_dim, self.z_dim)
49 |         out = self.decoder(out)
50 |         return out
51 | 
52 |     def forward(self, x):
53 |         mean, logvar = self.encode(x)
54 |         z = self.sample_z(mean, logvar)
55 |         out = self.decode(z)
56 |         return out, mean, logvar
57 | 
58 | 
59 | def vae_loss(output, input, mean, logvar, loss_func):
60 |     recon_loss = loss_func(output, input)
61 |     kl_loss = torch.mean(0.5 * torch.sum(
62 |         torch.exp(logvar) + mean**2 - 1. - logvar, 1))
63 |     return recon_loss + kl_loss
64 | 
65 | def train(model, loader, loss_func, optimizer):
66 |     model.train()
67 |     for inputs, _ in loader:
68 |         inputs = Variable(inputs)
69 | 
70 |         output, mean, logvar = model(inputs)
71 |         loss = vae_loss(output, inputs, mean, logvar, loss_func)
72 | 
73 |         optimizer.zero_grad()
74 |         loss.backward()
75 |         optimizer.step()
76 | 


--------------------------------------------------------------------------------
/docs/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "git.ignoreLimitWarning": true
3 | }


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = AIGlossary
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | livehtml:
22 | 	sphinx-autobuild . ../docs/_build/html
23 | 


--------------------------------------------------------------------------------
/docs/_static/theme_overrides.css:
--------------------------------------------------------------------------------
 1 | /* override table width restrictions */
 2 | @media screen and (min-width: 767px) {
 3 | 
 4 |    .wy-table-responsive table td {
 5 |       /* !important prevents the common CSS stylesheets from overriding
 6 |          this as on RTD they are loaded after this stylesheet */
 7 |       white-space: normal !important;
 8 |    }
 9 | 
10 |    .wy-table-responsive {
11 |       overflow: visible !important;
12 |    }
13 | }
14 | 


--------------------------------------------------------------------------------
/docs/applications.rst:
--------------------------------------------------------------------------------
 1 | .. _applications:
 2 | 
 3 | ============
 4 | Applications
 5 | ============
 6 | 
 7 | .. contents:: :local:
 8 | 
 9 | 
10 | Anomaly Detection
11 | =================
12 | 
13 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
14 | 
15 | 
16 | 
17 | Computer Vision
18 | ===============
19 | 
20 | Classification
21 | --------------
22 | 
23 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
24 | 
25 | Object Detection
26 | ----------------
27 | 
28 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
29 | 
30 | Segmentation
31 | ------------
32 | 
33 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
34 | 
35 | 
36 | 
37 | Natural Language
38 | ================
39 | 
40 | Dialog Systems
41 | --------------
42 | 
43 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
44 | 
45 | Machine Translation
46 | -------------------
47 | 
48 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
49 | 
50 | Speech Recognition
51 | ------------------
52 | 
53 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
54 | 
55 | Text Summarization
56 | ------------------
57 | 
58 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
59 | 
60 | Question Answering
61 | ------------------
62 | 
63 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
64 | 
65 | 
66 | 
67 | Recommender Systems
68 | ===================
69 | 
70 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
71 | 
72 | 
73 | 
74 | Time-Series
75 | ===========
76 | 
77 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
78 | 
79 | 
80 | .. rubric:: References
81 | 
82 | .. [1] Example Reference
83 | 


--------------------------------------------------------------------------------
/docs/backpropagation.rst:
--------------------------------------------------------------------------------
  1 | .. _backpropagation:
  2 | 
  3 | ===============
  4 | Backpropagation
  5 | ===============
  6 | 
  7 | .. contents:: :local:
  8 | 
  9 | The goals of backpropagation are straightforward: adjust each weight in the network in proportion to how much it contributes to overall error. If we iteratively reduce each weight's error, eventually we’ll have a series of weights that produce good predictions.
 10 | 
 11 | 
 12 | Chain rule refresher
 13 | ====================
 14 | 
 15 | As seen above, foward propagation can be viewed as a long series of nested equations. If you think of feed forward this way, then backpropagation is merely an application of :ref:`chain_rule` to find the :ref:`derivative` of cost with respect to any variable in the nested equation. Given a forward propagation function:
 16 | 
 17 | .. math::
 18 | 
 19 |   f(x) = A(B(C(x)))
 20 | 
 21 | A, B, and C are activation functions at different layers. Using the chain rule we easily calculate the derivative of :math:`f(x)` with respect to :math:`x`:
 22 | 
 23 | .. math::
 24 | 
 25 |   f'(x) = f'(A) \cdot A'(B) \cdot B'(C) \cdot C'(x)
 26 | 
 27 | How about the derivative with respect to B? To find the derivative with respect to B you can pretend :math:`B(C(x))` is a constant, replace it with a placeholder variable B, and proceed to find the derivative normally with respect to B.
 28 | 
 29 | .. math::
 30 | 
 31 |   f'(B) = f'(A) \cdot A'(B)
 32 | 
 33 | This simple technique extends to any variable within a function and allows us to precisely pinpoint the exact impact each variable has on the total output.
 34 | 
 35 | 
 36 | 
 37 | Applying the chain rule
 38 | =======================
 39 | 
 40 | Let's use the chain rule to calculate the derivative of cost with respect to any weight in the network. The chain rule will help us identify how much each weight contributes to our overall error and the direction to update each weight to reduce our error. Here are the equations we need to make a prediction and calculate total error, or cost:
 41 | 
 42 | .. image:: images/backprop_ff_equations.png
 43 |     :align: center
 44 | 
 45 | Given a network consisting of a single neuron, total cost could be calculated as:
 46 | 
 47 | .. math::
 48 | 
 49 |   Cost = C(R(Z(X W)))
 50 | 
 51 | Using the chain rule we can easily find the derivative of Cost with respect to weight W.
 52 | 
 53 | .. math::
 54 | 
 55 |   C'(W) &= C'(R) \cdot R'(Z) \cdot Z'(W) \\
 56 |         &= (\hat{y} -y) \cdot R'(Z) \cdot X
 57 | 
 58 | Now that we have an equation to calculate the derivative of cost with respect to any weight, let's go back to our toy neural network example above
 59 | 
 60 | .. image:: images/simple_nn_diagram_zo_zh_defined.png
 61 |     :align: center
 62 | 
 63 | What is the derivative of cost with respect to :math:`W_o`?
 64 | 
 65 | .. math::
 66 | 
 67 |   C'(W_O) &= C'(\hat{y}) \cdot \hat{y}'(Z_O) \cdot Z_O'(W_O) \\
 68 |           &= (\hat{y} - y) \cdot R'(Z_O) \cdot H
 69 | 
 70 | And how about with respect to :math:`W_h`? To find out we just keep going further back in our function applying the chain rule recursively until we get to the function that has the Wh term.
 71 | 
 72 | .. math::
 73 | 
 74 |   C'(W_h) &= C'(\hat{y}) \cdot O'(Z_o) \cdot Z_o'(H) \cdot H'(Z_h) \cdot Z_h'(W_h) \\
 75 |           &= (\hat{y} - y) \cdot R'(Z_o) \cdot W_o \cdot R'(Z_h) \cdot X
 76 | 
 77 | And just for fun, what if our network had 10 hidden layers. What is the derivative of cost for the first weight :math:`w_1`?
 78 | 
 79 | .. math::
 80 | 
 81 |   C'(w_1) = \frac{dC}{d\hat{y}} \cdot \frac{d\hat{y}}{dZ_{11}} \cdot \frac{dZ_{11}}{dH_{10}} \cdot \\ \frac{dH_{10}}{dZ_{10}} \cdot \frac{dZ_{10}}{dH_9} \cdot \frac{dH_9}{dZ_9} \cdot \frac{dZ_9}{dH_8} \cdot \frac{dH_8}{dZ_8} \cdot \frac{dZ_8}{dH_7} \cdot \frac{dH_7}{dZ_7} \cdot \\ \frac{dZ_7}{dH_6} \cdot \frac{dH_6}{dZ_6} \cdot \frac{dZ_6}{dH_5} \cdot \frac{dH_5}{dZ_5} \cdot \frac{dZ_5}{dH_4} \cdot \frac{dH_4}{dZ_4} \cdot \frac{dZ_4}{dH_3} \cdot \\ \frac{dH_3}{dZ_3} \cdot \frac{dZ_3}{dH_2} \cdot \frac{dH_2}{dZ_2} \cdot \frac{dZ_2}{dH_1} \cdot \frac{dH_1}{dZ_1} \cdot \frac{dZ_1}{dW_1}
 82 | 
 83 | See the pattern? The number of calculations required to compute cost derivatives increases as our network grows deeper. Notice also the redundancy in our derivative calculations. Each layer's cost derivative appends two new terms to the terms that have already been calculated by the layers above it. What if there was a way to save our work somehow and avoid these duplicate calculations?
 84 | 
 85 | 
 86 | 
 87 | Saving work with memoization
 88 | ============================
 89 | 
 90 | Memoization is a computer science term which simply means: don’t recompute the same thing over and over. In memoization we store previously computed results to avoid recalculating the same function. It's handy for speeding up recursive functions of which backpropagation is one. Notice the pattern in the derivative equations below.
 91 | 
 92 | 
 93 | .. image:: images/memoization.png
 94 |     :align: center
 95 | 
 96 | Each of these layers is recomputing the same derivatives! Instead of writing out long derivative equations for every weight, we can use memoization to save our work as we backprop error through the network. To do this, we define 3 equations (below), which together encapsulate all the calculations needed for backpropagation. The math is the same, but the equations provide a nice shorthand we can use to track which calculations we've already performed and save our work as we move backwards through the network.
 97 | 
 98 | .. image:: images/backprop_3_equations.png
 99 |     :align: center
100 | 
101 | We first calculate the output layer error and pass the result to the hidden layer before it. After calculating the hidden layer error, we pass its error value back to the previous hidden layer before it. And so on and so forth. As we move back through the network we apply the 3rd formula at every layer to calculate the derivative of cost with respect that layer's weights. This resulting derivative tells us in which direction to adjust our weights to reduce overall cost.
102 | 
103 | .. note::
104 | 
105 |   The term *layer error* refers to the derivative of cost with respect to a layer's *input*. It answers the question: how does the cost function output change when the input to that layer changes?
106 | 
107 | .. rubric:: Output layer error
108 | 
109 | To calculate output layer error we need to find the derivative of cost with respect to the output layer input, :math:`Z_o`. It answers the question — how are the final layer's weights impacting overall error in the network? The derivative is then:
110 | 
111 | .. math::
112 | 
113 |   C'(Z_o) = (\hat{y} - y) \cdot R'(Z_o)
114 | 
115 | To simplify notation, ml practitioners typically replace the :math:`(\hat{y}-y) * R'(Zo)` sequence with the term :math:`E_o`. So our formula for output layer error equals:
116 | 
117 | .. math::
118 | 
119 |   E_o = (\hat{y} - y) \cdot R'(Z_o)
120 | 
121 | .. rubric:: Hidden layer error
122 | 
123 | To calculate hidden layer error we need to find the derivative of cost with respect to the hidden layer input, Zh. 
124 | 
125 | .. math::
126 | 
127 |   C'(Z_h) = (\hat{y} - y) \cdot R'(Z_o) \cdot W_o \cdot R'(Z_h)
128 | 
129 | Next we can swap in the :math:`E_o` term above to avoid duplication and create a new simplified equation for Hidden layer error:
130 | 
131 | .. math::
132 | 
133 |   E_h = E_o \cdot W_o \cdot R'(Z_h)
134 | 
135 | This formula is at the core of backpropagation. We calculate the current layer's error, and pass the weighted error back to the previous layer, continuing the process until we arrive at our first hidden layer. Along the way we update the weights using the derivative of cost with respect to each weight.
136 | 
137 | .. rubric:: Derivative of cost with respect to any weight
138 | 
139 | Let’s return to our formula for the derivative of cost with respect to the output layer weight :math:`W_o`. 
140 | 
141 | .. math::
142 | 
143 |   C'(W_O) = (\hat{y} - y) \cdot R'(Z_O) \cdot H
144 | 
145 | We know we can replace the first part with our equation for output layer error :math:`E_o`. H represents the hidden layer activation.
146 | 
147 | .. math::
148 | 
149 |   C'(W_o) = E_o \cdot H
150 | 
151 | So to find the derivative of cost with respect to any weight in our network, we simply multiply the corresponding layer's error times its input (the previous layer's output).
152 | 
153 | .. math::
154 | 
155 |   C'(w) = CurrentLayerError \cdot CurrentLayerInput
156 | 
157 | .. note::
158 | 
159 |   *Input* refers to the activation from the previous layer, not the weighted input, Z.
160 | 
161 | .. rubric:: Summary
162 | 
163 | Here are the final 3 equations that together form the foundation of backpropagation.
164 | 
165 | .. image:: images/backprop_final_3_deriv_equations.png
166 |     :align: center
167 | 
168 | Here is the process visualized using our toy neural network example above.
169 | 
170 | .. image:: images/backprop_visually.png
171 |     :align: center
172 | 
173 | Code example
174 | ============
175 | 
176 | .. literalinclude:: ../code/nn_simple.py
177 |     :language: python
178 |     :lines: 17-41
179 | 
180 | 
181 | 
182 | .. rubric:: References
183 | 
184 | .. [1] Example
185 | 


--------------------------------------------------------------------------------
/docs/build.bat:
--------------------------------------------------------------------------------
 1 | @echo OFF
 2 | 
 3 | set SPHINXOPTS=" "
 4 | set SPHINXBUILD=sphinx-build
 5 | set SOURCEDIR=.
 6 | set BUILDDIR=_build/html
 7 | 
 8 | 
 9 | if "%1"=="" (
10 |     echo "Usage : build.bat html"
11 | ) else (
12 |     %SPHINXBUILD% -b "%1" %SOURCEDIR% %BUILDDIR% 
13 | )
14 | 
15 | 


--------------------------------------------------------------------------------
/docs/clustering_algos.rst:
--------------------------------------------------------------------------------
 1 | .. _clustering_algos:
 2 | 
 3 | =====================
 4 | Clustering Algorithms
 5 | =====================
 6 | 
 7 | 
 8 | Centroid
 9 | ========
10 | 
11 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
12 | 
13 | Density
14 | =======
15 | 
16 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
17 | 
18 | Distribution
19 | ============
20 | 
21 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
22 | 
23 | Hierarchical
24 | ============
25 | 
26 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
27 | 
28 | K-Means
29 | ========
30 | 
31 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
32 | 
33 | Mean shift
34 | ==========
35 | 
36 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
37 | 
38 | 
39 | .. rubric:: References
40 | 
41 | .. [1] https://en.wikipedia.org/wiki/Cluster_analysis
42 | 
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # AI Glossary documentation build configuration file, created by
  5 | # sphinx-quickstart on Tue Apr 11 17:53:13 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | # import os
 21 | # import sys
 22 | # sys.path.insert(0, os.path.abspath('.'))
 23 | 
 24 | 
 25 | # -- General configuration ------------------------------------------------
 26 | 
 27 | # If your documentation needs a minimal Sphinx version, state it here.
 28 | #
 29 | # needs_sphinx = '1.0'
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be
 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 33 | # ones.
 34 | extensions = ['sphinx.ext.mathjax',
 35 |     'sphinx.ext.githubpages']
 36 | 
 37 | # Add any paths that contain templates here, relative to this directory.
 38 | templates_path = ['_templates']
 39 | 
 40 | # The suffix(es) of source filenames.
 41 | # You can specify multiple suffix as a list of string:
 42 | #
 43 | # source_suffix = ['.rst', '.md']
 44 | source_suffix = '.rst'
 45 | 
 46 | # The master toctree document.
 47 | master_doc = 'index'
 48 | 
 49 | # General information about the project.
 50 | project = 'ML Glossary'
 51 | copyright = '2017'
 52 | author = 'Team'
 53 | 
 54 | # The version info for the project you're documenting, acts as replacement for
 55 | # |version| and |release|, also used in various other places throughout the
 56 | # built documents.
 57 | #
 58 | # The short X.Y version.
 59 | version = ''
 60 | # The full version, including alpha/beta/rc tags.
 61 | release = ''
 62 | 
 63 | # The language for content autogenerated by Sphinx. Refer to documentation
 64 | # for a list of supported languages.
 65 | #
 66 | # This is also used if you do content translation via gettext catalogs.
 67 | # Usually you set "language" from the command line for these cases.
 68 | language = None
 69 | 
 70 | # List of patterns, relative to source directory, that match files and
 71 | # directories to ignore when looking for source files.
 72 | # This patterns also effect to html_static_path and html_extra_path
 73 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 74 | 
 75 | 
 76 | # The name of the Pygments (syntax highlighting) style to use.
 77 | pygments_style = 'sphinx'
 78 | 
 79 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 80 | todo_include_todos = False
 81 | 
 82 | 
 83 | # -- Options for HTML output ----------------------------------------------
 84 | 
 85 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 86 | # a list of builtin themes.
 87 | #
 88 | import sphinx_rtd_theme
 89 | 
 90 | html_theme = "sphinx_rtd_theme" #'alabaster'
 91 | 
 92 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
 93 | 
 94 | # Theme options are theme-specific and customize the look and feel of a theme
 95 | # further.  For a list of options available for each theme, see the
 96 | # documentation.
 97 | #
 98 | # html_theme_options = {}
 99 | 
100 | # Add any paths that contain custom static files (such as style sheets) here,
101 | # relative to this directory. They are copied after the builtin static files,
102 | # so a file named "default.css" will overwrite the builtin "default.css".
103 | html_static_path = ['_static']
104 | 
105 | 
106 | # -- Options for HTMLHelp output ------------------------------------------
107 | 
108 | # Output file base name for HTML help builder.
109 | htmlhelp_basename = 'MLCheatsheetdoc'
110 | 
111 | 
112 | # -- Options for LaTeX output ---------------------------------------------
113 | 
114 | latex_elements = {
115 |     # The paper size ('letterpaper' or 'a4paper').
116 |     #
117 |     # 'papersize': 'letterpaper',
118 | 
119 |     # The font size ('10pt', '11pt' or '12pt').
120 |     #
121 |     # 'pointsize': '10pt',
122 | 
123 |     # Additional stuff for the LaTeX preamble.
124 |     #
125 |     # 'preamble': '',
126 | 
127 |     # Latex figure (float) alignment
128 |     #
129 |     # 'figure_align': 'htbp',
130 | }
131 | 
132 | # Grouping the document tree into LaTeX files. List of tuples
133 | # (source start file, target name, title,
134 | #  author, documentclass [howto, manual, or own class]).
135 | latex_documents = [
136 |     (master_doc, 'MLCheatsheet.tex', 'ML Cheatsheet Documentation',
137 |      'Team', 'manual'),
138 | ]
139 | 
140 | 
141 | # -- Options for manual page output ---------------------------------------
142 | 
143 | # One entry per manual page. List of tuples
144 | # (source start file, name, description, authors, manual section).
145 | man_pages = [
146 |     (master_doc, 'mlcheatsheet', 'Machine Learning Cheatsheet Documentation',
147 |      [author], 1)
148 | ]
149 | 
150 | 
151 | # -- Options for Texinfo output -------------------------------------------
152 | 
153 | # Grouping the document tree into Texinfo files. List of tuples
154 | # (source start file, target name, title, author,
155 | #  dir menu entry, description, category)
156 | texinfo_documents = [
157 |     (master_doc, 'MLCheatsheet', 'ML Cheatsheet Documentation',
158 |      author, 'Contributors', 'Glossary of machine learning terms and concepts.',
159 |      'Miscellaneous'),
160 | ]
161 | 
162 | 
163 | 
164 | # -- Options for Epub output ----------------------------------------------
165 | 
166 | # Bibliographic Dublin Core info.
167 | epub_title = project
168 | epub_author = author
169 | epub_publisher = author
170 | epub_copyright = copyright
171 | 
172 | # The unique identifier of the text. This can be a ISBN number
173 | # or the project homepage.
174 | #
175 | # epub_identifier = ''
176 | 
177 | # A unique identification for the text.
178 | #
179 | # epub_uid = ''
180 | 
181 | # A list of files that should not be packed into the epub file.
182 | epub_exclude_files = ['search.html']
183 | 
184 | 
185 | from recommonmark.parser import CommonMarkParser
186 | 
187 | source_parsers = {
188 |     '.md': CommonMarkParser,
189 | }
190 | 
191 | source_suffix = ['.rst', '.md']
192 | 
193 | html_theme_options = {
194 |     'collapse_navigation': False,
195 |     'display_version': False
196 | #    'logo_only': True,
197 | }
198 | 
199 | def setup(app):
200 |     #app.add_stylesheet('theme_overrides.css')
201 |     app.add_css_file('theme_overrides.css')
202 | 
203 | #html_context = {
204 | #    'css_files': [
205 | #        '_static/theme_overrides.css',  # override wide tables in RTD theme
206 | #        ],
207 | #     }
208 | 


--------------------------------------------------------------------------------
/docs/contribute.rst:
--------------------------------------------------------------------------------
1 | .. _contribute:
2 | 
3 | ==========
4 | Contribute
5 | ==========
6 | 
7 | Become a contributor! Check out our `github <http://github.com/bfortuner/ml-cheatsheet/>`_ for more information.
8 | 


--------------------------------------------------------------------------------
/docs/figures/SimpleDiagram3_neural_networks.sdxml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/figures/SimpleDiagram3_neural_networks.sdxml


--------------------------------------------------------------------------------
/docs/figures/activation_function_table.tgn:
--------------------------------------------------------------------------------
1 | {"rows_views":[[{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}]],"model":{"rows":[[{"value":"Function","cspan":1,"rspan":1},{"value":"Derivative","cspan":1,"rspan":1}],[{"value":".. math::\n      r(x) = x + 1","cspan":1,"rspan":1},{"value":".. math::\n       r(x) = x + 1","cspan":1,"rspan":1}],[{"value":".. image:: images/sigmoid.png\n      :align: center\n      :width: 256 px\n      :height: 256 px","cspan":1,"rspan":1},{"value":".. image:: images/sigmoid_prime.png\n      :align: center\n      :width: 256 px\n      :height: 256 px","cspan":1,"rspan":1}],[{"value":".. literalinclude:: ../code/activation_functions.py\n      :pyobject: sigmoid","cspan":1,"rspan":1},{"value":".. literalinclude:: ../code/activation_functions.py\n      :pyobject: sigmoid_prime\n","cspan":1,"rspan":1}]]},"theme":null,"fixed_layout":false}


--------------------------------------------------------------------------------
/docs/figures/calculus_symbol_table.tgn:
--------------------------------------------------------------------------------
1 | {"rows_views":[[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}]],"model":{"rows":[[{"value":"Symbol","cspan":1,"rspan":1},{"value":"Name","cspan":1,"rspan":1},{"value":"Description","cspan":1,"rspan":1},{"value":"Example","cspan":1,"rspan":1}],[{"value":"x'","cspan":1,"rspan":1},{"value":"derivative","cspan":1,"rspan":1},{"value":"first derivative","cspan":1,"rspan":1},{"value":"(x^2)' = 2x","cspan":1,"rspan":1}],[{"value":"x''","cspan":1,"rspan":1},{"value":"second derivative","cspan":1,"rspan":1},{"value":"second derivative","cspan":1,"rspan":1},{"value":"(x^2)'' = 2","cspan":1,"rspan":1}],[{"value":"lim(x-->0)","cspan":1,"rspan":1},{"value":"limit","cspan":1,"rspan":1},{"value":"function value as x approaches 0","cspan":1,"rspan":1},{"value":"","cspan":1,"rspan":1}],[{"value":"∇","cspan":1,"rspan":1},{"value":"nabla","cspan":1,"rspan":1},{"value":"gradient","cspan":1,"rspan":1},{"value":"∇f(a,b,c)","cspan":1,"rspan":1}]]},"theme":null,"fixed_layout":false}


--------------------------------------------------------------------------------
/docs/figures/forward_prop_matrix_dimensions_table.tgn:
--------------------------------------------------------------------------------
1 | {"rows_views":[[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}]],"model":{"rows":[[{"value":"**Var**","cspan":1,"rspan":1},{"value":"**Name**","cspan":1,"rspan":1},{"value":"**Dimensions**","cspan":1,"rspan":1},{"value":"**Explanation**","cspan":1,"rspan":1}],[{"value":"``X``","cspan":1,"rspan":1},{"value":"Input","cspan":1,"rspan":1},{"value":"(3, 1)","cspan":1,"rspan":1},{"value":"Includes 3 rows of training data, and each row has 1 attribute (height, price, etc.)","cspan":1,"rspan":1}],[{"value":"``Wh``","cspan":1,"rspan":1},{"value":"Hidden weights","cspan":1,"rspan":1},{"value":"(1, 2)","cspan":1,"rspan":1},{"value":"These dimensions are based on number of rows equals the number of attributes for the observations in our training set. The number columns equals the number of neurons in the hidden layer. The dimensions of the weights matrix between two layers is determined by the sizes of the two layers it connects. There is one weight for every input-to-neuron connection between the layers.","cspan":1,"rspan":1}],[{"value":"``Bh``","cspan":1,"rspan":1},{"value":"Hidden bias","cspan":1,"rspan":1},{"value":"(1, 2)","cspan":1,"rspan":1},{"value":"Each neuron in the hidden layer has is own bias constant. This bias matrix is added to the weighted input matrix before the hidden layer applies ReLU.","cspan":1,"rspan":1}],[{"value":"``Zh``","cspan":1,"rspan":1},{"value":"Hidden weighted input","cspan":1,"rspan":1},{"value":"(1, 2)","cspan":1,"rspan":1},{"value":"Computed by taking the dot product of X and Wh. The dimensions (1,2) are required by the rules of matrix multiplication. Zh takes the rows of in the inputs matrix and the columns of weights matrix. We then add the hidden layer bias matrix Bh.","cspan":1,"rspan":1}],[{"value":"``H``","cspan":1,"rspan":1},{"value":"Hidden activations","cspan":1,"rspan":1},{"value":"(3, 2)","cspan":1,"rspan":1},{"value":"Computed by applying the Relu function to Zh. The dimensions are (3,2) — the number of rows matches the number of training samples and the number of columns equals the number of neurons. Each column holds all the activations for a specific neuron.","cspan":1,"rspan":1}],[{"value":"``Wo``","cspan":1,"rspan":1},{"value":"Output weights","cspan":1,"rspan":1},{"value":"(2, 2)","cspan":1,"rspan":1},{"value":"The number of rows matches the number of hidden layer neurons and the number of columns equals the number of output layer neurons. There is one weight for every hidden-neuron-to-output-neuron connection between the layers.","cspan":1,"rspan":1}],[{"value":"``Bo``","cspan":1,"rspan":1},{"value":"Output bias","cspan":1,"rspan":1},{"value":"(1, 2)","cspan":1,"rspan":1},{"value":"There is one column for every neuron in the output layer.","cspan":1,"rspan":1}],[{"value":"``Zo``","cspan":1,"rspan":1},{"value":"Output weighted input","cspan":1,"rspan":1},{"value":"(3, 2)","cspan":1,"rspan":1},{"value":"Computed by taking the dot product of H and Wo and then adding the output layer bias Bo. The dimensions are (3,2) representing the rows of in the hidden layer matrix and the columns of output layer weights matrix.","cspan":1,"rspan":1}],[{"value":"``O``","cspan":1,"rspan":1},{"value":"Output activations","cspan":1,"rspan":1},{"value":"(3, 2)","cspan":1,"rspan":1},{"value":"Each row represents a prediction for a single observation in our training set. Each column is a unique attribute we want to predict. Examples of two-column output predictions could be a company's sales and units sold, or a person's height and weight.","cspan":1,"rspan":1}]]},"theme":null,"fixed_layout":false}


--------------------------------------------------------------------------------
/docs/figures/linear_regression_companies_sales.tgn:
--------------------------------------------------------------------------------
1 | {"rows_views":[[{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}}]],"model":{"rows":[[{"value":"Company","cspan":1,"rspan":1},{"value":"TV","cspan":1,"rspan":1},{"value":"Radio","cspan":1,"rspan":1},{"value":"News","cspan":1,"rspan":1},{"value":"Units","cspan":1,"rspan":1}],[{"value":"Amazon","cspan":1,"rspan":1},{"value":"230.1","cspan":1,"rspan":1},{"value":"37.8","cspan":1,"rspan":1},{"value":"69.1","cspan":1,"rspan":1},{"value":"22.1","cspan":1,"rspan":1}],[{"value":"Google","cspan":1,"rspan":1},{"value":"44.5","cspan":1,"rspan":1},{"value":"39.3","cspan":1,"rspan":1},{"value":"23.1","cspan":1,"rspan":1},{"value":"10.4","cspan":1,"rspan":1}],[{"value":"Facebook","cspan":1,"rspan":1},{"value":"17.2","cspan":1,"rspan":1},{"value":"45.9","cspan":1,"rspan":1},{"value":"34.7","cspan":1,"rspan":1},{"value":"18.3","cspan":1,"rspan":1}],[{"value":"Apple","cspan":1,"rspan":1},{"value":"151.5","cspan":1,"rspan":1},{"value":"41.3","cspan":1,"rspan":1},{"value":"13.2","cspan":1,"rspan":1},{"value":"18.5","cspan":1,"rspan":1}]]},"theme":null,"fixed_layout":false}


--------------------------------------------------------------------------------
/docs/figures/linearalgebra.tgn:
--------------------------------------------------------------------------------
1 | {"rows_views":[[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}]],"model":{"rows":[[{"value":"Symbol","cspan":1,"rspan":1},{"value":"Name","cspan":1,"rspan":1},{"value":"Description","cspan":1,"rspan":1},{"value":"Example","cspan":1,"rspan":1}],[{"value":"[ ] ","cspan":1,"rspan":1},{"value":"brackets","cspan":1,"rspan":1},{"value":"matrix or vector","cspan":1,"rspan":1},{"value":"v = [1 3 5]","cspan":1,"rspan":1}],[{"value":"\\cdot","cspan":1,"rspan":1},{"value":"dot","cspan":1,"rspan":1},{"value":"dot product ","cspan":1,"rspan":1},{"value":"(Z = X \\cdot W","cspan":1,"rspan":1}],[{"value":"\\odot","cspan":1,"rspan":1},{"value":"hadamard","cspan":1,"rspan":1},{"value":"hadamard product","cspan":1,"rspan":1},{"value":"A = B \\odot C","cspan":1,"rspan":1}],[{"value":"X^T","cspan":1,"rspan":1},{"value":"transpose","cspan":1,"rspan":1},{"value":"matrix transpose","cspan":1,"rspan":1},{"value":"W^T \\cdot X ","cspan":1,"rspan":1}],[{"value":"\\arrow x","cspan":1,"rspan":1},{"value":"vector","cspan":1,"rspan":1},{"value":"","cspan":1,"rspan":1},{"value":"s = \\frac{1}{1+e^{-z}}","cspan":1,"rspan":1}],[{"value":"X","cspan":1,"rspan":1},{"value":"matrix","cspan":1,"rspan":1},{"value":"capitalized variables are matrices","cspan":1,"rspan":1},{"value":"X, W, B","cspan":1,"rspan":1}],[{"value":"\\hat x","cspan":1,"rspan":1},{"value":"unit vector","cspan":1,"rspan":1},{"value":"vector of magnitude 1","cspan":1,"rspan":1},{"value":"\\hat x = [0.2 0.5 0.3]","cspan":1,"rspan":1}]]},"theme":null,"fixed_layout":false}


--------------------------------------------------------------------------------
/docs/figures/statistics_symbols_table.tgn:
--------------------------------------------------------------------------------
1 | {"rows_views":[[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}]],"model":{"rows":[[{"value":"Symbol","cspan":1,"rspan":1},{"value":"Name","cspan":1,"rspan":1},{"value":"Description","cspan":1,"rspan":1},{"value":"Example","cspan":1,"rspan":1}],[{"value":"μ","cspan":1,"rspan":1},{"value":"population mean","cspan":1,"rspan":1},{"value":"mean of population values","cspan":1,"rspan":1},{"value":"","cspan":1,"rspan":1}],[{"value":"x (line above)","cspan":1,"rspan":1},{"value":"sample mean","cspan":1,"rspan":1},{"value":"mean of subset of population","cspan":1,"rspan":1},{"value":"","cspan":1,"rspan":1}],[{"value":"σ^2","cspan":1,"rspan":1},{"value":"population variance","cspan":1,"rspan":1},{"value":"variance of population value","cspan":1,"rspan":1},{"value":"","cspan":1,"rspan":1}],[{"value":"s 2","cspan":1,"rspan":1},{"value":"sample variable","cspan":1,"rspan":1},{"value":"variance of subset of population","cspan":1,"rspan":1},{"value":"","cspan":1,"rspan":1}],[{"value":"σX","cspan":1,"rspan":1},{"value":"standard deviation","cspan":1,"rspan":1},{"value":"population standard deviation","cspan":1,"rspan":1},{"value":"","cspan":1,"rspan":1}],[{"value":"s","cspan":1,"rspan":1},{"value":"sample std dev","cspan":1,"rspan":1},{"value":"standard deviation of sample","cspan":1,"rspan":1},{"value":"","cspan":1,"rspan":1}],[{"value":"ρX,Y","cspan":1,"rspan":1},{"value":"correlation","cspan":1,"rspan":1},{"value":"correlation of variables X and Y","cspan":1,"rspan":1},{"value":"","cspan":1,"rspan":1}],[{"value":"x (squiggle)","cspan":1,"rspan":1},{"value":"median","cspan":1,"rspan":1},{"value":"median of sample/population","cspan":1,"rspan":1},{"value":"","cspan":1,"rspan":1}]]},"theme":null,"fixed_layout":false}


--------------------------------------------------------------------------------
/docs/generative_algos.rst:
--------------------------------------------------------------------------------
 1 | .. _generative_algos:
 2 | 
 3 | =====================
 4 | Generative Algorithms
 5 | =====================
 6 | 
 7 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
 8 | 
 9 | 
10 | .. rubric:: References
11 | 
12 | .. [1] Example Reference
13 | 
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/docs/gradient_descent.rst:
--------------------------------------------------------------------------------
 1 | .. _gradient_descent:
 2 | 
 3 | ================
 4 | Gradient Descent
 5 | ================
 6 | 
 7 | Gradient descent is an optimization algorithm used to minimize some function by iteratively moving in the direction of steepest descent as defined by the negative of the gradient. In machine learning, we use gradient descent to update the :ref:`parameters <glossary_parameters>` of our model. Parameters refer to coefficients in :doc:`linear_regression` and :ref:`weights <nn_weights>` in neural networks.
 8 | 
 9 | 
10 | Introduction
11 | ============
12 | 
13 | Consider the 3-dimensional graph below in the context of a cost function. Our goal is to move from the mountain in the top right corner (high cost) to the dark blue sea in the bottom left (low cost). The arrows represent the direction of steepest descent (negative gradient) from any given point--the direction that decreases the cost function as quickly as possible. `Source <http://www.adalta.it/Pages/-GoldenSoftware-Surfer-010.asp>`_
14 | 
15 | .. image:: images/gradient_descent.png
16 |     :align: center
17 | 
18 | Starting at the top of the mountain, we take our first step downhill in the direction specified by the negative gradient. Next we recalculate the negative gradient (passing in the coordinates of our new point) and take another step in the direction it specifies. We continue this process iteratively until we get to the bottom of our graph, or to a point where we can no longer move downhill--a local minimum. `image source <https://youtu.be/5u0jaA3qAGk>`_.
19 | 
20 | .. image:: images/gradient_descent_demystified.png
21 |     :align: center
22 | 
23 | Learning rate
24 | =============
25 | 
26 | The size of these steps is called the *learning rate*. With a high learning rate we can cover more ground each step, but we risk overshooting the lowest point since the slope of the hill is constantly changing. With a very low learning rate, we can confidently move in the direction of the negative gradient since we are recalculating it so frequently. A low learning rate is more precise, but calculating the gradient is time-consuming, so it will take us a very long time to get to the bottom.
27 | 
28 | 
29 | Cost function
30 | =============
31 | 
32 | A :ref:`cost_function` tells us "how good" our model is at making predictions for a given set of parameters. The cost function has its own curve and its own gradients. The slope of this curve tells us how to update our parameters to make the model more accurate.
33 | 
34 | 
35 | Step-by-step
36 | ============
37 | 
38 | Now let's run gradient descent using our new cost function. There are two parameters in our cost function we can control: :math:`m` (weight) and :math:`b` (bias). Since we need to consider the impact each one has on the final prediction, we need to use partial derivatives. We calculate the partial derivatives of the cost function with respect to each parameter and store the results in a gradient.
39 | 
40 | .. rubric:: Math
41 | 
42 | Given the cost function:
43 | 
44 | .. math::
45 | 
46 |   f(m,b) =  \frac{1}{N} \sum_{i=1}^{N} (y_i - (mx_i + b))^2
47 | 
48 | The gradient can be calculated as:
49 | 
50 | .. math::
51 | 
52 |   f'(m,b) =
53 |      \begin{bmatrix}
54 |        \frac{df}{dm}\\
55 |        \frac{df}{db}\\
56 |       \end{bmatrix}
57 |   =
58 |      \begin{bmatrix}
59 |        \frac{1}{N} \sum -2x_i(y_i - (mx_i + b)) \\
60 |        \frac{1}{N} \sum -2(y_i - (mx_i + b)) \\
61 |       \end{bmatrix}
62 | 
63 | To solve for the gradient, we iterate through our data points using our new :math:`m` and :math:`b` values and compute the partial derivatives. This new gradient tells us the slope of our cost function at our current position (current parameter values) and the direction we should move to update our parameters. The size of our update is controlled by the learning rate.
64 | 
65 | 
66 | .. rubric:: Code
67 | 
68 | ::
69 | 
70 |   def update_weights(m, b, X, Y, learning_rate):
71 |       m_deriv = 0
72 |       b_deriv = 0
73 |       N = len(X)
74 |       for i in range(N):
75 |           # Calculate partial derivatives
76 |           # -2x(y - (mx + b))
77 |           m_deriv += -2*X[i] * (Y[i] - (m*X[i] + b))
78 | 
79 |           # -2(y - (mx + b))
80 |           b_deriv += -2*(Y[i] - (m*X[i] + b))
81 | 
82 |       # We subtract because the derivatives point in direction of steepest ascent
83 |       m -= (m_deriv / float(N)) * learning_rate
84 |       b -= (b_deriv / float(N)) * learning_rate
85 | 
86 |       return m, b
87 | 
88 | 
89 | .. rubric:: References
90 | 
91 | .. [1] http://ruder.io/optimizing-gradient-descent
92 | 


--------------------------------------------------------------------------------
/docs/images/autoencoder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/autoencoder.png


--------------------------------------------------------------------------------
/docs/images/autoencoder_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/autoencoder_2.png


--------------------------------------------------------------------------------
/docs/images/autoencoder_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/autoencoder_architecture.png


--------------------------------------------------------------------------------
/docs/images/backprop_3_equations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/backprop_3_equations.png


--------------------------------------------------------------------------------
/docs/images/backprop_ff_equations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/backprop_ff_equations.png


--------------------------------------------------------------------------------
/docs/images/backprop_final_3_deriv_equations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/backprop_final_3_deriv_equations.png


--------------------------------------------------------------------------------
/docs/images/backprop_visually.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/backprop_visually.png


--------------------------------------------------------------------------------
/docs/images/boosting-sequence-models.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/boosting-sequence-models.PNG


--------------------------------------------------------------------------------
/docs/images/boosting_error_iteration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/boosting_error_iteration.png


--------------------------------------------------------------------------------
/docs/images/calculus_slope_intro.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/calculus_slope_intro.png


--------------------------------------------------------------------------------
/docs/images/cnn.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/cnn.jpg


--------------------------------------------------------------------------------
/docs/images/cnn_filter_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/cnn_filter_output.png


--------------------------------------------------------------------------------
/docs/images/cross_entropy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/cross_entropy.png


--------------------------------------------------------------------------------
/docs/images/decision_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/decision_tree.png


--------------------------------------------------------------------------------
/docs/images/dropout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/dropout.png


--------------------------------------------------------------------------------
/docs/images/dropout_net.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/dropout_net.png


--------------------------------------------------------------------------------
/docs/images/dynamic_resizing_neural_network_1_obs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/dynamic_resizing_neural_network_1_obs.png


--------------------------------------------------------------------------------
/docs/images/dynamic_resizing_neural_network_4_obs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/dynamic_resizing_neural_network_4_obs.png


--------------------------------------------------------------------------------
/docs/images/earlystopping.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/earlystopping.png


--------------------------------------------------------------------------------
/docs/images/elu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/elu.png


--------------------------------------------------------------------------------
/docs/images/elu_prime.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/elu_prime.png


--------------------------------------------------------------------------------
/docs/images/fc_layer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/fc_layer.png


--------------------------------------------------------------------------------
/docs/images/gan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/gan.png


--------------------------------------------------------------------------------
/docs/images/gradient_accumulation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/gradient_accumulation.png


--------------------------------------------------------------------------------
/docs/images/gradient_descent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/gradient_descent.png


--------------------------------------------------------------------------------
/docs/images/gradient_descent_demystified.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/gradient_descent_demystified.png


--------------------------------------------------------------------------------
/docs/images/grid_search_cross_validation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/grid_search_cross_validation.png


--------------------------------------------------------------------------------
/docs/images/gru_structure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/gru_structure.png


--------------------------------------------------------------------------------
/docs/images/integral_as_change_in_antriderivative.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/integral_as_change_in_antriderivative.png


--------------------------------------------------------------------------------
/docs/images/integral_as_rectangular_strips.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/integral_as_rectangular_strips.png


--------------------------------------------------------------------------------
/docs/images/integral_definition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/integral_definition.png


--------------------------------------------------------------------------------
/docs/images/khan_academy_matrix_product.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/khan_academy_matrix_product.png


--------------------------------------------------------------------------------
/docs/images/leakyrelu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/leakyrelu.png


--------------------------------------------------------------------------------
/docs/images/leakyrelu_prime.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/leakyrelu_prime.png


--------------------------------------------------------------------------------
/docs/images/learned_regression_line.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/learned_regression_line.png


--------------------------------------------------------------------------------
/docs/images/linear.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/linear.png


--------------------------------------------------------------------------------
/docs/images/linear_prime.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/linear_prime.png


--------------------------------------------------------------------------------
/docs/images/linear_regression_3d_plane_mlr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/linear_regression_3d_plane_mlr.png


--------------------------------------------------------------------------------
/docs/images/linear_regression_line_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/linear_regression_line_1.png


--------------------------------------------------------------------------------
/docs/images/linear_regression_line_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/linear_regression_line_2.png


--------------------------------------------------------------------------------
/docs/images/linear_regression_line_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/linear_regression_line_3.png


--------------------------------------------------------------------------------
/docs/images/linear_regression_line_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/linear_regression_line_4.png


--------------------------------------------------------------------------------
/docs/images/linear_regression_line_intro.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/linear_regression_line_intro.png


--------------------------------------------------------------------------------
/docs/images/linear_regression_training_cost.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/linear_regression_training_cost.png


--------------------------------------------------------------------------------
/docs/images/log_vs_neglog.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/log_vs_neglog.gif


--------------------------------------------------------------------------------
/docs/images/logistic_cost_function_joined.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/logistic_cost_function_joined.png


--------------------------------------------------------------------------------
/docs/images/logistic_cost_function_vectorized.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/logistic_cost_function_vectorized.png


--------------------------------------------------------------------------------
/docs/images/logistic_regression_binary_decision_boundary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/logistic_regression_binary_decision_boundary.png


--------------------------------------------------------------------------------
/docs/images/logistic_regression_exam_scores_scatter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/logistic_regression_exam_scores_scatter.png


--------------------------------------------------------------------------------
/docs/images/logistic_regression_final_decision_boundary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/logistic_regression_final_decision_boundary.png


--------------------------------------------------------------------------------
/docs/images/logistic_regression_loss_history.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/logistic_regression_loss_history.png


--------------------------------------------------------------------------------
/docs/images/logistic_regression_scatter_w_decision_boundary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/logistic_regression_scatter_w_decision_boundary.png


--------------------------------------------------------------------------------
/docs/images/logistic_regression_sigmoid_w_threshold.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/logistic_regression_sigmoid_w_threshold.png


--------------------------------------------------------------------------------
/docs/images/lstm_structure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/lstm_structure.png


--------------------------------------------------------------------------------
/docs/images/maxpool.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/maxpool.png


--------------------------------------------------------------------------------
/docs/images/memoization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/memoization.png


--------------------------------------------------------------------------------
/docs/images/mlp.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/mlp.jpg


--------------------------------------------------------------------------------
/docs/images/multiple_regression_error_history.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/multiple_regression_error_history.png


--------------------------------------------------------------------------------
/docs/images/neural_network_matrix_weighted_input.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/neural_network_matrix_weighted_input.png


--------------------------------------------------------------------------------
/docs/images/neural_network_simple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/neural_network_simple.png


--------------------------------------------------------------------------------
/docs/images/neural_network_w_matrices.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/neural_network_w_matrices.png


--------------------------------------------------------------------------------
/docs/images/neuron.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/neuron.png


--------------------------------------------------------------------------------
/docs/images/ng_cost_function_logistic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/ng_cost_function_logistic.png


--------------------------------------------------------------------------------
/docs/images/nn_with_matrices_displayed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/nn_with_matrices_displayed.png


--------------------------------------------------------------------------------
/docs/images/optimizers.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/optimizers.gif


--------------------------------------------------------------------------------
/docs/images/regularization-dropout.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/regularization-dropout.PNG


--------------------------------------------------------------------------------
/docs/images/relu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/relu.png


--------------------------------------------------------------------------------
/docs/images/relu_prime.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/relu_prime.png


--------------------------------------------------------------------------------
/docs/images/rnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/rnn.png


--------------------------------------------------------------------------------
/docs/images/rnn_layer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/rnn_layer.png


--------------------------------------------------------------------------------
/docs/images/sigmoid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/sigmoid.png


--------------------------------------------------------------------------------
/docs/images/sigmoid_prime.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/sigmoid_prime.png


--------------------------------------------------------------------------------
/docs/images/simple_nn_diagram_zo_zh_defined.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/simple_nn_diagram_zo_zh_defined.png


--------------------------------------------------------------------------------
/docs/images/slope_formula.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/slope_formula.png


--------------------------------------------------------------------------------
/docs/images/svm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/svm.png


--------------------------------------------------------------------------------
/docs/images/svm_linear.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/svm_linear.png


--------------------------------------------------------------------------------
/docs/images/svm_nonlinear_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/svm_nonlinear_1.png


--------------------------------------------------------------------------------
/docs/images/svm_nonlinear_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/svm_nonlinear_2.png


--------------------------------------------------------------------------------
/docs/images/svm_nonlinear_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/svm_nonlinear_3.png


--------------------------------------------------------------------------------
/docs/images/tanh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/tanh.png


--------------------------------------------------------------------------------
/docs/images/tanh_prime.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/tanh_prime.png


--------------------------------------------------------------------------------
/docs/images/vae.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/vae.png


--------------------------------------------------------------------------------
/docs/images/vector_field.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/vector_field.png


--------------------------------------------------------------------------------
/docs/images/vectors_geometry.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/vectors_geometry.png


--------------------------------------------------------------------------------
/docs/images/y1andy2_logistic_function.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/y1andy2_logistic_function.png


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | =========================
 3 | Machine Learning Glossary
 4 | =========================
 5 | 
 6 | Brief visual explanations of machine learning concepts with diagrams, code examples and links to resources for learning more.
 7 | 
 8 | .. warning::
 9 | 
10 |     If you find errors, please raise an `issue <https://github.com/bfortuner/ml-cheatsheet/issues>`_ or `contribute <https://github.com/bfortuner/ml-cheatsheet/blob/master/README.md>`_ a better definition!
11 | 
12 | .. toctree::
13 |     :caption: Basics
14 |     :maxdepth: 1
15 | 
16 |     linear_regression
17 |     gradient_descent
18 |     logistic_regression
19 |     glossary
20 | 
21 | .. toctree::
22 |     :caption: Math
23 |     :maxdepth: 1
24 | 
25 |     calculus
26 |     linear_algebra
27 |     Probability (TODO) <probability>
28 |     Statistics (TODO) <statistics>
29 |     math_notation
30 | 
31 | .. toctree::
32 |     :maxdepth: 1
33 |     :caption: Neural Networks
34 | 
35 |     nn_concepts
36 |     forwardpropagation
37 |     backpropagation
38 |     activation_functions
39 |     layers
40 |     loss_functions
41 |     optimizers
42 |     regularization
43 |     architectures
44 | 
45 | .. toctree::
46 |     :maxdepth: 1
47 |     :caption: Algorithms (TODO)
48 | 
49 |     Classification <classification_algos>
50 |     Clustering <clustering_algos>
51 |     Regression <regression_algos>
52 |     Reinforcement Learning <reinforcement_learning>
53 | 
54 | .. toctree::
55 |     :maxdepth: 1
56 |     :caption: Resources
57 |     :titlesonly:
58 | 
59 |     datasets
60 |     libraries
61 |     papers
62 |     Other <other_content>
63 | 
64 | .. toctree::
65 |     :maxdepth: 1
66 |     :caption: Contributing
67 |     :titlesonly:
68 | 
69 |     How to contribute <contribute>
70 | 
71 | 
72 | .. https://en.wikipedia.org/wiki/Outline_of_machine_learning
73 | 
74 | .. Indices and tables
75 | .. ------------------
76 | 
77 | .. * :ref:`genindex`
78 | .. * :ref:`modindex`
79 | .. * :ref:`search`
80 | 


--------------------------------------------------------------------------------
/docs/loss_functions.rst:
--------------------------------------------------------------------------------
  1 | .. _cost_function:
  2 | 
  3 | ==============
  4 | Loss Functions
  5 | ==============
  6 | 
  7 | .. contents:: :local:
  8 | 
  9 | 
 10 | .. _loss_cross_entropy:
 11 | 
 12 | Cross-Entropy
 13 | =============
 14 | 
 15 | Cross-entropy loss, or log loss, measures the performance of a classification model whose output is a probability value between 0 and 1. Cross-entropy loss increases as the predicted probability diverges from the actual label. So predicting a probability of .012 when the actual observation label is 1 would be bad and result in a high loss value. A perfect model would have a log loss of 0.
 16 | 
 17 | .. image:: images/cross_entropy.png
 18 |     :align: center
 19 | 
 20 | The graph above shows the range of possible loss values given a true observation (isDog = 1). As the predicted probability approaches 1, log loss slowly decreases. As the predicted probability decreases, however, the log loss increases rapidly. Log loss penalizes both types of errors, but especially those predictions that are confident and wrong!
 21 | 
 22 | .. note::
 23 | 
 24 | Cross-entropy and log loss are slightly different depending on context, but in machine learning when calculating error rates between 0 and 1 they resolve to the same thing.
 25 | 
 26 | .. rubric:: Code
 27 | 
 28 | .. literalinclude:: ../code/loss_functions.py
 29 |       :pyobject: CrossEntropy
 30 | 
 31 | .. rubric:: Math
 32 | 
 33 | In binary classification, where the number of classes :math:`M` equals 2, cross-entropy can be calculated as:
 34 | 
 35 | .. math::
 36 | 
 37 |   -{(y\log(p) + (1 - y)\log(1 - p))}
 38 | 
 39 | If :math:`M > 2` (i.e. multiclass classification), we calculate a separate loss for each class label per observation and sum the result.
 40 | 
 41 | .. math::
 42 | 
 43 |   -\sum_{c=1}^My_{o,c}\log(p_{o,c})
 44 | 
 45 | .. note::
 46 | 
 47 |   - M - number of classes (dog, cat, fish)
 48 |   - log - the natural log
 49 |   - y - binary indicator (0 or 1) if class label :math:`c` is the correct classification for observation :math:`o`
 50 |   - p - predicted probability observation :math:`o` is of class :math:`c`
 51 | 
 52 | 
 53 | .. _hinge_loss:
 54 | 
 55 | Hinge
 56 | =====
 57 | 
 58 | Used for classification.
 59 | 
 60 | .. rubric:: Code
 61 | 
 62 | .. literalinclude:: ../code/loss_functions.py
 63 |       :pyobject: Hinge
 64 | 
 65 | 
 66 | .. _huber_loss:
 67 | 
 68 | Huber
 69 | =====
 70 | 
 71 | Typically used for regression. It's less sensitive to outliers than the MSE as it treats error as square only inside an interval.
 72 | 
 73 | .. math::
 74 | 
 75 |   L_{\delta}=\left\{\begin{matrix}
 76 |   \frac{1}{2}(y - \hat{y})^{2} & if \left | (y - \hat{y})  \right | < \delta\\
 77 |   \delta ((y - \hat{y}) - \frac1 2 \delta) & otherwise
 78 |   \end{matrix}\right.
 79 | 
 80 | .. rubric:: Code
 81 | 
 82 | .. literalinclude:: ../code/loss_functions.py
 83 |       :pyobject: Huber
 84 | 
 85 | Further information can be found at `Huber Loss in Wikipedia`_.  
 86 | 
 87 | .. _`Huber Loss in Wikipedia`: https://en.wikipedia.org/wiki/Huber_loss
 88 | 
 89 | .. _kl_divergence:
 90 | 
 91 | Kullback-Leibler
 92 | ================
 93 | 
 94 | .. rubric:: Code
 95 | 
 96 | .. literalinclude:: ../code/loss_functions.py
 97 |       :pyobject: KLDivergence
 98 | 
 99 | .. _rmse:
100 | 
101 | RMSE
102 | ========
103 | 
104 | Root Mean Square Error
105 | 
106 | .. math::
107 | 
108 |     RMSE = \sqrt{\frac{1}{m}\sum^{m}_{i=1}(h(x^{(i)})-y^{(i)})^2}
109 | 
110 | .. line-block::
111 | 
112 |     RMSE - root mean square error
113 |     m - number of samples
114 |     :math:`x^{(i)}` - i-th sample from dataset
115 |     :math:`h(x^{(i)})` - prediction for i-th sample (thesis)
116 |     :math:`y^{(i)}` - ground truth label for i-th sample
117 | 
118 | 
119 | .. rubric:: Code
120 | 
121 | .. literalinclude:: ../code/loss_functions.py
122 |       :pyobject: root_mean_square_error
123 | 
124 | 
125 | .. _mae:
126 | 
127 | MAE (L1)
128 | ========
129 | 
130 | Mean Absolute Error, or L1 loss. Excellent overview below [6] and [10].
131 | 
132 | .. math::
133 | 
134 |     MAE = \frac{1}{m}\sum^{m}_{i=1}|h(x^{(i)})-y^{(i)}|
135 | 
136 | .. line-block::
137 | 
138 |     MAE - mean absolute error
139 |     m - number of samples
140 |     :math:`x^{(i)}` - i-th sample from dataset
141 |     :math:`h(x^{(i)})` - prediction for i-th sample (thesis)
142 |     :math:`y^{(i)}` - ground truth label for i-th sample
143 | 
144 | .. rubric:: Code
145 | 
146 | .. literalinclude:: ../code/loss_functions.py
147 |       :pyobject: L1
148 | 
149 | 
150 | .. _mse:
151 | 
152 | MSE (L2)
153 | ========
154 | 
155 | Mean Squared Error, or L2 loss. Excellent overview below [6] and [10].
156 | 
157 | .. math::
158 | 
159 |     MSE = \frac{1}{m}\sum^{m}_{i=1}(y^{(i)} - \hat{y}^{(i)})^2
160 | 
161 | .. line-block::
162 | 
163 |     MSE - mean square error
164 |     m - number of samples
165 |     :math:`y^{(i)}` - ground truth label for i-th sample
166 |     :math:`\hat{y}^{(i)}` - predicted label for i-th sample
167 | 
168 | .. literalinclude:: ../code/loss_functions.py
169 |     :language: python
170 |     :pyobject: MSE
171 | 
172 | .. literalinclude:: ../code/loss_functions.py
173 |     :language: python
174 |     :pyobject: MSE_prime
175 | 
176 | 
177 | .. rubric:: References
178 | 
179 | .. [1] https://en.m.wikipedia.org/wiki/Cross_entropy
180 | .. [2] https://www.kaggle.com/wiki/LogarithmicLoss
181 | .. [3] https://en.wikipedia.org/wiki/Loss_functions_for_classification
182 | .. [4] http://www.exegetic.biz/blog/2015/12/making-sense-logarithmic-loss/
183 | .. [5] http://neuralnetworksanddeeplearning.com/chap3.html
184 | .. [6] http://rishy.github.io/ml/2015/07/28/l1-vs-l2-loss/
185 | .. [7] https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient
186 | .. [8] https://en.wikipedia.org/wiki/Huber_loss
187 | .. [9] https://en.wikipedia.org/wiki/Hinge_loss
188 | .. [10] http://www.chioka.in/differences-between-l1-and-l2-as-loss-function-and-regularization/
189 | 


--------------------------------------------------------------------------------
/docs/math_notation.rst:
--------------------------------------------------------------------------------
  1 | .. _math_notation:
  2 | 
  3 | ========
  4 | Notation
  5 | ========
  6 | 
  7 | Commonly used math symbols in machine learning texts.
  8 | 
  9 | .. contents:: :local:
 10 | 
 11 | .. note::
 12 | 
 13 |   Use the `table generator <http://www.tablesgenerator.com/text_tables>`_ to quickly add new symbols.
 14 |   Import current tables into tablesgenerator from ``figures/*.tgn``. Export and save your changes. Also
 15 |   see helpful `multiline editing <https://www.sublimetext.com/docs/3/multiple_selection_with_the_keyboard.html>`_ in Sublime.
 16 | 
 17 | 
 18 | Algebra
 19 | -------
 20 | 
 21 | +--------------------+--------------------+-----------------------+-------------------------+
 22 | | **Symbol**         | **Name**           | **Description**       | **Example**             |
 23 | +--------------------+--------------------+-----------------------+-------------------------+
 24 | | :math:`(f ∘ g)`    | composite function | a nested function     | (f ∘ g)(x) = f(g(x))    |
 25 | +--------------------+--------------------+-----------------------+-------------------------+
 26 | | :math:`∆`          | delta              | change / difference   | ∆x = x_1 - x_0          |
 27 | +--------------------+--------------------+-----------------------+-------------------------+
 28 | | :math:`e`          | Euler's number     | e = 2.718281828       | s = \frac{1}{1+e^{-z}}  |
 29 | +--------------------+--------------------+-----------------------+-------------------------+
 30 | | :math:`\sum`       | summation          | sum of all values     | ∑ x_i = x_1 + x_2 + x_3 |
 31 | +--------------------+--------------------+-----------------------+-------------------------+
 32 | | :math:`\prod`      | capital pi         | product of all values | ∏ x_i = x_1∙x_2∙x_3     |
 33 | +--------------------+--------------------+-----------------------+-------------------------+
 34 | | :math:`\epsilon`   | epsilon            | tiny number near 0    | lr = 1e-4               |
 35 | +--------------------+--------------------+-----------------------+-------------------------+
 36 | 
 37 | 
 38 | Calculus
 39 | --------
 40 | 
 41 | +--------------------+-------------------+----------------------------------+-------------+
 42 | | **Symbol**         | **Name**          | **Description**                  | **Example** |
 43 | +--------------------+-------------------+----------------------------------+-------------+
 44 | | :math:`x'`         | derivative        | first derivative                 | (x^2)' = 2x |
 45 | +--------------------+-------------------+----------------------------------+-------------+
 46 | | :math:`x''`        | second derivative | second derivative                | (x^2)'' = 2 |
 47 | +--------------------+-------------------+----------------------------------+-------------+
 48 | | :math:`\lim`       | limit             | function value as x approaches 0 |             |
 49 | +--------------------+-------------------+----------------------------------+-------------+
 50 | | :math:`∇`          | nabla             | gradient                         | ∇f(a,b,c)   |
 51 | +--------------------+-------------------+----------------------------------+-------------+
 52 | 
 53 | 
 54 | Linear algebra
 55 | --------------
 56 | 
 57 | +-------------------+-------------+------------------------------------+---------------------------------+
 58 | | **Symbol**        | **Name**    | **Description**                    | **Example**                     |
 59 | +-------------------+-------------+------------------------------------+---------------------------------+
 60 | | :math:`[ ]`       | brackets    | matrix or vector                   | :math:`M = [1 3 5]`             |
 61 | +-------------------+-------------+------------------------------------+---------------------------------+
 62 | | :math:`\cdot`     | dot         | dot product                        | :math:`(Z = X \cdot W`          |
 63 | +-------------------+-------------+------------------------------------+---------------------------------+
 64 | | :math:`\odot`     | hadamard    | hadamard product                   | :math:`A = B \odot C`           |
 65 | +-------------------+-------------+------------------------------------+---------------------------------+
 66 | | :math:`X^T`       | transpose   | matrix transpose                   | :math:`W^T \cdot X`             |
 67 | +-------------------+-------------+------------------------------------+---------------------------------+
 68 | | :math:`\vec x`    | vector      | vector                             | :math:`v = [1 2 3]`             |
 69 | +-------------------+-------------+------------------------------------+---------------------------------+
 70 | | :math:`X`         | matrix      | capitalized variables are matrices | :math:`X, W, B`                 |
 71 | +-------------------+-------------+------------------------------------+---------------------------------+
 72 | | :math:`\hat x`    | unit vector | vector of magnitude 1              | :math:`\hat x = [0.2 0.5 0.3]`  |
 73 | +-------------------+-------------+------------------------------------+---------------------------------+
 74 | 
 75 | 
 76 | Probability
 77 | -----------
 78 | 
 79 | +-------------+---------------------+--------------------------+-----------------------+
 80 | | **Symbol**  | **Name**            | **Description**          | **Example**           |
 81 | +-------------+---------------------+--------------------------+-----------------------+
 82 | | :math:`P(A)`| probability         | probability of event  A  | P(x=1) = 0.5          |
 83 | +-------------+---------------------+--------------------------+-----------------------+
 84 | 
 85 | 
 86 | Set theory
 87 | ----------
 88 | 
 89 | +------------+---------------------+-----------------------------+-----------------------+
 90 | | **Symbol** | **Name**            | **Description**             | **Example**           |
 91 | +------------+---------------------+-----------------------------+-----------------------+
 92 | | :math:`{ }`| set                 | list of distinct elements   | S = {1, 5, 7, 9}      |
 93 | +------------+---------------------+-----------------------------+-----------------------+
 94 | 
 95 | 
 96 | Statistics
 97 | ----------
 98 | 
 99 | +------------------+---------------------+----------------------------------+-----------------------+
100 | | **Symbol**       | **Name**            | **Description**                  | **Example**           |
101 | +------------------+---------------------+----------------------------------+-----------------------+
102 | | :math:`μ`        | population mean     | mean of population values        |                       |
103 | +------------------+---------------------+----------------------------------+-----------------------+
104 | | :math:`\bar x`   | sample mean         | mean of subset of population     |                       |
105 | +------------------+---------------------+----------------------------------+-----------------------+
106 | | :math:`σ^2`      | population variance | variance of population value     |                       |
107 | +------------------+---------------------+----------------------------------+-----------------------+
108 | | :math:`s^2`      | sample variance     | variance of subset of population |                       |
109 | +------------------+---------------------+----------------------------------+-----------------------+
110 | | :math:`σ_X`      | standard deviation  | population standard deviation    |                       |
111 | +------------------+---------------------+----------------------------------+-----------------------+
112 | | :math:`s`        | sample std dev      | standard deviation of sample     |                       |
113 | +------------------+---------------------+----------------------------------+-----------------------+
114 | | :math:`ρX`       | correlation         | correlation of variables X and Y |                       |
115 | +------------------+---------------------+----------------------------------+-----------------------+
116 | | :math:`\tilde x` | median              | median value of variable x       |                       |
117 | +------------------+---------------------+----------------------------------+-----------------------+
118 | 
119 | 
120 | .. rubric:: References
121 | 
122 | .. [1] http://www.tablesgenerator.com/text_tables
123 | .. [2] http://www.rapidtables.com/math/symbols/Basic_Math_Symbols.htm
124 | 
125 | 


--------------------------------------------------------------------------------
/docs/nn_concepts.rst:
--------------------------------------------------------------------------------
  1 | .. _nn_concepts:
  2 | 
  3 | ========
  4 | Concepts
  5 | ========
  6 | 
  7 | .. contents:: :local:
  8 | 
  9 | 
 10 | Neural Network
 11 | ==============
 12 | 
 13 | Neural networks are a class of machine learning algorithms used to model complex patterns in datasets using multiple hidden layers and non-linear activation functions. A neural network takes an input, passes it through multiple layers of hidden neurons (mini-functions with unique coefficients that must be learned), and outputs a prediction representing the combined input of all the neurons.
 14 | 
 15 | .. image:: images/neural_network_w_matrices.png
 16 |     :align: center
 17 | 
 18 | Neural networks are trained iteratively using optimization techniques like gradient descent. After each cycle of training, an error metric is calculated based on the difference between prediction and target. The derivatives of this error metric are calculated and propagated back through the network using a technique called backpropagation. Each neuron's coefficients (weights) are then adjusted relative to how much they contributed to the total error. This process is repeated iteratively until the network error drops below an acceptable threshold.
 19 | 
 20 | 
 21 | Neuron
 22 | ======
 23 | 
 24 | A neuron takes a group of weighted inputs, applies an activation function, and returns an output.
 25 | 
 26 | .. image:: images/neuron.png
 27 |     :align: center
 28 | 
 29 | Inputs to a neuron can either be features from a training set or outputs from a previous layer’s neurons. Weights are applied to the inputs as they travel along synapses to reach the neuron. The neuron then applies an activation function to the “sum of weighted inputs” from each incoming synapse and passes the result on to all the neurons in the next layer.
 30 | 
 31 | 
 32 | 
 33 | Synapse
 34 | =======
 35 | 
 36 | Synapses are like roads in a neural network. They connect inputs to neurons, neurons to neurons, and neurons to outputs. In order to get from one neuron to another, you have to travel along the synapse paying the “toll” (weight) along the way. Each connection between two neurons has a unique synapse with a unique weight attached to it. When we talk about updating weights in a network, we’re really talking about adjusting the weights on these synapses.
 37 | 
 38 | 
 39 | .. _nn_weights:
 40 | 
 41 | Weights
 42 | =======
 43 | 
 44 | Weights are values that control the strength of the connection between two neurons. That is, inputs are typically multiplied by weights, and that defines how much influence the input will have on the output. In other words: when the inputs are transmitted between neurons, the weights are applied to the inputs along with an additional value (the bias) 
 45 | 
 46 | .. _nn_bias:
 47 | 
 48 | Bias
 49 | ====
 50 | 
 51 | Bias terms are additional constants attached to neurons and added to the weighted input before the activation function is applied. Bias terms help models represent patterns that do not necessarily pass through the origin. For example, if all your features were 0, would your output also be zero? Is it possible there is some base value upon which your features have an effect? Bias terms typically accompany weights and must also be learned by your model.
 52 | 
 53 | 
 54 | Layers
 55 | ======
 56 | 
 57 | .. image:: images/neural_network_simple.png
 58 |     :align: center
 59 | 
 60 | .. rubric:: Input Layer
 61 | 
 62 | Holds the data your model will train on. Each neuron in the input layer represents a unique attribute in your dataset (e.g. height, hair color, etc.).
 63 | 
 64 | .. rubric:: Hidden Layer
 65 | 
 66 | Sits between the input and output layers and applies an activation function before passing on the results. There are often multiple hidden layers in a network. In traditional networks, hidden layers are typically fully-connected layers — each neuron receives input from all the previous layer’s neurons and sends its output to every neuron in the next layer. This contrasts with how convolutional layers work where the neurons send their output to only some of the neurons in the next layer.
 67 | 
 68 | .. rubric:: Output Layer
 69 | 
 70 | The final layer in a network. It receives input from the previous hidden layer, optionally applies an activation function, and returns an output representing your model’s prediction.
 71 | 
 72 | 
 73 | 
 74 | Weighted Input
 75 | ==============
 76 | 
 77 | A neuron’s input equals the sum of weighted outputs from all neurons in the previous layer. Each input is multiplied by the weight associated with the synapse connecting the input to the current neuron. If there are 3 inputs or neurons in the previous layer, each neuron in the current layer will have 3 distinct weights — one for each each synapse.
 78 | 
 79 | **Single Input**
 80 | 
 81 | .. math::
 82 | 
 83 |   Z &= Input \cdot Weight \\
 84 |     &= X W
 85 | 
 86 | **Multiple Inputs**
 87 | 
 88 | .. math::
 89 | 
 90 |   Z &= \sum_{i=1}^{n}x_i w_i \\
 91 |     &= x_1 w_1 + x_2 w_2 + x_3 w_3
 92 | 
 93 | 
 94 | Notice, it’s exactly the same equation we use with linear regression! In fact, a neural network with a single neuron is the same as linear regression! The only difference is the neural network post-processes the weighted input with an activation function.
 95 | 
 96 | 
 97 | 
 98 | Activation Functions
 99 | ====================
100 | 
101 | Activation functions live inside neural network layers and modify the data they receive before passing it to the next layer. Activation functions give neural networks their power — allowing them to model complex non-linear relationships. By modifying inputs with non-linear functions neural networks can model highly complex relationships between features. Popular activation functions include :ref:`relu <activation_relu>` and :ref:`sigmoid <activation_sigmoid>`.
102 | 
103 | Activation functions typically have the following properties:
104 | 
105 |   * **Non-linear** - In linear regression we’re limited to a prediction equation that looks like a straight line. This is nice for simple datasets with a one-to-one relationship between inputs and outputs, but what if the patterns in our dataset were non-linear? (e.g. :math:`x^2`, sin, log). To model these relationships we need a non-linear prediction equation.¹ Activation functions provide this non-linearity.
106 | 
107 |   * **Continuously differentiable** — To improve our model with gradient descent, we need our output to have a nice slope so we can compute error derivatives with respect to weights. If our neuron instead outputted 0 or 1 (perceptron), we wouldn’t know in which direction to update our weights to reduce our error.
108 | 
109 |   * **Fixed Range** — Activation functions typically squash the input data into a narrow range that makes training the model more stable and efficient.
110 | 
111 | 
112 | Loss Functions
113 | ==============
114 | 
115 | A loss function, or cost function, is a wrapper around our model's predict function that tells us "how good" the model is at making predictions for a given set of parameters. The loss function has its own curve and its own derivatives. The slope of this curve tells us how to change our parameters to make the model more accurate! We use the model to make predictions. We use the cost function to update our parameters. Our cost function can take a variety of forms as there are many different cost functions available. Popular loss functions include: :ref:`mse` and :ref:`Cross-entropy Loss <loss_cross_entropy>`.
116 | 
117 | 
118 | Optimization Algorithms
119 | =======================
120 | 
121 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
122 | 
123 | 
124 | Gradient Accumulation
125 | =====================
126 | 
127 | Gradient accumulation is a mechanism to split the batch of samples—used for training a neural network—into several mini-batches of samples that will be run sequentially.
128 | 
129 | This is used to enable using large batch sizes that require more GPU memory than available. Gradient accumulation helps in doing so by using mini-batches that require an amount of GPU memory that can be satisfied.
130 | 
131 | Gradient accumulation means running all mini-batches sequentially (generally on the same GPU) while accumulating their calculated gradients and not updating the model variables - the weights and biases of the model.
132 | The model variables must not be updated during the accumulation in order to ensure all mini-batches use the same model variable values to calculate their gradients.
133 | Only after accumulating the gradients of all those mini-batches will we generate and apply the updates for the model variables.
134 | 
135 | This results in the same updates for the model parameters as if we were to use the global batch.
136 | 
137 | .. image:: images/gradient_accumulation.png
138 |     :align: center
139 | 
140 | More details, a technical and algorithmical deep-dive, how-to tutorials, and examples can be found at [2].
141 | 
142 | 
143 | 
144 | .. rubric:: References
145 | 
146 | .. [1] http://sebastianruder.com/optimizing-gradient-descent/
147 | .. [2] https://github.com/run-ai/runai/tree/master/runai/ga/
148 | 


--------------------------------------------------------------------------------
/docs/optimizers.rst:
--------------------------------------------------------------------------------
  1 | .. _optimizers:
  2 | 
  3 | ==========
  4 | Optimizers
  5 | ==========
  6 | 
  7 | .. rubric:: What is Optimizer ? 
  8 | 
  9 | It is very important to tweak the weights of the model during the training process, to make our predictions as correct and optimized as possible. But how exactly do you do that? How do you change the parameters of your model, by how much, and when?
 10 | 
 11 | Best answer to all above question is *optimizers*. They tie together the loss function and model parameters by updating the model in response to the output of the loss function. In simpler terms, optimizers shape and mold your model into its most accurate possible form by futzing with the weights. The loss function is the guide to the terrain, telling the optimizer when it’s moving in the right or wrong direction.
 12 | 
 13 | Below are list of example optimizers
 14 | 
 15 | .. contents:: :local:
 16 | 
 17 | .. image:: images/optimizers.gif
 18 |       :align: center
 19 | 
 20 | Image Credit: `CS231n <https://cs231n.github.io/neural-networks-3/>`_
 21 | 
 22 | Adagrad
 23 | -------
 24 | 
 25 | Adagrad (short for adaptive gradient) adaptively sets the learning rate according to a parameter.
 26 | 
 27 | - Parameters that have higher gradients or frequent updates should have slower learning rate so that we do not overshoot the minimum value.
 28 | - Parameters that have low gradients or infrequent updates should faster learning rate so that they get trained quickly.
 29 | - It divides the learning rate by the sum of squares of all previous gradients of the parameter.
 30 | - When the sum of the squared past gradients has a high value, it basically divides the learning rate by a high value, so the learning rate will become less. 
 31 | - Similarly, if the sum of the squared past gradients has a low value, it divides the learning rate by a lower value, so the learning rate value will become high. 
 32 | - This implies that the learning rate is inversely proportional to the sum of the squares of all the previous gradients of the parameter.
 33 | 
 34 | .. math::
 35 | 
 36 |     g_{t}^{i} = \frac{\partial \mathcal{J}(w_{t}^{i})}{\partial W} \\
 37 |     W = W - \alpha \frac{\partial \mathcal{J}(w_{t}^{i})}{\sqrt{\sum_{r=1}^{t}\left ( g_{r}^{i} \right )^{2} + \varepsilon }}
 38 |     
 39 | .. note::
 40 |   
 41 |   - :math:`g_{t}^{i}` - the gradient of a parameter, :math: `\Theta `  at an iteration t.
 42 |   - :math:`\alpha` - the learning rate
 43 |   - :math:`\epsilon` - very small value to avoid dividing by zero
 44 | 
 45 | .. literalinclude:: ../code/optimizers.py
 46 |     :language: python
 47 |     :pyobject: Adagrad
 48 | 
 49 | Adadelta
 50 | --------
 51 | 
 52 | AdaDelta belongs to the family of stochastic gradient descent algorithms, that provide adaptive techniques for hyperparameter tuning. Adadelta is probably short for ‘adaptive delta’, where delta here refers to the difference between the current weight and the newly updated weight. 
 53 | 
 54 | The main disadvantage in Adagrad is its accumulation of the squared gradients. During the training process, the accumulated sum keeps growing. From the above formala we can see that, As the accumulated sum increases learning rate to shrink and eventually become infinitesimally small, at which point the algorithm is no longer able to acquire additional knowledge.
 55 | 
 56 | Adadelta is a more robust extension of Adagrad that adapts learning rates based on a moving window of gradient updates, instead of accumulating all past gradients. This way, Adadelta continues learning even when many updates have been done.
 57 | 
 58 | With Adadelta, we do not even need to set a default learning rate, as it has been eliminated from the update rule.
 59 | 
 60 | Implementation is something like this, 
 61 | 
 62 | .. math::
 63 | 
 64 |   v_t = \rho v_{t-1} + (1-\rho) \nabla_\theta^2 J( \theta) \\ 
 65 |   \Delta\theta &= \dfrac{\sqrt{w_t + \epsilon}}{\sqrt{v_t + \epsilon}} \nabla_\theta J( \theta) \\
 66 |   \theta &= \theta - \eta \Delta\theta \\ 
 67 |   w_t = \rho w_{t-1} + (1-\rho) \Delta\theta^2
 68 | 
 69 | .. literalinclude:: ../code/optimizers.py
 70 |     :language: python
 71 |     :pyobject: Adadelta
 72 | 
 73 | Adam
 74 | ----
 75 | 
 76 | Adaptive Moment Estimation (Adam) combines ideas from both RMSProp and Momentum. It computes adaptive learning rates for each parameter and works as follows.
 77 | 
 78 | - First, it computes the exponentially weighted average of past gradients (:math:`v_{dW}`).
 79 | - Second, it computes the exponentially weighted average of the squares of past gradients (:math:`s_{dW}`).
 80 | - Third, these averages have a bias towards zero and to counteract this a bias correction is applied (:math:`v_{dW}^{corrected}`, :math:`s_{dW}^{corrected}`).
 81 | - Lastly, the parameters are updated using the information from the calculated averages.
 82 | 
 83 | .. math::
 84 | 
 85 | 
 86 |     v_{dW} = \beta_1 v_{dW} + (1 - \beta_1) \frac{\partial \mathcal{J} }{ \partial W } \\
 87 |     s_{dW} = \beta_2 s_{dW} + (1 - \beta_2) (\frac{\partial \mathcal{J} }{\partial W })^2 \\
 88 |     v^{corrected}_{dW} = \frac{v_{dW}}{1 - (\beta_1)^t} \\
 89 |     s^{corrected}_{dW} = \frac{s_{dW}}{1 - (\beta_1)^t} \\
 90 |     W = W - \alpha \frac{v^{corrected}_{dW}}{\sqrt{s^{corrected}_{dW}} + \varepsilon}
 91 | 
 92 | .. note::
 93 | 
 94 |   - :math:`v_{dW}` - the exponentially weighted average of past gradients
 95 |   - :math:`s_{dW}` - the exponentially weighted average of past squares of gradients
 96 |   - :math:`\beta_1` - hyperparameter to be tuned
 97 |   - :math:`\beta_2` - hyperparameter to be tuned
 98 |   - :math:`\frac{\partial \mathcal{J} }{ \partial W }` - cost gradient with respect to current layer
 99 |   - :math:`W` - the weight matrix (parameter to be updated)
100 |   - :math:`\alpha` - the learning rate
101 |   - :math:`\epsilon` - very small value to avoid dividing by zero
102 | 
103 | 
104 | 
105 | Conjugate Gradients
106 | -------------------
107 | 
108 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
109 | 
110 | 
111 | .. _optimizers_lbfgs:
112 | 
113 | BFGS
114 | ----
115 | 
116 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
117 | 
118 | 
119 | Momentum
120 | --------
121 | 
122 | Used in conjunction Stochastic Gradient Descent (sgd) or Mini-Batch Gradient Descent, Momentum takes into account
123 | past gradients to smooth out the update. This is seen in variable :math:`v` which is an exponentially weighted average
124 | of the gradient on previous steps. This results in minimizing oscillations and faster convergence.
125 | 
126 | .. math::
127 | 
128 |     v_{dW} = \beta v_{dW} + (1 - \beta) \frac{\partial \mathcal{J} }{ \partial W } \\
129 |     W = W - \alpha v_{dW}
130 | 
131 | .. note::
132 | 
133 |   - :math:`v` - the exponentially weighted average of past gradients
134 |   - :math:`\frac{\partial \mathcal{J} }{ \partial W }` - cost gradient with respect to current layer weight tensor
135 |   - :math:`W` - weight tensor
136 |   - :math:`\beta` - hyperparameter to be tuned
137 |   - :math:`\alpha` - the learning rate
138 | 
139 | Nesterov Momentum
140 | -----------------
141 | 
142 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
143 | 
144 | 
145 | Newton's Method
146 | ---------------
147 | 
148 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
149 | 
150 | 
151 | RMSProp
152 | -------
153 | 
154 | Another adaptive learning rate optimization algorithm, Root Mean Square Prop (RMSProp) works by keeping an exponentially weighted average of the squares of past gradients.
155 | RMSProp then divides the learning rate by this average to speed up convergence.
156 | 
157 | 
158 | .. math::
159 | 
160 | 
161 |     s_{dW} = \beta s_{dW} + (1 - \beta) (\frac{\partial \mathcal{J} }{\partial W })^2 \\
162 |     W = W - \alpha \frac{\frac{\partial \mathcal{J} }{\partial W }}{\sqrt{s^{corrected}_{dW}} + \varepsilon}
163 | 
164 | .. note::
165 | 
166 |   - :math:`s` - the exponentially weighted average of past squares of gradients
167 |   - :math:`\frac{\partial \mathcal{J} }{\partial W }` - cost gradient with respect to current layer weight tensor
168 |   - :math:`W` - weight tensor
169 |   - :math:`\beta` - hyperparameter to be tuned
170 |   - :math:`\alpha` - the learning rate
171 |   - :math:`\epsilon` - very small value to avoid dividing by zero
172 | 
173 | SGD
174 | ---
175 | 
176 | SGD stands for Stochastic Gradient Descent.In Stochastic Gradient Descent, a few samples are selected randomly instead of the whole data set for each iteration. In Gradient Descent, there is a term called “batch” which denotes the total number of samples from a dataset that is used for calculating the gradient for each iteration. In typical Gradient Descent optimization, like Batch Gradient Descent, the batch is taken to be the whole dataset. Although, using the whole dataset is really useful for getting to the minima in a less noisy or less random manner, but the problem arises when our datasets get really huge.
177 | 
178 | This problem is solved by Stochastic Gradient Descent. In SGD, it uses only a single sample to perform each iteration. The sample is randomly shuffled and selected for performing the iteration.
179 | 
180 | Since only one sample from the dataset is chosen at random for each iteration, the path taken by the algorithm to reach the minima is usually noisier than your typical Gradient Descent algorithm. But that doesn’t matter all that much because the path taken by the algorithm does not matter, as long as we reach the minima and with significantly shorter training time.
181 | 
182 | .. literalinclude:: ../code/optimizers.py
183 |     :language: python
184 |     :pyobject: SGD
185 | 
186 | 
187 | .. rubric:: References
188 | 
189 | .. [1] https://ruder.io/optimizing-gradient-descent/
190 | .. [2] http://www.deeplearningbook.org/contents/optimization.html
191 | .. [3] https://arxiv.org/pdf/1502.03167.pdf
192 | 


--------------------------------------------------------------------------------
/docs/other_content.rst:
--------------------------------------------------------------------------------
  1 | .. _content:
  2 | 
  3 | =============
  4 | Other Content
  5 | =============
  6 | 
  7 | Books, blogs, courses and more forked from josephmisiti's `awesome machine learning <https://github.com/josephmisiti/awesome-machine-learning>`_
  8 | 
  9 | .. contents:: :local:
 10 | 
 11 | Blogs
 12 | =====
 13 | 
 14 | Data Science
 15 | ------------
 16 | 
 17 | - https://jeremykun.com/
 18 | - http://iamtrask.github.io/
 19 | - http://blog.explainmydata.com/
 20 | - http://andrewgelman.com/
 21 | - http://simplystatistics.org/
 22 | - http://www.evanmiller.org/
 23 | - http://jakevdp.github.io/
 24 | - http://blog.yhat.com/
 25 | - http://wesmckinney.com
 26 | - http://www.overkillanalytics.net/
 27 | - http://newton.cx/~peter/
 28 | - http://mbakker7.github.io/exploratory_computing_with_python/
 29 | - https://sebastianraschka.com/blog/index.html
 30 | - http://camdavidsonpilon.github.io/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/
 31 | - http://colah.github.io/
 32 | - http://www.thomasdimson.com/
 33 | - http://blog.smellthedata.com/
 34 | - https://sebastianraschka.com/
 35 | - http://dogdogfish.com/
 36 | - http://www.johnmyleswhite.com/
 37 | - http://drewconway.com/zia/
 38 | - http://bugra.github.io/
 39 | - http://opendata.cern.ch/
 40 | - https://alexanderetz.com/
 41 | - http://www.sumsar.net/
 42 | - https://www.countbayesie.com
 43 | - http://blog.kaggle.com/
 44 | - http://www.danvk.org/
 45 | - http://hunch.net/
 46 | - http://www.randalolson.com/blog/
 47 | - https://www.johndcook.com/blog/r_language_for_programmers/
 48 | - http://www.dataschool.io/
 49 | 
 50 | 
 51 | Machine learning
 52 | ----------------
 53 | 
 54 | - `OpenAI <https://www.openai.com/>`__
 55 | - `Distill <http://distill.pub/>`__
 56 | - `Andrej Karpathy Blog <http://karpathy.github.io/>`__
 57 | - `Colah's Blog <http://colah.github.io/>`__
 58 | - `WildML <http://www.wildml.com/>`__
 59 | - `FastML <http://www.fastml.com/>`__
 60 | - `TheMorningPaper <https://blog.acolyer.org>`__
 61 | 
 62 | 
 63 | Math
 64 | ----
 65 | 
 66 | - http://www.sumsar.net/
 67 | - http://allendowney.blogspot.ca/
 68 | - https://healthyalgorithms.com/
 69 | - https://petewarden.com/
 70 | - http://mrtz.org/blog/
 71 | 
 72 | 
 73 | 
 74 | Books
 75 | =====
 76 | 
 77 | Machine learning
 78 | ----------------
 79 | 
 80 | - `Real World Machine Learning <https://www.manning.com/books/real-world-machine-learning>`__ [Free Chapters]
 81 | - `An Introduction To Statistical Learning <http://www-bcf.usc.edu/~gareth/ISL/>`__ - Book + R Code
 82 | - `Elements of Statistical Learning <http://statweb.stanford.edu/~tibs/ElemStatLearn/>`__ - Book
 83 | - `Probabilistic Programming & Bayesian Methods for Hackers <http://camdavidsonpilon.github.io/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/>`__ - Book + IPython Notebooks
 84 | - `Think Bayes <http://greenteapress.com/wp/think-bayes/>`__ - Book + Python Code
 85 | - `Information Theory, Inference, and Learning Algorithms <http://www.inference.phy.cam.ac.uk/mackay/itila/book.html>`__
 86 | - `Gaussian Processes for Machine Learning <http://www.gaussianprocess.org/gpml/chapters/>`__
 87 | - `Data Intensive Text Processing w/ MapReduce <http://lintool.github.io/MapReduceAlgorithms/>`__
 88 | - `Reinforcement Learning: - An Introduction <http://webdocs.cs.ualberta.ca/~sutton/book/ebook/the-book.html>`__
 89 | - `Mining Massive Datasets <http://infolab.stanford.edu/~ullman/mmds/book.pdf>`__
 90 | - `A First Encounter with Machine Learning <https://www.ics.uci.edu/~welling/teaching/273ASpring10/IntroMLBook.pdf>`__
 91 | - `Pattern Recognition and Machine Learning <http://users.isr.ist.utl.pt/~wurmd/Livros/school/Bishop%20-%20Pattern%20Recognition%20And%20Machine%20Learning%20-%20Springer%20%202006.pdf>`__
 92 | - `Machine Learning & Bayesian Reasoning <http://web4.cs.ucl.ac.uk/staff/D.Barber/textbook/090310.pdf>`__
 93 | - `Introduction to Machine Learning <http://alex.smola.org/drafts/thebook.pdf>`__ - Alex Smola and S.V.N. Vishwanathan
 94 | - `A Probabilistic Theory of Pattern Recognition <http://www.szit.bme.hu/~gyorfi/pbook.pdf>`__
 95 | - `Introduction to Information Retrieval <http://nlp.stanford.edu/IR-book/pdf/irbookprint.pdf>`__
 96 | - `Forecasting: principles and practice <https://www.otexts.org/fpp/>`__
 97 | - `Practical Artificial Intelligence Programming in Java <http://www.markwatson.com/opencontent_data/JavaAI3rd.pdf>`__
 98 | - `Introduction to Machine Learning <https://arxiv.org/pdf/0904.3664v1.pdf>`__ - Amnon Shashua
 99 | - `Reinforcement Learning <http://www.intechopen.com/books/reinforcement_learning>`__
100 | - `Machine Learning <http://www.intechopen.com/books/machine_learning>`__
101 | - `A Quest for AI <http://ai.stanford.edu/~nilsson/QAI/qai.pdf>`__
102 | - `Introduction to Applied Bayesian Statistics and Estimation for Social Scientists <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.177.857&rep=rep1&type=pdf>`__ - Scott M. Lynch
103 | - `Bayesian Modeling, Inference and Prediction <https://users.soe.ucsc.edu/~draper/draper-BMIP-dec2005.pdf>`__
104 | - `A Course in Machine Learning <http://ciml.info/>`__
105 | - `Machine Learning, Neural and Statistical Classification <http://www1.maths.leeds.ac.uk/~charles/statlog/>`__
106 | - `Bayesian Reasoning and Machine Learning <http://web4.cs.ucl.ac.uk/staff/D.Barber/pmwiki/pmwiki.php?n=Brml.HomePage>`__ Book+MatlabToolBox
107 | - `R Programming for Data Science <https://leanpub.com/rprogramming>`__
108 | - `Data Mining - Practical Machine Learning Tools and Techniques <http://cs.du.edu/~mitchell/mario_books/Data_Mining:_Practical_Machine_Learning_Tools_and_Techniques_-_2e_-_Witten_&_Frank.pdf>`__ Book
109 | 
110 | 
111 | Deep learning
112 | -------------
113 | 
114 | - `Deep Learning - An MIT Press book <http://www.deeplearningbook.org/>`__
115 | - `Coursera Course Book on NLP <http://www.cs.columbia.edu/~mcollins/notes-spring2013.html>`__
116 | - `NLTK <http://www.nltk.org/book/>`__
117 | - `NLP w/ Python <http://victoria.lviv.ua/html/fl5/NaturalLanguageProcessingWithPython.pdf>`__
118 | - `Foundations of Statistical Natural Language Processing <http://nlp.stanford.edu/fsnlp/promo/>`__
119 | - `An Introduction to Information Retrieval <http://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf>`__
120 | - `A Brief Introduction to Neural Networks <http://www.dkriesel.com/_media/science/neuronalenetze-en-zeta2-2col-dkrieselcom.pdf>`__
121 | - `Neural Networks and Deep Learning <http://neuralnetworksanddeeplearning.com/>`__
122 | 
123 | 
124 | Probability & Statistics
125 | ------------------------
126 | 
127 | - `Think Stats <http://www.greenteapress.com/thinkstats/>`__ - Book + Python Code
128 | - `From Algorithms to Z-Scores <http://heather.cs.ucdavis.edu/probstatbook>`__ - Book
129 | - `The Art of R Programming <http://heather.cs.ucdavis.edu/~matloff/132/NSPpart.pdf) - Book (Not Finished>`__
130 | - `Introduction to statistical thought <http://people.math.umass.edu/~lavine/Book/book.pdf>`__
131 | - `Basic Probability Theory <http://www.math.uiuc.edu/~r-ash/BPT/BPT.pdf>`__
132 | - `Introduction to probability <https://math.dartmouth.edu/~prob/prob/prob.pdf>`__ - By Dartmouth College
133 | - `Principle of Uncertainty <http://uncertainty.stat.cmu.edu/wp-content/uploads/2011/05/principles-of-uncertainty.pdf>`__
134 | - `Probability & Statistics Cookbook <http://statistics.zone/>`__
135 | - `Advanced Data Analysis From An Elementary Point of View <http://www.stat.cmu.edu/~cshalizi/ADAfaEPoV/ADAfaEPoV.pdf>`__
136 | - `Introduction to Probability <http://athenasc.com/probbook.html>`__ -  Book and course by MIT
137 | - `The Elements of Statistical Learning: Data Mining, Inference, and Prediction. <http://statweb.stanford.edu/~tibs/ElemStatLearn/>`__ -Book
138 | - `An Introduction to Statistical Learning with Applications in R <http://www-bcf.usc.edu/~gareth/ISL/>`__ - Book
139 | - `Learning Statistics Using R <http://health.adelaide.edu.au/psychology/ccs/teaching/lsr/>`__
140 | - `Introduction to Probability and Statistics Using R <https://cran.r-project.org/web/packages/IPSUR/vignettes/IPSUR.pdf>`__ - Book
141 | - `Advanced R Programming <http://adv-r.had.co.nz>`__ - Book
142 | - `Practical Regression and Anova using R <http://cran.r-project.org/doc/contrib/Faraway-PRA.pdf>`__ - Book
143 | - `R practicals <http://www.columbia.edu/~cjd11/charles_dimaggio/DIRE/resources/R/practicalsBookNoAns.pdf>`__ - Book
144 | - `The R Inferno <http://www.burns-stat.com/pages/Tutor/R_inferno.pdf>`__ - Book
145 | 
146 | Linear Algebra
147 | --------------
148 | 
149 | - `Linear Algebra Done Wrong <http://www.math.brown.edu/~treil/papers/LADW/book.pdf>`__
150 | - `Linear Algebra, Theory, and Applications <https://math.byu.edu/~klkuttle/Linearalgebra.pdf>`__
151 | - `Convex Optimization <http://web.stanford.edu/~boyd/cvxbook/bv_cvxbook.pdf>`__
152 | - `Applied Numerical Computing <http://www.seas.ucla.edu/~vandenbe/103/reader.pdf>`__
153 | - `Applied Numerical Linear Algebra <http://egrcc.github.io/docs/math/applied-numerical-linear-algebra.pdf>`__
154 | 
155 | 
156 | Courses
157 | =======
158 | 
159 | - `CS231n, Convolutional Neural Networks for Visual Recognition, Stanford University <http://cs231n.stanford.edu/>`__
160 | - `CS224d, Deep Learning for Natural Language Processing, Stanford University <http://cs224d.stanford.edu/>`__
161 | - `Oxford Deep NLP 2017, Deep Learning for Natural Language Processing, University of Oxford <https://github.com/oxford-cs-deepnlp-2017/lectures>`__
162 | - `Artificial Intelligence (Columbia University) <https://www.edx.org/course/artificial-intelligence-ai-columbiax-csmm-101x>`__ - free
163 | - `Machine Learning (Columbia University) <https://www.edx.org/course/machine-learning-columbiax-csmm-102x>`__ - free
164 | - `Machine Learning (Stanford University) <https://www.coursera.org/learn/machine-learning>`__ - free
165 | - `Neural Networks for Machine Learning (University of Toronto) <https://www.coursera.org/learn/neural-networks>`__ - free
166 | - `Machine Learning Specialization (University of Washington) <https://www.coursera.org/specializations/machine-learning>`__ - Courses: Machine Learning Foundations: A Case Study Approach, Machine Learning: Regression, Machine Learning: Classification, Machine Learning: Clustering & Retrieval, Machine Learning: Recommender Systems & Dimensionality Reduction,Machine Learning Capstone: An Intelligent Application with Deep Learning; free
167 | - `Machine Learning Course (2014-15 session) (by Nando de Freitas, University of Oxford) <https://www.cs.ox.ac.uk/people/nando.defreitas/machinelearning/>`__ - Lecture slides and video recordings.
168 | - `Learning from Data (by Yaser S. Abu-Mostafa, Caltech) <http://www.work.caltech.edu/telecourse.html>`__ - Lecture videos available
169 | 
170 | 
171 | Podcasts
172 | ========
173 | 
174 | - `The O'Reilly Data Show <http://radar.oreilly.com/tag/oreilly-data-show-podcast>`__
175 | - `Partially Derivative <http://partiallyderivative.com/>`__
176 | - `The Talking Machines <http://www.thetalkingmachines.com/>`__
177 | - `The Data Skeptic <https://dataskeptic.com/>`__
178 | - `Linear Digressions <http://benjaffe.github.io/linear-digressions-site/>`__
179 | - `Data Stories <http://datastori.es/>`__
180 | - `Learning Machines 101 <http://www.learningmachines101.com/>`__
181 | - `Not So Standard Deviations <http://simplystatistics.org/2015/09/17/not-so-standard-deviations-the-podcast/>`__
182 | - `TWIMLAI <https://twimlai.com/shows/>`__
183 | -_`Machine Learning Guide <http://ocdevel.com/mlg>`_
184 | 
185 | 
186 | Tutorials
187 | =========
188 | 
189 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
190 | 
191 | 
192 | 


--------------------------------------------------------------------------------
/docs/probability.rst:
--------------------------------------------------------------------------------
 1 | .. _probability:
 2 | 
 3 | ===========
 4 | Probability
 5 | ===========
 6 | 
 7 | .. contents:: :local:
 8 | 
 9 | Basic concepts in probability for machine learning.
10 | 
11 | This cheatsheet is a 10-page reference in probability that covers a semester's worth of introductory probability.
12 | 
13 | The cheatsheet is based off of Harvard's introductory probability course, Stat 110. It is co-authored by former Stat 110 Teaching Fellow William Chen and Stat 110 Professor Joe Blitzstein.
14 | 
15 | Links
16 | -------
17 | 
18 | * [Probability Cheatsheet PDF](http://www.wzchen.com/probability-cheatsheet/)
19 | 
20 | 
21 | Screenshots
22 | -------
23 | 
24 | ![First Page](http://i.imgur.com/Oa73huL.jpg)
25 | ![Second Page](http://i.imgur.com/dyvW2rB.jpg)
26 | 
27 | 
28 | License
29 | -------
30 | 
31 | This work is licensed under a [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.][by-nc-sa].
32 | 
33 | [![Creative Commons License][by-nc-sa-img]][by-nc-sa]
34 | 
35 | .. rubric:: References
36 | 
37 | .. [1] Example
38 | 


--------------------------------------------------------------------------------
/docs/regression_algos.rst:
--------------------------------------------------------------------------------
 1 | .. _regression_algos:
 2 | 
 3 | =====================
 4 | Regression Algorithms
 5 | =====================
 6 | 
 7 | .. contents:: :local:
 8 | 
 9 | 
10 | Ordinary Least Squares
11 | ======================
12 | 
13 | OLS is the method with which linear regression is performed. The square of the difference from the mean is taken for every data point, and the summed loss function is to be minimized.
14 | 
15 | .. math::
16 |     l = \sum_{i=1}^n (y_i - \bar{y})^2
17 | 
18 | 
19 | 
20 | Polynomial
21 | ==========
22 | 
23 | Polynomial regression is a modification of linear regression where the existing features are mapped to a polynomial form. The problem is still a linear regression problem, but the input vector is now mapped to a higher dimensional vector which serves as a pseudo-input vector of sorts.
24 | 
25 | .. math::
26 | 
27 |     \textbf{x} = (x_0, x_1) \rightarrow \textbf{x'} = (x_0, x^2_0, x_1, x^2_1, x_0x_1)
28 | 
29 | 
30 | Lasso
31 | =====
32 | 
33 | Lasso Regression tries to reduce the ordinary least squares error similar to vanilla regression, but adds an extra term. The sum of the :math:`L_1` norm for every data point multiplied by a hyperparameter :math:`\alpha` is used. This reduces model complexity and prevents overfitting. 
34 | 
35 | .. math::
36 | 
37 |     l = \sum_{i=1}^n (y_i - \tilde{y})^2 + \alpha \sum_{j=1}^p |w_j|
38 | 
39 | 
40 | Ridge
41 | =====
42 | 
43 | Ridge regression is similar to lasso regression, but the regularization term uses the :math:`L_2` norm instead.
44 | 
45 | .. math::
46 | 
47 |     l = \sum_{i=1}^n (y_i - \tilde{y})^2 + \alpha \sum_{j=1}^p w^2_j
48 | 
49 | 
50 | 
51 | Stepwise
52 | ========
53 | 
54 | Stepwise regression or spline regression helps us fit a piece wise function to the data. It is usually used with linear models, but it can be generalized to higher degrees as well. The regression equation takes the form of
55 | 
56 | .. math::
57 |     y = ax + b(x-\bar{x})H_{\alpha}+c
58 |  
59 | where :math:`H_{\alpha}` is the shifted Heaviside step function, having its discontinuity at :math:`\alpha`.
60 | 
61 | 
62 | .. rubric:: References
63 | 
64 | .. [1] https://www.analyticsvidhya.com/blog/2015/08/comprehensive-guide-regression/
65 | .. [2] http://machinelearningmastery.com/a-tour-of-machine-learning-algorithms/
66 | 
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/docs/reinforcement_learning.rst:
--------------------------------------------------------------------------------
  1 | .. _reinforcement_learning:
  2 | 
  3 | ======================
  4 | Reinforcement Learning
  5 | ======================
  6 | 
  7 | In machine learning, supervised is sometimes contrasted with unsupervised learning. This is a useful distinction, but there are some problem domains that have share characteristics with each without fitting exactly in either category. In cases where the algorithm does not have explicit labels but does receive a form of feedback, we are dealing with a third and distinct paradigm of machine learning - reinforcement learning.
  8 | 
  9 | Programmatic and a theoretical introduction to reinforcement learning:https://spinningup.openai.com/
 10 | 
 11 | There are different problem types and algorithms, but all reinforcement learning problems have the following aspects in common:
 12 | 
 13 |   * an **agent** - the algorithm or "AI" responsible for making decisions
 14 | 
 15 |   * an **environment**, consisting of different **states** in which the agent may find itself
 16 | 
 17 |   * a **reward** signal which is returned by the environment as a function of the current state
 18 | 
 19 |   * **actions**, each of which takes the agent from one state to another
 20 | 
 21 |   * a **policy**, i.e. a mapping from states to actions that defines the agent's behavior
 22 | 
 23 | The goal of reinforcement learning is to learn the optimal policy, that is the policy that maximizes expected (discounted) cumulative reward. 
 24 | 
 25 | Many RL algorithms will include a value function or a Q-function. A value function gives the expected cumulative reward for each state under the current policy In other words, it answers the question, "If I begin in state :math:`i` and follow my policy, what will be my expected reward?"
 26 | 
 27 | In most algorithms, expected cumulative reward is discounted by some factor :math:`\gamma \in (0, 1)`; a typical value for :math:`\gamma` is 0.9. In addition to more accurately modeling the behavior of humans and other animals, :math:`\gamma < 1` helps to ensure that algorithms converge even when there is no terminal state or when the terminal state is never found (because otherwise expected cumulative reward may also become infinite).
 28 | 
 29 | Note on Terminology
 30 | -------------------
 31 | 
 32 | For mostly historical reasons, engineering and operations research use different words to talk about the same concepts. For example, the general field of reinforcement learning itself is sometimes referred to as optimal control, approximate dynamic programming, or neuro-dynamic programming.\ :sup:`1`
 33 | 
 34 | Eploration vs. Exploitation
 35 | ---------------------------
 36 | 
 37 | One dilemma inherent to the RL problem setting is the tension between the desire to choose the best known option and the need to try something new in order to discover other options that may be even better. Choosing the best known action is known as exploitation, while choosing a different action is known as exploration. 
 38 | 
 39 | Typically, this is solved by adding to the policy a small probability of exploration. For example, the policy could be to choose the optimal action (optimal with regard to what is known) with probability 0.95, and exploring by randomly choosing some other action with probability 0.5 (if uniform across all remaining actions: probability 0.5/(n-1) where n is the number of states).
 40 | 
 41 | MDPs and Tabular methods
 42 | ------------------------
 43 | 
 44 | Many problems can be effectively modeled as Markov Decision Processes (MDPs), and usually as `Partially Observable Markov Decision Processes (POMDPs) <https://en.wikipedia.org/wiki/Partially_observable_Markov_decision_process>`. That is, we have 
 45 | 
 46 |   * a set of states :math:`S`
 47 |   * a set of actions :math:`A`
 48 |   * a set of conditional state transition probabilities :math:`T`
 49 |   * a reward function :math:`R: S \times A \rightarrow \mathbb{R}`
 50 |   * a set of observations :math:`\Omega`
 51 |   * a set of condition observation probabilities :math:`O`
 52 |   * a discount factor :math:`\gamma \in [0]`
 53 | 
 54 | Given these things, the goal is to choose the action at each time step which will maximize :math:`E \left[ \sum_{t=0}^{\infty} \gamma^t r_t \right]`, the expected discounted reward.
 55 |    
 56 | Monte Carlo methods
 57 | -------------------
 58 | 
 59 | One possible approach is to run a large number of simulations to learn :math:`p^*`. This is good for cases where we know the environment and can run many simulations reasonably quickly. For example, it is fairly trivial to compute an optimal policy for the card game `21 (blackjack) <https://en.wikipedia.org/wiki/Twenty-One_(card_game)>` by running many simulations, and the same is true for most simple games.
 60 | 
 61 | Temporal-Difference Learning
 62 | ----------------------------
 63 | 
 64 | TODO
 65 | 
 66 | Planning
 67 | --------
 68 | 
 69 | TODO
 70 | 
 71 | On-Policy vs. Off-Policy Learning
 72 | ---------------------------------
 73 | 
 74 | TODO
 75 | 
 76 | Model-Free vs. Model-Based Approaches
 77 | -------------------------------------
 78 | 
 79 | TODO
 80 | 
 81 | Imitation Learning
 82 | ------------------
 83 | 
 84 | TODO
 85 | 
 86 | Q-Learning
 87 | ----------
 88 | 
 89 | Q Learning, a model-free RL algorithm, is to update Q values to the optimal by iteration. It is an off-policy method that select the optimal action based on the current estimated Q\* and does not follow the current policy.
 90 | 
 91 | The algorithm of Q Learning is:
 92 | 
 93 | 	#. Initialize t = 0.
 94 | 	#. Start at initial state s\ :sub:`t` = 0.
 95 | 	#. The agent chooses a\ :sub:`t` = ɛ-greedy
 96 | 	   action.
 97 | 	#. For given a\ :sub:`t`, the agent retrieves
 98 | 	   the reward r\ :sub:`t+1` as well as the next
 99 | 	   state s\ :sub:`t+1`.
100 | 	#. Get (but do not perform) the next action
101 | 	   a\ :sub:`t+1` =
102 | 	   argmax\ :sub:`a∈A`\ Q(s\ :sub:`t+1`, a).
103 | 	#. Compute the TD target y\ :sub:`t` =
104 | 	   r\ :sub:`t+1` + γ · Q(s\ :sub:`t+1`,
105 | 	   a\ :sub:`t+1`), where γ is the discounted
106 | 	   factor.
107 | 	#. Calculate the TD error δ = y\ :sub:`t` −
108 | 	   Q(s\ :sub:`t`, a\ :sub:`t`).
109 | 	#. Update Q(s\ :sub:`t`, a\ :sub:`t`) ←
110 | 	   Q(s\ :sub:`t`, a\ :sub:`t`) + α\ :sub:`t` ·
111 | 	   δ, where α\ :sub:`t` is the step size
112 | 	   (learning rate) at t.
113 | 	#. Update t ← t + 1 and repeat step 3-9 until
114 | 	   Q(s, a) converge.
115 | 	   
116 | Epsilon-Greedy Algorithm
117 | 
118 | .. math::
119 | 
120 | 	\begin{equation}
121 | 	a_{t} = \begin{cases}
122 | 	argmax_{a∈A} & \text{if } p = 1 - e \\
123 | 	random\, action\ &\text{otherwise}
124 | 	\end{cases}
125 | 	\end{equation}
126 | 
127 | The agent performs optimal action for exploitation or random action for exploration during training. It acts randomly in the beginning with the ɛ = 1 and chooses the best action based on the Q function with a decreasing ɛ capped at some small constant not equal to zero.
128 | 
129 | Q-Table / Q-Matrix
130 | 
131 | 	+-------------+---------------+---------------+-----+---------------+
132 | 	|             | a\ :sub:`1`   | a\ :sub:`2`   | ... | a\ :sub:`n`   |
133 | 	+-------------+---------------+---------------+-----+---------------+
134 | 	| s\ :sub:`1` | Q             | Q             | ... | Q             |
135 | 	|             | (s\ :sub:`1`, | (s\ :sub:`1`, |     | (s\ :sub:`1`, |
136 | 	|             | a\ :sub:`1`)  | a\ :sub:`2`)  |     | a\ :sub:`3`)  |
137 | 	+-------------+---------------+---------------+-----+---------------+
138 | 	| s\ :sub:`2` | Q             | Q             | ... | Q             |
139 | 	|             | (s\ :sub:`2`, | (s\ :sub:`2`, |     | (s\ :sub:`2`, |
140 | 	|             | a\ :sub:`1`)  | a\ :sub:`2`)  |     | a\ :sub:`3`)  |
141 | 	+-------------+---------------+---------------+-----+---------------+
142 | 	| ...         | ...           | ...           | ... | ...           |
143 | 	+-------------+---------------+---------------+-----+---------------+
144 | 	| s\ :sub:`m` | Q             | Q             | ... | Q             |
145 | 	|             | (s\ :sub:`m`, | (s\ :sub:`m`, |     | (s\ :sub:`m`, |
146 | 	|             | a\ :sub:`1`)  | a\ :sub:`2`)  |     | a\ :sub:`3`)  |
147 | 	+-------------+---------------+---------------+-----+---------------+
148 | 	
149 | It's a lookup table storing the action-value function Q(s, a) for state-action pairs where there are M states and n actions. We can initialize the Q(s, a) arbitrarily except s = terminal state. For s = final state, we set it equal to the reward on that state.
150 | 
151 | Reasons of using Q Learning are:
152 | 
153 | 	-  It’s applicable for the discrete action space of our environment.
154 | 	-  When we don’t have the true MDP model: transitional probability matrix and rewards (Model-Free Setting).
155 | 	-  It's able to learn from incomplete episodes because of TD learning.
156 | 
157 | Drawbacks of Q Learning are:
158 | 
159 | 	-  When the state space and action space are continuous and extremely large, due to the curse of dimensionality, it’s nearly impossible to maintain a Q-matrix when the data is large.
160 | 	-  Using a Q-table is unable to infer optimal action for unseen states.
161 | 	   
162 | Deep Q-Learning
163 | ---------------
164 | 
165 | Deep Q-learning pursues the same general methods as Q-learning. Its innovation is to add a neural network, which makes it possible to learn a very complex Q-function. This makes it very powerful, especially because it makes a large body of well-developed theory and tools for deep learning useful to reinforcement learning problems.
166 | 
167 | Examples of Applications
168 | ------------------------
169 | 
170 |   * `Getting Started With OpenAI Gym: Creating Custom Gym Environments <https://blog.paperspace.com/creating-custom-environments-openai-gym/>`_
171 | 
172 |   * `What Is Q-Learning: The Best Guide To Understand Q-Learning (Simplilearn) <https://www.simplilearn.com/tutorials/machine-learning-tutorial/what-is-q-learning>`_
173 | 
174 |   * `REINFORCEMENT LEARNING (DQN) TUTORIAL (PyTorch) <https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html>`_
175 | 
176 |   * `QWOP Game AI (DQN/DDQN) <https://github.com/yatshunlee/qwop_RL>`_
177 | 
178 | Links
179 | -----
180 | 
181 |   * `Practical Applications of Reinforcement Learning (tTowards Data Science) <https://towardsdatascience.com/applications-of-reinforcement-learning-in-real-world-1a94955bcd12>`_
182 | 
183 |   * `Reinforcement learning (GeeksforGeeks) <https://www.geeksforgeeks.org/what-is-reinforcement-learning/>`_ 
184 | 
185 |   * `Reinforcement Learning Algorithms: An Intuitive Overview (SmartLabAI) <https://medium.com/@SmartLabAI/reinforcement-learning-algorithms-an-intuitive-overview-904e2dff5bbc>`_ 
186 |   
187 |   * `Q-learning(Wikipedia) <https://en.wikipedia.org/wiki/Q-learning>`_
188 | 
189 |   * `Epsilon-Greedy Algorithm in Reinforcement Learning (GeeksforGeeks) <https://www.geeksforgeeks.org/epsilon-greedy-algorithm-in-reinforcement-learning/>`_
190 | 
191 |   * `OpenAI Gym Documentation <https://www.gymlibrary.ml/>`_
192 | 
193 |   * `Stable-Baselines3 Documentation <https://stable-baselines3.readthedocs.io/en/master/#>`_
194 |   
195 |   * `David Silver Teaching Material <https://www.davidsilver.uk/teaching/>`_
196 | 
197 | 
198 | 
199 | .. rubric:: References
200 | 
201 | .. [1] https://en.wikipedia.org/wiki/Reinforcement_learning#Introduction
202 | .. [2] Reinforcement Learning: An Introduction (Sutton and Barto, 2018)
203 | .. [3] Silver, David. "Lecture 5: Model-Free Control." UCL, Computer Sci. Dep. Reinf Learn. Lect. (2015): 101-140.
204 | .. [4] En.wikipedia.org. 2022. Q-learning - Wikipedia. [online] Available at: <https://en.wikipedia.org/wiki/Q-learning> [Accessed 15 June 2022].
205 | 
206 | 
207 | 
208 | 
209 | 


--------------------------------------------------------------------------------
/docs/statistics.rst:
--------------------------------------------------------------------------------
 1 | .. _statistics:
 2 | 
 3 | ==========
 4 | Statistics
 5 | ==========
 6 | 
 7 | Basic concepts in statistics for machine learning.
 8 | 
 9 | 
10 | 
11 | .. rubric:: References
12 | 
13 | .. [1] Example
14 | 


--------------------------------------------------------------------------------
/docs/training.rst:
--------------------------------------------------------------------------------
 1 | .. _probability:
 2 | 
 3 | ================
 4 | Training (empty)
 5 | ================
 6 | 
 7 | .. contents:: :local:
 8 | 
 9 | 
10 | Combating Overfitting
11 | =====================
12 | 
13 | Cross-validation
14 | ----------------
15 | 
16 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
17 | 
18 | Validation Set
19 | --------------
20 | 
21 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
22 | 
23 | Test Set
24 | --------
25 | 
26 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
27 | 
28 | 
29 | 
30 | Hyperparameter Tuning
31 | =====================
32 | 
33 | Learning Rate
34 | -------------
35 | 
36 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
37 | 
38 | Optimizers
39 | ----------
40 | 
41 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
42 | 
43 | 
44 | 
45 | Model Evaluation
46 | ================
47 | 
48 | 
49 | Bias-Variance Tradeoff
50 | ----------------------
51 | 
52 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
53 | 
54 | Loss Functions
55 | --------------
56 | 
57 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
58 | 
59 | Precision vs Recall
60 | -------------------
61 | 
62 | Be the first to `contribute! <https://github.com/bfortuner/ml-cheatsheet>`__
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | .. rubric:: References
73 | 
74 | .. [1] Example
75 | 


--------------------------------------------------------------------------------
/notebooks/rnn.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {
 7 |     "collapsed": true
 8 |    },
 9 |    "outputs": [],
10 |    "source": [
11 |     "\n"
12 |    ]
13 |   }
14 |  ],
15 |  "metadata": {
16 |   "kernelspec": {
17 |    "display_name": "Python 3",
18 |    "language": "python",
19 |    "name": "python3"
20 |   },
21 |   "language_info": {
22 |    "codemirror_mode": {
23 |     "name": "ipython",
24 |     "version": 2
25 |    },
26 |    "file_extension": ".py",
27 |    "mimetype": "text/x-python",
28 |    "name": "python",
29 |    "nbconvert_exporter": "python",
30 |    "pygments_lexer": "ipython2",
31 |    "version": "2.7.6"
32 |   }
33 |  },
34 |  "nbformat": 4,
35 |  "nbformat_minor": 0
36 | }


--------------------------------------------------------------------------------