├── .gitignore ├── .vscode └── settings.json ├── LICENSE ├── README.md ├── code ├── activation_functions.py ├── autoencoder.py ├── cnn.py ├── decision_tree.py ├── gan.py ├── id3_decision_tree_simple.py ├── knn.py ├── layers.py ├── logistic_regression.py ├── logistic_regression_scipy.py ├── loss_functions.py ├── mlp.py ├── nn_matrix.py ├── nn_simple.py ├── optimizers.py ├── random_forest_classifier.py ├── rnn.py └── vae.py ├── docs ├── .vscode │ └── settings.json ├── Makefile ├── _static │ └── theme_overrides.css ├── activation_functions.rst ├── applications.rst ├── architectures.rst ├── backpropagation.rst ├── build.bat ├── calculus.rst ├── classification_algos.rst ├── clustering_algos.rst ├── conf.py ├── contribute.rst ├── datasets.rst ├── figures │ ├── SimpleDiagram3_neural_networks.sdxml │ ├── activation_function_table.tgn │ ├── calculus_symbol_table.tgn │ ├── forward_prop_matrix_dimensions_table.tgn │ ├── linear_regression_companies_sales.tgn │ ├── linearalgebra.tgn │ └── statistics_symbols_table.tgn ├── forwardpropagation.rst ├── generative_algos.rst ├── glossary.rst ├── gradient_descent.rst ├── images │ ├── autoencoder.png │ ├── autoencoder_2.png │ ├── autoencoder_architecture.png │ ├── backprop_3_equations.png │ ├── backprop_ff_equations.png │ ├── backprop_final_3_deriv_equations.png │ ├── backprop_visually.png │ ├── boosting-sequence-models.PNG │ ├── boosting_error_iteration.png │ ├── calculus_slope_intro.png │ ├── cnn.jpg │ ├── cnn_filter_output.png │ ├── cross_entropy.png │ ├── decision_tree.png │ ├── dropout.png │ ├── dropout_net.png │ ├── dynamic_resizing_neural_network_1_obs.png │ ├── dynamic_resizing_neural_network_4_obs.png │ ├── earlystopping.png │ ├── elu.png │ ├── elu_prime.png │ ├── fc_layer.png │ ├── gan.png │ ├── gradient_accumulation.png │ ├── gradient_descent.png │ ├── gradient_descent_demystified.png │ ├── grid_search_cross_validation.png │ ├── gru_structure.png │ ├── integral_as_change_in_antriderivative.png │ ├── integral_as_rectangular_strips.png │ ├── integral_definition.png │ ├── khan_academy_matrix_product.png │ ├── leakyrelu.png │ ├── leakyrelu_prime.png │ ├── learned_regression_line.png │ ├── linear.png │ ├── linear_prime.png │ ├── linear_regression_3d_plane_mlr.png │ ├── linear_regression_line_1.png │ ├── linear_regression_line_2.png │ ├── linear_regression_line_3.png │ ├── linear_regression_line_4.png │ ├── linear_regression_line_intro.png │ ├── linear_regression_training_cost.png │ ├── log_vs_neglog.gif │ ├── logistic_cost_function_joined.png │ ├── logistic_cost_function_vectorized.png │ ├── logistic_regression_binary_decision_boundary.png │ ├── logistic_regression_exam_scores_scatter.png │ ├── logistic_regression_final_decision_boundary.png │ ├── logistic_regression_loss_history.png │ ├── logistic_regression_scatter_w_decision_boundary.png │ ├── logistic_regression_sigmoid_w_threshold.png │ ├── lstm_structure.png │ ├── maxpool.png │ ├── memoization.png │ ├── mlp.jpg │ ├── multiple_regression_error_history.png │ ├── neural_network_matrix_weighted_input.png │ ├── neural_network_simple.png │ ├── neural_network_w_matrices.png │ ├── neuron.png │ ├── ng_cost_function_logistic.png │ ├── nn_with_matrices_displayed.png │ ├── optimizers.gif │ ├── regularization-dropout.PNG │ ├── relu.png │ ├── relu_prime.png │ ├── rnn.png │ ├── rnn_layer.png │ ├── sigmoid.png │ ├── sigmoid_prime.png │ ├── simple_nn_diagram_zo_zh_defined.png │ ├── slope_formula.png │ ├── svm.png │ ├── svm_linear.png │ ├── svm_nonlinear_1.png │ ├── svm_nonlinear_2.png │ ├── svm_nonlinear_3.png │ ├── tanh.png │ ├── tanh_prime.png │ ├── vae.png │ ├── vector_field.png │ ├── vectors_geometry.png │ └── y1andy2_logistic_function.png ├── index.rst ├── layers.rst ├── libraries.rst ├── linear_algebra.rst ├── linear_regression.rst ├── logistic_regression.rst ├── loss_functions.rst ├── math_notation.rst ├── nn_concepts.rst ├── optimizers.rst ├── other_content.rst ├── papers.rst ├── probability.rst ├── regression_algos.rst ├── regularization.rst ├── reinforcement_learning.rst ├── statistics.rst └── training.rst └── notebooks └── rnn.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *~ 3 | _build/ 4 | docs/_build/ 5 | venv/ 6 | .idea 7 | sphinxenv/ -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "restructuredtext.confPath": "${workspaceFolder}/docs" 3 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Brendan Fortuner 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Glossary 2 | 3 | ## Looking for fellow maintainers! 4 | Apologies for my non-responsiveness. :( I've been heads down at Cruise, buiding ML infra for self-driving cars, and haven't reviewed this repo in forever. Looks like we're getting `54k monthly active users` now and I think the repo deserves more attention. Let me know if you would be interested in joining as a maintainer with priviledges to merge PRs. 5 | 6 | [View The Glossary](http://ml-cheatsheet.readthedocs.io/en/latest/) 7 | 8 | ## How To Contribute 9 | 10 | 1. Clone Repo 11 | ``` 12 | git clone https://github.com/bfortuner/ml-glossary.git 13 | ``` 14 | 15 | 2. Install Dependencies 16 | ``` 17 | # Assumes you have the usual suspects installed: numpy, scipy, etc.. 18 | pip install sphinx sphinx-autobuild 19 | pip install sphinx_rtd_theme 20 | pip install recommonmark 21 | ``` 22 | For python-3.x installed, use: 23 | ``` 24 | pip3 install sphinx sphinx-autobuild 25 | pip3 install sphinx_rtd_theme 26 | pip3 install recommonmark 27 | ``` 28 | 3. Preview Changes 29 | 30 | If you are using make build. 31 | 32 | ``` 33 | cd ml-glossary 34 | cd docs 35 | make html 36 | ``` 37 | 38 | For Windows. 39 | 40 | ``` 41 | cd ml-glossary 42 | cd docs 43 | build.bat html 44 | ``` 45 | 46 | 47 | 4. Verify your changes by opening the `index.html` file in `_build/` 48 | 49 | 5. [Submit Pull Request](https://help.github.com/articles/creating-a-pull-request/) 50 | 51 | 52 | ### Short for time? 53 | 54 | Feel free to raise an [issue](https://github.com/bfortuner/ml-glossary/issues) to correct errors or contribute content without a pull request. 55 | 56 | 57 | ## Style Guide 58 | 59 | Each entry in the glossary MUST include the following at a minimum: 60 | 61 | 1. **Concise explanation** - as short as possible, but no shorter 62 | 2. **Citations** - Papers, Tutorials, etc. 63 | 64 | Excellent entries will also include: 65 | 66 | 1. **Visuals** - diagrams, charts, animations, images 67 | 2. **Code** - python/numpy snippets, classes, or functions 68 | 3. **Equations** - Formatted with Latex 69 | 70 | The goal of the glossary is to present content in the most accessible way possible, with a heavy emphasis on visuals and interactive diagrams. That said, in the spirit of rapid prototyping, it's okay to to submit a "rough draft" without visuals or code. We expect other readers will enhance your submission over time. 71 | 72 | 73 | ## Why RST and not Markdown? 74 | 75 | RST has more features. For large and complex documentation projects, it's the logical choice. 76 | 77 | * https://eli.thegreenplace.net/2017/restructuredtext-vs-markdown-for-technical-documentation/ 78 | 79 | 80 | ## Top Contributors 81 | 82 | We're big fans of [Distill](http://distill.pub/prize) and we like their idea of offering prizes for high-quality submissions. We don't have as much money as they do, but we'd still like to reward contributors in some way for contributing to the glossary. For instance a cheatsheet cryptocurreny where tokens equal commits ;). Let us know if you have better ideas. In the end, this is an open-source project and we hope contributing to a repository of concise, accessible, machine learning knowledge is enough incentive on its own! 83 | 84 | 85 | ## Tips and Tricks 86 | 87 | * [Adding equations](http://www.sphinx-doc.org/en/stable/ext/math.html) 88 | * [Working with Jupyter Notebook](http://louistiao.me/posts/demos/ipython-notebook-demo/) 89 | * Quickstart with Jupyter notebook template 90 | * Graphs and charts 91 | * Importing images 92 | * Linking to code 93 | 94 | 95 | ## Resources 96 | 97 | * [Desmos Graphing Tool](https://www.desmos.com/calculator) 98 | * [3D Graphing Tool](https://www.geogebra.org/3d) 99 | * [How To Submit Pull Requests](https://help.github.com/articles/creating-a-pull-request/) 100 | * [RST Cheatsheet](http://docutils.sourceforge.net/docs/user/rst/quickref.html) 101 | * [Markdown Cheatsheet](https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet) 102 | * [Citation Generator](http://www.citationmachine.net) 103 | * [MathJax Cheatsheet](https://math.meta.stackexchange.com/questions/5020/mathjax-basic-tutorial-and-quick-reference) 104 | * [Embedding Math Equations](http://www.sphinx-doc.org/en/stable/ext/math.html) 105 | * [Sphinx Tutorial](https://pythonhosted.org/an_example_pypi_project/sphinx.html) 106 | * [Sphinx Docs](http://www.sphinx-doc.org/en/stable/markup/code.html) 107 | * [Sphinx Cheatsheet](http://openalea.gforge.inria.fr/doc/openalea/doc/_build/html/source/sphinx/rest_syntax.html) 108 | -------------------------------------------------------------------------------- /code/activation_functions.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | 4 | ### Note ### 5 | 6 | # z is weighted input 7 | 8 | 9 | ### Functions ### 10 | 11 | def linear(z,m): 12 | return m*z 13 | 14 | def elu(z,alpha): 15 | return z if z >= 0 else alpha*(e^z -1) 16 | 17 | def leakyrelu(z, alpha): 18 | return max(alpha * z, z) 19 | 20 | def relu(z): 21 | return max(0, z) 22 | 23 | def sigmoid(z): 24 | return 1.0 / (1 + np.exp(-z)) 25 | 26 | def tanh(z): 27 | return (np.exp(z) - np.exp(-z)) / (np.exp(z) + np.exp(-z)) 28 | 29 | 30 | 31 | 32 | ### Derivatives ### 33 | 34 | def linear_prime(z,m): 35 | return m 36 | 37 | def elu_prime(z,alpha): 38 | return 1 if z > 0 else alpha*np.exp(z) 39 | 40 | def leakyrelu_prime(z, alpha): 41 | return 1 if z > 0 else alpha 42 | 43 | def sigmoid_prime(z): 44 | return sigmoid(z) * (1-sigmoid(z)) 45 | 46 | def relu_prime(z): 47 | return 1 if z > 0 else 0 48 | 49 | def tanh_prime(z): 50 | return 1 - np.power(tanh(z), 2) 51 | 52 | -------------------------------------------------------------------------------- /code/autoencoder.py: -------------------------------------------------------------------------------- 1 | 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | 5 | 6 | class Autoencoder(nn.Module): 7 | def __init__(self, in_shape): 8 | super().__init__() 9 | c,h,w = in_shape 10 | self.encoder = nn.Sequential( 11 | nn.Linear(c*h*w, 128), 12 | nn.ReLU(), 13 | nn.Linear(128, 64), 14 | nn.ReLU(), 15 | nn.Linear(64, 12), 16 | nn.ReLU() 17 | ) 18 | self.decoder = nn.Sequential( 19 | nn.Linear(12, 64), 20 | nn.ReLU(), 21 | nn.Linear(64, 128), 22 | nn.ReLU(), 23 | nn.Linear(128, c*h*w), 24 | nn.Sigmoid() 25 | ) 26 | 27 | def forward(self, x): 28 | bs,c,h,w = x.size() 29 | x = x.view(bs, -1) 30 | x = self.encoder(x) 31 | x = self.decoder(x) 32 | x = x.view(bs, c, h, w) 33 | return x 34 | 35 | 36 | class ConvAutoencoder(nn.Module): 37 | def __init__(self, in_shape): 38 | super().__init__() 39 | c,h,w = in_shape 40 | self.encoder = nn.Sequential( 41 | nn.Conv2d(c, 16, kernel_size=3, stride=1, padding=1), # b, 16, 32, 32 42 | nn.ReLU(), 43 | nn.MaxPool2d(kernel_size=2, stride=2), # b, 16, 16, 16 44 | nn.Conv2d(16, 8, kernel_size=3, stride=1, padding=1), # b, 8, 16, 16 45 | nn.ReLU(), 46 | nn.MaxPool2d(kernel_size=2, stride=2) # b, 8, 8, 8 47 | ) 48 | self.decoder = nn.Sequential( 49 | nn.ConvTranspose2d(8, 16, kernel_size=3, stride=2, padding=0), # 16, 17, 17 50 | nn.ReLU(), 51 | nn.ConvTranspose2d(16, c, kernel_size=3, stride=2, padding=1), # 3, 33, 33 52 | CenterCrop(h, w), # 3, 32, 32 53 | nn.Sigmoid() 54 | ) 55 | 56 | def forward(self, x): 57 | x = self.encoder(x) 58 | x = self.decoder(x) 59 | return x 60 | 61 | 62 | def train(net, loader, loss_func, optimizer): 63 | net.train() 64 | for inputs, _ in loader: 65 | inputs = Variable(inputs) 66 | 67 | output = net(inputs) 68 | loss = loss_func(output, inputs) 69 | 70 | optimizer.zero_grad() 71 | loss.backward() 72 | optimizer.step() 73 | -------------------------------------------------------------------------------- /code/cnn.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from torch.autograd import Variable 3 | 4 | 5 | class CNN(nn.Module): 6 | def __init__(self, in_shape, n_classes): 7 | super().__init__() 8 | c, w, h = in_shape 9 | pool_layers = 2 10 | fc_h = int(h / 2**pool_layers) 11 | fc_w = int(w / 2**pool_layers) 12 | self.features = nn.Sequential( 13 | *conv_bn_relu(c, 16, kernel_size=1, stride=1, padding=0), 14 | *conv_bn_relu(16, 32, kernel_size=3, stride=1, padding=1), 15 | nn.MaxPool2d(kernel_size=2, stride=2), #size/2 16 | *conv_bn_relu(32, 64, kernel_size=3, stride=1, padding=1), 17 | nn.MaxPool2d(kernel_size=2, stride=2), #size/2 18 | ) 19 | self.classifier = nn.Sequential( 20 | *linear_bn_relu_drop(64 * fc_h * fc_w, 128, dropout=0.5), 21 | nn.Linear(128, n_classes), 22 | nn.Softmax(dim=1) 23 | ) 24 | 25 | def forward(self, x): 26 | x = self.features(x) 27 | x = x.view(x.size(0), -1) 28 | x = self.classifier(x) 29 | return x 30 | 31 | def conv_bn_relu(in_chans, out_chans, kernel_size=3, stride=1, 32 | padding=1, bias=False): 33 | return [ 34 | nn.Conv2d(in_chans, out_chans, kernel_size=kernel_size, 35 | stride=stride, padding=padding, bias=bias), 36 | nn.BatchNorm2d(out_chans), 37 | nn.ReLU(inplace=True), 38 | ] 39 | 40 | def linear_bn_relu_drop(in_chans, out_chans, dropout=0.5, bias=False): 41 | layers = [ 42 | nn.Linear(in_chans, out_chans, bias=bias), 43 | nn.BatchNorm1d(out_chans), 44 | nn.ReLU(inplace=True) 45 | ] 46 | if dropout > 0: 47 | layers.append(nn.Dropout(dropout)) 48 | return layers 49 | 50 | def train(net, loader, loss_func, optimizer): 51 | net.train() 52 | n_batches = len(loader) 53 | for inputs, targets in loader: 54 | inputs = Variable(inputs) 55 | targets = Variable(targets) 56 | 57 | output = net(inputs) 58 | loss = loss_func(output, targets) 59 | 60 | optimizer.zero_grad() 61 | loss.backward() 62 | optimizer.step() 63 | -------------------------------------------------------------------------------- /code/decision_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy import stats 3 | from abc import ABCMeta 4 | from typing import List 5 | 6 | 7 | class TreeNode: 8 | def __init__(self, data_idx, depth, child_lst=[]): 9 | self.data_idx = data_idx 10 | self.depth = depth 11 | self.child = child_lst 12 | self.label = None 13 | self.split_col = None 14 | self.child_cate_order = None 15 | 16 | def set_attribute(self, split_col, child_cate_order=None): 17 | self.split_col = split_col 18 | self.child_cate_order = child_cate_order 19 | 20 | def set_label(self, label): 21 | self.label = label 22 | 23 | 24 | class DecisionTree(metaclass=ABCMeta): 25 | def __init__(self, max_depth, min_sample_leaf, min_split_criterion=1e-4, verbose=False): 26 | self.max_depth = max_depth 27 | self.min_sample_leaf = min_sample_leaf 28 | self.verbose = verbose 29 | self.min_split_criterion = min_split_criterion 30 | self.root = None 31 | self.data = None 32 | self.labels = None 33 | self.feature_num = None 34 | 35 | def fit(self, X, y): 36 | """ 37 | X: train data, dimensition [num_sample, num_feature] 38 | y: label, dimension [num_sample, ] 39 | """ 40 | self.data = X 41 | self.labels = y 42 | num_sample, num_feature = X.shape 43 | self.feature_num = num_feature 44 | data_idx = list(range(num_sample)) 45 | self.root = TreeNode(data_idx=data_idx, depth=0, child_lst=[]) 46 | queue = [self.root] 47 | while queue: 48 | node = queue.pop(0) 49 | if node.depth>self.max_depth or len(node.data_idx)==1: 50 | self.set_label(node) 51 | else: 52 | child_nodes = self.split_node(node) 53 | if not child_nodes: 54 | self.set_label(node) 55 | else: 56 | queue.extend(child_nodes) 57 | 58 | def predict(self, X): 59 | num_sample, num_feature = X.shape 60 | labels = [] 61 | for idx in range(num_sample): 62 | x = X[idx] 63 | node = self.root 64 | while node.child: 65 | node = self.get_nex_node(node, x) 66 | labels.append(node.label) 67 | return labels 68 | 69 | @classmethod 70 | def get_split_criterion(self, node, child_node_lst): 71 | pass 72 | 73 | def set_label(self, node): 74 | target_Y = self.labels[node.data_idx] 75 | target_label = stats.mode(target_Y).mode[0] 76 | node.set_label(label=target_label) 77 | 78 | @classmethod 79 | def split_node(self, node): 80 | pass 81 | 82 | @classmethod 83 | def get_nex_node(self, node, x): 84 | pass 85 | 86 | 87 | class ID3DecisionTree(DecisionTree): 88 | 89 | def split_node(self, node): 90 | child_node_lst = [] 91 | child_cate_order = [] 92 | informatin_gain = 0 93 | split_col = None 94 | for col_idx in range(self.feature_num): 95 | current_child_cate_order = list(np.unique(self.data[node.data_idx][:, col_idx])) 96 | current_child_node_lst = [] 97 | for col_value in current_child_cate_order: 98 | data_idx = np.intersect1d(node.data_idx, np.where(self.data[:, col_idx] == col_value)) 99 | current_child_node_lst.append( 100 | TreeNode( 101 | data_idx=data_idx, 102 | depth=node.depth+1 103 | ) 104 | ) 105 | current_gain = self.get_split_criterion(node, current_child_node_lst) 106 | if current_gain > informatin_gain: 107 | informatin_gain = current_gain 108 | child_node_lst = current_child_node_lst 109 | child_cate_order = current_child_cate_order 110 | split_col = col_idx 111 | if informatin_gain List[TreeNode]: 176 | child_node_lst = [] 177 | child_cate_order = None 178 | gini_index = float("inf") 179 | split_col = None 180 | for col_idx in range(self.feature_num): 181 | current_child_cate_order = list(np.unique(self.data[node.data_idx][:, col_idx])) 182 | current_child_cate_order.sort() 183 | for col_value in current_child_cate_order: 184 | left_data_idx = np.intersect1d(node.data_idx, np.where(self.data[:, col_idx] <= col_value)) 185 | right_data_idx = np.intersect1d(node.data_idx, np.where(self.data[:, col_idx] > col_value)) 186 | current_child_node_lst = [] 187 | if len(left_data_idx) != 0: 188 | left_tree = TreeNode( 189 | data_idx=left_data_idx, 190 | depth=node.depth+1, 191 | ) 192 | current_child_node_lst.append(left_tree) 193 | if len(right_data_idx) != 0: 194 | right_tree = TreeNode( 195 | data_idx=right_data_idx, 196 | depth=node.depth+1, 197 | ) 198 | current_child_node_lst.append(right_tree) 199 | current_gini_index = self.get_split_criterion(node, current_child_node_lst) 200 | if current_gini_index < gini_index: 201 | gini_index = current_gini_index 202 | child_node_lst = current_child_node_lst 203 | child_cate_order = col_value 204 | split_col = col_idx 205 | node.child = child_node_lst 206 | node.set_attribute(split_col=split_col, child_cate_order=child_cate_order) 207 | return child_node_lst 208 | 209 | def get_split_criterion(self, node, child_node_lst): 210 | total = len(node.data_idx) 211 | split_criterion = 0 212 | for child_node in child_node_lst: 213 | impurity = self.get_impurity(child_node.data_idx) 214 | split_criterion += len(child_node.data_idx) / float(total) * impurity 215 | return split_criterion 216 | 217 | def get_impurity(self, data_ids): 218 | target_y = self.labels[data_ids] 219 | total = len(target_y) 220 | if self.tree_type == "regression": 221 | res = 0 222 | mean_y = np.mean(target_y) 223 | for y in target_y: 224 | res += (y - mean_y) ** 2 / total 225 | elif self.tree_type == "classification": 226 | if self.split_criterion == "gini": 227 | res = 1 228 | unique_y = np.unique(target_y) 229 | for y in unique_y: 230 | num = len(np.where(target_y==y)[0]) 231 | res -= (num/float(total))**2 232 | elif self.split_criterion == "entropy": 233 | unique, count = np.unique(target_y, return_counts=True) 234 | res = 0 235 | for c in count: 236 | p = float(c) / total 237 | res -= p * np.log(p) 238 | return res 239 | 240 | def get_nex_node(self, node: TreeNode, x: np.array): 241 | col_value = x[node.split_col] 242 | if col_value> node.child_cate_order: 243 | index = 1 244 | else: 245 | index = 0 246 | return node.child[index] 247 | 248 | 249 | if __name__ == "__main__": 250 | # ID3: only categorical features 251 | from sklearn.model_selection import train_test_split 252 | from sklearn.metrics import classification_report, mean_squared_error 253 | from sklearn import datasets 254 | from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor 255 | dataset = datasets.load_iris() 256 | 257 | 258 | # ############################# 259 | # ========== Config ========== 260 | # ############################# 261 | all_categorical_feature = True 262 | max_depth = 3 263 | min_sample_leaf = 4 264 | split_criterion = "entropy" 265 | # tree_type = "classification" 266 | tree_type = "regression" 267 | # ########################### 268 | 269 | # convert continuous feature to categorical features 270 | if all_categorical_feature: 271 | f = lambda x: int(x) 272 | func = np.vectorize(f) 273 | X = func(dataset.data) 274 | else: 275 | X = dataset.data 276 | 277 | Y = dataset.target 278 | X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.8) 279 | 280 | if tree_type == "classification": 281 | model = DecisionTreeClassifier(criterion=split_criterion, max_depth=max_depth, min_samples_leaf=min_sample_leaf) 282 | else: 283 | model = DecisionTreeRegressor(max_depth=max_depth, min_samples_leaf=min_sample_leaf) 284 | model.fit(X_train, y_train) 285 | y_pred = model.predict(X_test) 286 | if tree_type == "classification": 287 | print(classification_report(y_true=y_test, y_pred=y_pred)) 288 | else: 289 | print(mean_squared_error(y_test, y_pred)) 290 | # 291 | # model = ID3DecisionTree(max_depth=max_depth, min_sample_leaf=min_sample_leaf, verbose=True) 292 | # model = C45DecisionTree(max_depth=max_depth, min_sample_leaf=min_sample_leaf, verbose=True) 293 | model = CART(max_depth=max_depth, min_sample_leaf=min_sample_leaf, verbose=True, tree_type=tree_type, split_criterion=split_criterion) 294 | model.fit(X_train, y_train) 295 | y_pred = model.predict(X_test) 296 | if tree_type == "classification": 297 | print(classification_report(y_true=y_test, y_pred=y_pred)) 298 | else: 299 | print(mean_squared_error(y_test, y_pred)) -------------------------------------------------------------------------------- /code/gan.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class Generator(nn.Module): 5 | def __init__(self): 6 | super() 7 | self.net = nn.Sequential( 8 | nn.ConvTranspose2d( 200, 32 * 8, 4, 1, 0, bias=False), 9 | nn.BatchNorm2d(32 * 8), 10 | nn.ReLU(), 11 | nn.ConvTranspose2d(32 * 8, 32 * 4, 4, 2, 1, bias=False), 12 | nn.BatchNorm2d(32 * 4), 13 | nn.ReLU(), 14 | nn.ConvTranspose2d( 32 * 4, 32 * 2, 4, 2, 1, bias=False), 15 | nn.BatchNorm2d(32 * 2), 16 | nn.ReLU(), 17 | nn.ConvTranspose2d( 32 * 2, 32, 4, 2, 1, bias=False), 18 | nn.BatchNorm2d(32), 19 | nn.ReLU(), 20 | nn.ConvTranspose2d( 32, 1, 4, 2, 1, bias=False), 21 | nn.Tanh() 22 | ) 23 | def forward(self, tens): 24 | return self.net(tens) 25 | 26 | class Discriminator(nn.Module): 27 | def __init__(self): 28 | super() 29 | self.net = nn.Sequential( 30 | nn.Conv2d(1, 32, 4, 2, 1, bias=False), 31 | nn.LeakyReLU(0.2), 32 | nn.Conv2d(32, 32 * 2, 4, 2, 1, bias=False), 33 | nn.BatchNorm2d(32 * 2), 34 | nn.LeakyReLU(0.2), 35 | nn.Conv2d(32 * 2, 32 * 4, 4, 2, 1, bias=False), 36 | nn.BatchNorm2d(32 * 4), 37 | nn.LeakyReLU(0.2), 38 | # state size. (32*4) x 8 x 8 39 | nn.Conv2d(32 * 4, 32 * 8, 4, 2, 1, bias=False), 40 | nn.BatchNorm2d(32 * 8), 41 | nn.LeakyReLU(0.2), 42 | # state size. (32*8) x 4 x 4 43 | nn.Conv2d(32 * 8, 1, 4, 1, 0, bias=False), 44 | nn.Sigmoid() 45 | ) 46 | 47 | def forward(self, tens): 48 | return self.net(tens) 49 | 50 | def train(netD, netG, loader, loss_func, optimizerD, optimizerG, num_epochs): 51 | netD.train() 52 | netG.train() 53 | device = "cuda:0" if torch.cuda.is_available() else "cpu" 54 | for epoch in range(num_epochs): 55 | for i, data in enumerate(loader, 0): 56 | netD.zero_grad() 57 | realtens = data[0].to(device) 58 | b_size = realtens.size(0) 59 | label = torch.full((b_size,), 1, dtype=torch.float, device=device) # gen labels 60 | output = netD(realtens) 61 | errD_real = loss_func(output, label) 62 | errD_real.backward() # backprop discriminator fake and real based on label 63 | noise = torch.randn(b_size, 200, 1, 1, device=device) 64 | fake = netG(noise) 65 | label.fill_(0) 66 | output = netD(fake.detach()).view(-1) 67 | errD_fake = loss_func(output, label) 68 | errD_fake.backward() # backprop discriminator fake and real based on label 69 | errD = errD_real + errD_fake # discriminator error 70 | optimizerD.step() 71 | netG.zero_grad() 72 | label.fill_(1) 73 | output = netD(fake).view(-1) 74 | errG = loss_func(output, label) # generator error 75 | errG.backward() 76 | optimizerG.step() 77 | -------------------------------------------------------------------------------- /code/id3_decision_tree_simple.py: -------------------------------------------------------------------------------- 1 | """Numpy Implementation of ID3 Decision Tree Classifier.""" 2 | import numpy as np 3 | from collections import Counter 4 | 5 | 6 | class id3_Classifier(): 7 | """ 8 | The ID3 classifier is based on information gain to split. 9 | 10 | Usage: 11 | model = id3_tree_classifier(least_children_num = 4, verbose=True) 12 | model.fit(X_train,y) 13 | model.predict(X_test) 14 | """ 15 | 16 | def __init__(self, least_children_num, verbose=True): 17 | """Constructor.""" 18 | self.least_children_num = least_children_num 19 | self.verbose = verbose 20 | 21 | def fit(self, tmp_x, tmp_y): 22 | """Fit function.""" 23 | def fit_tree(tmp_x, tmp_y): 24 | # Exit condition: 25 | if len(tmp_y) < self.least_children_num or len(np.unique(tmp_y)) == 1: 26 | 27 | if self.verbose: 28 | print('exit condition:') 29 | print('tmp_y:') 30 | print(tmp_y) 31 | 32 | mode_val = self._mode(tmp_y.flatten().tolist()) 33 | return([np.nan, mode_val, np.nan, np.nan]) 34 | 35 | # Otherwise Split: 36 | if self.verbose: 37 | print("start....subset Y len {}".format(len(tmp_y))) 38 | split_row, split_col = self._decide_split(tmp_x, tmp_y) 39 | if not split_row and not split_col: 40 | print('no better split...return mode') 41 | mode_val = self._mode(tmp_y.flatten().tolist()) 42 | return([np.nan, mode_val, np.nan, np.nan]) 43 | 44 | if self.verbose: 45 | print("split on:") 46 | print(split_row, split_col) 47 | split_vec = tmp_x[:, split_col] 48 | split_val = tmp_x[split_row, split_col] 49 | left_ind = np.where(split_vec < split_val)[0].tolist() 50 | right_ind = np.where(split_vec >= split_val)[0].tolist() 51 | left_dat, left_y = tmp_x[left_ind, :], tmp_y[left_ind, ] 52 | right_dat, right_y = tmp_x[right_ind, :], tmp_y[right_ind, ] 53 | 54 | left_tree = fit_tree(left_dat, left_y) 55 | right_tree = fit_tree(right_dat, right_y) 56 | 57 | if isinstance(left_tree, list): 58 | len_l_tree = 1 59 | else: 60 | len_l_tree = left_tree.shape[0] 61 | 62 | root = [split_col, split_val, 1, len_l_tree + 1] 63 | return(np.vstack([root, left_tree, right_tree])) 64 | tree = fit_tree(tmp_x, tmp_y) 65 | self.tree = tree 66 | 67 | 68 | def _decide_split(self, x, y): 69 | """ 70 | Given subset of X,Y, 71 | search for the best splitting node based on: information gain. 72 | """ 73 | def _entropy(tmp_y): 74 | """Key Metrics of building a decision tree use Shannon Entropy.""" 75 | tmp_ent = 0 76 | for uni_y in np.unique(tmp_y): 77 | p = len(tmp_y[tmp_y == uni_y]) / len(tmp_y) 78 | tmp_ent -= (p * np.log2(p)) 79 | return tmp_ent 80 | 81 | m, n = x.shape 82 | best_gain = 0 83 | split_row, split_col = None, None 84 | 85 | previous_entropy = _entropy(y) 86 | for col in range(n): 87 | tmp_vec = x[:, col].ravel() 88 | for row in range(m): 89 | val = tmp_vec[row] 90 | # >= & < is the convention here: 91 | if val != np.max(tmp_vec) and val != np.min(tmp_vec): 92 | left_b = np.where(tmp_vec < val)[0].tolist() 93 | right_b = np.where(tmp_vec >= val)[0].tolist() 94 | 95 | new_ent = (len(y[left_b]) / len(y)) * _entropy(y[left_b]) + \ 96 | (len(y[right_b]) / len(y)) * _entropy(y[right_b]) 97 | info_gain = previous_entropy - new_ent 98 | 99 | if info_gain > best_gain: 100 | split_row, split_col = row, col 101 | best_gain = info_gain 102 | if self.verbose: 103 | print('better gain:{}'.format(best_gain)) 104 | print() 105 | return split_row, split_col 106 | 107 | def _mode(self, x_list): 108 | """Calculate the mode for splitting.""" 109 | return Counter(x_list).most_common(1)[0][0] 110 | 111 | def predict(self, tmp_test_array): 112 | """Wrap-up fun for prediction.""" 113 | def _query(tree, tmp_test_array): 114 | """Prediction for single example.""" 115 | assert len(tmp_test_array.shape) == 2, \ 116 | "Make sure your test data is 2d array" 117 | 118 | if isinstance(tree,list): 119 | start_node = tree # only the 1 row in data 120 | else: 121 | start_node = tree[0,:] # Iteratively hit first row 122 | 123 | test_feat, test_val, left_tree_jump, right_tree_jump = \ 124 | start_node[0], start_node[1], start_node[2], start_node[3] 125 | 126 | if np.isnan(test_feat) and np.isnan(left_tree_jump) and \ 127 | np.isnan(right_tree_jump): 128 | 129 | pred = test_val 130 | return pred 131 | 132 | if tmp_test_array[0, int(test_feat)] < test_val: 133 | # If <, go left branch: 134 | jump_loc = left_tree_jump 135 | pred = _query(tree[int(jump_loc):, ], tmp_test_array) 136 | 137 | else: 138 | # If >=, go right branch: 139 | jump_loc = right_tree_jump 140 | pred = _query(tree[int(jump_loc):, ], tmp_test_array) 141 | 142 | return pred 143 | 144 | assert len(tmp_test_array.shape) == 2, \ 145 | "Make sure test data is 2d-array" 146 | result = [] 147 | 148 | for i in range(tmp_test_array.shape[0]): 149 | inp = tmp_test_array[i, :].reshape(1, -1) 150 | result.append(_query(self.tree, inp)) 151 | return result 152 | -------------------------------------------------------------------------------- /code/knn.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from math import sqrt 3 | 4 | 5 | def euclidean_distance(point1, point2): 6 | distance = 0 7 | for i in range(len(point1)): 8 | distance +=(point1[i] - point2[i]) ** 2 9 | return sqrt(distance) 10 | 11 | 12 | def mean(labels): 13 | return sum(labels) / len(labels) 14 | 15 | 16 | def mode(labels): 17 | return Counter(labels).most_common(1)[0][0] 18 | 19 | 20 | def KNN(training_data, target, k, func): 21 | """ 22 | training_data: all training data point 23 | target: new point 24 | k: user-defined constant, number of closest training data 25 | func: functions used to get the the target label 26 | """ 27 | # Step one: calculate the Euclidean distance between the new point and all training data 28 | neighbors= [] 29 | for index, data in enumerate(training_data): 30 | # distance between the target data and the current example from the data. 31 | distance = euclidean_distance(data[:-1], target) 32 | neighbors.append((distance, index)) 33 | 34 | # Step two: pick the top-K closest training data 35 | sorted_neighbors = sorted(neighbors) 36 | k_nearest = sorted_neighbors[:k] 37 | 38 | # Get the labels of the selected K entries 39 | k_nearest_labels = [training_data[i][1] for distance, i in k_nearest] 40 | 41 | # Step three: For regression problem, take the average of the labels as the result; 42 | # for classification problem, take the most common label of these labels as the result. 43 | return k_nearest, func(k_nearest_labels) 44 | 45 | 46 | def main(): 47 | """ 48 | # Regression Data(Column 0 : Height(inch), Column 1: Weight(lb)) 49 | """ 50 | reg_data = [ 51 | [73.84, 241.89], 52 | [68.78, 162.31], 53 | [74.11, 212.74], 54 | [71.73, 220.04], 55 | [69.88, 206.34], 56 | [67.25, 152.21], 57 | [63.45, 156.39] 58 | ] 59 | 60 | target_data = [70] 61 | reg_k_nearest_neighbors, reg_prediction = KNN( 62 | reg_data, target_data, k=3, func=mean 63 | ) 64 | print(reg_prediction) 65 | ''' 66 | # Classification Data( Column 0: age, Column 1:like paragliding or not ) 67 | ''' 68 | clf_data = [ 69 | [26, 1], 70 | [20, 1], 71 | [22, 1], 72 | [19, 1], 73 | [28, 0], 74 | [33, 0], 75 | [30, 0], 76 | [50, 0], 77 | ] 78 | target_data2 = [32] 79 | clf_k_nearest_neighbors, clf_prediction = KNN( 80 | clf_data, target_data2, k=3, func=mode 81 | ) 82 | print(clf_prediction) 83 | 84 | 85 | if __name__ == '__main__': 86 | main() 87 | -------------------------------------------------------------------------------- /code/layers.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | from scipy.special import softmax 4 | from scipy.special import expit 5 | from typing import List 6 | 7 | 8 | def BatchNorm(): 9 | # From https://wiseodd.github.io/techblog/2016/07/04/batchnorm/ 10 | # TODO: Add doctring for variable names. Add momentum to init. 11 | def __init__(self): 12 | pass 13 | 14 | def forward(self, X, gamma, beta): 15 | mu = np.mean(X, axis=0) 16 | var = np.var(X, axis=0) 17 | 18 | X_norm = (X - mu) / np.sqrt(var + 1e-8) 19 | out = gamma * X_norm + beta 20 | 21 | cache = (X, X_norm, mu, var, gamma, beta) 22 | 23 | return out, cache, mu, var 24 | 25 | def backward(self, dout, cache): 26 | X, X_norm, mu, var, gamma, beta = cache 27 | 28 | N, D = X.shape 29 | 30 | X_mu = X - mu 31 | std_inv = 1. / np.sqrt(var + 1e-8) 32 | 33 | dX_norm = dout * gamma 34 | dvar = np.sum(dX_norm * X_mu, axis=0) * -.5 * std_inv**3 35 | dmu = np.sum(dX_norm * -std_inv, axis=0) + dvar * np.mean(-2. * X_mu, axis=0) 36 | 37 | dX = (dX_norm * std_inv) + (dvar * 2 * X_mu / N) + (dmu / N) 38 | dgamma = np.sum(dout * X_norm, axis=0) 39 | dbeta = np.sum(dout, axis=0) 40 | 41 | return dX, dgamma, dbeta 42 | 43 | 44 | class RNN: 45 | def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, batch_size=1) -> None: 46 | self.input_dim = input_dim 47 | self.hidden_dim = hidden_dim 48 | self.out_dim = output_dim 49 | self.batch_size = batch_size 50 | # initialization 51 | self.params = self._init_params() 52 | self.hidden_state = self._init_hidden_state() 53 | 54 | def _init_params(self) -> List[np.array]: 55 | scale = 0.01 56 | Waa = np.random.normal(scale=scale, size=[self.hidden_dim, self.hidden_dim]) 57 | Wax = np.random.normal(scale=scale, size=[self.hidden_dim, self.input_dim]) 58 | Wy = np.random.normal(scale=scale, size=[self.out_dim, self.hidden_dim]) 59 | ba = np.zeros(shape=[self.hidden_dim, 1]) 60 | by = np.zeros(shape=[self.out_dim, 1]) 61 | return [Waa, Wax, Wy, ba, by] 62 | 63 | def _init_hidden_state(self) -> np.array: 64 | return np.zeros(shape=[self.hidden_dim, self.batch_size]) 65 | 66 | def forward(self, input_vector: np.array) -> np.array: 67 | """ 68 | input_vector: 69 | dimension: [num_steps, self.input_dim, self.batch_size] 70 | out_vector: 71 | dimension: [num_steps, self.output_dim, self.batch_size] 72 | """ 73 | Waa, Wax, Wy, ba, by = self.params 74 | output_vector = [] 75 | for vector in input_vector: 76 | self.hidden_state = np.tanh( 77 | np.dot(Waa, self.hidden_state) + np.dot(Wax, vector) + ba 78 | ) 79 | y = softmax( 80 | np.dot(Wy, self.hidden_state) + by 81 | ) 82 | output_vector.append(y) 83 | return np.array(output_vector) 84 | 85 | 86 | class GRU: 87 | def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, batch_size=1) -> None: 88 | self.input_dim = input_dim 89 | self.hidden_dim = hidden_dim 90 | self.out_dim = output_dim 91 | self.batch_size = batch_size 92 | # initialization 93 | self.params = self._init_params() 94 | self.hidden_state = self._init_hidden_state() 95 | 96 | def _init_params(self) -> List[np.array]: 97 | scale = 0.01 98 | def param_single_layer(): 99 | w = np.random.normal(scale=scale, size=(self.hidden_dim, self.hidden_dim+input_dim)) 100 | b = np.zeros(shape=[self.hidden_dim, 1]) 101 | return w, b 102 | 103 | # reset, update gate 104 | Wr, br = param_single_layer() 105 | Wu, bu = param_single_layer() 106 | # output layer 107 | Wy = np.random.normal(scale=scale, size=[self.out_dim, self.hidden_dim]) 108 | by = np.zeros(shape=[self.out_dim, 1]) 109 | return [Wr, br, Wu, bu, Wy, by] 110 | 111 | def _init_hidden_state(self) -> np.array: 112 | return np.zeros(shape=[self.hidden_dim, self.batch_size]) 113 | 114 | def forward(self, input_vector: np.array) -> np.array: 115 | """ 116 | input_vector: 117 | dimension: [num_steps, self.input_dim, self.batch_size] 118 | out_vector: 119 | dimension: [num_steps, self.output_dim, self.batch_size] 120 | """ 121 | Wr, br, Wu, bu, Wy, by = self.params 122 | output_vector = [] 123 | for vector in input_vector: 124 | # expit in scipy is sigmoid function 125 | reset_gate = expit( 126 | np.dot(Wr, np.concatenate([self.hidden_state, vector], axis=0)) + br 127 | ) 128 | update_gate = expit( 129 | np.dot(Wu, np.concatenate([self.hidden_state, vector], axis=0)) + bu 130 | ) 131 | candidate_hidden = np.tanh( 132 | reset_gate * self.hidden_state 133 | ) 134 | self.hidden_state = update_gate * self.hidden_state + (1-update_gate) * candidate_hidden 135 | y = softmax( 136 | np.dot(Wy, self.hidden_state) + by 137 | ) 138 | output_vector.append(y) 139 | return np.array(output_vector) 140 | 141 | 142 | class LSTM: 143 | def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, batch_size=1) -> None: 144 | self.input_dim = input_dim 145 | self.hidden_dim = hidden_dim 146 | self.out_dim = output_dim 147 | self.batch_size = batch_size 148 | # initialization 149 | self.params = self._init_params() 150 | self.hidden_state = self._init_hidden_state() 151 | self.memory_state = self._init_hidden_state() 152 | 153 | def _init_params(self) -> List[np.array]: 154 | scale = 0.01 155 | def param_single_layer(): 156 | w = np.random.normal(scale=scale, size=(self.hidden_dim, self.hidden_dim+input_dim)) 157 | b = np.zeros(shape=[self.hidden_dim, 1]) 158 | return w, b 159 | 160 | # forget, input, output gate + candidate memory state 161 | Wf, bf = param_single_layer() 162 | Wi, bi = param_single_layer() 163 | Wo, bo = param_single_layer() 164 | Wc, bc = param_single_layer() 165 | # output layer 166 | Wy = np.random.normal(scale=scale, size=[self.out_dim, self.hidden_dim]) 167 | by = np.zeros(shape=[self.out_dim, 1]) 168 | return [Wf, bf, Wi, bi, Wo, bo, Wc, bc, Wy, by] 169 | 170 | def _init_hidden_state(self) -> np.array: 171 | return np.zeros(shape=[self.hidden_dim, self.batch_size]) 172 | 173 | def forward(self, input_vector: np.array) -> np.array: 174 | """ 175 | input_vector: 176 | dimension: [num_steps, self.input_dim, self.batch_size] 177 | out_vector: 178 | dimension: [num_steps, self.output_dim, self.batch_size] 179 | """ 180 | Wf, bf, Wi, bi, Wo, bo, Wc, bc, Wy, by = self.params 181 | output_vector = [] 182 | for vector in input_vector: 183 | # expit in scipy is sigmoid function 184 | foget_gate = expit( 185 | np.dot(Wf, np.concatenate([self.hidden_state, vector], axis=0)) + bf 186 | ) 187 | input_gate = expit( 188 | np.dot(Wi, np.concatenate([self.hidden_state, vector], axis=0)) + bi 189 | ) 190 | output_gate = expit( 191 | np.dot(Wo, np.concatenate([self.hidden_state, vector], axis=0)) + bo 192 | ) 193 | candidate_memory = np.tanh( 194 | np.dot(Wc, np.concatenate([self.hidden_state, vector], axis=0)) + bc 195 | ) 196 | self.memory_state = foget_gate * self.memory_state + input_gate * candidate_memory 197 | self.hidden_state = output_gate * np.tanh(self.memory_state) 198 | y = softmax( 199 | np.dot(Wy, self.hidden_state) + by 200 | ) 201 | output_vector.append(y) 202 | return np.array(output_vector) 203 | 204 | 205 | def Adagrad(data): 206 | pass 207 | 208 | 209 | def Adam(data): 210 | pass 211 | 212 | 213 | def LBFGS(data): 214 | pass 215 | 216 | 217 | def RMSProp(data): 218 | pass 219 | 220 | 221 | # def SGD(data, batch_size, lr): 222 | # N = len(data) 223 | # np.random.shuffle(data) 224 | # mini_batches = np.array([data[i:i+batch_size] 225 | # for i in range(0, N, batch_size)]) 226 | # for X,y in mini_batches: 227 | # backprop(X, y, lr) 228 | 229 | 230 | def SGD_Momentum(): 231 | pass 232 | 233 | 234 | if __name__ == "__main__": 235 | input_data = np.array([ 236 | [ 237 | [1, 3] 238 | , [2, 4] 239 | , [3, 6] 240 | ] 241 | , [ 242 | [4, 3] 243 | , [3, 4] 244 | , [1, 5] 245 | ] 246 | ]) 247 | batch_size = 2 248 | input_dim = 3 249 | output_dim = 4 250 | hidden_dim = 5 251 | time_step = 2 252 | # rnn = RNN(input_dim=input_dim, batch_size=batch_size, output_dim=output_dim, hidden_dim=hidden_dim) 253 | # output_vector = rnn.forward(input_vector=input_data) 254 | # print("RNN:") 255 | # print(f"Input data dimensions: {input_data.shape}") 256 | # print(f"Output data dimensions {output_vector.shape}") 257 | rnn = GRU(input_dim=input_dim, batch_size=batch_size, output_dim=output_dim, hidden_dim=hidden_dim) 258 | output_vector = rnn.forward(input_vector=input_data) 259 | print("LSTM:") 260 | print(f"Input data dimensions: {input_data.shape}") 261 | print(f"Output data dimensions {output_vector.shape}") -------------------------------------------------------------------------------- /code/logistic_regression.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy 3 | from activation_functions import sigmoid, sigmoid_prime 4 | 5 | 6 | def predict(features, weights): 7 | ''' 8 | Returns 1D array of probabilities 9 | that the class label == 1 10 | ''' 11 | z = np.dot(features, weights) 12 | return sigmoid(z) 13 | 14 | 15 | def cost_function(features, labels, weights): 16 | ''' 17 | Using Mean Absolute Error 18 | 19 | Features:(100,3) 20 | Labels: (100,1) 21 | Weights:(3,1) 22 | Returns 1D matrix of predictions 23 | Cost = (labels*log(predictions) + (1-labels)*log(1-predictions) ) / len(labels) 24 | ''' 25 | observations = len(labels) 26 | 27 | predictions = predict(features, weights) 28 | 29 | #Take the error when label=1 30 | class1_cost = -labels*np.log(predictions) 31 | 32 | #Take the error when label=0 33 | class2_cost = (1-labels)*np.log(1-predictions) 34 | 35 | #Take the sum of both costs 36 | cost = class1_cost - class2_cost 37 | 38 | #Take the average cost 39 | cost = cost.sum() / observations 40 | 41 | return cost 42 | 43 | 44 | def update_weights(features, labels, weights, lr): 45 | ''' 46 | Vectorized Gradient Descent 47 | 48 | Features:(200, 3) 49 | Labels: (200, 1) 50 | Weights:(3, 1) 51 | ''' 52 | N = len(features) 53 | 54 | #1 - Get Predictions 55 | predictions = predict(features, weights) 56 | 57 | #2 Transpose features from (200, 3) to (3, 200) 58 | # So we can multiply w the (200,1) cost matrix. 59 | # Returns a (3,1) matrix holding 3 partial derivatives -- 60 | # one for each feature -- representing the aggregate 61 | # slope of the cost function across all observations 62 | gradient = np.dot(features.T, predictions - labels) 63 | 64 | #3 Take the average cost derivative for each feature 65 | gradient /= N 66 | 67 | #4 - Multiply the gradient by our learning rate 68 | gradient *= lr 69 | 70 | #5 - Subtract from our weights to minimize cost 71 | weights -= gradient 72 | 73 | return weights 74 | 75 | 76 | def decision_boundary(prob): 77 | return 1 if prob >= .5 else 0 78 | 79 | 80 | def classify(predictions): 81 | ''' 82 | input - N element array of predictions between 0 and 1 83 | output - N element array of 0s (False) and 1s (True) 84 | ''' 85 | decision_boundary = np.vectorize(decision_boundary) 86 | return decision_boundary(predictions).flatten() 87 | 88 | 89 | def train(features, labels, weights, lr, iters): 90 | cost_history = [] 91 | 92 | for i in range(iters): 93 | weights = update_weights(features, labels, weights, lr) 94 | 95 | #Calculate error for auditing purposes 96 | cost = cost_function(features, labels, weights) 97 | cost_history.append(cost) 98 | 99 | # Log Progress 100 | if i % 1000 == 0: 101 | print "iter: "+str(i) + " cost: "+str(cost) 102 | 103 | return weights, cost_history 104 | 105 | 106 | def accuracy(predicted_labels, actual_labels): 107 | diff = predicted_labels - actual_labels 108 | return 1.0 - (float(np.count_nonzero(diff)) / len(diff)) 109 | 110 | 111 | def plot_decision_boundary(trues, falses): 112 | fig = plt.figure() 113 | ax = fig.add_subplot(111) 114 | 115 | no_of_preds = len(trues) + len(falses) 116 | 117 | ax.scatter([i for i in range(len(trues))], trues, s=25, c='b', marker="o", label='Trues') 118 | ax.scatter([i for i in range(len(falses))], falses, s=25, c='r', marker="s", label='Falses') 119 | 120 | plt.legend(loc='upper right'); 121 | ax.set_title("Decision Boundary") 122 | ax.set_xlabel('N/2') 123 | ax.set_ylabel('Predicted Probability') 124 | plt.axhline(.5, color='black') 125 | plt.show() 126 | -------------------------------------------------------------------------------- /code/logistic_regression_scipy.py: -------------------------------------------------------------------------------- 1 | import sklearn 2 | from sklearn.linear_model import LogisticRegression 3 | from sklearn.cross_validation import train_test_split 4 | 5 | # Normalize grades to values between 0 and 1 for more efficient computation 6 | normalized_range = sklearn.preprocessing.MinMaxScaler(feature_range=(-1,1)) 7 | 8 | # Extract Features + Labels 9 | labels.shape = (100,) #scikit expects this 10 | features = normalized_range.fit_transform(features) 11 | 12 | # Create Test/Train 13 | features_train,features_test,labels_train,labels_test = train_test_split(features,labels,test_size=0.4) 14 | 15 | # Scikit Logistic Regression 16 | scikit_log_reg = LogisticRegression() 17 | scikit_log_reg.fit(features_train,labels_train) 18 | 19 | #Score is Mean Accuracy 20 | scikit_score = clf.score(features_test,labels_test) 21 | print 'Scikit score: ', scikit_score 22 | 23 | #Our Mean Accuracy 24 | observations, features, labels, weights = run() 25 | probabilities = predict(features, weights).flatten() 26 | classifications = classifier(probabilities) 27 | our_acc = accuracy(classifications,labels.flatten()) 28 | print 'Our score: ',our_acc 29 | -------------------------------------------------------------------------------- /code/loss_functions.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | 4 | ### Note ### 5 | 6 | # yHat is prediction 7 | # y is the target (true label) 8 | 9 | 10 | ### Functions ### 11 | 12 | def CrossEntropy(yHat, y): 13 | if y == 1: 14 | return -log(yHat) 15 | else: 16 | return -log(1 - yHat) 17 | 18 | 19 | def Dice(yHat, y): 20 | total = np.sum(y, dim=1) + np.sum(yHat, dim=1) 21 | intersection = np.sum(y * yHat, dim=1) 22 | dice = (2.0 * intersection) / (total + 1e-7) 23 | return np.mean(dice) 24 | 25 | 26 | def Hinge(yHat, y): 27 | return np.max(0, y - (1-2*y)*yHat) 28 | 29 | 30 | def Huber(yHat, y, delta=1.): 31 | return np.where(np.abs(y-yHat) < delta,.5*(y-yHat)**2 , delta*(np.abs(y-yHat)-0.5*delta)) 32 | 33 | 34 | def KLDivergence(yHat, y): 35 | """ 36 | :param yHat: 37 | :param y: 38 | :return: KLDiv(yHat || y) 39 | """ 40 | return np.sum(yHat * np.log((yHat / y))) 41 | 42 | 43 | def L1(yHat, y): 44 | return np.sum(np.absolute(yHat - y)) / y.size 45 | 46 | def root_mean_square_error(y_hat: np.ndarray, y: np.ndarray) -> float: 47 | return np.sqrt(np.sum((y_hat - y)**2) / y.size) 48 | 49 | def L2(yHat, y): 50 | return np.sum((yHat - y)**2) 51 | 52 | 53 | def MLE(yHat, y): 54 | pass 55 | 56 | 57 | def MSE(yHat, y): 58 | return np.sum((yHat - y)**2) / y.size 59 | 60 | 61 | ### Derivatives ### 62 | 63 | def MSE_prime(yHat, y): 64 | return yHat - y 65 | -------------------------------------------------------------------------------- /code/mlp.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | from torch.autograd import Variable 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | 7 | # import torchvision module to handle image manipulation 8 | import torchvision 9 | import torchvision.transforms as transforms 10 | 11 | # calculate train time, writing train data to files etc. 12 | import time 13 | import pandas as pd 14 | import json 15 | 16 | 17 | class MLP(nn.Module): 18 | def __init__(self): 19 | super(MLP,self).__init__() 20 | # define layers 21 | self.fc1 = nn.Linear(in_features=28*28, out_features=500) 22 | self.fc2 = nn.Linear(in_features=500, out_features=200) 23 | self.fc3 = nn.Linear(in_features=200, out_features=100) 24 | self.out = nn.Linear(in_features=100, out_features=10) 25 | 26 | 27 | def forward(self, t): 28 | # fc1 make input 1 dimentional 29 | t = t.view(-1,28*28) 30 | t = self.fc1(t) 31 | t = F.relu(t) 32 | # fc2 33 | t = self.fc2(t) 34 | t = F.relu(t) 35 | # fc3 36 | t = self.fc3(t) 37 | t = F.relu(t) 38 | # output 39 | t = self.out(t) 40 | return t 41 | 42 | def train(net, loader, loss_func, optimizer): 43 | net.train() 44 | n_batches = len(loader) 45 | for inputs, targets in loader: 46 | inputs = Variable(inputs) 47 | targets = Variable(targets) 48 | 49 | output = net(inputs) 50 | loss = loss_func(output, targets) 51 | 52 | optimizer.zero_grad() 53 | loss.backward() 54 | optimizer.step() 55 | # print statistics 56 | running_loss = loss.item() 57 | print('Training loss: %.3f' %( running_loss)) 58 | 59 | def main(): 60 | train_set = torchvision.datasets.FashionMNIST( 61 | root = './FMNIST', 62 | train = True, 63 | download = False, 64 | transform = transforms.Compose([ 65 | transforms.ToTensor() 66 | ]) 67 | ) 68 | mlp = MLP() 69 | loader = torch.utils.data.DataLoader(train_set, batch_size = 1000) 70 | optimizer = optim.Adam(mlp.parameters(), lr=0.01) 71 | loss_func=nn.CrossEntropyLoss() 72 | for i in range(0,15): 73 | train(mlp,loader,loss_func,optimizer) 74 | print("Finished Training") 75 | torch.save(mlp.state_dict(), "./mlpmodel.pt") 76 | test_set = torchvision.datasets.FashionMNIST( 77 | root = './FMNIST', 78 | train = False, 79 | download = False, 80 | transform = transforms.Compose([ 81 | transforms.ToTensor() 82 | ]) 83 | ) 84 | testloader = torch.utils.data.DataLoader(test_set, batch_size=4) 85 | correct = 0 86 | total = 0 87 | with torch.no_grad(): 88 | for data in testloader: 89 | images, labels = data 90 | outputs = mlp(images) 91 | _, predicted = torch.max(outputs.data, 1) 92 | total += labels.size(0) 93 | correct += (predicted == labels).sum().item() 94 | print('Accuracy of the network on the 10000 test images: %d %%' % ( 95 | 100 * correct / total)) 96 | 97 | 98 | main() 99 | -------------------------------------------------------------------------------- /code/nn_matrix.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | 4 | # Neural Network w Matrices 5 | 6 | INPUT_LAYER_SIZE = 1 7 | HIDDEN_LAYER_SIZE = 2 8 | OUTPUT_LAYER_SIZE = 2 9 | 10 | def init_weights(): 11 | Wh = np.random.randn(INPUT_LAYER_SIZE, HIDDEN_LAYER_SIZE) * \ 12 | np.sqrt(2.0/INPUT_LAYER_SIZE) 13 | Wo = np.random.randn(HIDDEN_LAYER_SIZE, OUTPUT_LAYER_SIZE) * \ 14 | np.sqrt(2.0/HIDDEN_LAYER_SIZE) 15 | 16 | 17 | def init_bias(): 18 | Bh = np.full((1, HIDDEN_LAYER_SIZE), 0.1) 19 | Bo = np.full((1, OUTPUT_LAYER_SIZE), 0.1) 20 | return Bh, Bo 21 | 22 | def relu(Z): 23 | return np.maximum(0, Z) 24 | 25 | def relu_prime(Z): 26 | ''' 27 | Z - weighted input matrix 28 | 29 | Returns gradient of Z where all 30 | negative values are set to 0 and 31 | all positive values set to 1 32 | ''' 33 | Z[Z < 0] = 0 34 | Z[Z > 0] = 1 35 | return Z 36 | 37 | def cost(yHat, y): 38 | cost = np.sum((yHat - y)**2) / 2.0 39 | return cost 40 | 41 | def cost_prime(yHat, y): 42 | return yHat - y 43 | 44 | def feed_forward(X): 45 | ''' 46 | X - input matrix 47 | Zh - hidden layer weighted input 48 | Zo - output layer weighted input 49 | H - hidden layer activation 50 | y - output layer 51 | yHat - output layer predictions 52 | ''' 53 | 54 | # Hidden layer 55 | Zh = np.dot(X, Wh) + Bh 56 | H = relu(Zh) 57 | 58 | # Output layer 59 | Zo = np.dot(H, Wo) + Bo 60 | yHat = relu(Zo) 61 | return yHat 62 | 63 | def backprop(X, y, lr): 64 | 65 | yHat = feed_forward(X) 66 | 67 | # Layer Error 68 | Eo = (yHat - y) * relu_prime(Zo) 69 | Eh = np.dot(Eo, Wo.T) * relu_prime(Zh) 70 | 71 | # Cost derivative for weights 72 | dWo = np.dot(H.T, Eo) 73 | dWh = np.dot(X.T, Eh) 74 | 75 | # Cost derivative for bias 76 | dBo = np.sum(Eo, axis=0, keepdims=True) 77 | dBh = np.sum(Eh, axis=0, keepdims=True) 78 | 79 | # Update weights 80 | Wo -= lr * dWo 81 | Wh -= lr * dWh 82 | 83 | # Update biases 84 | Bo -= lr * dBo 85 | Bh -= lr * dBh 86 | 87 | 88 | -------------------------------------------------------------------------------- /code/nn_simple.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | 4 | def relu(z): 5 | return max(0,z) 6 | 7 | def feed_forward(x, Wh, Wo): 8 | # Hidden layer 9 | Zh = x * Wh 10 | H = relu(Zh) 11 | 12 | # Output layer 13 | Zo = H * Wo 14 | output = relu(Zo) 15 | return output 16 | 17 | def relu_prime(z): 18 | if z > 0: 19 | return 1 20 | return 0 21 | 22 | def cost(yHat, y): 23 | return 0.5 * (yHat - y)**2 24 | 25 | def cost_prime(yHat, y): 26 | return yHat - y 27 | 28 | def backprop(x, y, Wh, Wo, lr): 29 | yHat = feed_forward(x, Wh, Wo) 30 | 31 | # Layer Error 32 | Eo = (yHat - y) * relu_prime(Zo) 33 | Eh = Eo * Wo * relu_prime(Zh) 34 | 35 | # Cost derivative for weights 36 | dWo = Eo * H 37 | dWh = Eh * x 38 | 39 | # Update weights 40 | Wh -= lr * dWh 41 | Wo -= lr * dWo 42 | -------------------------------------------------------------------------------- /code/optimizers.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | 4 | 5 | def Adadelta(weights, sqrs, deltas, rho, batch_size): 6 | eps_stable = 1e-5 7 | for weight, sqr, delta in zip(weights, sqrs, deltas): 8 | g = weight.grad / batch_size 9 | sqr[:] = rho * sqr + (1. - rho) * nd.square(g) 10 | cur_delta = nd.sqrt(delta + eps_stable) / nd.sqrt(sqr + eps_stable) * g 11 | delta[:] = rho * delta + (1. - rho) * cur_delta * cur_delta 12 | # update weight in place. 13 | weight[:] -= cur_delta 14 | 15 | 16 | def Adagrad(data): 17 | gradient_sums = np.zeros(theta.shape[0]) 18 | for t in range(num_iterations): 19 | gradients = compute_gradients(data, weights) 20 | gradient_sums += gradients ** 2 21 | gradient_update = gradients / (np.sqrt(gradient_sums + epsilon)) 22 | weights = weights - lr * gradient_update 23 | return weights 24 | 25 | 26 | def Adam(data): 27 | pass 28 | 29 | 30 | def LBFGS(data): 31 | pass 32 | 33 | 34 | def RMSProp(data): 35 | pass 36 | 37 | 38 | def SGD(data, batch_size, lr): 39 | N = len(data) 40 | np.random.shuffle(data) 41 | for i in range(0, N, batch_size)]): 42 | mini_batches = np.array([data[i:i+batch_size] 43 | for X,y in mini_batches: 44 | backprop(X, y, lr) 45 | 46 | 47 | def SGD_Momentum(): 48 | pass 49 | 50 | -------------------------------------------------------------------------------- /code/random_forest_classifier.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_breast_cancer 2 | import numpy as np 3 | from collections import Counter 4 | import multiprocessing as mp 5 | import scipy 6 | import time 7 | 8 | # Basic ID3 Tree 9 | class id3_tree(): 10 | 'Implementation of ID3 Decision Tree in Python, majorly in NumPy' 11 | def __init__(self,least_children_num,verbose=True): 12 | self.least_children_num = least_children_num 13 | self.verbose = verbose 14 | 15 | def fit(self,tmp_x,tmp_y): 16 | def fit_tree(tmp_x,tmp_y): 17 | # Exit Condition 0: 18 | # Exit Condition 1: 19 | if \ 20 | len(tmp_y) < self.least_children_num or len(np.unique(tmp_y))==1: 21 | 22 | if self.verbose: 23 | print('exit condition:') 24 | print('tmp_y:') 25 | print(tmp_y) 26 | 27 | mode_val = self.mode(tmp_y.flatten().tolist()) 28 | return([np.nan, mode_val, np.nan, np.nan]) # Leaf Node: format [feat,splitval,] 29 | 30 | # Otherwise Split: 31 | if self.verbose: 32 | print("start....subset Y len {}".format(len(tmp_y))) 33 | 34 | 35 | split_row,split_col = self.decide_split_data(tmp_x,tmp_y) 36 | 37 | if not split_row and not split_col: 38 | mode_val = self.mode(tmp_y.flatten().tolist()) 39 | return([np.nan, mode_val, np.nan, np.nan]) 40 | 41 | if self.verbose: 42 | print("split on:") 43 | print(split_row,split_col) 44 | 45 | split_vec = tmp_x[:,split_col] 46 | split_val = tmp_x[split_row,split_col] 47 | # Recursively Split to left and right branches: 48 | left_ind = np.where(split_vec=split_val)[0].tolist() 50 | left_dat,left_y = tmp_x[left_ind,:],tmp_y[left_ind,] 51 | right_dat,right_y = tmp_x[right_ind,:],tmp_y[right_ind,] 52 | 53 | left_tree = fit_tree(left_dat,left_y) 54 | right_tree = fit_tree(right_dat,right_y) 55 | 56 | if isinstance(left_tree, list): # If list, tree len 1 57 | len_l_tree = 1 58 | else: 59 | len_l_tree = left_tree.shape[0] # If array, tree len >1 60 | 61 | root = [split_col,split_val,1,len_l_tree+1] # Format [split_col, split_val, left_tree_relative_idx, right_tree_relative_idx] 62 | return(np.vstack([root,left_tree,right_tree])) 63 | 64 | tree = fit_tree(tmp_x,tmp_y) 65 | self.tree = tree 66 | 67 | def decide_split_data(self,x,y): 68 | 'Given subset of X,Y, search for the best splitting node based on: information gain' 69 | def entropy(tmp_y): 70 | 'Key Metrics of building a decision tree. Specifically Shannon Entropy' 71 | tmp_ent = 0 72 | for uni_y in np.unique(tmp_y): 73 | p = len(tmp_y[tmp_y==uni_y])/len(tmp_y) 74 | tmp_ent -= (p*np.log2(p)) 75 | return tmp_ent 76 | 77 | m,n = x.shape 78 | best_gain = 0 79 | split_row, split_col = None,None 80 | 81 | previous_entropy = entropy(y) 82 | for col in range(n): 83 | tmp_vec = x[:,col].ravel() 84 | 85 | for row in range(m): 86 | val = tmp_vec[row] 87 | # >= & < is my convention here: 88 | if val!=np.max(tmp_vec) and val!= np.min(tmp_vec): 89 | left_b = np.where(tmp_vec=val)[0].tolist() 91 | 92 | # new entropy is the weighted average entropy from each of the subset 93 | new_ent = \ 94 | (len(y[left_b])/len(y))*entropy(y[left_b]) + \ 95 | (len(y[right_b])/len(y))*entropy(y[right_b]) 96 | 97 | info_gain = previous_entropy - new_ent 98 | 99 | if info_gain > best_gain: 100 | split_row, split_col = row,col 101 | best_gain = info_gain 102 | if self.verbose: 103 | print('better gain:{}'.format(best_gain)) 104 | print() 105 | 106 | return split_row, split_col 107 | 108 | def mode(self, x_list): 109 | 'calculate the mode' 110 | return Counter(x_list).most_common(1)[0][0] 111 | 112 | def predict(self, tmp_test_array): 113 | 'Wrap-up fun for prediction' 114 | def query(tree,tmp_test_array): 115 | 'Test for single example' 116 | assert len(tmp_test_array.shape) == 2, "Make sure your test data is 2d array" 117 | 118 | if isinstance(tree,list): 119 | start_node = tree # only the 1 row in data 120 | else: 121 | start_node = tree[0,:] # Iteratively hit first row 122 | test_feat,test_val,left_tree_jump,right_tree_jump = start_node[0],start_node[1],start_node[2],start_node[3] 123 | # Exit Condition: 124 | if np.isnan(test_feat) and np.isnan(left_tree_jump) and np.isnan(right_tree_jump): 125 | pred = test_val 126 | return pred 127 | #Test: 128 | if tmp_test_array[0,int(test_feat)] < test_val: 129 | # If <, go left branch: 130 | jump_loc = left_tree_jump 131 | pred = query(tree[int(jump_loc):,],tmp_test_array) 132 | else: 133 | # If >=, go right branch: 134 | jump_loc = right_tree_jump 135 | pred = query(tree[int(jump_loc):,],tmp_test_array) 136 | return pred 137 | assert len(tmp_test_array.shape) == 2, "Make sure your test data is 2d array" 138 | result = [] 139 | for i in range(tmp_test_array.shape[0]): 140 | inp = tmp_test_array[i,:].reshape(1,-1) 141 | result.append(query(self.tree,inp)) 142 | return result 143 | 144 | 145 | 146 | # RF using ID-3 tree: 147 | class RandomForestClassification(): 148 | """ 149 | Python inplementation of random forest classifier 150 | using id3 as the base tree 151 | with parallel processing 152 | """ 153 | def __init__ ( 154 | self, 155 | n_tree, 156 | min_leaf_num, # to control overfit 157 | criteria = 'entropy', # currently only support entropy 158 | max_features = 'auto',# if max_feature = sqrt(number of features), otherwise will be proportion of features sampled 159 | n_workers = 1, 160 | verbose = True 161 | 162 | ): 163 | self.n_tree = n_tree 164 | self.min_leaf_num = min_leaf_num 165 | self.criteria = criteria 166 | self.max_features = max_features 167 | self.n_workers = n_workers 168 | self.verbose = verbose 169 | 170 | 171 | def fit_single(self,data): 172 | """ 173 | Single ID3 Tree Fitting 174 | """ 175 | X = data[0] 176 | y = data[1] 177 | tmp_X,tmp_y,feat_choose = self.random_find_feature(X,y) 178 | model = id3_tree(least_children_num = self.min_leaf_num,verbose=False) 179 | model.fit(tmp_X,tmp_y) 180 | return model,feat_choose 181 | 182 | def fit_rf(self,X,y): 183 | """ 184 | Forest 185 | """ 186 | data = [X,y] 187 | with mp.Pool(self.n_workers) as p: 188 | model_list = p.map(self.fit_single,[data]*self.n_tree) 189 | 190 | self.model_list = model_list 191 | 192 | 193 | def predict_rf(self,X): 194 | """ 195 | Forest Prediction 196 | taking the vote of each tree 197 | """ 198 | result_list = [] 199 | for model_stuff in self.model_list: 200 | print('.') 201 | single_model,single_feat_choose = model_stuff 202 | 203 | res = single_model.predict(X[:,single_feat_choose]) 204 | result_list.append(res) 205 | 206 | return scipy.stats.mode(np.array(result_list),axis=0).mode.tolist()[0] # Take the vote 207 | 208 | 209 | def random_find_feature(self,X,y): 210 | """ 211 | Randomly select subset of features for each tree 212 | """ 213 | 214 | if self.max_features == 'auto': 215 | n_feat_dat = X.shape[1] 216 | n_feat_choose = int(round(np.sqrt(n_feat_dat))) 217 | else: 218 | n_feat_dat = X.shape[1] 219 | n_feat_choose = int(n_feat_dat*self.max_features) 220 | 221 | feat_choose = np.random.choice(range(n_feat_dat),size=n_feat_choose,replace=False).tolist() 222 | feat_choose = sorted(feat_choose) # Important to sort this in order otherwise will confuse the model 223 | print("feat_chosen:{}".format(feat_choose)) 224 | return X[:,feat_choose],y,feat_choose 225 | 226 | 227 | if __name__ == "__main__": 228 | # ID3: only categorical features 229 | from sklearn.model_selection import train_test_split 230 | from sklearn.metrics import classification_report 231 | from sklearn import datasets 232 | from sklearn.tree import DecisionTreeClassifier 233 | dataset = datasets.load_iris() 234 | all_categorical_feature = True 235 | 236 | # convert continuous feature to categorical features 237 | if all_categorical_feature: 238 | f = lambda x: int(x) 239 | func = np.vectorize(f) 240 | X = func(dataset.data) 241 | else: 242 | X = dataset.data 243 | 244 | Y = dataset.target 245 | X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.8) 246 | # config 247 | max_depth = 3 248 | min_sample_leaf = 4 249 | 250 | model = DecisionTreeClassifier(criterion="entropy", max_depth=max_depth, min_samples_leaf=min_sample_leaf) 251 | model.fit(X_train, y_train) 252 | y_pred = model.predict(X_test) 253 | print(classification_report(y_true=y_test, y_pred=y_pred)) 254 | # 255 | # model = ID3DecisionTree(max_depth=max_depth, min_sample_leaf=min_sample_leaf, verbose=True) 256 | # model = C45DecisionTree(max_depth=max_depth, min_sample_leaf=min_sample_leaf, verbose=True) 257 | model = RandomForestClassification( 258 | n_tree=5, 259 | min_leaf_num=min_sample_leaf, 260 | n_workers=5 261 | ) 262 | model.fit_rf(X_train, y_train) 263 | y_pred = model.predict_rf(X_test) 264 | print(classification_report(y_true=y_test, y_pred=y_pred)) 265 | 266 | 267 | -------------------------------------------------------------------------------- /code/rnn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | #from torch.autograd import Variable 4 | 5 | class RNN(nn.Module): 6 | def __init__(self, n_classes): 7 | super().__init__() 8 | self.hid_fc = nn.Linear(185, 128) 9 | self.out_fc = nn.Linear(185, n_classes) 10 | self.softmax = nn.LogSoftmax() 11 | 12 | def forward(self, inputs, hidden): 13 | inputs = inputs.view(1,-1) 14 | combined = torch.cat([inputs, hidden], dim=1) 15 | hid_out = self.hid_fc(combined) 16 | out = self.out_fc(combined) 17 | out = self.softmax(out) 18 | return out, hid_out 19 | 20 | def train(model, inputs, targets): 21 | for i in range(len(inputs)): 22 | target = Variable(targets[i]) 23 | name = inputs[i] 24 | hidden = Variable(torch.zeros(1,128)) 25 | model.zero_grad() 26 | 27 | for char in name: 28 | input_ = Variable(torch.FloatTensor(char)) 29 | pred, hidden = model(input_, hidden) 30 | 31 | loss = criterion(pred, target) 32 | loss.backward() 33 | 34 | for p in model.parameters(): 35 | p.data.add_(-.001, p.grad.data) 36 | -------------------------------------------------------------------------------- /code/vae.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | 5 | 6 | class VAE(nn.Module): 7 | def __init__(self, in_shape, n_latent): 8 | super().__init__() 9 | self.in_shape = in_shape 10 | self.n_latent = n_latent 11 | c,h,w = in_shape 12 | self.z_dim = h//2**2 # receptive field downsampled 2 times 13 | self.encoder = nn.Sequential( 14 | nn.BatchNorm2d(c), 15 | nn.Conv2d(c, 32, kernel_size=4, stride=2, padding=1), # 32, 16, 16 16 | nn.BatchNorm2d(32), 17 | nn.LeakyReLU(), 18 | nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=1), # 32, 8, 8 19 | nn.BatchNorm2d(64), 20 | nn.LeakyReLU(), 21 | ) 22 | self.z_mean = nn.Linear(64 * self.z_dim**2, n_latent) 23 | self.z_var = nn.Linear(64 * self.z_dim**2, n_latent) 24 | self.z_develop = nn.Linear(n_latent, 64 * self.z_dim**2) 25 | self.decoder = nn.Sequential( 26 | nn.ConvTranspose2d(64, 32, kernel_size=3, stride=2, padding=0), 27 | nn.BatchNorm2d(32), 28 | nn.ReLU(), 29 | nn.ConvTranspose2d(32, 1, kernel_size=3, stride=2, padding=1), 30 | CenterCrop(h,w), 31 | nn.Sigmoid() 32 | ) 33 | 34 | def sample_z(self, mean, logvar): 35 | stddev = torch.exp(0.5 * logvar) 36 | noise = Variable(torch.randn(stddev.size())) 37 | return (noise * stddev) + mean 38 | 39 | def encode(self, x): 40 | x = self.encoder(x) 41 | x = x.view(x.size(0), -1) 42 | mean = self.z_mean(x) 43 | var = self.z_var(x) 44 | return mean, var 45 | 46 | def decode(self, z): 47 | out = self.z_develop(z) 48 | out = out.view(z.size(0), 64, self.z_dim, self.z_dim) 49 | out = self.decoder(out) 50 | return out 51 | 52 | def forward(self, x): 53 | mean, logvar = self.encode(x) 54 | z = self.sample_z(mean, logvar) 55 | out = self.decode(z) 56 | return out, mean, logvar 57 | 58 | 59 | def vae_loss(output, input, mean, logvar, loss_func): 60 | recon_loss = loss_func(output, input) 61 | kl_loss = torch.mean(0.5 * torch.sum( 62 | torch.exp(logvar) + mean**2 - 1. - logvar, 1)) 63 | return recon_loss + kl_loss 64 | 65 | def train(model, loader, loss_func, optimizer): 66 | model.train() 67 | for inputs, _ in loader: 68 | inputs = Variable(inputs) 69 | 70 | output, mean, logvar = model(inputs) 71 | loss = vae_loss(output, inputs, mean, logvar, loss_func) 72 | 73 | optimizer.zero_grad() 74 | loss.backward() 75 | optimizer.step() 76 | -------------------------------------------------------------------------------- /docs/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "git.ignoreLimitWarning": true 3 | } -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = AIGlossary 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | livehtml: 22 | sphinx-autobuild . ../docs/_build/html 23 | -------------------------------------------------------------------------------- /docs/_static/theme_overrides.css: -------------------------------------------------------------------------------- 1 | /* override table width restrictions */ 2 | @media screen and (min-width: 767px) { 3 | 4 | .wy-table-responsive table td { 5 | /* !important prevents the common CSS stylesheets from overriding 6 | this as on RTD they are loaded after this stylesheet */ 7 | white-space: normal !important; 8 | } 9 | 10 | .wy-table-responsive { 11 | overflow: visible !important; 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /docs/applications.rst: -------------------------------------------------------------------------------- 1 | .. _applications: 2 | 3 | ============ 4 | Applications 5 | ============ 6 | 7 | .. contents:: :local: 8 | 9 | 10 | Anomaly Detection 11 | ================= 12 | 13 | Be the first to `contribute! `__ 14 | 15 | 16 | 17 | Computer Vision 18 | =============== 19 | 20 | Classification 21 | -------------- 22 | 23 | Be the first to `contribute! `__ 24 | 25 | Object Detection 26 | ---------------- 27 | 28 | Be the first to `contribute! `__ 29 | 30 | Segmentation 31 | ------------ 32 | 33 | Be the first to `contribute! `__ 34 | 35 | 36 | 37 | Natural Language 38 | ================ 39 | 40 | Dialog Systems 41 | -------------- 42 | 43 | Be the first to `contribute! `__ 44 | 45 | Machine Translation 46 | ------------------- 47 | 48 | Be the first to `contribute! `__ 49 | 50 | Speech Recognition 51 | ------------------ 52 | 53 | Be the first to `contribute! `__ 54 | 55 | Text Summarization 56 | ------------------ 57 | 58 | Be the first to `contribute! `__ 59 | 60 | Question Answering 61 | ------------------ 62 | 63 | Be the first to `contribute! `__ 64 | 65 | 66 | 67 | Recommender Systems 68 | =================== 69 | 70 | Be the first to `contribute! `__ 71 | 72 | 73 | 74 | Time-Series 75 | =========== 76 | 77 | Be the first to `contribute! `__ 78 | 79 | 80 | .. rubric:: References 81 | 82 | .. [1] Example Reference 83 | -------------------------------------------------------------------------------- /docs/backpropagation.rst: -------------------------------------------------------------------------------- 1 | .. _backpropagation: 2 | 3 | =============== 4 | Backpropagation 5 | =============== 6 | 7 | .. contents:: :local: 8 | 9 | The goals of backpropagation are straightforward: adjust each weight in the network in proportion to how much it contributes to overall error. If we iteratively reduce each weight's error, eventually we’ll have a series of weights that produce good predictions. 10 | 11 | 12 | Chain rule refresher 13 | ==================== 14 | 15 | As seen above, foward propagation can be viewed as a long series of nested equations. If you think of feed forward this way, then backpropagation is merely an application of :ref:`chain_rule` to find the :ref:`derivative` of cost with respect to any variable in the nested equation. Given a forward propagation function: 16 | 17 | .. math:: 18 | 19 | f(x) = A(B(C(x))) 20 | 21 | A, B, and C are activation functions at different layers. Using the chain rule we easily calculate the derivative of :math:`f(x)` with respect to :math:`x`: 22 | 23 | .. math:: 24 | 25 | f'(x) = f'(A) \cdot A'(B) \cdot B'(C) \cdot C'(x) 26 | 27 | How about the derivative with respect to B? To find the derivative with respect to B you can pretend :math:`B(C(x))` is a constant, replace it with a placeholder variable B, and proceed to find the derivative normally with respect to B. 28 | 29 | .. math:: 30 | 31 | f'(B) = f'(A) \cdot A'(B) 32 | 33 | This simple technique extends to any variable within a function and allows us to precisely pinpoint the exact impact each variable has on the total output. 34 | 35 | 36 | 37 | Applying the chain rule 38 | ======================= 39 | 40 | Let's use the chain rule to calculate the derivative of cost with respect to any weight in the network. The chain rule will help us identify how much each weight contributes to our overall error and the direction to update each weight to reduce our error. Here are the equations we need to make a prediction and calculate total error, or cost: 41 | 42 | .. image:: images/backprop_ff_equations.png 43 | :align: center 44 | 45 | Given a network consisting of a single neuron, total cost could be calculated as: 46 | 47 | .. math:: 48 | 49 | Cost = C(R(Z(X W))) 50 | 51 | Using the chain rule we can easily find the derivative of Cost with respect to weight W. 52 | 53 | .. math:: 54 | 55 | C'(W) &= C'(R) \cdot R'(Z) \cdot Z'(W) \\ 56 | &= (\hat{y} -y) \cdot R'(Z) \cdot X 57 | 58 | Now that we have an equation to calculate the derivative of cost with respect to any weight, let's go back to our toy neural network example above 59 | 60 | .. image:: images/simple_nn_diagram_zo_zh_defined.png 61 | :align: center 62 | 63 | What is the derivative of cost with respect to :math:`W_o`? 64 | 65 | .. math:: 66 | 67 | C'(W_O) &= C'(\hat{y}) \cdot \hat{y}'(Z_O) \cdot Z_O'(W_O) \\ 68 | &= (\hat{y} - y) \cdot R'(Z_O) \cdot H 69 | 70 | And how about with respect to :math:`W_h`? To find out we just keep going further back in our function applying the chain rule recursively until we get to the function that has the Wh term. 71 | 72 | .. math:: 73 | 74 | C'(W_h) &= C'(\hat{y}) \cdot O'(Z_o) \cdot Z_o'(H) \cdot H'(Z_h) \cdot Z_h'(W_h) \\ 75 | &= (\hat{y} - y) \cdot R'(Z_o) \cdot W_o \cdot R'(Z_h) \cdot X 76 | 77 | And just for fun, what if our network had 10 hidden layers. What is the derivative of cost for the first weight :math:`w_1`? 78 | 79 | .. math:: 80 | 81 | C'(w_1) = \frac{dC}{d\hat{y}} \cdot \frac{d\hat{y}}{dZ_{11}} \cdot \frac{dZ_{11}}{dH_{10}} \cdot \\ \frac{dH_{10}}{dZ_{10}} \cdot \frac{dZ_{10}}{dH_9} \cdot \frac{dH_9}{dZ_9} \cdot \frac{dZ_9}{dH_8} \cdot \frac{dH_8}{dZ_8} \cdot \frac{dZ_8}{dH_7} \cdot \frac{dH_7}{dZ_7} \cdot \\ \frac{dZ_7}{dH_6} \cdot \frac{dH_6}{dZ_6} \cdot \frac{dZ_6}{dH_5} \cdot \frac{dH_5}{dZ_5} \cdot \frac{dZ_5}{dH_4} \cdot \frac{dH_4}{dZ_4} \cdot \frac{dZ_4}{dH_3} \cdot \\ \frac{dH_3}{dZ_3} \cdot \frac{dZ_3}{dH_2} \cdot \frac{dH_2}{dZ_2} \cdot \frac{dZ_2}{dH_1} \cdot \frac{dH_1}{dZ_1} \cdot \frac{dZ_1}{dW_1} 82 | 83 | See the pattern? The number of calculations required to compute cost derivatives increases as our network grows deeper. Notice also the redundancy in our derivative calculations. Each layer's cost derivative appends two new terms to the terms that have already been calculated by the layers above it. What if there was a way to save our work somehow and avoid these duplicate calculations? 84 | 85 | 86 | 87 | Saving work with memoization 88 | ============================ 89 | 90 | Memoization is a computer science term which simply means: don’t recompute the same thing over and over. In memoization we store previously computed results to avoid recalculating the same function. It's handy for speeding up recursive functions of which backpropagation is one. Notice the pattern in the derivative equations below. 91 | 92 | 93 | .. image:: images/memoization.png 94 | :align: center 95 | 96 | Each of these layers is recomputing the same derivatives! Instead of writing out long derivative equations for every weight, we can use memoization to save our work as we backprop error through the network. To do this, we define 3 equations (below), which together encapsulate all the calculations needed for backpropagation. The math is the same, but the equations provide a nice shorthand we can use to track which calculations we've already performed and save our work as we move backwards through the network. 97 | 98 | .. image:: images/backprop_3_equations.png 99 | :align: center 100 | 101 | We first calculate the output layer error and pass the result to the hidden layer before it. After calculating the hidden layer error, we pass its error value back to the previous hidden layer before it. And so on and so forth. As we move back through the network we apply the 3rd formula at every layer to calculate the derivative of cost with respect that layer's weights. This resulting derivative tells us in which direction to adjust our weights to reduce overall cost. 102 | 103 | .. note:: 104 | 105 | The term *layer error* refers to the derivative of cost with respect to a layer's *input*. It answers the question: how does the cost function output change when the input to that layer changes? 106 | 107 | .. rubric:: Output layer error 108 | 109 | To calculate output layer error we need to find the derivative of cost with respect to the output layer input, :math:`Z_o`. It answers the question — how are the final layer's weights impacting overall error in the network? The derivative is then: 110 | 111 | .. math:: 112 | 113 | C'(Z_o) = (\hat{y} - y) \cdot R'(Z_o) 114 | 115 | To simplify notation, ml practitioners typically replace the :math:`(\hat{y}-y) * R'(Zo)` sequence with the term :math:`E_o`. So our formula for output layer error equals: 116 | 117 | .. math:: 118 | 119 | E_o = (\hat{y} - y) \cdot R'(Z_o) 120 | 121 | .. rubric:: Hidden layer error 122 | 123 | To calculate hidden layer error we need to find the derivative of cost with respect to the hidden layer input, Zh.  124 | 125 | .. math:: 126 | 127 | C'(Z_h) = (\hat{y} - y) \cdot R'(Z_o) \cdot W_o \cdot R'(Z_h) 128 | 129 | Next we can swap in the :math:`E_o` term above to avoid duplication and create a new simplified equation for Hidden layer error: 130 | 131 | .. math:: 132 | 133 | E_h = E_o \cdot W_o \cdot R'(Z_h) 134 | 135 | This formula is at the core of backpropagation. We calculate the current layer's error, and pass the weighted error back to the previous layer, continuing the process until we arrive at our first hidden layer. Along the way we update the weights using the derivative of cost with respect to each weight. 136 | 137 | .. rubric:: Derivative of cost with respect to any weight 138 | 139 | Let’s return to our formula for the derivative of cost with respect to the output layer weight :math:`W_o`.  140 | 141 | .. math:: 142 | 143 | C'(W_O) = (\hat{y} - y) \cdot R'(Z_O) \cdot H 144 | 145 | We know we can replace the first part with our equation for output layer error :math:`E_o`. H represents the hidden layer activation. 146 | 147 | .. math:: 148 | 149 | C'(W_o) = E_o \cdot H 150 | 151 | So to find the derivative of cost with respect to any weight in our network, we simply multiply the corresponding layer's error times its input (the previous layer's output). 152 | 153 | .. math:: 154 | 155 | C'(w) = CurrentLayerError \cdot CurrentLayerInput 156 | 157 | .. note:: 158 | 159 | *Input* refers to the activation from the previous layer, not the weighted input, Z. 160 | 161 | .. rubric:: Summary 162 | 163 | Here are the final 3 equations that together form the foundation of backpropagation. 164 | 165 | .. image:: images/backprop_final_3_deriv_equations.png 166 | :align: center 167 | 168 | Here is the process visualized using our toy neural network example above. 169 | 170 | .. image:: images/backprop_visually.png 171 | :align: center 172 | 173 | Code example 174 | ============ 175 | 176 | .. literalinclude:: ../code/nn_simple.py 177 | :language: python 178 | :lines: 17-41 179 | 180 | 181 | 182 | .. rubric:: References 183 | 184 | .. [1] Example 185 | -------------------------------------------------------------------------------- /docs/build.bat: -------------------------------------------------------------------------------- 1 | @echo OFF 2 | 3 | set SPHINXOPTS=" " 4 | set SPHINXBUILD=sphinx-build 5 | set SOURCEDIR=. 6 | set BUILDDIR=_build/html 7 | 8 | 9 | if "%1"=="" ( 10 | echo "Usage : build.bat html" 11 | ) else ( 12 | %SPHINXBUILD% -b "%1" %SOURCEDIR% %BUILDDIR% 13 | ) 14 | 15 | -------------------------------------------------------------------------------- /docs/clustering_algos.rst: -------------------------------------------------------------------------------- 1 | .. _clustering_algos: 2 | 3 | ===================== 4 | Clustering Algorithms 5 | ===================== 6 | 7 | 8 | Centroid 9 | ======== 10 | 11 | Be the first to `contribute! `__ 12 | 13 | Density 14 | ======= 15 | 16 | Be the first to `contribute! `__ 17 | 18 | Distribution 19 | ============ 20 | 21 | Be the first to `contribute! `__ 22 | 23 | Hierarchical 24 | ============ 25 | 26 | Be the first to `contribute! `__ 27 | 28 | K-Means 29 | ======== 30 | 31 | Be the first to `contribute! `__ 32 | 33 | Mean shift 34 | ========== 35 | 36 | Be the first to `contribute! `__ 37 | 38 | 39 | .. rubric:: References 40 | 41 | .. [1] https://en.wikipedia.org/wiki/Cluster_analysis 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # AI Glossary documentation build configuration file, created by 5 | # sphinx-quickstart on Tue Apr 11 17:53:13 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | # import os 21 | # import sys 22 | # sys.path.insert(0, os.path.abspath('.')) 23 | 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = ['sphinx.ext.mathjax', 35 | 'sphinx.ext.githubpages'] 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ['_templates'] 39 | 40 | # The suffix(es) of source filenames. 41 | # You can specify multiple suffix as a list of string: 42 | # 43 | # source_suffix = ['.rst', '.md'] 44 | source_suffix = '.rst' 45 | 46 | # The master toctree document. 47 | master_doc = 'index' 48 | 49 | # General information about the project. 50 | project = 'ML Glossary' 51 | copyright = '2017' 52 | author = 'Team' 53 | 54 | # The version info for the project you're documenting, acts as replacement for 55 | # |version| and |release|, also used in various other places throughout the 56 | # built documents. 57 | # 58 | # The short X.Y version. 59 | version = '' 60 | # The full version, including alpha/beta/rc tags. 61 | release = '' 62 | 63 | # The language for content autogenerated by Sphinx. Refer to documentation 64 | # for a list of supported languages. 65 | # 66 | # This is also used if you do content translation via gettext catalogs. 67 | # Usually you set "language" from the command line for these cases. 68 | language = None 69 | 70 | # List of patterns, relative to source directory, that match files and 71 | # directories to ignore when looking for source files. 72 | # This patterns also effect to html_static_path and html_extra_path 73 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 74 | 75 | 76 | # The name of the Pygments (syntax highlighting) style to use. 77 | pygments_style = 'sphinx' 78 | 79 | # If true, `todo` and `todoList` produce output, else they produce nothing. 80 | todo_include_todos = False 81 | 82 | 83 | # -- Options for HTML output ---------------------------------------------- 84 | 85 | # The theme to use for HTML and HTML Help pages. See the documentation for 86 | # a list of builtin themes. 87 | # 88 | import sphinx_rtd_theme 89 | 90 | html_theme = "sphinx_rtd_theme" #'alabaster' 91 | 92 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 93 | 94 | # Theme options are theme-specific and customize the look and feel of a theme 95 | # further. For a list of options available for each theme, see the 96 | # documentation. 97 | # 98 | # html_theme_options = {} 99 | 100 | # Add any paths that contain custom static files (such as style sheets) here, 101 | # relative to this directory. They are copied after the builtin static files, 102 | # so a file named "default.css" will overwrite the builtin "default.css". 103 | html_static_path = ['_static'] 104 | 105 | 106 | # -- Options for HTMLHelp output ------------------------------------------ 107 | 108 | # Output file base name for HTML help builder. 109 | htmlhelp_basename = 'MLCheatsheetdoc' 110 | 111 | 112 | # -- Options for LaTeX output --------------------------------------------- 113 | 114 | latex_elements = { 115 | # The paper size ('letterpaper' or 'a4paper'). 116 | # 117 | # 'papersize': 'letterpaper', 118 | 119 | # The font size ('10pt', '11pt' or '12pt'). 120 | # 121 | # 'pointsize': '10pt', 122 | 123 | # Additional stuff for the LaTeX preamble. 124 | # 125 | # 'preamble': '', 126 | 127 | # Latex figure (float) alignment 128 | # 129 | # 'figure_align': 'htbp', 130 | } 131 | 132 | # Grouping the document tree into LaTeX files. List of tuples 133 | # (source start file, target name, title, 134 | # author, documentclass [howto, manual, or own class]). 135 | latex_documents = [ 136 | (master_doc, 'MLCheatsheet.tex', 'ML Cheatsheet Documentation', 137 | 'Team', 'manual'), 138 | ] 139 | 140 | 141 | # -- Options for manual page output --------------------------------------- 142 | 143 | # One entry per manual page. List of tuples 144 | # (source start file, name, description, authors, manual section). 145 | man_pages = [ 146 | (master_doc, 'mlcheatsheet', 'Machine Learning Cheatsheet Documentation', 147 | [author], 1) 148 | ] 149 | 150 | 151 | # -- Options for Texinfo output ------------------------------------------- 152 | 153 | # Grouping the document tree into Texinfo files. List of tuples 154 | # (source start file, target name, title, author, 155 | # dir menu entry, description, category) 156 | texinfo_documents = [ 157 | (master_doc, 'MLCheatsheet', 'ML Cheatsheet Documentation', 158 | author, 'Contributors', 'Glossary of machine learning terms and concepts.', 159 | 'Miscellaneous'), 160 | ] 161 | 162 | 163 | 164 | # -- Options for Epub output ---------------------------------------------- 165 | 166 | # Bibliographic Dublin Core info. 167 | epub_title = project 168 | epub_author = author 169 | epub_publisher = author 170 | epub_copyright = copyright 171 | 172 | # The unique identifier of the text. This can be a ISBN number 173 | # or the project homepage. 174 | # 175 | # epub_identifier = '' 176 | 177 | # A unique identification for the text. 178 | # 179 | # epub_uid = '' 180 | 181 | # A list of files that should not be packed into the epub file. 182 | epub_exclude_files = ['search.html'] 183 | 184 | 185 | from recommonmark.parser import CommonMarkParser 186 | 187 | source_parsers = { 188 | '.md': CommonMarkParser, 189 | } 190 | 191 | source_suffix = ['.rst', '.md'] 192 | 193 | html_theme_options = { 194 | 'collapse_navigation': False, 195 | 'display_version': False 196 | # 'logo_only': True, 197 | } 198 | 199 | def setup(app): 200 | #app.add_stylesheet('theme_overrides.css') 201 | app.add_css_file('theme_overrides.css') 202 | 203 | #html_context = { 204 | # 'css_files': [ 205 | # '_static/theme_overrides.css', # override wide tables in RTD theme 206 | # ], 207 | # } 208 | -------------------------------------------------------------------------------- /docs/contribute.rst: -------------------------------------------------------------------------------- 1 | .. _contribute: 2 | 3 | ========== 4 | Contribute 5 | ========== 6 | 7 | Become a contributor! Check out our `github `_ for more information. 8 | -------------------------------------------------------------------------------- /docs/figures/SimpleDiagram3_neural_networks.sdxml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/figures/SimpleDiagram3_neural_networks.sdxml -------------------------------------------------------------------------------- /docs/figures/activation_function_table.tgn: -------------------------------------------------------------------------------- 1 | {"rows_views":[[{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}]],"model":{"rows":[[{"value":"Function","cspan":1,"rspan":1},{"value":"Derivative","cspan":1,"rspan":1}],[{"value":".. math::\n      r(x) = x + 1","cspan":1,"rspan":1},{"value":".. math::\n       r(x) = x + 1","cspan":1,"rspan":1}],[{"value":".. image:: images/sigmoid.png\n      :align: center\n      :width: 256 px\n      :height: 256 px","cspan":1,"rspan":1},{"value":".. image:: images/sigmoid_prime.png\n      :align: center\n      :width: 256 px\n      :height: 256 px","cspan":1,"rspan":1}],[{"value":".. literalinclude:: ../code/activation_functions.py\n      :pyobject: sigmoid","cspan":1,"rspan":1},{"value":".. literalinclude:: ../code/activation_functions.py\n      :pyobject: sigmoid_prime\n","cspan":1,"rspan":1}]]},"theme":null,"fixed_layout":false} -------------------------------------------------------------------------------- /docs/figures/calculus_symbol_table.tgn: -------------------------------------------------------------------------------- 1 | {"rows_views":[[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}]],"model":{"rows":[[{"value":"Symbol","cspan":1,"rspan":1},{"value":"Name","cspan":1,"rspan":1},{"value":"Description","cspan":1,"rspan":1},{"value":"Example","cspan":1,"rspan":1}],[{"value":"x'","cspan":1,"rspan":1},{"value":"derivative","cspan":1,"rspan":1},{"value":"first derivative","cspan":1,"rspan":1},{"value":"(x^2)' = 2x","cspan":1,"rspan":1}],[{"value":"x''","cspan":1,"rspan":1},{"value":"second derivative","cspan":1,"rspan":1},{"value":"second derivative","cspan":1,"rspan":1},{"value":"(x^2)'' = 2","cspan":1,"rspan":1}],[{"value":"lim(x-->0)","cspan":1,"rspan":1},{"value":"limit","cspan":1,"rspan":1},{"value":"function value as x approaches 0","cspan":1,"rspan":1},{"value":"","cspan":1,"rspan":1}],[{"value":"∇","cspan":1,"rspan":1},{"value":"nabla","cspan":1,"rspan":1},{"value":"gradient","cspan":1,"rspan":1},{"value":"∇f(a,b,c)","cspan":1,"rspan":1}]]},"theme":null,"fixed_layout":false} -------------------------------------------------------------------------------- /docs/figures/forward_prop_matrix_dimensions_table.tgn: -------------------------------------------------------------------------------- 1 | {"rows_views":[[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}]],"model":{"rows":[[{"value":"**Var**","cspan":1,"rspan":1},{"value":"**Name**","cspan":1,"rspan":1},{"value":"**Dimensions**","cspan":1,"rspan":1},{"value":"**Explanation**","cspan":1,"rspan":1}],[{"value":"``X``","cspan":1,"rspan":1},{"value":"Input","cspan":1,"rspan":1},{"value":"(3, 1)","cspan":1,"rspan":1},{"value":"Includes 3 rows of training data, and each row has 1 attribute (height, price, etc.)","cspan":1,"rspan":1}],[{"value":"``Wh``","cspan":1,"rspan":1},{"value":"Hidden weights","cspan":1,"rspan":1},{"value":"(1, 2)","cspan":1,"rspan":1},{"value":"These dimensions are based on number of rows equals the number of attributes for the observations in our training set. The number columns equals the number of neurons in the hidden layer. The dimensions of the weights matrix between two layers is determined by the sizes of the two layers it connects. There is one weight for every input-to-neuron connection between the layers.","cspan":1,"rspan":1}],[{"value":"``Bh``","cspan":1,"rspan":1},{"value":"Hidden bias","cspan":1,"rspan":1},{"value":"(1, 2)","cspan":1,"rspan":1},{"value":"Each neuron in the hidden layer has is own bias constant. This bias matrix is added to the weighted input matrix before the hidden layer applies ReLU.","cspan":1,"rspan":1}],[{"value":"``Zh``","cspan":1,"rspan":1},{"value":"Hidden weighted input","cspan":1,"rspan":1},{"value":"(1, 2)","cspan":1,"rspan":1},{"value":"Computed by taking the dot product of X and Wh. The dimensions (1,2) are required by the rules of matrix multiplication. Zh takes the rows of in the inputs matrix and the columns of weights matrix. We then add the hidden layer bias matrix Bh.","cspan":1,"rspan":1}],[{"value":"``H``","cspan":1,"rspan":1},{"value":"Hidden activations","cspan":1,"rspan":1},{"value":"(3, 2)","cspan":1,"rspan":1},{"value":"Computed by applying the Relu function to Zh. The dimensions are (3,2) — the number of rows matches the number of training samples and the number of columns equals the number of neurons. Each column holds all the activations for a specific neuron.","cspan":1,"rspan":1}],[{"value":"``Wo``","cspan":1,"rspan":1},{"value":"Output weights","cspan":1,"rspan":1},{"value":"(2, 2)","cspan":1,"rspan":1},{"value":"The number of rows matches the number of hidden layer neurons and the number of columns equals the number of output layer neurons. There is one weight for every hidden-neuron-to-output-neuron connection between the layers.","cspan":1,"rspan":1}],[{"value":"``Bo``","cspan":1,"rspan":1},{"value":"Output bias","cspan":1,"rspan":1},{"value":"(1, 2)","cspan":1,"rspan":1},{"value":"There is one column for every neuron in the output layer.","cspan":1,"rspan":1}],[{"value":"``Zo``","cspan":1,"rspan":1},{"value":"Output weighted input","cspan":1,"rspan":1},{"value":"(3, 2)","cspan":1,"rspan":1},{"value":"Computed by taking the dot product of H and Wo and then adding the output layer bias Bo. The dimensions are (3,2) representing the rows of in the hidden layer matrix and the columns of output layer weights matrix.","cspan":1,"rspan":1}],[{"value":"``O``","cspan":1,"rspan":1},{"value":"Output activations","cspan":1,"rspan":1},{"value":"(3, 2)","cspan":1,"rspan":1},{"value":"Each row represents a prediction for a single observation in our training set. Each column is a unique attribute we want to predict. Examples of two-column output predictions could be a company's sales and units sold, or a person's height and weight.","cspan":1,"rspan":1}]]},"theme":null,"fixed_layout":false} -------------------------------------------------------------------------------- /docs/figures/linear_regression_companies_sales.tgn: -------------------------------------------------------------------------------- 1 | {"rows_views":[[{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}}]],"model":{"rows":[[{"value":"Company","cspan":1,"rspan":1},{"value":"TV","cspan":1,"rspan":1},{"value":"Radio","cspan":1,"rspan":1},{"value":"News","cspan":1,"rspan":1},{"value":"Units","cspan":1,"rspan":1}],[{"value":"Amazon","cspan":1,"rspan":1},{"value":"230.1","cspan":1,"rspan":1},{"value":"37.8","cspan":1,"rspan":1},{"value":"69.1","cspan":1,"rspan":1},{"value":"22.1","cspan":1,"rspan":1}],[{"value":"Google","cspan":1,"rspan":1},{"value":"44.5","cspan":1,"rspan":1},{"value":"39.3","cspan":1,"rspan":1},{"value":"23.1","cspan":1,"rspan":1},{"value":"10.4","cspan":1,"rspan":1}],[{"value":"Facebook","cspan":1,"rspan":1},{"value":"17.2","cspan":1,"rspan":1},{"value":"45.9","cspan":1,"rspan":1},{"value":"34.7","cspan":1,"rspan":1},{"value":"18.3","cspan":1,"rspan":1}],[{"value":"Apple","cspan":1,"rspan":1},{"value":"151.5","cspan":1,"rspan":1},{"value":"41.3","cspan":1,"rspan":1},{"value":"13.2","cspan":1,"rspan":1},{"value":"18.5","cspan":1,"rspan":1}]]},"theme":null,"fixed_layout":false} -------------------------------------------------------------------------------- /docs/figures/linearalgebra.tgn: -------------------------------------------------------------------------------- 1 | {"rows_views":[[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}]],"model":{"rows":[[{"value":"Symbol","cspan":1,"rspan":1},{"value":"Name","cspan":1,"rspan":1},{"value":"Description","cspan":1,"rspan":1},{"value":"Example","cspan":1,"rspan":1}],[{"value":"[ ] ","cspan":1,"rspan":1},{"value":"brackets","cspan":1,"rspan":1},{"value":"matrix or vector","cspan":1,"rspan":1},{"value":"v = [1 3 5]","cspan":1,"rspan":1}],[{"value":"\\cdot","cspan":1,"rspan":1},{"value":"dot","cspan":1,"rspan":1},{"value":"dot product ","cspan":1,"rspan":1},{"value":"(Z = X \\cdot W","cspan":1,"rspan":1}],[{"value":"\\odot","cspan":1,"rspan":1},{"value":"hadamard","cspan":1,"rspan":1},{"value":"hadamard product","cspan":1,"rspan":1},{"value":"A = B \\odot C","cspan":1,"rspan":1}],[{"value":"X^T","cspan":1,"rspan":1},{"value":"transpose","cspan":1,"rspan":1},{"value":"matrix transpose","cspan":1,"rspan":1},{"value":"W^T \\cdot X ","cspan":1,"rspan":1}],[{"value":"\\arrow x","cspan":1,"rspan":1},{"value":"vector","cspan":1,"rspan":1},{"value":"","cspan":1,"rspan":1},{"value":"s = \\frac{1}{1+e^{-z}}","cspan":1,"rspan":1}],[{"value":"X","cspan":1,"rspan":1},{"value":"matrix","cspan":1,"rspan":1},{"value":"capitalized variables are matrices","cspan":1,"rspan":1},{"value":"X, W, B","cspan":1,"rspan":1}],[{"value":"\\hat x","cspan":1,"rspan":1},{"value":"unit vector","cspan":1,"rspan":1},{"value":"vector of magnitude 1","cspan":1,"rspan":1},{"value":"\\hat x = [0.2 0.5 0.3]","cspan":1,"rspan":1}]]},"theme":null,"fixed_layout":false} -------------------------------------------------------------------------------- /docs/figures/statistics_symbols_table.tgn: -------------------------------------------------------------------------------- 1 | {"rows_views":[[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"middle","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}],[{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}},{"style":{"borders":"lrtb","font_style":{},"text_color":"","bg_color":"","halign":"left","valign":"top","padding":{"top":10,"right":5,"bottom":10,"left":5}}}]],"model":{"rows":[[{"value":"Symbol","cspan":1,"rspan":1},{"value":"Name","cspan":1,"rspan":1},{"value":"Description","cspan":1,"rspan":1},{"value":"Example","cspan":1,"rspan":1}],[{"value":"μ","cspan":1,"rspan":1},{"value":"population mean","cspan":1,"rspan":1},{"value":"mean of population values","cspan":1,"rspan":1},{"value":"","cspan":1,"rspan":1}],[{"value":"x (line above)","cspan":1,"rspan":1},{"value":"sample mean","cspan":1,"rspan":1},{"value":"mean of subset of population","cspan":1,"rspan":1},{"value":"","cspan":1,"rspan":1}],[{"value":"σ^2","cspan":1,"rspan":1},{"value":"population variance","cspan":1,"rspan":1},{"value":"variance of population value","cspan":1,"rspan":1},{"value":"","cspan":1,"rspan":1}],[{"value":"s 2","cspan":1,"rspan":1},{"value":"sample variable","cspan":1,"rspan":1},{"value":"variance of subset of population","cspan":1,"rspan":1},{"value":"","cspan":1,"rspan":1}],[{"value":"σX","cspan":1,"rspan":1},{"value":"standard deviation","cspan":1,"rspan":1},{"value":"population standard deviation","cspan":1,"rspan":1},{"value":"","cspan":1,"rspan":1}],[{"value":"s","cspan":1,"rspan":1},{"value":"sample std dev","cspan":1,"rspan":1},{"value":"standard deviation of sample","cspan":1,"rspan":1},{"value":"","cspan":1,"rspan":1}],[{"value":"ρX,Y","cspan":1,"rspan":1},{"value":"correlation","cspan":1,"rspan":1},{"value":"correlation of variables X and Y","cspan":1,"rspan":1},{"value":"","cspan":1,"rspan":1}],[{"value":"x (squiggle)","cspan":1,"rspan":1},{"value":"median","cspan":1,"rspan":1},{"value":"median of sample/population","cspan":1,"rspan":1},{"value":"","cspan":1,"rspan":1}]]},"theme":null,"fixed_layout":false} -------------------------------------------------------------------------------- /docs/generative_algos.rst: -------------------------------------------------------------------------------- 1 | .. _generative_algos: 2 | 3 | ===================== 4 | Generative Algorithms 5 | ===================== 6 | 7 | Be the first to `contribute! `__ 8 | 9 | 10 | .. rubric:: References 11 | 12 | .. [1] Example Reference 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /docs/gradient_descent.rst: -------------------------------------------------------------------------------- 1 | .. _gradient_descent: 2 | 3 | ================ 4 | Gradient Descent 5 | ================ 6 | 7 | Gradient descent is an optimization algorithm used to minimize some function by iteratively moving in the direction of steepest descent as defined by the negative of the gradient. In machine learning, we use gradient descent to update the :ref:`parameters ` of our model. Parameters refer to coefficients in :doc:`linear_regression` and :ref:`weights ` in neural networks. 8 | 9 | 10 | Introduction 11 | ============ 12 | 13 | Consider the 3-dimensional graph below in the context of a cost function. Our goal is to move from the mountain in the top right corner (high cost) to the dark blue sea in the bottom left (low cost). The arrows represent the direction of steepest descent (negative gradient) from any given point--the direction that decreases the cost function as quickly as possible. `Source `_ 14 | 15 | .. image:: images/gradient_descent.png 16 | :align: center 17 | 18 | Starting at the top of the mountain, we take our first step downhill in the direction specified by the negative gradient. Next we recalculate the negative gradient (passing in the coordinates of our new point) and take another step in the direction it specifies. We continue this process iteratively until we get to the bottom of our graph, or to a point where we can no longer move downhill--a local minimum. `image source `_. 19 | 20 | .. image:: images/gradient_descent_demystified.png 21 | :align: center 22 | 23 | Learning rate 24 | ============= 25 | 26 | The size of these steps is called the *learning rate*. With a high learning rate we can cover more ground each step, but we risk overshooting the lowest point since the slope of the hill is constantly changing. With a very low learning rate, we can confidently move in the direction of the negative gradient since we are recalculating it so frequently. A low learning rate is more precise, but calculating the gradient is time-consuming, so it will take us a very long time to get to the bottom. 27 | 28 | 29 | Cost function 30 | ============= 31 | 32 | A :ref:`cost_function` tells us "how good" our model is at making predictions for a given set of parameters. The cost function has its own curve and its own gradients. The slope of this curve tells us how to update our parameters to make the model more accurate. 33 | 34 | 35 | Step-by-step 36 | ============ 37 | 38 | Now let's run gradient descent using our new cost function. There are two parameters in our cost function we can control: :math:`m` (weight) and :math:`b` (bias). Since we need to consider the impact each one has on the final prediction, we need to use partial derivatives. We calculate the partial derivatives of the cost function with respect to each parameter and store the results in a gradient. 39 | 40 | .. rubric:: Math 41 | 42 | Given the cost function: 43 | 44 | .. math:: 45 | 46 | f(m,b) = \frac{1}{N} \sum_{i=1}^{N} (y_i - (mx_i + b))^2 47 | 48 | The gradient can be calculated as: 49 | 50 | .. math:: 51 | 52 | f'(m,b) = 53 | \begin{bmatrix} 54 | \frac{df}{dm}\\ 55 | \frac{df}{db}\\ 56 | \end{bmatrix} 57 | = 58 | \begin{bmatrix} 59 | \frac{1}{N} \sum -2x_i(y_i - (mx_i + b)) \\ 60 | \frac{1}{N} \sum -2(y_i - (mx_i + b)) \\ 61 | \end{bmatrix} 62 | 63 | To solve for the gradient, we iterate through our data points using our new :math:`m` and :math:`b` values and compute the partial derivatives. This new gradient tells us the slope of our cost function at our current position (current parameter values) and the direction we should move to update our parameters. The size of our update is controlled by the learning rate. 64 | 65 | 66 | .. rubric:: Code 67 | 68 | :: 69 | 70 | def update_weights(m, b, X, Y, learning_rate): 71 | m_deriv = 0 72 | b_deriv = 0 73 | N = len(X) 74 | for i in range(N): 75 | # Calculate partial derivatives 76 | # -2x(y - (mx + b)) 77 | m_deriv += -2*X[i] * (Y[i] - (m*X[i] + b)) 78 | 79 | # -2(y - (mx + b)) 80 | b_deriv += -2*(Y[i] - (m*X[i] + b)) 81 | 82 | # We subtract because the derivatives point in direction of steepest ascent 83 | m -= (m_deriv / float(N)) * learning_rate 84 | b -= (b_deriv / float(N)) * learning_rate 85 | 86 | return m, b 87 | 88 | 89 | .. rubric:: References 90 | 91 | .. [1] http://ruder.io/optimizing-gradient-descent 92 | -------------------------------------------------------------------------------- /docs/images/autoencoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/autoencoder.png -------------------------------------------------------------------------------- /docs/images/autoencoder_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/autoencoder_2.png -------------------------------------------------------------------------------- /docs/images/autoencoder_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/autoencoder_architecture.png -------------------------------------------------------------------------------- /docs/images/backprop_3_equations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/backprop_3_equations.png -------------------------------------------------------------------------------- /docs/images/backprop_ff_equations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/backprop_ff_equations.png -------------------------------------------------------------------------------- /docs/images/backprop_final_3_deriv_equations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/backprop_final_3_deriv_equations.png -------------------------------------------------------------------------------- /docs/images/backprop_visually.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/backprop_visually.png -------------------------------------------------------------------------------- /docs/images/boosting-sequence-models.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/boosting-sequence-models.PNG -------------------------------------------------------------------------------- /docs/images/boosting_error_iteration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/boosting_error_iteration.png -------------------------------------------------------------------------------- /docs/images/calculus_slope_intro.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/calculus_slope_intro.png -------------------------------------------------------------------------------- /docs/images/cnn.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/cnn.jpg -------------------------------------------------------------------------------- /docs/images/cnn_filter_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/cnn_filter_output.png -------------------------------------------------------------------------------- /docs/images/cross_entropy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/cross_entropy.png -------------------------------------------------------------------------------- /docs/images/decision_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/decision_tree.png -------------------------------------------------------------------------------- /docs/images/dropout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/dropout.png -------------------------------------------------------------------------------- /docs/images/dropout_net.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/dropout_net.png -------------------------------------------------------------------------------- /docs/images/dynamic_resizing_neural_network_1_obs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/dynamic_resizing_neural_network_1_obs.png -------------------------------------------------------------------------------- /docs/images/dynamic_resizing_neural_network_4_obs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/dynamic_resizing_neural_network_4_obs.png -------------------------------------------------------------------------------- /docs/images/earlystopping.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/earlystopping.png -------------------------------------------------------------------------------- /docs/images/elu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/elu.png -------------------------------------------------------------------------------- /docs/images/elu_prime.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/elu_prime.png -------------------------------------------------------------------------------- /docs/images/fc_layer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/fc_layer.png -------------------------------------------------------------------------------- /docs/images/gan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/gan.png -------------------------------------------------------------------------------- /docs/images/gradient_accumulation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/gradient_accumulation.png -------------------------------------------------------------------------------- /docs/images/gradient_descent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/gradient_descent.png -------------------------------------------------------------------------------- /docs/images/gradient_descent_demystified.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/gradient_descent_demystified.png -------------------------------------------------------------------------------- /docs/images/grid_search_cross_validation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/grid_search_cross_validation.png -------------------------------------------------------------------------------- /docs/images/gru_structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/gru_structure.png -------------------------------------------------------------------------------- /docs/images/integral_as_change_in_antriderivative.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/integral_as_change_in_antriderivative.png -------------------------------------------------------------------------------- /docs/images/integral_as_rectangular_strips.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/integral_as_rectangular_strips.png -------------------------------------------------------------------------------- /docs/images/integral_definition.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/integral_definition.png -------------------------------------------------------------------------------- /docs/images/khan_academy_matrix_product.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/khan_academy_matrix_product.png -------------------------------------------------------------------------------- /docs/images/leakyrelu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/leakyrelu.png -------------------------------------------------------------------------------- /docs/images/leakyrelu_prime.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/leakyrelu_prime.png -------------------------------------------------------------------------------- /docs/images/learned_regression_line.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/learned_regression_line.png -------------------------------------------------------------------------------- /docs/images/linear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/linear.png -------------------------------------------------------------------------------- /docs/images/linear_prime.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/linear_prime.png -------------------------------------------------------------------------------- /docs/images/linear_regression_3d_plane_mlr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/linear_regression_3d_plane_mlr.png -------------------------------------------------------------------------------- /docs/images/linear_regression_line_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/linear_regression_line_1.png -------------------------------------------------------------------------------- /docs/images/linear_regression_line_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/linear_regression_line_2.png -------------------------------------------------------------------------------- /docs/images/linear_regression_line_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/linear_regression_line_3.png -------------------------------------------------------------------------------- /docs/images/linear_regression_line_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/linear_regression_line_4.png -------------------------------------------------------------------------------- /docs/images/linear_regression_line_intro.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/linear_regression_line_intro.png -------------------------------------------------------------------------------- /docs/images/linear_regression_training_cost.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/linear_regression_training_cost.png -------------------------------------------------------------------------------- /docs/images/log_vs_neglog.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/log_vs_neglog.gif -------------------------------------------------------------------------------- /docs/images/logistic_cost_function_joined.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/logistic_cost_function_joined.png -------------------------------------------------------------------------------- /docs/images/logistic_cost_function_vectorized.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/logistic_cost_function_vectorized.png -------------------------------------------------------------------------------- /docs/images/logistic_regression_binary_decision_boundary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/logistic_regression_binary_decision_boundary.png -------------------------------------------------------------------------------- /docs/images/logistic_regression_exam_scores_scatter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/logistic_regression_exam_scores_scatter.png -------------------------------------------------------------------------------- /docs/images/logistic_regression_final_decision_boundary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/logistic_regression_final_decision_boundary.png -------------------------------------------------------------------------------- /docs/images/logistic_regression_loss_history.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/logistic_regression_loss_history.png -------------------------------------------------------------------------------- /docs/images/logistic_regression_scatter_w_decision_boundary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/logistic_regression_scatter_w_decision_boundary.png -------------------------------------------------------------------------------- /docs/images/logistic_regression_sigmoid_w_threshold.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/logistic_regression_sigmoid_w_threshold.png -------------------------------------------------------------------------------- /docs/images/lstm_structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/lstm_structure.png -------------------------------------------------------------------------------- /docs/images/maxpool.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/maxpool.png -------------------------------------------------------------------------------- /docs/images/memoization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/memoization.png -------------------------------------------------------------------------------- /docs/images/mlp.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/mlp.jpg -------------------------------------------------------------------------------- /docs/images/multiple_regression_error_history.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/multiple_regression_error_history.png -------------------------------------------------------------------------------- /docs/images/neural_network_matrix_weighted_input.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/neural_network_matrix_weighted_input.png -------------------------------------------------------------------------------- /docs/images/neural_network_simple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/neural_network_simple.png -------------------------------------------------------------------------------- /docs/images/neural_network_w_matrices.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/neural_network_w_matrices.png -------------------------------------------------------------------------------- /docs/images/neuron.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/neuron.png -------------------------------------------------------------------------------- /docs/images/ng_cost_function_logistic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/ng_cost_function_logistic.png -------------------------------------------------------------------------------- /docs/images/nn_with_matrices_displayed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/nn_with_matrices_displayed.png -------------------------------------------------------------------------------- /docs/images/optimizers.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/optimizers.gif -------------------------------------------------------------------------------- /docs/images/regularization-dropout.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/regularization-dropout.PNG -------------------------------------------------------------------------------- /docs/images/relu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/relu.png -------------------------------------------------------------------------------- /docs/images/relu_prime.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/relu_prime.png -------------------------------------------------------------------------------- /docs/images/rnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/rnn.png -------------------------------------------------------------------------------- /docs/images/rnn_layer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/rnn_layer.png -------------------------------------------------------------------------------- /docs/images/sigmoid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/sigmoid.png -------------------------------------------------------------------------------- /docs/images/sigmoid_prime.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/sigmoid_prime.png -------------------------------------------------------------------------------- /docs/images/simple_nn_diagram_zo_zh_defined.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/simple_nn_diagram_zo_zh_defined.png -------------------------------------------------------------------------------- /docs/images/slope_formula.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/slope_formula.png -------------------------------------------------------------------------------- /docs/images/svm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/svm.png -------------------------------------------------------------------------------- /docs/images/svm_linear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/svm_linear.png -------------------------------------------------------------------------------- /docs/images/svm_nonlinear_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/svm_nonlinear_1.png -------------------------------------------------------------------------------- /docs/images/svm_nonlinear_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/svm_nonlinear_2.png -------------------------------------------------------------------------------- /docs/images/svm_nonlinear_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/svm_nonlinear_3.png -------------------------------------------------------------------------------- /docs/images/tanh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/tanh.png -------------------------------------------------------------------------------- /docs/images/tanh_prime.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/tanh_prime.png -------------------------------------------------------------------------------- /docs/images/vae.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/vae.png -------------------------------------------------------------------------------- /docs/images/vector_field.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/vector_field.png -------------------------------------------------------------------------------- /docs/images/vectors_geometry.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/vectors_geometry.png -------------------------------------------------------------------------------- /docs/images/y1andy2_logistic_function.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfortuner/ml-glossary/ad889a823beee92b7ac1e8c92e85a8ed57d64994/docs/images/y1andy2_logistic_function.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | 2 | ========================= 3 | Machine Learning Glossary 4 | ========================= 5 | 6 | Brief visual explanations of machine learning concepts with diagrams, code examples and links to resources for learning more. 7 | 8 | .. warning:: 9 | 10 | If you find errors, please raise an `issue `_ or `contribute `_ a better definition! 11 | 12 | .. toctree:: 13 | :caption: Basics 14 | :maxdepth: 1 15 | 16 | linear_regression 17 | gradient_descent 18 | logistic_regression 19 | glossary 20 | 21 | .. toctree:: 22 | :caption: Math 23 | :maxdepth: 1 24 | 25 | calculus 26 | linear_algebra 27 | Probability (TODO) 28 | Statistics (TODO) 29 | math_notation 30 | 31 | .. toctree:: 32 | :maxdepth: 1 33 | :caption: Neural Networks 34 | 35 | nn_concepts 36 | forwardpropagation 37 | backpropagation 38 | activation_functions 39 | layers 40 | loss_functions 41 | optimizers 42 | regularization 43 | architectures 44 | 45 | .. toctree:: 46 | :maxdepth: 1 47 | :caption: Algorithms (TODO) 48 | 49 | Classification 50 | Clustering 51 | Regression 52 | Reinforcement Learning 53 | 54 | .. toctree:: 55 | :maxdepth: 1 56 | :caption: Resources 57 | :titlesonly: 58 | 59 | datasets 60 | libraries 61 | papers 62 | Other 63 | 64 | .. toctree:: 65 | :maxdepth: 1 66 | :caption: Contributing 67 | :titlesonly: 68 | 69 | How to contribute 70 | 71 | 72 | .. https://en.wikipedia.org/wiki/Outline_of_machine_learning 73 | 74 | .. Indices and tables 75 | .. ------------------ 76 | 77 | .. * :ref:`genindex` 78 | .. * :ref:`modindex` 79 | .. * :ref:`search` 80 | -------------------------------------------------------------------------------- /docs/loss_functions.rst: -------------------------------------------------------------------------------- 1 | .. _cost_function: 2 | 3 | ============== 4 | Loss Functions 5 | ============== 6 | 7 | .. contents:: :local: 8 | 9 | 10 | .. _loss_cross_entropy: 11 | 12 | Cross-Entropy 13 | ============= 14 | 15 | Cross-entropy loss, or log loss, measures the performance of a classification model whose output is a probability value between 0 and 1. Cross-entropy loss increases as the predicted probability diverges from the actual label. So predicting a probability of .012 when the actual observation label is 1 would be bad and result in a high loss value. A perfect model would have a log loss of 0. 16 | 17 | .. image:: images/cross_entropy.png 18 | :align: center 19 | 20 | The graph above shows the range of possible loss values given a true observation (isDog = 1). As the predicted probability approaches 1, log loss slowly decreases. As the predicted probability decreases, however, the log loss increases rapidly. Log loss penalizes both types of errors, but especially those predictions that are confident and wrong! 21 | 22 | .. note:: 23 | 24 | Cross-entropy and log loss are slightly different depending on context, but in machine learning when calculating error rates between 0 and 1 they resolve to the same thing. 25 | 26 | .. rubric:: Code 27 | 28 | .. literalinclude:: ../code/loss_functions.py 29 | :pyobject: CrossEntropy 30 | 31 | .. rubric:: Math 32 | 33 | In binary classification, where the number of classes :math:`M` equals 2, cross-entropy can be calculated as: 34 | 35 | .. math:: 36 | 37 | -{(y\log(p) + (1 - y)\log(1 - p))} 38 | 39 | If :math:`M > 2` (i.e. multiclass classification), we calculate a separate loss for each class label per observation and sum the result. 40 | 41 | .. math:: 42 | 43 | -\sum_{c=1}^My_{o,c}\log(p_{o,c}) 44 | 45 | .. note:: 46 | 47 | - M - number of classes (dog, cat, fish) 48 | - log - the natural log 49 | - y - binary indicator (0 or 1) if class label :math:`c` is the correct classification for observation :math:`o` 50 | - p - predicted probability observation :math:`o` is of class :math:`c` 51 | 52 | 53 | .. _hinge_loss: 54 | 55 | Hinge 56 | ===== 57 | 58 | Used for classification. 59 | 60 | .. rubric:: Code 61 | 62 | .. literalinclude:: ../code/loss_functions.py 63 | :pyobject: Hinge 64 | 65 | 66 | .. _huber_loss: 67 | 68 | Huber 69 | ===== 70 | 71 | Typically used for regression. It's less sensitive to outliers than the MSE as it treats error as square only inside an interval. 72 | 73 | .. math:: 74 | 75 | L_{\delta}=\left\{\begin{matrix} 76 | \frac{1}{2}(y - \hat{y})^{2} & if \left | (y - \hat{y}) \right | < \delta\\ 77 | \delta ((y - \hat{y}) - \frac1 2 \delta) & otherwise 78 | \end{matrix}\right. 79 | 80 | .. rubric:: Code 81 | 82 | .. literalinclude:: ../code/loss_functions.py 83 | :pyobject: Huber 84 | 85 | Further information can be found at `Huber Loss in Wikipedia`_. 86 | 87 | .. _`Huber Loss in Wikipedia`: https://en.wikipedia.org/wiki/Huber_loss 88 | 89 | .. _kl_divergence: 90 | 91 | Kullback-Leibler 92 | ================ 93 | 94 | .. rubric:: Code 95 | 96 | .. literalinclude:: ../code/loss_functions.py 97 | :pyobject: KLDivergence 98 | 99 | .. _rmse: 100 | 101 | RMSE 102 | ======== 103 | 104 | Root Mean Square Error 105 | 106 | .. math:: 107 | 108 | RMSE = \sqrt{\frac{1}{m}\sum^{m}_{i=1}(h(x^{(i)})-y^{(i)})^2} 109 | 110 | .. line-block:: 111 | 112 | RMSE - root mean square error 113 | m - number of samples 114 | :math:`x^{(i)}` - i-th sample from dataset 115 | :math:`h(x^{(i)})` - prediction for i-th sample (thesis) 116 | :math:`y^{(i)}` - ground truth label for i-th sample 117 | 118 | 119 | .. rubric:: Code 120 | 121 | .. literalinclude:: ../code/loss_functions.py 122 | :pyobject: root_mean_square_error 123 | 124 | 125 | .. _mae: 126 | 127 | MAE (L1) 128 | ======== 129 | 130 | Mean Absolute Error, or L1 loss. Excellent overview below [6] and [10]. 131 | 132 | .. math:: 133 | 134 | MAE = \frac{1}{m}\sum^{m}_{i=1}|h(x^{(i)})-y^{(i)}| 135 | 136 | .. line-block:: 137 | 138 | MAE - mean absolute error 139 | m - number of samples 140 | :math:`x^{(i)}` - i-th sample from dataset 141 | :math:`h(x^{(i)})` - prediction for i-th sample (thesis) 142 | :math:`y^{(i)}` - ground truth label for i-th sample 143 | 144 | .. rubric:: Code 145 | 146 | .. literalinclude:: ../code/loss_functions.py 147 | :pyobject: L1 148 | 149 | 150 | .. _mse: 151 | 152 | MSE (L2) 153 | ======== 154 | 155 | Mean Squared Error, or L2 loss. Excellent overview below [6] and [10]. 156 | 157 | .. math:: 158 | 159 | MSE = \frac{1}{m}\sum^{m}_{i=1}(y^{(i)} - \hat{y}^{(i)})^2 160 | 161 | .. line-block:: 162 | 163 | MSE - mean square error 164 | m - number of samples 165 | :math:`y^{(i)}` - ground truth label for i-th sample 166 | :math:`\hat{y}^{(i)}` - predicted label for i-th sample 167 | 168 | .. literalinclude:: ../code/loss_functions.py 169 | :language: python 170 | :pyobject: MSE 171 | 172 | .. literalinclude:: ../code/loss_functions.py 173 | :language: python 174 | :pyobject: MSE_prime 175 | 176 | 177 | .. rubric:: References 178 | 179 | .. [1] https://en.m.wikipedia.org/wiki/Cross_entropy 180 | .. [2] https://www.kaggle.com/wiki/LogarithmicLoss 181 | .. [3] https://en.wikipedia.org/wiki/Loss_functions_for_classification 182 | .. [4] http://www.exegetic.biz/blog/2015/12/making-sense-logarithmic-loss/ 183 | .. [5] http://neuralnetworksanddeeplearning.com/chap3.html 184 | .. [6] http://rishy.github.io/ml/2015/07/28/l1-vs-l2-loss/ 185 | .. [7] https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient 186 | .. [8] https://en.wikipedia.org/wiki/Huber_loss 187 | .. [9] https://en.wikipedia.org/wiki/Hinge_loss 188 | .. [10] http://www.chioka.in/differences-between-l1-and-l2-as-loss-function-and-regularization/ 189 | -------------------------------------------------------------------------------- /docs/math_notation.rst: -------------------------------------------------------------------------------- 1 | .. _math_notation: 2 | 3 | ======== 4 | Notation 5 | ======== 6 | 7 | Commonly used math symbols in machine learning texts. 8 | 9 | .. contents:: :local: 10 | 11 | .. note:: 12 | 13 | Use the `table generator `_ to quickly add new symbols. 14 | Import current tables into tablesgenerator from ``figures/*.tgn``. Export and save your changes. Also 15 | see helpful `multiline editing `_ in Sublime. 16 | 17 | 18 | Algebra 19 | ------- 20 | 21 | +--------------------+--------------------+-----------------------+-------------------------+ 22 | | **Symbol** | **Name** | **Description** | **Example** | 23 | +--------------------+--------------------+-----------------------+-------------------------+ 24 | | :math:`(f ∘ g)` | composite function | a nested function | (f ∘ g)(x) = f(g(x)) | 25 | +--------------------+--------------------+-----------------------+-------------------------+ 26 | | :math:`∆` | delta | change / difference | ∆x = x_1 - x_0 | 27 | +--------------------+--------------------+-----------------------+-------------------------+ 28 | | :math:`e` | Euler's number | e = 2.718281828 | s = \frac{1}{1+e^{-z}} | 29 | +--------------------+--------------------+-----------------------+-------------------------+ 30 | | :math:`\sum` | summation | sum of all values | ∑ x_i = x_1 + x_2 + x_3 | 31 | +--------------------+--------------------+-----------------------+-------------------------+ 32 | | :math:`\prod` | capital pi | product of all values | ∏ x_i = x_1∙x_2∙x_3 | 33 | +--------------------+--------------------+-----------------------+-------------------------+ 34 | | :math:`\epsilon` | epsilon | tiny number near 0 | lr = 1e-4 | 35 | +--------------------+--------------------+-----------------------+-------------------------+ 36 | 37 | 38 | Calculus 39 | -------- 40 | 41 | +--------------------+-------------------+----------------------------------+-------------+ 42 | | **Symbol** | **Name** | **Description** | **Example** | 43 | +--------------------+-------------------+----------------------------------+-------------+ 44 | | :math:`x'` | derivative | first derivative | (x^2)' = 2x | 45 | +--------------------+-------------------+----------------------------------+-------------+ 46 | | :math:`x''` | second derivative | second derivative | (x^2)'' = 2 | 47 | +--------------------+-------------------+----------------------------------+-------------+ 48 | | :math:`\lim` | limit | function value as x approaches 0 | | 49 | +--------------------+-------------------+----------------------------------+-------------+ 50 | | :math:`∇` | nabla | gradient | ∇f(a,b,c) | 51 | +--------------------+-------------------+----------------------------------+-------------+ 52 | 53 | 54 | Linear algebra 55 | -------------- 56 | 57 | +-------------------+-------------+------------------------------------+---------------------------------+ 58 | | **Symbol** | **Name** | **Description** | **Example** | 59 | +-------------------+-------------+------------------------------------+---------------------------------+ 60 | | :math:`[ ]` | brackets | matrix or vector | :math:`M = [1 3 5]` | 61 | +-------------------+-------------+------------------------------------+---------------------------------+ 62 | | :math:`\cdot` | dot | dot product | :math:`(Z = X \cdot W` | 63 | +-------------------+-------------+------------------------------------+---------------------------------+ 64 | | :math:`\odot` | hadamard | hadamard product | :math:`A = B \odot C` | 65 | +-------------------+-------------+------------------------------------+---------------------------------+ 66 | | :math:`X^T` | transpose | matrix transpose | :math:`W^T \cdot X` | 67 | +-------------------+-------------+------------------------------------+---------------------------------+ 68 | | :math:`\vec x` | vector | vector | :math:`v = [1 2 3]` | 69 | +-------------------+-------------+------------------------------------+---------------------------------+ 70 | | :math:`X` | matrix | capitalized variables are matrices | :math:`X, W, B` | 71 | +-------------------+-------------+------------------------------------+---------------------------------+ 72 | | :math:`\hat x` | unit vector | vector of magnitude 1 | :math:`\hat x = [0.2 0.5 0.3]` | 73 | +-------------------+-------------+------------------------------------+---------------------------------+ 74 | 75 | 76 | Probability 77 | ----------- 78 | 79 | +-------------+---------------------+--------------------------+-----------------------+ 80 | | **Symbol** | **Name** | **Description** | **Example** | 81 | +-------------+---------------------+--------------------------+-----------------------+ 82 | | :math:`P(A)`| probability | probability of event A | P(x=1) = 0.5 | 83 | +-------------+---------------------+--------------------------+-----------------------+ 84 | 85 | 86 | Set theory 87 | ---------- 88 | 89 | +------------+---------------------+-----------------------------+-----------------------+ 90 | | **Symbol** | **Name** | **Description** | **Example** | 91 | +------------+---------------------+-----------------------------+-----------------------+ 92 | | :math:`{ }`| set | list of distinct elements | S = {1, 5, 7, 9} | 93 | +------------+---------------------+-----------------------------+-----------------------+ 94 | 95 | 96 | Statistics 97 | ---------- 98 | 99 | +------------------+---------------------+----------------------------------+-----------------------+ 100 | | **Symbol** | **Name** | **Description** | **Example** | 101 | +------------------+---------------------+----------------------------------+-----------------------+ 102 | | :math:`μ` | population mean | mean of population values | | 103 | +------------------+---------------------+----------------------------------+-----------------------+ 104 | | :math:`\bar x` | sample mean | mean of subset of population | | 105 | +------------------+---------------------+----------------------------------+-----------------------+ 106 | | :math:`σ^2` | population variance | variance of population value | | 107 | +------------------+---------------------+----------------------------------+-----------------------+ 108 | | :math:`s^2` | sample variance | variance of subset of population | | 109 | +------------------+---------------------+----------------------------------+-----------------------+ 110 | | :math:`σ_X` | standard deviation | population standard deviation | | 111 | +------------------+---------------------+----------------------------------+-----------------------+ 112 | | :math:`s` | sample std dev | standard deviation of sample | | 113 | +------------------+---------------------+----------------------------------+-----------------------+ 114 | | :math:`ρX` | correlation | correlation of variables X and Y | | 115 | +------------------+---------------------+----------------------------------+-----------------------+ 116 | | :math:`\tilde x` | median | median value of variable x | | 117 | +------------------+---------------------+----------------------------------+-----------------------+ 118 | 119 | 120 | .. rubric:: References 121 | 122 | .. [1] http://www.tablesgenerator.com/text_tables 123 | .. [2] http://www.rapidtables.com/math/symbols/Basic_Math_Symbols.htm 124 | 125 | -------------------------------------------------------------------------------- /docs/nn_concepts.rst: -------------------------------------------------------------------------------- 1 | .. _nn_concepts: 2 | 3 | ======== 4 | Concepts 5 | ======== 6 | 7 | .. contents:: :local: 8 | 9 | 10 | Neural Network 11 | ============== 12 | 13 | Neural networks are a class of machine learning algorithms used to model complex patterns in datasets using multiple hidden layers and non-linear activation functions. A neural network takes an input, passes it through multiple layers of hidden neurons (mini-functions with unique coefficients that must be learned), and outputs a prediction representing the combined input of all the neurons. 14 | 15 | .. image:: images/neural_network_w_matrices.png 16 | :align: center 17 | 18 | Neural networks are trained iteratively using optimization techniques like gradient descent. After each cycle of training, an error metric is calculated based on the difference between prediction and target. The derivatives of this error metric are calculated and propagated back through the network using a technique called backpropagation. Each neuron's coefficients (weights) are then adjusted relative to how much they contributed to the total error. This process is repeated iteratively until the network error drops below an acceptable threshold. 19 | 20 | 21 | Neuron 22 | ====== 23 | 24 | A neuron takes a group of weighted inputs, applies an activation function, and returns an output. 25 | 26 | .. image:: images/neuron.png 27 | :align: center 28 | 29 | Inputs to a neuron can either be features from a training set or outputs from a previous layer’s neurons. Weights are applied to the inputs as they travel along synapses to reach the neuron. The neuron then applies an activation function to the “sum of weighted inputs” from each incoming synapse and passes the result on to all the neurons in the next layer. 30 | 31 | 32 | 33 | Synapse 34 | ======= 35 | 36 | Synapses are like roads in a neural network. They connect inputs to neurons, neurons to neurons, and neurons to outputs. In order to get from one neuron to another, you have to travel along the synapse paying the “toll” (weight) along the way. Each connection between two neurons has a unique synapse with a unique weight attached to it. When we talk about updating weights in a network, we’re really talking about adjusting the weights on these synapses. 37 | 38 | 39 | .. _nn_weights: 40 | 41 | Weights 42 | ======= 43 | 44 | Weights are values that control the strength of the connection between two neurons. That is, inputs are typically multiplied by weights, and that defines how much influence the input will have on the output. In other words: when the inputs are transmitted between neurons, the weights are applied to the inputs along with an additional value (the bias) 45 | 46 | .. _nn_bias: 47 | 48 | Bias 49 | ==== 50 | 51 | Bias terms are additional constants attached to neurons and added to the weighted input before the activation function is applied. Bias terms help models represent patterns that do not necessarily pass through the origin. For example, if all your features were 0, would your output also be zero? Is it possible there is some base value upon which your features have an effect? Bias terms typically accompany weights and must also be learned by your model. 52 | 53 | 54 | Layers 55 | ====== 56 | 57 | .. image:: images/neural_network_simple.png 58 | :align: center 59 | 60 | .. rubric:: Input Layer 61 | 62 | Holds the data your model will train on. Each neuron in the input layer represents a unique attribute in your dataset (e.g. height, hair color, etc.). 63 | 64 | .. rubric:: Hidden Layer 65 | 66 | Sits between the input and output layers and applies an activation function before passing on the results. There are often multiple hidden layers in a network. In traditional networks, hidden layers are typically fully-connected layers — each neuron receives input from all the previous layer’s neurons and sends its output to every neuron in the next layer. This contrasts with how convolutional layers work where the neurons send their output to only some of the neurons in the next layer. 67 | 68 | .. rubric:: Output Layer 69 | 70 | The final layer in a network. It receives input from the previous hidden layer, optionally applies an activation function, and returns an output representing your model’s prediction. 71 | 72 | 73 | 74 | Weighted Input 75 | ============== 76 | 77 | A neuron’s input equals the sum of weighted outputs from all neurons in the previous layer. Each input is multiplied by the weight associated with the synapse connecting the input to the current neuron. If there are 3 inputs or neurons in the previous layer, each neuron in the current layer will have 3 distinct weights — one for each each synapse. 78 | 79 | **Single Input** 80 | 81 | .. math:: 82 | 83 | Z &= Input \cdot Weight \\ 84 | &= X W 85 | 86 | **Multiple Inputs** 87 | 88 | .. math:: 89 | 90 | Z &= \sum_{i=1}^{n}x_i w_i \\ 91 | &= x_1 w_1 + x_2 w_2 + x_3 w_3 92 | 93 | 94 | Notice, it’s exactly the same equation we use with linear regression! In fact, a neural network with a single neuron is the same as linear regression! The only difference is the neural network post-processes the weighted input with an activation function. 95 | 96 | 97 | 98 | Activation Functions 99 | ==================== 100 | 101 | Activation functions live inside neural network layers and modify the data they receive before passing it to the next layer. Activation functions give neural networks their power — allowing them to model complex non-linear relationships. By modifying inputs with non-linear functions neural networks can model highly complex relationships between features. Popular activation functions include :ref:`relu ` and :ref:`sigmoid `. 102 | 103 | Activation functions typically have the following properties: 104 | 105 | * **Non-linear** - In linear regression we’re limited to a prediction equation that looks like a straight line. This is nice for simple datasets with a one-to-one relationship between inputs and outputs, but what if the patterns in our dataset were non-linear? (e.g. :math:`x^2`, sin, log). To model these relationships we need a non-linear prediction equation.¹ Activation functions provide this non-linearity. 106 | 107 | * **Continuously differentiable** — To improve our model with gradient descent, we need our output to have a nice slope so we can compute error derivatives with respect to weights. If our neuron instead outputted 0 or 1 (perceptron), we wouldn’t know in which direction to update our weights to reduce our error. 108 | 109 | * **Fixed Range** — Activation functions typically squash the input data into a narrow range that makes training the model more stable and efficient. 110 | 111 | 112 | Loss Functions 113 | ============== 114 | 115 | A loss function, or cost function, is a wrapper around our model's predict function that tells us "how good" the model is at making predictions for a given set of parameters. The loss function has its own curve and its own derivatives. The slope of this curve tells us how to change our parameters to make the model more accurate! We use the model to make predictions. We use the cost function to update our parameters. Our cost function can take a variety of forms as there are many different cost functions available. Popular loss functions include: :ref:`mse` and :ref:`Cross-entropy Loss `. 116 | 117 | 118 | Optimization Algorithms 119 | ======================= 120 | 121 | Be the first to `contribute! `__ 122 | 123 | 124 | Gradient Accumulation 125 | ===================== 126 | 127 | Gradient accumulation is a mechanism to split the batch of samples—used for training a neural network—into several mini-batches of samples that will be run sequentially. 128 | 129 | This is used to enable using large batch sizes that require more GPU memory than available. Gradient accumulation helps in doing so by using mini-batches that require an amount of GPU memory that can be satisfied. 130 | 131 | Gradient accumulation means running all mini-batches sequentially (generally on the same GPU) while accumulating their calculated gradients and not updating the model variables - the weights and biases of the model. 132 | The model variables must not be updated during the accumulation in order to ensure all mini-batches use the same model variable values to calculate their gradients. 133 | Only after accumulating the gradients of all those mini-batches will we generate and apply the updates for the model variables. 134 | 135 | This results in the same updates for the model parameters as if we were to use the global batch. 136 | 137 | .. image:: images/gradient_accumulation.png 138 | :align: center 139 | 140 | More details, a technical and algorithmical deep-dive, how-to tutorials, and examples can be found at [2]. 141 | 142 | 143 | 144 | .. rubric:: References 145 | 146 | .. [1] http://sebastianruder.com/optimizing-gradient-descent/ 147 | .. [2] https://github.com/run-ai/runai/tree/master/runai/ga/ 148 | -------------------------------------------------------------------------------- /docs/optimizers.rst: -------------------------------------------------------------------------------- 1 | .. _optimizers: 2 | 3 | ========== 4 | Optimizers 5 | ========== 6 | 7 | .. rubric:: What is Optimizer ? 8 | 9 | It is very important to tweak the weights of the model during the training process, to make our predictions as correct and optimized as possible. But how exactly do you do that? How do you change the parameters of your model, by how much, and when? 10 | 11 | Best answer to all above question is *optimizers*. They tie together the loss function and model parameters by updating the model in response to the output of the loss function. In simpler terms, optimizers shape and mold your model into its most accurate possible form by futzing with the weights. The loss function is the guide to the terrain, telling the optimizer when it’s moving in the right or wrong direction. 12 | 13 | Below are list of example optimizers 14 | 15 | .. contents:: :local: 16 | 17 | .. image:: images/optimizers.gif 18 | :align: center 19 | 20 | Image Credit: `CS231n `_ 21 | 22 | Adagrad 23 | ------- 24 | 25 | Adagrad (short for adaptive gradient) adaptively sets the learning rate according to a parameter. 26 | 27 | - Parameters that have higher gradients or frequent updates should have slower learning rate so that we do not overshoot the minimum value. 28 | - Parameters that have low gradients or infrequent updates should faster learning rate so that they get trained quickly. 29 | - It divides the learning rate by the sum of squares of all previous gradients of the parameter. 30 | - When the sum of the squared past gradients has a high value, it basically divides the learning rate by a high value, so the learning rate will become less. 31 | - Similarly, if the sum of the squared past gradients has a low value, it divides the learning rate by a lower value, so the learning rate value will become high. 32 | - This implies that the learning rate is inversely proportional to the sum of the squares of all the previous gradients of the parameter. 33 | 34 | .. math:: 35 | 36 | g_{t}^{i} = \frac{\partial \mathcal{J}(w_{t}^{i})}{\partial W} \\ 37 | W = W - \alpha \frac{\partial \mathcal{J}(w_{t}^{i})}{\sqrt{\sum_{r=1}^{t}\left ( g_{r}^{i} \right )^{2} + \varepsilon }} 38 | 39 | .. note:: 40 | 41 | - :math:`g_{t}^{i}` - the gradient of a parameter, :math: `\Theta ` at an iteration t. 42 | - :math:`\alpha` - the learning rate 43 | - :math:`\epsilon` - very small value to avoid dividing by zero 44 | 45 | .. literalinclude:: ../code/optimizers.py 46 | :language: python 47 | :pyobject: Adagrad 48 | 49 | Adadelta 50 | -------- 51 | 52 | AdaDelta belongs to the family of stochastic gradient descent algorithms, that provide adaptive techniques for hyperparameter tuning. Adadelta is probably short for ‘adaptive delta’, where delta here refers to the difference between the current weight and the newly updated weight. 53 | 54 | The main disadvantage in Adagrad is its accumulation of the squared gradients. During the training process, the accumulated sum keeps growing. From the above formala we can see that, As the accumulated sum increases learning rate to shrink and eventually become infinitesimally small, at which point the algorithm is no longer able to acquire additional knowledge. 55 | 56 | Adadelta is a more robust extension of Adagrad that adapts learning rates based on a moving window of gradient updates, instead of accumulating all past gradients. This way, Adadelta continues learning even when many updates have been done. 57 | 58 | With Adadelta, we do not even need to set a default learning rate, as it has been eliminated from the update rule. 59 | 60 | Implementation is something like this, 61 | 62 | .. math:: 63 | 64 | v_t = \rho v_{t-1} + (1-\rho) \nabla_\theta^2 J( \theta) \\ 65 | \Delta\theta &= \dfrac{\sqrt{w_t + \epsilon}}{\sqrt{v_t + \epsilon}} \nabla_\theta J( \theta) \\ 66 | \theta &= \theta - \eta \Delta\theta \\ 67 | w_t = \rho w_{t-1} + (1-\rho) \Delta\theta^2 68 | 69 | .. literalinclude:: ../code/optimizers.py 70 | :language: python 71 | :pyobject: Adadelta 72 | 73 | Adam 74 | ---- 75 | 76 | Adaptive Moment Estimation (Adam) combines ideas from both RMSProp and Momentum. It computes adaptive learning rates for each parameter and works as follows. 77 | 78 | - First, it computes the exponentially weighted average of past gradients (:math:`v_{dW}`). 79 | - Second, it computes the exponentially weighted average of the squares of past gradients (:math:`s_{dW}`). 80 | - Third, these averages have a bias towards zero and to counteract this a bias correction is applied (:math:`v_{dW}^{corrected}`, :math:`s_{dW}^{corrected}`). 81 | - Lastly, the parameters are updated using the information from the calculated averages. 82 | 83 | .. math:: 84 | 85 | 86 | v_{dW} = \beta_1 v_{dW} + (1 - \beta_1) \frac{\partial \mathcal{J} }{ \partial W } \\ 87 | s_{dW} = \beta_2 s_{dW} + (1 - \beta_2) (\frac{\partial \mathcal{J} }{\partial W })^2 \\ 88 | v^{corrected}_{dW} = \frac{v_{dW}}{1 - (\beta_1)^t} \\ 89 | s^{corrected}_{dW} = \frac{s_{dW}}{1 - (\beta_1)^t} \\ 90 | W = W - \alpha \frac{v^{corrected}_{dW}}{\sqrt{s^{corrected}_{dW}} + \varepsilon} 91 | 92 | .. note:: 93 | 94 | - :math:`v_{dW}` - the exponentially weighted average of past gradients 95 | - :math:`s_{dW}` - the exponentially weighted average of past squares of gradients 96 | - :math:`\beta_1` - hyperparameter to be tuned 97 | - :math:`\beta_2` - hyperparameter to be tuned 98 | - :math:`\frac{\partial \mathcal{J} }{ \partial W }` - cost gradient with respect to current layer 99 | - :math:`W` - the weight matrix (parameter to be updated) 100 | - :math:`\alpha` - the learning rate 101 | - :math:`\epsilon` - very small value to avoid dividing by zero 102 | 103 | 104 | 105 | Conjugate Gradients 106 | ------------------- 107 | 108 | Be the first to `contribute! `__ 109 | 110 | 111 | .. _optimizers_lbfgs: 112 | 113 | BFGS 114 | ---- 115 | 116 | Be the first to `contribute! `__ 117 | 118 | 119 | Momentum 120 | -------- 121 | 122 | Used in conjunction Stochastic Gradient Descent (sgd) or Mini-Batch Gradient Descent, Momentum takes into account 123 | past gradients to smooth out the update. This is seen in variable :math:`v` which is an exponentially weighted average 124 | of the gradient on previous steps. This results in minimizing oscillations and faster convergence. 125 | 126 | .. math:: 127 | 128 | v_{dW} = \beta v_{dW} + (1 - \beta) \frac{\partial \mathcal{J} }{ \partial W } \\ 129 | W = W - \alpha v_{dW} 130 | 131 | .. note:: 132 | 133 | - :math:`v` - the exponentially weighted average of past gradients 134 | - :math:`\frac{\partial \mathcal{J} }{ \partial W }` - cost gradient with respect to current layer weight tensor 135 | - :math:`W` - weight tensor 136 | - :math:`\beta` - hyperparameter to be tuned 137 | - :math:`\alpha` - the learning rate 138 | 139 | Nesterov Momentum 140 | ----------------- 141 | 142 | Be the first to `contribute! `__ 143 | 144 | 145 | Newton's Method 146 | --------------- 147 | 148 | Be the first to `contribute! `__ 149 | 150 | 151 | RMSProp 152 | ------- 153 | 154 | Another adaptive learning rate optimization algorithm, Root Mean Square Prop (RMSProp) works by keeping an exponentially weighted average of the squares of past gradients. 155 | RMSProp then divides the learning rate by this average to speed up convergence. 156 | 157 | 158 | .. math:: 159 | 160 | 161 | s_{dW} = \beta s_{dW} + (1 - \beta) (\frac{\partial \mathcal{J} }{\partial W })^2 \\ 162 | W = W - \alpha \frac{\frac{\partial \mathcal{J} }{\partial W }}{\sqrt{s^{corrected}_{dW}} + \varepsilon} 163 | 164 | .. note:: 165 | 166 | - :math:`s` - the exponentially weighted average of past squares of gradients 167 | - :math:`\frac{\partial \mathcal{J} }{\partial W }` - cost gradient with respect to current layer weight tensor 168 | - :math:`W` - weight tensor 169 | - :math:`\beta` - hyperparameter to be tuned 170 | - :math:`\alpha` - the learning rate 171 | - :math:`\epsilon` - very small value to avoid dividing by zero 172 | 173 | SGD 174 | --- 175 | 176 | SGD stands for Stochastic Gradient Descent.In Stochastic Gradient Descent, a few samples are selected randomly instead of the whole data set for each iteration. In Gradient Descent, there is a term called “batch” which denotes the total number of samples from a dataset that is used for calculating the gradient for each iteration. In typical Gradient Descent optimization, like Batch Gradient Descent, the batch is taken to be the whole dataset. Although, using the whole dataset is really useful for getting to the minima in a less noisy or less random manner, but the problem arises when our datasets get really huge. 177 | 178 | This problem is solved by Stochastic Gradient Descent. In SGD, it uses only a single sample to perform each iteration. The sample is randomly shuffled and selected for performing the iteration. 179 | 180 | Since only one sample from the dataset is chosen at random for each iteration, the path taken by the algorithm to reach the minima is usually noisier than your typical Gradient Descent algorithm. But that doesn’t matter all that much because the path taken by the algorithm does not matter, as long as we reach the minima and with significantly shorter training time. 181 | 182 | .. literalinclude:: ../code/optimizers.py 183 | :language: python 184 | :pyobject: SGD 185 | 186 | 187 | .. rubric:: References 188 | 189 | .. [1] https://ruder.io/optimizing-gradient-descent/ 190 | .. [2] http://www.deeplearningbook.org/contents/optimization.html 191 | .. [3] https://arxiv.org/pdf/1502.03167.pdf 192 | -------------------------------------------------------------------------------- /docs/other_content.rst: -------------------------------------------------------------------------------- 1 | .. _content: 2 | 3 | ============= 4 | Other Content 5 | ============= 6 | 7 | Books, blogs, courses and more forked from josephmisiti's `awesome machine learning `_ 8 | 9 | .. contents:: :local: 10 | 11 | Blogs 12 | ===== 13 | 14 | Data Science 15 | ------------ 16 | 17 | - https://jeremykun.com/ 18 | - http://iamtrask.github.io/ 19 | - http://blog.explainmydata.com/ 20 | - http://andrewgelman.com/ 21 | - http://simplystatistics.org/ 22 | - http://www.evanmiller.org/ 23 | - http://jakevdp.github.io/ 24 | - http://blog.yhat.com/ 25 | - http://wesmckinney.com 26 | - http://www.overkillanalytics.net/ 27 | - http://newton.cx/~peter/ 28 | - http://mbakker7.github.io/exploratory_computing_with_python/ 29 | - https://sebastianraschka.com/blog/index.html 30 | - http://camdavidsonpilon.github.io/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/ 31 | - http://colah.github.io/ 32 | - http://www.thomasdimson.com/ 33 | - http://blog.smellthedata.com/ 34 | - https://sebastianraschka.com/ 35 | - http://dogdogfish.com/ 36 | - http://www.johnmyleswhite.com/ 37 | - http://drewconway.com/zia/ 38 | - http://bugra.github.io/ 39 | - http://opendata.cern.ch/ 40 | - https://alexanderetz.com/ 41 | - http://www.sumsar.net/ 42 | - https://www.countbayesie.com 43 | - http://blog.kaggle.com/ 44 | - http://www.danvk.org/ 45 | - http://hunch.net/ 46 | - http://www.randalolson.com/blog/ 47 | - https://www.johndcook.com/blog/r_language_for_programmers/ 48 | - http://www.dataschool.io/ 49 | 50 | 51 | Machine learning 52 | ---------------- 53 | 54 | - `OpenAI `__ 55 | - `Distill `__ 56 | - `Andrej Karpathy Blog `__ 57 | - `Colah's Blog `__ 58 | - `WildML `__ 59 | - `FastML `__ 60 | - `TheMorningPaper `__ 61 | 62 | 63 | Math 64 | ---- 65 | 66 | - http://www.sumsar.net/ 67 | - http://allendowney.blogspot.ca/ 68 | - https://healthyalgorithms.com/ 69 | - https://petewarden.com/ 70 | - http://mrtz.org/blog/ 71 | 72 | 73 | 74 | Books 75 | ===== 76 | 77 | Machine learning 78 | ---------------- 79 | 80 | - `Real World Machine Learning `__ [Free Chapters] 81 | - `An Introduction To Statistical Learning `__ - Book + R Code 82 | - `Elements of Statistical Learning `__ - Book 83 | - `Probabilistic Programming & Bayesian Methods for Hackers `__ - Book + IPython Notebooks 84 | - `Think Bayes `__ - Book + Python Code 85 | - `Information Theory, Inference, and Learning Algorithms `__ 86 | - `Gaussian Processes for Machine Learning `__ 87 | - `Data Intensive Text Processing w/ MapReduce `__ 88 | - `Reinforcement Learning: - An Introduction `__ 89 | - `Mining Massive Datasets `__ 90 | - `A First Encounter with Machine Learning `__ 91 | - `Pattern Recognition and Machine Learning `__ 92 | - `Machine Learning & Bayesian Reasoning `__ 93 | - `Introduction to Machine Learning `__ - Alex Smola and S.V.N. Vishwanathan 94 | - `A Probabilistic Theory of Pattern Recognition `__ 95 | - `Introduction to Information Retrieval `__ 96 | - `Forecasting: principles and practice `__ 97 | - `Practical Artificial Intelligence Programming in Java `__ 98 | - `Introduction to Machine Learning `__ - Amnon Shashua 99 | - `Reinforcement Learning `__ 100 | - `Machine Learning `__ 101 | - `A Quest for AI `__ 102 | - `Introduction to Applied Bayesian Statistics and Estimation for Social Scientists `__ - Scott M. Lynch 103 | - `Bayesian Modeling, Inference and Prediction `__ 104 | - `A Course in Machine Learning `__ 105 | - `Machine Learning, Neural and Statistical Classification `__ 106 | - `Bayesian Reasoning and Machine Learning `__ Book+MatlabToolBox 107 | - `R Programming for Data Science `__ 108 | - `Data Mining - Practical Machine Learning Tools and Techniques `__ Book 109 | 110 | 111 | Deep learning 112 | ------------- 113 | 114 | - `Deep Learning - An MIT Press book `__ 115 | - `Coursera Course Book on NLP `__ 116 | - `NLTK `__ 117 | - `NLP w/ Python `__ 118 | - `Foundations of Statistical Natural Language Processing `__ 119 | - `An Introduction to Information Retrieval `__ 120 | - `A Brief Introduction to Neural Networks `__ 121 | - `Neural Networks and Deep Learning `__ 122 | 123 | 124 | Probability & Statistics 125 | ------------------------ 126 | 127 | - `Think Stats `__ - Book + Python Code 128 | - `From Algorithms to Z-Scores `__ - Book 129 | - `The Art of R Programming `__ 130 | - `Introduction to statistical thought `__ 131 | - `Basic Probability Theory `__ 132 | - `Introduction to probability `__ - By Dartmouth College 133 | - `Principle of Uncertainty `__ 134 | - `Probability & Statistics Cookbook `__ 135 | - `Advanced Data Analysis From An Elementary Point of View `__ 136 | - `Introduction to Probability `__ - Book and course by MIT 137 | - `The Elements of Statistical Learning: Data Mining, Inference, and Prediction. `__ -Book 138 | - `An Introduction to Statistical Learning with Applications in R `__ - Book 139 | - `Learning Statistics Using R `__ 140 | - `Introduction to Probability and Statistics Using R `__ - Book 141 | - `Advanced R Programming `__ - Book 142 | - `Practical Regression and Anova using R `__ - Book 143 | - `R practicals `__ - Book 144 | - `The R Inferno `__ - Book 145 | 146 | Linear Algebra 147 | -------------- 148 | 149 | - `Linear Algebra Done Wrong `__ 150 | - `Linear Algebra, Theory, and Applications `__ 151 | - `Convex Optimization `__ 152 | - `Applied Numerical Computing `__ 153 | - `Applied Numerical Linear Algebra `__ 154 | 155 | 156 | Courses 157 | ======= 158 | 159 | - `CS231n, Convolutional Neural Networks for Visual Recognition, Stanford University `__ 160 | - `CS224d, Deep Learning for Natural Language Processing, Stanford University `__ 161 | - `Oxford Deep NLP 2017, Deep Learning for Natural Language Processing, University of Oxford `__ 162 | - `Artificial Intelligence (Columbia University) `__ - free 163 | - `Machine Learning (Columbia University) `__ - free 164 | - `Machine Learning (Stanford University) `__ - free 165 | - `Neural Networks for Machine Learning (University of Toronto) `__ - free 166 | - `Machine Learning Specialization (University of Washington) `__ - Courses: Machine Learning Foundations: A Case Study Approach, Machine Learning: Regression, Machine Learning: Classification, Machine Learning: Clustering & Retrieval, Machine Learning: Recommender Systems & Dimensionality Reduction,Machine Learning Capstone: An Intelligent Application with Deep Learning; free 167 | - `Machine Learning Course (2014-15 session) (by Nando de Freitas, University of Oxford) `__ - Lecture slides and video recordings. 168 | - `Learning from Data (by Yaser S. Abu-Mostafa, Caltech) `__ - Lecture videos available 169 | 170 | 171 | Podcasts 172 | ======== 173 | 174 | - `The O'Reilly Data Show `__ 175 | - `Partially Derivative `__ 176 | - `The Talking Machines `__ 177 | - `The Data Skeptic `__ 178 | - `Linear Digressions `__ 179 | - `Data Stories `__ 180 | - `Learning Machines 101 `__ 181 | - `Not So Standard Deviations `__ 182 | - `TWIMLAI `__ 183 | -_`Machine Learning Guide `_ 184 | 185 | 186 | Tutorials 187 | ========= 188 | 189 | Be the first to `contribute! `__ 190 | 191 | 192 | -------------------------------------------------------------------------------- /docs/probability.rst: -------------------------------------------------------------------------------- 1 | .. _probability: 2 | 3 | =========== 4 | Probability 5 | =========== 6 | 7 | .. contents:: :local: 8 | 9 | Basic concepts in probability for machine learning. 10 | 11 | This cheatsheet is a 10-page reference in probability that covers a semester's worth of introductory probability. 12 | 13 | The cheatsheet is based off of Harvard's introductory probability course, Stat 110. It is co-authored by former Stat 110 Teaching Fellow William Chen and Stat 110 Professor Joe Blitzstein. 14 | 15 | Links 16 | ------- 17 | 18 | * [Probability Cheatsheet PDF](http://www.wzchen.com/probability-cheatsheet/) 19 | 20 | 21 | Screenshots 22 | ------- 23 | 24 | ![First Page](http://i.imgur.com/Oa73huL.jpg) 25 | ![Second Page](http://i.imgur.com/dyvW2rB.jpg) 26 | 27 | 28 | License 29 | ------- 30 | 31 | This work is licensed under a [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.][by-nc-sa]. 32 | 33 | [![Creative Commons License][by-nc-sa-img]][by-nc-sa] 34 | 35 | .. rubric:: References 36 | 37 | .. [1] Example 38 | -------------------------------------------------------------------------------- /docs/regression_algos.rst: -------------------------------------------------------------------------------- 1 | .. _regression_algos: 2 | 3 | ===================== 4 | Regression Algorithms 5 | ===================== 6 | 7 | .. contents:: :local: 8 | 9 | 10 | Ordinary Least Squares 11 | ====================== 12 | 13 | OLS is the method with which linear regression is performed. The square of the difference from the mean is taken for every data point, and the summed loss function is to be minimized. 14 | 15 | .. math:: 16 | l = \sum_{i=1}^n (y_i - \bar{y})^2 17 | 18 | 19 | 20 | Polynomial 21 | ========== 22 | 23 | Polynomial regression is a modification of linear regression where the existing features are mapped to a polynomial form. The problem is still a linear regression problem, but the input vector is now mapped to a higher dimensional vector which serves as a pseudo-input vector of sorts. 24 | 25 | .. math:: 26 | 27 | \textbf{x} = (x_0, x_1) \rightarrow \textbf{x'} = (x_0, x^2_0, x_1, x^2_1, x_0x_1) 28 | 29 | 30 | Lasso 31 | ===== 32 | 33 | Lasso Regression tries to reduce the ordinary least squares error similar to vanilla regression, but adds an extra term. The sum of the :math:`L_1` norm for every data point multiplied by a hyperparameter :math:`\alpha` is used. This reduces model complexity and prevents overfitting. 34 | 35 | .. math:: 36 | 37 | l = \sum_{i=1}^n (y_i - \tilde{y})^2 + \alpha \sum_{j=1}^p |w_j| 38 | 39 | 40 | Ridge 41 | ===== 42 | 43 | Ridge regression is similar to lasso regression, but the regularization term uses the :math:`L_2` norm instead. 44 | 45 | .. math:: 46 | 47 | l = \sum_{i=1}^n (y_i - \tilde{y})^2 + \alpha \sum_{j=1}^p w^2_j 48 | 49 | 50 | 51 | Stepwise 52 | ======== 53 | 54 | Stepwise regression or spline regression helps us fit a piece wise function to the data. It is usually used with linear models, but it can be generalized to higher degrees as well. The regression equation takes the form of 55 | 56 | .. math:: 57 | y = ax + b(x-\bar{x})H_{\alpha}+c 58 | 59 | where :math:`H_{\alpha}` is the shifted Heaviside step function, having its discontinuity at :math:`\alpha`. 60 | 61 | 62 | .. rubric:: References 63 | 64 | .. [1] https://www.analyticsvidhya.com/blog/2015/08/comprehensive-guide-regression/ 65 | .. [2] http://machinelearningmastery.com/a-tour-of-machine-learning-algorithms/ 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /docs/reinforcement_learning.rst: -------------------------------------------------------------------------------- 1 | .. _reinforcement_learning: 2 | 3 | ====================== 4 | Reinforcement Learning 5 | ====================== 6 | 7 | In machine learning, supervised is sometimes contrasted with unsupervised learning. This is a useful distinction, but there are some problem domains that have share characteristics with each without fitting exactly in either category. In cases where the algorithm does not have explicit labels but does receive a form of feedback, we are dealing with a third and distinct paradigm of machine learning - reinforcement learning. 8 | 9 | Programmatic and a theoretical introduction to reinforcement learning:https://spinningup.openai.com/ 10 | 11 | There are different problem types and algorithms, but all reinforcement learning problems have the following aspects in common: 12 | 13 | * an **agent** - the algorithm or "AI" responsible for making decisions 14 | 15 | * an **environment**, consisting of different **states** in which the agent may find itself 16 | 17 | * a **reward** signal which is returned by the environment as a function of the current state 18 | 19 | * **actions**, each of which takes the agent from one state to another 20 | 21 | * a **policy**, i.e. a mapping from states to actions that defines the agent's behavior 22 | 23 | The goal of reinforcement learning is to learn the optimal policy, that is the policy that maximizes expected (discounted) cumulative reward. 24 | 25 | Many RL algorithms will include a value function or a Q-function. A value function gives the expected cumulative reward for each state under the current policy In other words, it answers the question, "If I begin in state :math:`i` and follow my policy, what will be my expected reward?" 26 | 27 | In most algorithms, expected cumulative reward is discounted by some factor :math:`\gamma \in (0, 1)`; a typical value for :math:`\gamma` is 0.9. In addition to more accurately modeling the behavior of humans and other animals, :math:`\gamma < 1` helps to ensure that algorithms converge even when there is no terminal state or when the terminal state is never found (because otherwise expected cumulative reward may also become infinite). 28 | 29 | Note on Terminology 30 | ------------------- 31 | 32 | For mostly historical reasons, engineering and operations research use different words to talk about the same concepts. For example, the general field of reinforcement learning itself is sometimes referred to as optimal control, approximate dynamic programming, or neuro-dynamic programming.\ :sup:`1` 33 | 34 | Eploration vs. Exploitation 35 | --------------------------- 36 | 37 | One dilemma inherent to the RL problem setting is the tension between the desire to choose the best known option and the need to try something new in order to discover other options that may be even better. Choosing the best known action is known as exploitation, while choosing a different action is known as exploration. 38 | 39 | Typically, this is solved by adding to the policy a small probability of exploration. For example, the policy could be to choose the optimal action (optimal with regard to what is known) with probability 0.95, and exploring by randomly choosing some other action with probability 0.5 (if uniform across all remaining actions: probability 0.5/(n-1) where n is the number of states). 40 | 41 | MDPs and Tabular methods 42 | ------------------------ 43 | 44 | Many problems can be effectively modeled as Markov Decision Processes (MDPs), and usually as `Partially Observable Markov Decision Processes (POMDPs) `. That is, we have 45 | 46 | * a set of states :math:`S` 47 | * a set of actions :math:`A` 48 | * a set of conditional state transition probabilities :math:`T` 49 | * a reward function :math:`R: S \times A \rightarrow \mathbb{R}` 50 | * a set of observations :math:`\Omega` 51 | * a set of condition observation probabilities :math:`O` 52 | * a discount factor :math:`\gamma \in [0]` 53 | 54 | Given these things, the goal is to choose the action at each time step which will maximize :math:`E \left[ \sum_{t=0}^{\infty} \gamma^t r_t \right]`, the expected discounted reward. 55 | 56 | Monte Carlo methods 57 | ------------------- 58 | 59 | One possible approach is to run a large number of simulations to learn :math:`p^*`. This is good for cases where we know the environment and can run many simulations reasonably quickly. For example, it is fairly trivial to compute an optimal policy for the card game `21 (blackjack) ` by running many simulations, and the same is true for most simple games. 60 | 61 | Temporal-Difference Learning 62 | ---------------------------- 63 | 64 | TODO 65 | 66 | Planning 67 | -------- 68 | 69 | TODO 70 | 71 | On-Policy vs. Off-Policy Learning 72 | --------------------------------- 73 | 74 | TODO 75 | 76 | Model-Free vs. Model-Based Approaches 77 | ------------------------------------- 78 | 79 | TODO 80 | 81 | Imitation Learning 82 | ------------------ 83 | 84 | TODO 85 | 86 | Q-Learning 87 | ---------- 88 | 89 | Q Learning, a model-free RL algorithm, is to update Q values to the optimal by iteration. It is an off-policy method that select the optimal action based on the current estimated Q\* and does not follow the current policy. 90 | 91 | The algorithm of Q Learning is: 92 | 93 | #. Initialize t = 0. 94 | #. Start at initial state s\ :sub:`t` = 0. 95 | #. The agent chooses a\ :sub:`t` = ɛ-greedy 96 | action. 97 | #. For given a\ :sub:`t`, the agent retrieves 98 | the reward r\ :sub:`t+1` as well as the next 99 | state s\ :sub:`t+1`. 100 | #. Get (but do not perform) the next action 101 | a\ :sub:`t+1` = 102 | argmax\ :sub:`a∈A`\ Q(s\ :sub:`t+1`, a). 103 | #. Compute the TD target y\ :sub:`t` = 104 | r\ :sub:`t+1` + γ · Q(s\ :sub:`t+1`, 105 | a\ :sub:`t+1`), where γ is the discounted 106 | factor. 107 | #. Calculate the TD error δ = y\ :sub:`t` − 108 | Q(s\ :sub:`t`, a\ :sub:`t`). 109 | #. Update Q(s\ :sub:`t`, a\ :sub:`t`) ← 110 | Q(s\ :sub:`t`, a\ :sub:`t`) + α\ :sub:`t` · 111 | δ, where α\ :sub:`t` is the step size 112 | (learning rate) at t. 113 | #. Update t ← t + 1 and repeat step 3-9 until 114 | Q(s, a) converge. 115 | 116 | Epsilon-Greedy Algorithm 117 | 118 | .. math:: 119 | 120 | \begin{equation} 121 | a_{t} = \begin{cases} 122 | argmax_{a∈A} & \text{if } p = 1 - e \\ 123 | random\, action\ &\text{otherwise} 124 | \end{cases} 125 | \end{equation} 126 | 127 | The agent performs optimal action for exploitation or random action for exploration during training. It acts randomly in the beginning with the ɛ = 1 and chooses the best action based on the Q function with a decreasing ɛ capped at some small constant not equal to zero. 128 | 129 | Q-Table / Q-Matrix 130 | 131 | +-------------+---------------+---------------+-----+---------------+ 132 | | | a\ :sub:`1` | a\ :sub:`2` | ... | a\ :sub:`n` | 133 | +-------------+---------------+---------------+-----+---------------+ 134 | | s\ :sub:`1` | Q | Q | ... | Q | 135 | | | (s\ :sub:`1`, | (s\ :sub:`1`, | | (s\ :sub:`1`, | 136 | | | a\ :sub:`1`) | a\ :sub:`2`) | | a\ :sub:`3`) | 137 | +-------------+---------------+---------------+-----+---------------+ 138 | | s\ :sub:`2` | Q | Q | ... | Q | 139 | | | (s\ :sub:`2`, | (s\ :sub:`2`, | | (s\ :sub:`2`, | 140 | | | a\ :sub:`1`) | a\ :sub:`2`) | | a\ :sub:`3`) | 141 | +-------------+---------------+---------------+-----+---------------+ 142 | | ... | ... | ... | ... | ... | 143 | +-------------+---------------+---------------+-----+---------------+ 144 | | s\ :sub:`m` | Q | Q | ... | Q | 145 | | | (s\ :sub:`m`, | (s\ :sub:`m`, | | (s\ :sub:`m`, | 146 | | | a\ :sub:`1`) | a\ :sub:`2`) | | a\ :sub:`3`) | 147 | +-------------+---------------+---------------+-----+---------------+ 148 | 149 | It's a lookup table storing the action-value function Q(s, a) for state-action pairs where there are M states and n actions. We can initialize the Q(s, a) arbitrarily except s = terminal state. For s = final state, we set it equal to the reward on that state. 150 | 151 | Reasons of using Q Learning are: 152 | 153 | - It’s applicable for the discrete action space of our environment. 154 | - When we don’t have the true MDP model: transitional probability matrix and rewards (Model-Free Setting). 155 | - It's able to learn from incomplete episodes because of TD learning. 156 | 157 | Drawbacks of Q Learning are: 158 | 159 | - When the state space and action space are continuous and extremely large, due to the curse of dimensionality, it’s nearly impossible to maintain a Q-matrix when the data is large. 160 | - Using a Q-table is unable to infer optimal action for unseen states. 161 | 162 | Deep Q-Learning 163 | --------------- 164 | 165 | Deep Q-learning pursues the same general methods as Q-learning. Its innovation is to add a neural network, which makes it possible to learn a very complex Q-function. This makes it very powerful, especially because it makes a large body of well-developed theory and tools for deep learning useful to reinforcement learning problems. 166 | 167 | Examples of Applications 168 | ------------------------ 169 | 170 | * `Getting Started With OpenAI Gym: Creating Custom Gym Environments `_ 171 | 172 | * `What Is Q-Learning: The Best Guide To Understand Q-Learning (Simplilearn) `_ 173 | 174 | * `REINFORCEMENT LEARNING (DQN) TUTORIAL (PyTorch) `_ 175 | 176 | * `QWOP Game AI (DQN/DDQN) `_ 177 | 178 | Links 179 | ----- 180 | 181 | * `Practical Applications of Reinforcement Learning (tTowards Data Science) `_ 182 | 183 | * `Reinforcement learning (GeeksforGeeks) `_ 184 | 185 | * `Reinforcement Learning Algorithms: An Intuitive Overview (SmartLabAI) `_ 186 | 187 | * `Q-learning(Wikipedia) `_ 188 | 189 | * `Epsilon-Greedy Algorithm in Reinforcement Learning (GeeksforGeeks) `_ 190 | 191 | * `OpenAI Gym Documentation `_ 192 | 193 | * `Stable-Baselines3 Documentation `_ 194 | 195 | * `David Silver Teaching Material `_ 196 | 197 | 198 | 199 | .. rubric:: References 200 | 201 | .. [1] https://en.wikipedia.org/wiki/Reinforcement_learning#Introduction 202 | .. [2] Reinforcement Learning: An Introduction (Sutton and Barto, 2018) 203 | .. [3] Silver, David. "Lecture 5: Model-Free Control." UCL, Computer Sci. Dep. Reinf Learn. Lect. (2015): 101-140. 204 | .. [4] En.wikipedia.org. 2022. Q-learning - Wikipedia. [online] Available at: [Accessed 15 June 2022]. 205 | 206 | 207 | 208 | 209 | -------------------------------------------------------------------------------- /docs/statistics.rst: -------------------------------------------------------------------------------- 1 | .. _statistics: 2 | 3 | ========== 4 | Statistics 5 | ========== 6 | 7 | Basic concepts in statistics for machine learning. 8 | 9 | 10 | 11 | .. rubric:: References 12 | 13 | .. [1] Example 14 | -------------------------------------------------------------------------------- /docs/training.rst: -------------------------------------------------------------------------------- 1 | .. _probability: 2 | 3 | ================ 4 | Training (empty) 5 | ================ 6 | 7 | .. contents:: :local: 8 | 9 | 10 | Combating Overfitting 11 | ===================== 12 | 13 | Cross-validation 14 | ---------------- 15 | 16 | Be the first to `contribute! `__ 17 | 18 | Validation Set 19 | -------------- 20 | 21 | Be the first to `contribute! `__ 22 | 23 | Test Set 24 | -------- 25 | 26 | Be the first to `contribute! `__ 27 | 28 | 29 | 30 | Hyperparameter Tuning 31 | ===================== 32 | 33 | Learning Rate 34 | ------------- 35 | 36 | Be the first to `contribute! `__ 37 | 38 | Optimizers 39 | ---------- 40 | 41 | Be the first to `contribute! `__ 42 | 43 | 44 | 45 | Model Evaluation 46 | ================ 47 | 48 | 49 | Bias-Variance Tradeoff 50 | ---------------------- 51 | 52 | Be the first to `contribute! `__ 53 | 54 | Loss Functions 55 | -------------- 56 | 57 | Be the first to `contribute! `__ 58 | 59 | Precision vs Recall 60 | ------------------- 61 | 62 | Be the first to `contribute! `__ 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | .. rubric:: References 73 | 74 | .. [1] Example 75 | -------------------------------------------------------------------------------- /notebooks/rnn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "\n" 12 | ] 13 | } 14 | ], 15 | "metadata": { 16 | "kernelspec": { 17 | "display_name": "Python 3", 18 | "language": "python", 19 | "name": "python3" 20 | }, 21 | "language_info": { 22 | "codemirror_mode": { 23 | "name": "ipython", 24 | "version": 2 25 | }, 26 | "file_extension": ".py", 27 | "mimetype": "text/x-python", 28 | "name": "python", 29 | "nbconvert_exporter": "python", 30 | "pygments_lexer": "ipython2", 31 | "version": "2.7.6" 32 | } 33 | }, 34 | "nbformat": 4, 35 | "nbformat_minor": 0 36 | } --------------------------------------------------------------------------------